Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

解决keras模式下,使用GPU训练时会爆显存的bug。 #1190

Merged
merged 3 commits into from
Oct 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added data/img001.bmp
Binary file not shown.
7 changes: 7 additions & 0 deletions src/TensorFlowNET.Core/APIs/tf.image.cs
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,13 @@ public Tensor decode_image(Tensor contents, int channels = 0, TF_DataType dtype
=> image_ops_impl.decode_image(contents, channels: channels, dtype: dtype,
name: name, expand_animations: expand_animations);

public Tensor encode_png(Tensor contents, string name = null)
=> image_ops_impl.encode_png(contents, name: name);

public Tensor encode_jpeg(Tensor contents, string name = null)
=> image_ops_impl.encode_jpeg(contents, name: name);


/// <summary>
/// Convenience function to check if the 'contents' encodes a JPEG image.
/// </summary>
Expand Down
7 changes: 7 additions & 0 deletions src/TensorFlowNET.Core/APIs/tf.io.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ limitations under the License.

using System.Collections.Generic;
using Tensorflow.IO;
using Tensorflow.Operations;

namespace Tensorflow
{
Expand Down Expand Up @@ -46,6 +47,12 @@ public Operation save_v2(Tensor prefix, string[] tensor_names,
public Tensor[] restore_v2(Tensor prefix, string[] tensor_names,
string[] shape_and_slices, TF_DataType[] dtypes, string name = null)
=> ops.restore_v2(prefix, tensor_names, shape_and_slices, dtypes, name: name);

public Operation write_file(string filename, Tensor conentes, string name = null)
=> write_file(Tensorflow.ops.convert_to_tensor(filename, TF_DataType.TF_STRING), conentes, name);

public Operation write_file(Tensor filename, Tensor conentes, string name = null)
=> gen_ops.write_file(filename, conentes, name);
}

public GFile gfile = new GFile();
Expand Down
5 changes: 5 additions & 0 deletions src/TensorFlowNET.Core/Eager/EagerRunner.RecordGradient.cs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ BackwardFunction GetGradientFunction(string op_name,
Tensor[] op_outputs)
=> (out_grads, unneeded_gradients) =>
{
if(!ops.gradientFunctions.ContainsKey(op_name))
{
throw new Exception($"gradientFunctions not find op_name: {op_name}");
}

if (ops.gradientFunctions[op_name] == null)
return new Tensor[op_inputs.Length];

Expand Down
31 changes: 31 additions & 0 deletions src/TensorFlowNET.Core/Gradients/nn_grad.cs
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,37 @@ public static Tensor[] _Conv2DGrad(Operation op, Tensor[] grads)
};
}

/// <summary>
/// Gradient function for Conv2D.
/// </summary>
/// <param name="op"></param>
/// <param name="grads"></param>
/// <returns></returns>
[RegisterGradient("DepthwiseConv2dNative")]
public static Tensor[] _DepthwiseConv2DGrad(Operation op, Tensor[] grads)
{
var dilations = op.get_attr_list<int>("dilations");
var strides = op.get_attr_list<int>("strides");
var padding = op.get_attr<string>("padding");
var explicit_paddings = op.get_attr_list<int>("explicit_paddings");
var data_format = op.get_attr<string>("data_format");
var shape = gen_array_ops.shape_n(new Tensor[] { op.inputs[0], op.inputs[1] });

return new Tensor[]
{
gen_nn_ops.depthwise_conv2d_native_backprop_input(
shape[0], op.inputs[1], grads[0],
strides, padding, explicit_paddings,
dilations: dilations,
data_format: data_format),
gen_nn_ops.depthwise_conv2d_native_backprop_filter(op.inputs[0], shape[1], grads[0],
strides, padding,
dilations: dilations,
explicit_paddings: explicit_paddings,
data_format: data_format)
};
}

[RegisterGradient("FusedBatchNorm")]
public static Tensor[] _FusedBatchNormGrad(Operation op, Tensor[] grads)
=> _BaseFusedBatchNormGrad(op, 0, grads);
Expand Down
23 changes: 23 additions & 0 deletions src/TensorFlowNET.Core/Keras/Engine/IModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ ICallback fit(NDArray x, NDArray y,
List<ICallback> callbacks = null,
float validation_split = 0f,
ValidationDataPack validation_data = null,
int validation_step = 10,
bool shuffle = true,
Dictionary<int, float> class_weight = null,
NDArray sample_weight = null,
Expand All @@ -47,6 +48,20 @@ ICallback fit(IEnumerable<NDArray> x, NDArray y,
int workers = 1,
bool use_multiprocessing = false);

public ICallback fit(IDatasetV2 dataset,
int batch_size = -1,
int epochs = 1,
int verbose = 1,
List<ICallback> callbacks = null,
IDatasetV2 validation_data = null,
int validation_step = 10, // 间隔多少次会进行一次验证
bool shuffle = true,
Dictionary<int, float> class_weight = null,
int initial_epoch = 0,
int max_queue_size = 10,
int workers = 1,
bool use_multiprocessing = false);

void save(string filepath,
bool overwrite = true,
bool include_optimizer = true,
Expand Down Expand Up @@ -85,6 +100,14 @@ Tensors predict(Tensors x,
int workers = 1,
bool use_multiprocessing = false);

public Tensors predict(IDatasetV2 dataset,
int batch_size = -1,
int verbose = 0,
int steps = -1,
int max_queue_size = 10,
int workers = 1,
bool use_multiprocessing = false);

void summary(int line_length = -1, float[] positions = null);

IKerasConfig get_config();
Expand Down
19 changes: 19 additions & 0 deletions src/TensorFlowNET.Core/Keras/Layers/ILayersApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ public ILayer Conv1D(int filters,
string kernel_initializer = "glorot_uniform",
string bias_initializer = "zeros");

public ILayer Conv2D(int filters,
Shape kernel_size = null,
Shape strides = null,
string padding = "valid"
);

public ILayer Conv2D(int filters,
Shape kernel_size = null,
Shape strides = null,
Expand Down Expand Up @@ -95,6 +101,19 @@ public ILayer Conv2D(int filters,
bool use_bias = true,
string kernel_initializer = "glorot_uniform",
string bias_initializer = "zeros");
public ILayer DepthwiseConv2D(Shape kernel_size = null,
Shape strides = null,
string padding = "valid",
string data_format = null,
Shape dilation_rate = null,
int groups = 1,
int depth_multiplier = 1,
string activation = null,
bool use_bias = false,
string kernel_initializer = "glorot_uniform",
string bias_initializer = "zeros",
string depthwise_initializer = "glorot_uniform"
);

public ILayer Dense(int units);
public ILayer Dense(int units,
Expand Down
43 changes: 32 additions & 11 deletions src/TensorFlowNET.Core/Operations/image_ops_impl.cs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,10 @@ internal static Operation[] _CheckAtLeast3DImage(Tensor image, bool require_stat
{
throw new ValueError("\'image\' must be fully defined.");
}
var dims = image_shape["-3:"];
var dims = new Shape(new[] {
image_shape.dims[image_shape.dims.Length - 3],
image_shape.dims[image_shape.dims.Length - 2],
image_shape.dims[image_shape.dims.Length - 1]});
foreach (var dim in dims.dims)
{
if (dim == 0)
Expand All @@ -112,16 +115,18 @@ internal static Operation[] _CheckAtLeast3DImage(Tensor image, bool require_stat
}

var image_shape_last_three_elements = new Shape(new[] {
image_shape.dims[image_shape.dims.Length - 1],
image_shape.dims[image_shape.dims.Length - 3],
image_shape.dims[image_shape.dims.Length - 2],
image_shape.dims[image_shape.dims.Length - 3]});
image_shape.dims[image_shape.dims.Length - 1]});
if (!image_shape_last_three_elements.IsFullyDefined)
{
Tensor image_shape_ = array_ops.shape(image);
var image_shape_return = tf.constant(new[] {
image_shape_.dims[image_shape.dims.Length - 1],
image_shape_.dims[image_shape.dims.Length - 2],
image_shape_.dims[image_shape.dims.Length - 3]});
var image_shape_return = tf.slice(image_shape_, new[] { Math.Max(image_shape.dims.Length - 3, 0) }, new[] { 3 });

//var image_shape_return = tf.constant(new[] {
// image_shape_.dims[image_shape_.dims.Length - 3],
// image_shape_.dims[image_shape_.dims.Length - 2],
// image_shape_.dims[image_shape_.dims.Length - 1]});

return new Operation[] {
check_ops.assert_positive(
Expand Down Expand Up @@ -209,10 +214,10 @@ internal static Tensor _random_flip(Tensor image, int flip_index, int seed, stri
}

public static Tensor flip_left_right(Tensor image)
=> _flip(image, 0, "flip_left_right");
=> _flip(image, 1, "flip_left_right");

public static Tensor flip_up_down(Tensor image)
=> _flip(image, 1, "flip_up_down");
=> _flip(image, 0, "flip_up_down");

internal static Tensor _flip(Tensor image, int flip_index, string scope_name)
{
Expand All @@ -223,11 +228,11 @@ internal static Tensor _flip(Tensor image, int flip_index, string scope_name)
Shape shape = image.shape;
if (shape.ndim == 3 || shape.ndim == Unknown)
{
return fix_image_flip_shape(image, gen_array_ops.reverse(image, ops.convert_to_tensor(new int[] { flip_index })));
return fix_image_flip_shape(image, gen_array_ops.reverse_v2(image, ops.convert_to_tensor(new int[] { flip_index })));
}
else if (shape.ndim == 4)
{
return gen_array_ops.reverse_v2(image, ops.convert_to_tensor(new[] { (flip_index + 1) % 2 }));
return gen_array_ops.reverse_v2(image, ops.convert_to_tensor(new[] { flip_index + 1 }));
}
else
{
Expand Down Expand Up @@ -2047,6 +2052,22 @@ internal static (Tensor, Tensor) non_max_suppression_padded_v1(Tensor boxes, Ten
});
}

public static Tensor encode_jpeg(Tensor contents, string name = null)
{
return tf_with(ops.name_scope(name, "encode_jpeg"), scope =>
{
return gen_ops.encode_jpeg(contents, name:name);
});
}

public static Tensor encode_png(Tensor contents, string name = null)
{
return tf_with(ops.name_scope(name, "encode_png"), scope =>
{
return gen_ops.encode_png(contents, name: name);
});
}

public static Tensor is_jpeg(Tensor contents, string name = null)
{
return tf_with(ops.name_scope(name, "is_jpeg"), scope =>
Expand Down
5 changes: 4 additions & 1 deletion src/TensorFlowNET.Core/Tensors/tensor_util.cs
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,9 @@ public static TensorProto make_tensor_proto(object values, TF_DataType dtype = T
case sbyte val:
tensor_proto.IntVal.AddRange(new[] { (int)val });
break;
case byte val:
tensor_proto.IntVal.AddRange(new[] { (int)val });
break;
case int val:
tensor_proto.IntVal.AddRange(new[] { val });
break;
Expand All @@ -262,7 +265,7 @@ public static TensorProto make_tensor_proto(object values, TF_DataType dtype = T
tensor_proto.DoubleVal.AddRange(new[] { val });
break;
default:
throw new Exception("make_tensor_proto Not Implemented");
throw new Exception($"make_tensor_proto Not Implemented {values.GetType().Name}");
}
}

Expand Down
3 changes: 3 additions & 0 deletions src/TensorFlowNET.Keras/Engine/Model.Evaluate.cs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ Dictionary<string, float> evaluate(DataHandler data_handler, CallbackList callba
var end_step = step + data_handler.StepIncrement;
if (!is_val)
callbacks.on_test_batch_end(end_step, logs);
GC.Collect();
}
}
callbacks.on_test_end(logs);
Expand Down Expand Up @@ -167,7 +168,9 @@ Dictionary<string, float> test_step_multi_inputs_function(DataHandler data_handl
Dictionary<string, float> test_step(DataHandler data_handler, Tensors x, Tensors y)
{
(x,y) = data_handler.DataAdapter.Expand1d(x, y);

var y_pred = Apply(x, training: false);

var loss = compiled_loss.Call(y, y_pred);
compiled_metrics.update_state(y, y_pred);
return metrics.Select(x => (x.Name, x.result())).ToDictionary(x => x.Item1, x => (float)x.Item2);
Expand Down
12 changes: 6 additions & 6 deletions src/TensorFlowNET.Keras/Engine/Model.Fit.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public ICallback fit(NDArray x, NDArray y,
List<ICallback> callbacks = null,
float validation_split = 0f,
ValidationDataPack validation_data = null,
int validation_step = 10,
bool shuffle = true,
Dictionary<int, float> class_weight = null,
NDArray sample_weight = null,
Expand Down Expand Up @@ -147,7 +148,7 @@ public ICallback fit(IEnumerable<NDArray> x, NDArray y,
}
}

public History fit(IDatasetV2 dataset,
public ICallback fit(IDatasetV2 dataset,
int batch_size = -1,
int epochs = 1,
int verbose = 1,
Expand All @@ -156,7 +157,6 @@ public History fit(IDatasetV2 dataset,
int validation_step = 10,
bool shuffle = true,
Dictionary<int, float> class_weight = null,
NDArray sample_weight = null,
int initial_epoch = 0,
int max_queue_size = 10,
int workers = 1,
Expand All @@ -170,7 +170,7 @@ public History fit(IDatasetV2 dataset,
InitialEpoch = initial_epoch,
Epochs = epochs,
Shuffle = shuffle,
SampleWeight = sample_weight,
ClassWeight = class_weight,
MaxQueueSize = max_queue_size,
Workers = workers,
UseMultiprocessing = use_multiprocessing,
Expand Down Expand Up @@ -218,6 +218,7 @@ History FitInternal(DataHandler data_handler, int epochs, int validation_step, i
var end_step = step + data_handler.StepIncrement;
End_step = end_step;
callbacks.on_train_batch_end(end_step, logs);
GC.Collect();
}

if (validation_data != null)
Expand All @@ -233,11 +234,10 @@ History FitInternal(DataHandler data_handler, int epochs, int validation_step, i
callbacks.on_train_batch_end(End_step, logs);
}

GC.Collect();

callbacks.on_epoch_end(epoch, logs);

GC.Collect();
GC.WaitForPendingFinalizers();
if (stop_training)
{
break;
Expand Down Expand Up @@ -282,6 +282,7 @@ History FitInternal(DataHandler data_handler, int epochs, int verbose, List<ICal
var end_step = step + data_handler.StepIncrement;
End_step = end_step;
callbacks.on_train_batch_end(end_step, logs);
GC.Collect();
}

if (validation_data != null)
Expand All @@ -301,7 +302,6 @@ History FitInternal(DataHandler data_handler, int epochs, int verbose, List<ICal
callbacks.on_epoch_end(epoch, logs);

GC.Collect();
GC.WaitForPendingFinalizers();
if (stop_training)
{
break;
Expand Down
2 changes: 1 addition & 1 deletion src/TensorFlowNET.Keras/Engine/Model.Predict.cs
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ Tensors PredictInternal(DataHandler data_handler, int verbose)
for (int i = 0; i < batch_outputs.Length; i++)
batch_outputs[i] = tf.concat(new Tensor[] { batch_outputs[i], tmp_batch_outputs[i] }, axis: 0);
}

var end_step = step + data_handler.StepIncrement;
callbacks.on_predict_batch_end(end_step, new Dictionary<string, Tensors> { { "outputs", batch_outputs } });
GC.Collect();
}
}

Expand Down
Loading
Loading