From dc1e9f71d4670e9156886ef2b398a9485a8dfaef Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Sun, 16 Jul 2023 21:10:52 +0530 Subject: [PATCH 01/16] added keras utils -> keras-core utils --- keras_core/utils/dataset_utils.py | 383 ++++++++++++++++++++++++++++-- 1 file changed, 365 insertions(+), 18 deletions(-) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index e33a5788d..cb33921bc 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -1,3 +1,10 @@ +import tensorflow as tf +import torch +from torch.utils.data import Dataset as torchDataset +import numpy as np +import random +import time + from keras_core.api_export import keras_core_export from keras_core.utils.module_utils import tensorflow as tf @@ -6,21 +13,19 @@ def split_dataset( dataset, left_size=None, right_size=None, shuffle=False, seed=None ): - """Splits a dataset into a left half and a right half (e.g. train / test). + """Split a dataset into a left half and a right half (e.g. train / test). Args: dataset: A `tf.data.Dataset` object, or a list/tuple of arrays with the - same length. + same length. left_size: If float (in the range `[0, 1]`), it signifies - the fraction of the data to pack in the left dataset. - If integer, it signifies the number of samples to pack - in the left dataset. If `None`, it defaults to the complement - to `right_size`. + the fraction of the data to pack in the left dataset. If integer, it + signifies the number of samples to pack in the left dataset. If + `None`, it uses the complement to `right_size`. Defaults to `None`. right_size: If float (in the range `[0, 1]`), it signifies - the fraction of the data to pack in the right dataset. - If integer, it signifies the number of samples to pack - in the right dataset. If `None`, it defaults to the complement - to `left_size`. + the fraction of the data to pack in the right dataset. If integer, it + signifies the number of samples to pack in the right dataset. If + `None`, it uses the complement to `left_size`. Defaults to `None`. shuffle: Boolean, whether to shuffle the data before splitting it. seed: A random seed for shuffling. @@ -30,20 +35,362 @@ def split_dataset( Example: >>> data = np.random.random(size=(1000, 4)) - >>> left_ds, right_ds = split_dataset(data, left_size=0.8) + >>> left_ds, right_ds = tf.keras.utils.split_dataset(data, left_size=0.8) >>> int(left_ds.cardinality()) 800 >>> int(right_ds.cardinality()) 200 + """ - # TODO: long-term, port implementation. - return tf.keras.utils.split_dataset( - dataset, - left_size=left_size, - right_size=right_size, - shuffle=shuffle, - seed=seed, + + dataset_type_spec = _get_type_spec(dataset) + + if dataset_type_spec not in [torchDataset, tf.data.Dataset, list, tuple, np.ndarray]: + raise TypeError( + "The `dataset` argument must be either a `tf.data.Dataset` " + "object or a list/tuple of arrays. " + f"Received: dataset={dataset} of type {type(dataset)}" + ) + + if right_size is None and left_size is None: + raise ValueError( + "At least one of the `left_size` or `right_size` " + "must be specified. Received: left_size=None and " + "right_size=None" + ) + + dataset_as_list = _convert_dataset_to_list(dataset, dataset_type_spec) + + if shuffle: + if seed is None: + seed = random.randint(0, int(1e6)) + random.seed(seed) + random.shuffle(dataset_as_list) + + total_length = len(dataset_as_list) + + left_size, right_size = _rescale_dataset_split_sizes( + left_size, right_size, total_length + ) + left_split = list(dataset_as_list[:left_size]) + right_split = list(dataset_as_list[-right_size:]) + + left_split = _restore_dataset_from_list( + left_split, dataset_type_spec, dataset + ) + right_split = _restore_dataset_from_list( + right_split, dataset_type_spec, dataset + ) + + left_split = tf.data.Dataset.from_tensor_slices(left_split) + right_split = tf.data.Dataset.from_tensor_slices(right_split) + + # apply batching to the splits if the dataset is batched + if dataset_type_spec is tf.data.Dataset and is_batched(dataset): + batch_size = get_batch_size(dataset) + if batch_size is not None: + left_split = left_split.batch(batch_size) + right_split = right_split.batch(batch_size) + + left_split = left_split.prefetch(tf.data.AUTOTUNE) + right_split = right_split.prefetch(tf.data.AUTOTUNE) + + return left_split, right_split + + +def _convert_dataset_to_list( + dataset, + dataset_type_spec, + data_size_warning_flag=True, + ensure_shape_similarity=True, +): + """Convert `tf.data.Dataset` object or list/tuple of NumPy arrays to a list. + + Args: + dataset : A `tf.data.Dataset` object or a list/tuple of arrays. + dataset_type_spec : the type of the dataset + data_size_warning_flag (bool, optional): If set to True, a warning will + be issued if the dataset takes longer than 10 seconds to iterate. + Defaults to `True`. + ensure_shape_similarity (bool, optional): If set to True, the shape of + the first sample will be used to validate the shape of rest of the + samples. Defaults to `True`. + + Returns: + List: A list of tuples/NumPy arrays. + """ + dataset_iterator = _get_data_iterator_from_dataset( + dataset, dataset_type_spec ) + dataset_as_list = [] + + start_time = time.time() + for sample in _get_next_sample( + dataset_iterator, + ensure_shape_similarity, + data_size_warning_flag, + start_time, + ): + if dataset_type_spec in [tuple, list]: + # The try-except here is for NumPy 1.24 compatibility, see: + # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html + try: + arr = np.array(sample) + except ValueError: + arr = np.array(sample, dtype=object) + dataset_as_list.append(arr) + else: + dataset_as_list.append(sample) + + return dataset_as_list + +def _get_data_iterator_from_dataset(dataset, dataset_type_spec): + """Get the iterator from a dataset. + + Args: + dataset : A `tf.data.Dataset` object or a list/tuple of arrays. + dataset_type_spec : the type of the dataset + + Raises: + ValueError: + - If the dataset is empty. + - If the dataset is not a `tf.data.Dataset` object + or a list/tuple of arrays. + - If the dataset is a list/tuple of arrays and the + length of the list/tuple is not equal to the number + + Returns: + iterator: An `iterator` object. + """ + if dataset_type_spec == list: + if len(dataset) == 0: + raise ValueError( + "Received an empty list dataset. " + "Please provide a non-empty list of arrays." + ) + + if _get_type_spec(dataset[0]) is np.ndarray: + expected_shape = dataset[0].shape + for i, element in enumerate(dataset): + if np.array(element).shape[0] != expected_shape[0]: + raise ValueError( + "Received a list of NumPy arrays with different " + f"lengths. Mismatch found at index {i}, " + f"Expected shape={expected_shape} " + f"Received shape={np.array(element).shape}." + "Please provide a list of NumPy arrays with " + "the same length." + ) + else: + raise ValueError( + "Expected a list of `numpy.ndarray` objects," + f"Received: {type(dataset[0])}" + ) + + return iter(zip(*dataset)) + elif dataset_type_spec == tuple: + if len(dataset) == 0: + raise ValueError( + "Received an empty list dataset." + "Please provide a non-empty tuple of arrays." + ) + + if _get_type_spec(dataset[0]) is np.ndarray: + expected_shape = dataset[0].shape + for i, element in enumerate(dataset): + if np.array(element).shape[0] != expected_shape[0]: + raise ValueError( + "Received a tuple of NumPy arrays with different " + f"lengths. Mismatch found at index {i}, " + f"Expected shape={expected_shape} " + f"Received shape={np.array(element).shape}." + "Please provide a tuple of NumPy arrays with " + "the same length." + ) + else: + raise ValueError( + "Expected a tuple of `numpy.ndarray` objects, " + f"Received: {type(dataset[0])}" + ) + + return iter(zip(*dataset)) + elif dataset_type_spec == tf.data.Dataset: + if is_batched(dataset): + dataset = dataset.unbatch() + return iter(dataset) + elif dataset_type_spec == np.ndarray: + return iter(dataset) + +def _rescale_dataset_split_sizes(left_size, right_size, total_length): + """Rescale the dataset split sizes. + + We want to ensure that the sum of + the split sizes is equal to the total length of the dataset. + + Args: + left_size : The size of the left dataset split. + right_size : The size of the right dataset split. + total_length : The total length of the dataset. + + Raises: + TypeError: - If `left_size` or `right_size` is not an integer or float. + ValueError: - If `left_size` or `right_size` is negative or greater + than 1 or greater than `total_length`. + + Returns: + tuple: A tuple of rescaled left_size and right_size + """ + left_size_type = type(left_size) + right_size_type = type(right_size) + + # check both left_size and right_size are integers or floats + if (left_size is not None and left_size_type not in [int, float]) and ( + right_size is not None and right_size_type not in [int, float] + ): + raise TypeError( + "Invalid `left_size` and `right_size` Types. Expected: " + "integer or float or None, Received: type(left_size)=" + f"{left_size_type} and type(right_size)={right_size_type}" + ) + + # check left_size is a integer or float + if left_size is not None and left_size_type not in [int, float]: + raise TypeError( + "Invalid `left_size` Type. Expected: int or float or None, " + f"Received: type(left_size)={left_size_type}. " + ) + + # check right_size is a integer or float + if right_size is not None and right_size_type not in [int, float]: + raise TypeError( + "Invalid `right_size` Type. " + "Expected: int or float or None," + f"Received: type(right_size)={right_size_type}." + ) + + # check left_size and right_size are non-zero + if left_size == 0 and right_size == 0: + raise ValueError( + "Both `left_size` and `right_size` are zero. " + "At least one of the split sizes must be non-zero." + ) + + # check left_size is non-negative and less than 1 and less than total_length + if ( + left_size_type == int + and (left_size <= 0 or left_size >= total_length) + or left_size_type == float + and (left_size <= 0 or left_size >= 1) + ): + raise ValueError( + "`left_size` should be either a positive integer " + f"smaller than {total_length}, or a float " + "within the range `[0, 1]`. Received: left_size=" + f"{left_size}" + ) + + # check right_size is non-negative and less than 1 and less than + # total_length + if ( + right_size_type == int + and (right_size <= 0 or right_size >= total_length) + or right_size_type == float + and (right_size <= 0 or right_size >= 1) + ): + raise ValueError( + "`right_size` should be either a positive integer " + f"and smaller than {total_length} or a float " + "within the range `[0, 1]`. Received: right_size=" + f"{right_size}" + ) + + # check sum of left_size and right_size is less than or equal to + # total_length + if ( + right_size_type == left_size_type == float + and right_size + left_size > 1 + ): + raise ValueError( + "The sum of `left_size` and `right_size` is greater " + "than 1. It must be less than or equal to 1." + ) + + if left_size_type == float: + left_size = round(left_size * total_length) + elif left_size_type == int: + left_size = float(left_size) + + if right_size_type == float: + right_size = round(right_size * total_length) + elif right_size_type == int: + right_size = float(right_size) + + if left_size is None: + left_size = total_length - right_size + elif right_size is None: + right_size = total_length - left_size + + if left_size + right_size > total_length: + raise ValueError( + "The sum of `left_size` and `right_size` should " + "be smaller than the {total_length}. " + f"Received: left_size + right_size = {left_size+right_size}" + f"and total_length = {total_length}" + ) + + for split, side in [(left_size, "left"), (right_size, "right")]: + if split == 0: + raise ValueError( + f"With `dataset` of length={total_length}, `left_size`=" + f"{left_size} and `right_size`={right_size}." + f"Resulting {side} side dataset split will be empty. " + "Adjust any of the aforementioned parameters" + ) + + left_size, right_size = int(left_size), int(right_size) + return left_size, right_size + +def _restore_dataset_from_list( + dataset_as_list, dataset_type_spec, original_dataset +): + """Restore the dataset from the list of arrays.""" + if dataset_type_spec in [tuple, list]: + return tuple(np.array(sample) for sample in zip(*dataset_as_list)) + elif dataset_type_spec == tf.data.Dataset: + if isinstance(original_dataset.element_spec, dict): + restored_dataset = {} + for d in dataset_as_list: + for k, v in d.items(): + if k not in restored_dataset: + restored_dataset[k] = [v] + else: + restored_dataset[k].append(v) + return restored_dataset + else: + return tuple(np.array(sample) for sample in zip(*dataset_as_list)) + return dataset_as_list + +def is_batched(dataset): + """ "Check if the `tf.data.Dataset` is batched.""" + return hasattr(dataset, "_batch_size") + +def _get_type_spec(dataset): + """Get the type spec of the dataset.""" + if isinstance(dataset, tuple): + return tuple + elif isinstance(dataset, list): + return list + elif isinstance(dataset, np.ndarray): + return np.ndarray + elif isinstance(dataset, dict): + return dict + elif isinstance(dataset, tf.data.Dataset): + return tf.data.Dataset + elif isinstance(dataset, torchDataset): + return torchDataset + else: + return None + @keras_core_export( From feb40ca4d2366581c96265ee90c80ec3442309ed Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Sun, 16 Jul 2023 21:58:01 +0530 Subject: [PATCH 02/16] added batched_dataset function --- keras_core/utils/dataset_utils.py | 79 +++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index cb33921bc..e18c2f715 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -2,6 +2,7 @@ import torch from torch.utils.data import Dataset as torchDataset import numpy as np +import warnings import random import time @@ -220,6 +221,77 @@ def _get_data_iterator_from_dataset(dataset, dataset_type_spec): return iter(dataset) elif dataset_type_spec == np.ndarray: return iter(dataset) + +def _get_next_sample( + dataset_iterator, + ensure_shape_similarity, + data_size_warning_flag, + start_time, +): + """ "Yield data samples from the `dataset_iterator`. + + Args: + dataset_iterator : An `iterator` object. + ensure_shape_similarity (bool, optional): If set to True, the shape of + the first sample will be used to validate the shape of rest of the + samples. Defaults to `True`. + data_size_warning_flag (bool, optional): If set to True, a warning will + be issued if the dataset takes longer than 10 seconds to iterate. + Defaults to `True`. + start_time (float): the start time of the dataset iteration. this is + used only if `data_size_warning_flag` is set to true. + + Raises: + ValueError: - If the dataset is empty. + - If `ensure_shape_similarity` is set to True and the + shape of the first sample is not equal to the shape of + atleast one of the rest of the samples. + + Yields: + data_sample: A tuple/list of numpy arrays. + """ + try: + dataset_iterator = iter(dataset_iterator) + first_sample = next(dataset_iterator) + if isinstance(first_sample, (tf.Tensor, np.ndarray)): + first_sample_shape = np.array(first_sample).shape + else: + first_sample_shape = None + ensure_shape_similarity = False + yield first_sample + except StopIteration: + raise ValueError( + "Received an empty Dataset. `dataset` must " + "be a non-empty list/tuple of `numpy.ndarray` objects " + "or `tf.data.Dataset` objects." + ) + + for i, sample in enumerate(dataset_iterator): + if ensure_shape_similarity: + if first_sample_shape != np.array(sample).shape: + raise ValueError( + "All `dataset` samples must have same shape, " + f"Expected shape: {np.array(first_sample).shape} " + f"Received shape: {np.array(sample).shape} at index " + f"{i}." + ) + if data_size_warning_flag: + if i % 10 == 0: + cur_time = time.time() + # warns user if the dataset is too large to iterate within 10s + if int(cur_time - start_time) > 10 and data_size_warning_flag: + warnings.warn( + "The dataset is taking longer than 10 seconds to " + "iterate over. This may be due to the size of the " + "dataset. Keep in mind that the `split_dataset` " + "utility is only for small in-memory dataset " + "(e.g. < 10,000 samples).", + category=ResourceWarning, + source="split_dataset", + ) + data_size_warning_flag = False + yield sample + def _rescale_dataset_split_sizes(left_size, right_size, total_length): """Rescale the dataset split sizes. @@ -374,6 +446,13 @@ def is_batched(dataset): """ "Check if the `tf.data.Dataset` is batched.""" return hasattr(dataset, "_batch_size") +def get_batch_size(dataset): + """Get the batch size of the dataset.""" + if is_batched(dataset): + return dataset._batch_size + else: + return None + def _get_type_spec(dataset): """Get the type spec of the dataset.""" if isinstance(dataset, tuple): From 3f874ade97470e6507aed614d7e052f8e00ae1e7 Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Wed, 19 Jul 2023 22:41:29 +0530 Subject: [PATCH 03/16] added torch decoupling --- keras_core/utils/dataset_utils.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index e18c2f715..9325becda 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -1,6 +1,7 @@ import tensorflow as tf import torch from torch.utils.data import Dataset as torchDataset +from torch.utils.data import DataLoader as torchDataLoader import numpy as np import warnings import random @@ -17,7 +18,7 @@ def split_dataset( """Split a dataset into a left half and a right half (e.g. train / test). Args: - dataset: A `tf.data.Dataset` object, or a list/tuple of arrays with the + dataset: A `tf.data.Dataset, torchDataset` object, or a list/tuple of arrays with the same length. left_size: If float (in the range `[0, 1]`), it signifies the fraction of the data to pack in the left dataset. If integer, it @@ -83,6 +84,7 @@ def split_dataset( right_split, dataset_type_spec, dataset ) + return left_split, right_split left_split = tf.data.Dataset.from_tensor_slices(left_split) right_split = tf.data.Dataset.from_tensor_slices(right_split) @@ -105,7 +107,7 @@ def _convert_dataset_to_list( data_size_warning_flag=True, ensure_shape_similarity=True, ): - """Convert `tf.data.Dataset` object or list/tuple of NumPy arrays to a list. + """Convert `tf.data.Dataset torchDataset` object or list/tuple of NumPy arrays to a list. Args: dataset : A `tf.data.Dataset` object or a list/tuple of arrays. @@ -149,7 +151,7 @@ def _get_data_iterator_from_dataset(dataset, dataset_type_spec): """Get the iterator from a dataset. Args: - dataset : A `tf.data.Dataset` object or a list/tuple of arrays. + dataset : A `tf.data.Dataset or torchDataset` object or a list/tuple of arrays. dataset_type_spec : the type of the dataset Raises: @@ -219,6 +221,11 @@ def _get_data_iterator_from_dataset(dataset, dataset_type_spec): if is_batched(dataset): dataset = dataset.unbatch() return iter(dataset) + + # torch dataset iterator might be required to change + elif dataset_type_spec == torchDataset: + return torchDataLoader(dataset) + elif dataset_type_spec == np.ndarray: return iter(dataset) @@ -253,7 +260,7 @@ def _get_next_sample( try: dataset_iterator = iter(dataset_iterator) first_sample = next(dataset_iterator) - if isinstance(first_sample, (tf.Tensor, np.ndarray)): + if isinstance(first_sample, (tf.Tensor, np.ndarray)) or torch.is_tensor(first_sample): first_sample_shape = np.array(first_sample).shape else: first_sample_shape = None From c708b3e99cfada300857abae61d8dac3499a5d50 Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Wed, 19 Jul 2023 22:42:34 +0530 Subject: [PATCH 04/16] added torch decoupling --- keras_core/utils/dataset_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index 9325becda..183a5ec13 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -77,6 +77,7 @@ def split_dataset( left_split = list(dataset_as_list[:left_size]) right_split = list(dataset_as_list[-right_size:]) + return left_split, right_split left_split = _restore_dataset_from_list( left_split, dataset_type_spec, dataset ) @@ -84,7 +85,7 @@ def split_dataset( right_split, dataset_type_spec, dataset ) - return left_split, right_split + left_split = tf.data.Dataset.from_tensor_slices(left_split) right_split = tf.data.Dataset.from_tensor_slices(right_split) From 26ad5be9c0f46caa8e59b244fb1da6fa8a3381f5 Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Thu, 20 Jul 2023 00:21:40 +0530 Subject: [PATCH 05/16] torch return logic --- keras_core/utils/dataset_utils.py | 41 ++++++++++++++++++------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index 183a5ec13..6379bf413 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -18,7 +18,7 @@ def split_dataset( """Split a dataset into a left half and a right half (e.g. train / test). Args: - dataset: A `tf.data.Dataset, torchDataset` object, or a list/tuple of arrays with the + dataset: A `tf.data.Datasetor torchDataset` object, or a list/tuple of arrays with the same length. left_size: If float (in the range `[0, 1]`), it signifies the fraction of the data to pack in the left dataset. If integer, it @@ -32,7 +32,7 @@ def split_dataset( seed: A random seed for shuffling. Returns: - A tuple of two `tf.data.Dataset` objects: the left and right splits. + A tuple of two `tf.data.Dataset or torchDataset` objects: the left and right splits. Example: @@ -77,29 +77,33 @@ def split_dataset( left_split = list(dataset_as_list[:left_size]) right_split = list(dataset_as_list[-right_size:]) - return left_split, right_split + left_split = _restore_dataset_from_list( left_split, dataset_type_spec, dataset ) right_split = _restore_dataset_from_list( right_split, dataset_type_spec, dataset ) - - left_split = tf.data.Dataset.from_tensor_slices(left_split) - right_split = tf.data.Dataset.from_tensor_slices(right_split) + if dataset_type_spec != torchDataset: + + left_split = tf.data.Dataset.from_tensor_slices(left_split) + right_split = tf.data.Dataset.from_tensor_slices(right_split) - # apply batching to the splits if the dataset is batched - if dataset_type_spec is tf.data.Dataset and is_batched(dataset): - batch_size = get_batch_size(dataset) - if batch_size is not None: - left_split = left_split.batch(batch_size) - right_split = right_split.batch(batch_size) + # apply batching to the splits if the dataset is batched + if dataset_type_spec is tf.data.Dataset and is_batched(dataset): + batch_size = get_batch_size(dataset) + if batch_size is not None: + left_split = left_split.batch(batch_size) + right_split = right_split.batch(batch_size) - left_split = left_split.prefetch(tf.data.AUTOTUNE) - right_split = right_split.prefetch(tf.data.AUTOTUNE) + left_split = left_split.prefetch(tf.data.AUTOTUNE) + right_split = right_split.prefetch(tf.data.AUTOTUNE) + return left_split, right_split + + elif dataset_type_spec == torchDataset: + return dataset.__class__(*left_split), dataset.__class__(*right_split) - return left_split, right_split def _convert_dataset_to_list( @@ -108,10 +112,10 @@ def _convert_dataset_to_list( data_size_warning_flag=True, ensure_shape_similarity=True, ): - """Convert `tf.data.Dataset torchDataset` object or list/tuple of NumPy arrays to a list. + """Convert `tf.data.Dataset or torchDataset` object or list/tuple of NumPy arrays to a list. Args: - dataset : A `tf.data.Dataset` object or a list/tuple of arrays. + dataset : A `tf.data.Dataset or torchDataset` object or a list/tuple of arrays. dataset_type_spec : the type of the dataset data_size_warning_flag (bool, optional): If set to True, a warning will be issued if the dataset takes longer than 10 seconds to iterate. @@ -448,6 +452,9 @@ def _restore_dataset_from_list( return restored_dataset else: return tuple(np.array(sample) for sample in zip(*dataset_as_list)) + + elif dataset_type_spec == torchDataset: + return tuple(np.array(sample, dtype=object) for sample in zip(*dataset_as_list)) return dataset_as_list def is_batched(dataset): From f2e8afd92c3fc5eafbe3261d3a27215450deef87 Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Thu, 20 Jul 2023 00:28:43 +0530 Subject: [PATCH 06/16] removed torchLoader dependency --- keras_core/utils/dataset_utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index 6379bf413..3718e5520 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -1,7 +1,6 @@ import tensorflow as tf import torch from torch.utils.data import Dataset as torchDataset -from torch.utils.data import DataLoader as torchDataLoader import numpy as np import warnings import random @@ -229,7 +228,7 @@ def _get_data_iterator_from_dataset(dataset, dataset_type_spec): # torch dataset iterator might be required to change elif dataset_type_spec == torchDataset: - return torchDataLoader(dataset) + return iter(dataset) elif dataset_type_spec == np.ndarray: return iter(dataset) @@ -454,7 +453,7 @@ def _restore_dataset_from_list( return tuple(np.array(sample) for sample in zip(*dataset_as_list)) elif dataset_type_spec == torchDataset: - return tuple(np.array(sample, dtype=object) for sample in zip(*dataset_as_list)) + return tuple(np.array(sample) for sample in zip(*dataset_as_list)) return dataset_as_list def is_batched(dataset): From aed0729dd179cc0c2c926afa36136cc9ed11c3cf Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Thu, 20 Jul 2023 18:57:43 +0530 Subject: [PATCH 07/16] added unittest --- keras_core/utils/dataset_utils.py | 10 +- keras_core/utils/dataset_utils_test.py | 151 +++++++++++++++++++++++++ 2 files changed, 157 insertions(+), 4 deletions(-) create mode 100644 keras_core/utils/dataset_utils_test.py diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index 3718e5520..9787947fb 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -1,11 +1,8 @@ -import tensorflow as tf import torch -from torch.utils.data import Dataset as torchDataset import numpy as np import warnings import random import time - from keras_core.api_export import keras_core_export from keras_core.utils.module_utils import tensorflow as tf @@ -17,7 +14,7 @@ def split_dataset( """Split a dataset into a left half and a right half (e.g. train / test). Args: - dataset: A `tf.data.Datasetor torchDataset` object, or a list/tuple of arrays with the + dataset: A `tf.data.Dataset or torchDataset` object, or a list/tuple of arrays with the same length. left_size: If float (in the range `[0, 1]`), it signifies the fraction of the data to pack in the left dataset. If integer, it @@ -43,6 +40,8 @@ def split_dataset( 200 """ + from torch.utils.data import Dataset as torchDataset + dataset_type_spec = _get_type_spec(dataset) @@ -169,6 +168,7 @@ def _get_data_iterator_from_dataset(dataset, dataset_type_spec): Returns: iterator: An `iterator` object. """ + from torch.utils.data import Dataset as torchDataset if dataset_type_spec == list: if len(dataset) == 0: raise ValueError( @@ -436,6 +436,7 @@ def _rescale_dataset_split_sizes(left_size, right_size, total_length): def _restore_dataset_from_list( dataset_as_list, dataset_type_spec, original_dataset ): + from torch.utils.data import Dataset as torchDataset """Restore the dataset from the list of arrays.""" if dataset_type_spec in [tuple, list]: return tuple(np.array(sample) for sample in zip(*dataset_as_list)) @@ -468,6 +469,7 @@ def get_batch_size(dataset): return None def _get_type_spec(dataset): + from torch.utils.data import Dataset as torchDataset """Get the type spec of the dataset.""" if isinstance(dataset, tuple): return tuple diff --git a/keras_core/utils/dataset_utils_test.py b/keras_core/utils/dataset_utils_test.py new file mode 100644 index 000000000..d429c7869 --- /dev/null +++ b/keras_core/utils/dataset_utils_test.py @@ -0,0 +1,151 @@ +from keras_core.testing import test_case +from keras_core.utils import naming +from keras_core.utils.module_utils import tensorflow as tf +from keras_core.utils.dataset_utils import split_dataset +import numpy as np + +class DatasetUtilsTest(test_case.TestCase): + def test_split_dataset_list(self): + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + dataset = [np.random.sample((n_sample,n_cols)), np.random.sample((n_sample,n_pred))] + dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (n_cols)) + + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + dataset = [np.random.sample((n_sample, 100, n_cols)), np.random.sample((n_sample,n_pred))] + dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, n_cols)) + + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + dataset = [np.random.sample((n_sample,10, 10, n_cols)), np.random.sample((n_sample,n_pred))] + dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (10, 10, n_cols)) + + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + dataset = [np.random.sample((n_sample, 100, 10, 30, n_cols)), np.random.sample((n_sample,n_pred))] + dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, 10, 30, n_cols)) + + def test_split_dataset_tuple(self): + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + dataset = (np.random.sample((n_sample,n_cols)), np.random.sample((n_sample,n_pred))) + dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (n_cols)) + + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + dataset = (np.random.sample((n_sample, 100, n_cols)), np.random.sample((n_sample,n_pred))) + dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, n_cols)) + + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + dataset = (np.random.sample((n_sample,10, 10, n_cols)), np.random.sample((n_sample,n_pred))) + dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (10, 10, n_cols)) + + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + dataset = (np.random.sample((n_sample, 100, 10, 30, n_cols)), np.random.sample((n_sample,n_pred))) + dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, 10, 30, n_cols)) + + def test_split_dataset_tensorflow(self): + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + features, labels = (np.random.sample((n_sample,n_cols)), np.random.sample((n_sample,n_pred))) + tf_dataset = tf.data.Dataset.from_tensor_slices((features,labels)) + dataset_left, dataset_right = split_dataset(tf_dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (n_cols)) + + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + features, labels = (np.random.sample((n_sample, 100, n_cols)), np.random.sample((n_sample,n_pred))) + tf_dataset = tf.data.Dataset.from_tensor_slices((features,labels)) + dataset_left, dataset_right = split_dataset(tf_dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, n_cols)) + + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + features, labels = (np.random.sample((n_sample,10, 10, n_cols)), np.random.sample((n_sample,n_pred))) + tf_dataset = tf.data.Dataset.from_tensor_slices((features,labels)) + dataset_left, dataset_right = split_dataset(tf_dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (10, 10, n_cols)) + + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + features, labels = (np.random.sample((n_sample, 100, 10, 30, n_cols)), np.random.sample((n_sample,n_pred))) + tf_dataset = tf.data.Dataset.from_tensor_slices((features,labels)) + dataset_left, dataset_right = split_dataset(tf_dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, 10, 30, n_cols)) + + def test_split_dataset_torch(self): + + # sample torch dataset class + from torch.utils.data import Dataset as torchDataset + class Dataset(torchDataset): + 'Characterizes a dataset for PyTorch' + def __init__(self, x, y): + 'Initialization' + self.x = x + self.y = y + + def __len__(self): + 'Denotes the total number of samples' + return len(self.x) + + def __getitem__(self, index): + 'Generates one sample of data' + return self.x[index], self.y[index] + + + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + features, labels = (np.random.sample((n_sample,n_cols)), np.random.sample((n_sample,n_pred))) + torch_dataset = Dataset(features,labels) + dataset_left, dataset_right = split_dataset(torch_dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (n_cols,)) + + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + features, labels = (np.random.sample((n_sample, 100, n_cols)), np.random.sample((n_sample,n_pred))) + torch_dataset = Dataset(features,labels) + dataset_left, dataset_right = split_dataset(torch_dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, n_cols)) + + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + features, labels = (np.random.sample((n_sample,10, 10, n_cols)), np.random.sample((n_sample,n_pred))) + torch_dataset = Dataset(features,labels) + dataset_left, dataset_right = split_dataset(torch_dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (10, 10, n_cols)) + + n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 + features, labels = (np.random.sample((n_sample, 100, 10, 30, n_cols)), np.random.sample((n_sample,n_pred))) + torch_dataset = Dataset(features,labels) + dataset_left, dataset_right = split_dataset(torch_dataset, left_size=left_size, right_size=right_size) + self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) + self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, 10, 30, n_cols)) + + From 6eec8cadc9672cbd56b1c9bc15743defd2fe0b39 Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Thu, 20 Jul 2023 19:31:04 +0530 Subject: [PATCH 08/16] added unittest using cardinality --- keras_core/utils/dataset_utils_test.py | 48 +++++++++++++------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/keras_core/utils/dataset_utils_test.py b/keras_core/utils/dataset_utils_test.py index d429c7869..64f01dd3d 100644 --- a/keras_core/utils/dataset_utils_test.py +++ b/keras_core/utils/dataset_utils_test.py @@ -9,58 +9,58 @@ def test_split_dataset_list(self): n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 dataset = [np.random.sample((n_sample,n_cols)), np.random.sample((n_sample,n_pred))] dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) + self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) self.assertEqual([sample for sample in dataset_right][0][0].shape, (n_cols)) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 dataset = [np.random.sample((n_sample, 100, n_cols)), np.random.sample((n_sample,n_pred))] dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) + self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, n_cols)) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 dataset = [np.random.sample((n_sample,10, 10, n_cols)), np.random.sample((n_sample,n_pred))] dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) + self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) self.assertEqual([sample for sample in dataset_right][0][0].shape, (10, 10, n_cols)) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 dataset = [np.random.sample((n_sample, 100, 10, 30, n_cols)), np.random.sample((n_sample,n_pred))] dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) + self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, 10, 30, n_cols)) def test_split_dataset_tuple(self): n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 dataset = (np.random.sample((n_sample,n_cols)), np.random.sample((n_sample,n_pred))) dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) + self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) self.assertEqual([sample for sample in dataset_right][0][0].shape, (n_cols)) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 dataset = (np.random.sample((n_sample, 100, n_cols)), np.random.sample((n_sample,n_pred))) dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) + self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, n_cols)) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 dataset = (np.random.sample((n_sample,10, 10, n_cols)), np.random.sample((n_sample,n_pred))) dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) + self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) self.assertEqual([sample for sample in dataset_right][0][0].shape, (10, 10, n_cols)) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 dataset = (np.random.sample((n_sample, 100, 10, 30, n_cols)), np.random.sample((n_sample,n_pred))) dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) + self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, 10, 30, n_cols)) def test_split_dataset_tensorflow(self): @@ -68,32 +68,32 @@ def test_split_dataset_tensorflow(self): features, labels = (np.random.sample((n_sample,n_cols)), np.random.sample((n_sample,n_pred))) tf_dataset = tf.data.Dataset.from_tensor_slices((features,labels)) dataset_left, dataset_right = split_dataset(tf_dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) + self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) self.assertEqual([sample for sample in dataset_right][0][0].shape, (n_cols)) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 features, labels = (np.random.sample((n_sample, 100, n_cols)), np.random.sample((n_sample,n_pred))) tf_dataset = tf.data.Dataset.from_tensor_slices((features,labels)) dataset_left, dataset_right = split_dataset(tf_dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) + self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, n_cols)) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 features, labels = (np.random.sample((n_sample,10, 10, n_cols)), np.random.sample((n_sample,n_pred))) tf_dataset = tf.data.Dataset.from_tensor_slices((features,labels)) dataset_left, dataset_right = split_dataset(tf_dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) + self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) self.assertEqual([sample for sample in dataset_right][0][0].shape, (10, 10, n_cols)) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 features, labels = (np.random.sample((n_sample, 100, 10, 30, n_cols)), np.random.sample((n_sample,n_pred))) tf_dataset = tf.data.Dataset.from_tensor_slices((features,labels)) dataset_left, dataset_right = split_dataset(tf_dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) + self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) + self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, 10, 30, n_cols)) def test_split_dataset_torch(self): From c3818a54b21ce30505b9b5fddf9a5e856c56a068 Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Fri, 21 Jul 2023 19:36:26 +0530 Subject: [PATCH 09/16] reformatted --- keras_core/utils/dataset_utils.py | 38 ++- keras_core/utils/dataset_utils_test.py | 375 ++++++++++++++++++------- 2 files changed, 304 insertions(+), 109 deletions(-) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index 9787947fb..f66ad5428 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -1,4 +1,4 @@ -import torch +import torch import numpy as np import warnings import random @@ -42,10 +42,15 @@ def split_dataset( """ from torch.utils.data import Dataset as torchDataset - dataset_type_spec = _get_type_spec(dataset) - if dataset_type_spec not in [torchDataset, tf.data.Dataset, list, tuple, np.ndarray]: + if dataset_type_spec not in [ + torchDataset, + tf.data.Dataset, + list, + tuple, + np.ndarray, + ]: raise TypeError( "The `dataset` argument must be either a `tf.data.Dataset` " "object or a list/tuple of arrays. " @@ -75,16 +80,14 @@ def split_dataset( left_split = list(dataset_as_list[:left_size]) right_split = list(dataset_as_list[-right_size:]) - left_split = _restore_dataset_from_list( left_split, dataset_type_spec, dataset ) right_split = _restore_dataset_from_list( right_split, dataset_type_spec, dataset ) - + if dataset_type_spec != torchDataset: - left_split = tf.data.Dataset.from_tensor_slices(left_split) right_split = tf.data.Dataset.from_tensor_slices(right_split) @@ -103,7 +106,6 @@ def split_dataset( return dataset.__class__(*left_split), dataset.__class__(*right_split) - def _convert_dataset_to_list( dataset, dataset_type_spec, @@ -150,6 +152,7 @@ def _convert_dataset_to_list( return dataset_as_list + def _get_data_iterator_from_dataset(dataset, dataset_type_spec): """Get the iterator from a dataset. @@ -169,6 +172,7 @@ def _get_data_iterator_from_dataset(dataset, dataset_type_spec): iterator: An `iterator` object. """ from torch.utils.data import Dataset as torchDataset + if dataset_type_spec == list: if len(dataset) == 0: raise ValueError( @@ -225,14 +229,15 @@ def _get_data_iterator_from_dataset(dataset, dataset_type_spec): if is_batched(dataset): dataset = dataset.unbatch() return iter(dataset) - + # torch dataset iterator might be required to change elif dataset_type_spec == torchDataset: return iter(dataset) - + elif dataset_type_spec == np.ndarray: return iter(dataset) - + + def _get_next_sample( dataset_iterator, ensure_shape_similarity, @@ -264,7 +269,9 @@ def _get_next_sample( try: dataset_iterator = iter(dataset_iterator) first_sample = next(dataset_iterator) - if isinstance(first_sample, (tf.Tensor, np.ndarray)) or torch.is_tensor(first_sample): + if isinstance(first_sample, (tf.Tensor, np.ndarray)) or torch.is_tensor( + first_sample + ): first_sample_shape = np.array(first_sample).shape else: first_sample_shape = None @@ -433,10 +440,12 @@ def _rescale_dataset_split_sizes(left_size, right_size, total_length): left_size, right_size = int(left_size), int(right_size) return left_size, right_size + def _restore_dataset_from_list( dataset_as_list, dataset_type_spec, original_dataset ): from torch.utils.data import Dataset as torchDataset + """Restore the dataset from the list of arrays.""" if dataset_type_spec in [tuple, list]: return tuple(np.array(sample) for sample in zip(*dataset_as_list)) @@ -452,15 +461,17 @@ def _restore_dataset_from_list( return restored_dataset else: return tuple(np.array(sample) for sample in zip(*dataset_as_list)) - + elif dataset_type_spec == torchDataset: return tuple(np.array(sample) for sample in zip(*dataset_as_list)) return dataset_as_list + def is_batched(dataset): """ "Check if the `tf.data.Dataset` is batched.""" return hasattr(dataset, "_batch_size") + def get_batch_size(dataset): """Get the batch size of the dataset.""" if is_batched(dataset): @@ -468,8 +479,10 @@ def get_batch_size(dataset): else: return None + def _get_type_spec(dataset): from torch.utils.data import Dataset as torchDataset + """Get the type spec of the dataset.""" if isinstance(dataset, tuple): return tuple @@ -487,7 +500,6 @@ def _get_type_spec(dataset): return None - @keras_core_export( [ "keras_core.utils.image_dataset_from_directory", diff --git a/keras_core/utils/dataset_utils_test.py b/keras_core/utils/dataset_utils_test.py index 64f01dd3d..645441fb3 100644 --- a/keras_core/utils/dataset_utils_test.py +++ b/keras_core/utils/dataset_utils_test.py @@ -4,148 +4,331 @@ from keras_core.utils.dataset_utils import split_dataset import numpy as np + class DatasetUtilsTest(test_case.TestCase): def test_split_dataset_list(self): n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - dataset = [np.random.sample((n_sample,n_cols)), np.random.sample((n_sample,n_pred))] - dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) - self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (n_cols)) + dataset = [ + np.random.sample((n_sample, n_cols)), + np.random.sample((n_sample, n_pred)), + ] + dataset_left, dataset_right = split_dataset( + dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + int(dataset_left.cardinality()), int(n_sample * left_size) + ) + self.assertEqual( + int(dataset_right.cardinality()), int(n_sample * right_size) + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, (n_cols) + ) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - dataset = [np.random.sample((n_sample, 100, n_cols)), np.random.sample((n_sample,n_pred))] - dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) - self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, n_cols)) + dataset = [ + np.random.sample((n_sample, 100, n_cols)), + np.random.sample((n_sample, n_pred)), + ] + dataset_left, dataset_right = split_dataset( + dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + int(dataset_left.cardinality()), int(n_sample * left_size) + ) + self.assertEqual( + int(dataset_right.cardinality()), int(n_sample * right_size) + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, (100, n_cols) + ) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - dataset = [np.random.sample((n_sample,10, 10, n_cols)), np.random.sample((n_sample,n_pred))] - dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) - self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (10, 10, n_cols)) + dataset = [ + np.random.sample((n_sample, 10, 10, n_cols)), + np.random.sample((n_sample, n_pred)), + ] + dataset_left, dataset_right = split_dataset( + dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + int(dataset_left.cardinality()), int(n_sample * left_size) + ) + self.assertEqual( + int(dataset_right.cardinality()), int(n_sample * right_size) + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, (10, 10, n_cols) + ) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - dataset = [np.random.sample((n_sample, 100, 10, 30, n_cols)), np.random.sample((n_sample,n_pred))] - dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) - self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, 10, 30, n_cols)) + dataset = [ + np.random.sample((n_sample, 100, 10, 30, n_cols)), + np.random.sample((n_sample, n_pred)), + ] + dataset_left, dataset_right = split_dataset( + dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + int(dataset_left.cardinality()), int(n_sample * left_size) + ) + self.assertEqual( + int(dataset_right.cardinality()), int(n_sample * right_size) + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, + (100, 10, 30, n_cols), + ) def test_split_dataset_tuple(self): n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - dataset = (np.random.sample((n_sample,n_cols)), np.random.sample((n_sample,n_pred))) - dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) - self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (n_cols)) + dataset = ( + np.random.sample((n_sample, n_cols)), + np.random.sample((n_sample, n_pred)), + ) + dataset_left, dataset_right = split_dataset( + dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + int(dataset_left.cardinality()), int(n_sample * left_size) + ) + self.assertEqual( + int(dataset_right.cardinality()), int(n_sample * right_size) + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, (n_cols) + ) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - dataset = (np.random.sample((n_sample, 100, n_cols)), np.random.sample((n_sample,n_pred))) - dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) - self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, n_cols)) + dataset = ( + np.random.sample((n_sample, 100, n_cols)), + np.random.sample((n_sample, n_pred)), + ) + dataset_left, dataset_right = split_dataset( + dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + int(dataset_left.cardinality()), int(n_sample * left_size) + ) + self.assertEqual( + int(dataset_right.cardinality()), int(n_sample * right_size) + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, (100, n_cols) + ) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - dataset = (np.random.sample((n_sample,10, 10, n_cols)), np.random.sample((n_sample,n_pred))) - dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) - self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (10, 10, n_cols)) + dataset = ( + np.random.sample((n_sample, 10, 10, n_cols)), + np.random.sample((n_sample, n_pred)), + ) + dataset_left, dataset_right = split_dataset( + dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + int(dataset_left.cardinality()), int(n_sample * left_size) + ) + self.assertEqual( + int(dataset_right.cardinality()), int(n_sample * right_size) + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, (10, 10, n_cols) + ) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - dataset = (np.random.sample((n_sample, 100, 10, 30, n_cols)), np.random.sample((n_sample,n_pred))) - dataset_left, dataset_right = split_dataset(dataset, left_size=left_size, right_size=right_size) - self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) - self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, 10, 30, n_cols)) + dataset = ( + np.random.sample((n_sample, 100, 10, 30, n_cols)), + np.random.sample((n_sample, n_pred)), + ) + dataset_left, dataset_right = split_dataset( + dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + int(dataset_left.cardinality()), int(n_sample * left_size) + ) + self.assertEqual( + int(dataset_right.cardinality()), int(n_sample * right_size) + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, + (100, 10, 30, n_cols), + ) def test_split_dataset_tensorflow(self): n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - features, labels = (np.random.sample((n_sample,n_cols)), np.random.sample((n_sample,n_pred))) - tf_dataset = tf.data.Dataset.from_tensor_slices((features,labels)) - dataset_left, dataset_right = split_dataset(tf_dataset, left_size=left_size, right_size=right_size) - self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) - self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (n_cols)) + features, labels = ( + np.random.sample((n_sample, n_cols)), + np.random.sample((n_sample, n_pred)), + ) + tf_dataset = tf.data.Dataset.from_tensor_slices((features, labels)) + dataset_left, dataset_right = split_dataset( + tf_dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + int(dataset_left.cardinality()), int(n_sample * left_size) + ) + self.assertEqual( + int(dataset_right.cardinality()), int(n_sample * right_size) + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, (n_cols) + ) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - features, labels = (np.random.sample((n_sample, 100, n_cols)), np.random.sample((n_sample,n_pred))) - tf_dataset = tf.data.Dataset.from_tensor_slices((features,labels)) - dataset_left, dataset_right = split_dataset(tf_dataset, left_size=left_size, right_size=right_size) - self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) - self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, n_cols)) + features, labels = ( + np.random.sample((n_sample, 100, n_cols)), + np.random.sample((n_sample, n_pred)), + ) + tf_dataset = tf.data.Dataset.from_tensor_slices((features, labels)) + dataset_left, dataset_right = split_dataset( + tf_dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + int(dataset_left.cardinality()), int(n_sample * left_size) + ) + self.assertEqual( + int(dataset_right.cardinality()), int(n_sample * right_size) + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, (100, n_cols) + ) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - features, labels = (np.random.sample((n_sample,10, 10, n_cols)), np.random.sample((n_sample,n_pred))) - tf_dataset = tf.data.Dataset.from_tensor_slices((features,labels)) - dataset_left, dataset_right = split_dataset(tf_dataset, left_size=left_size, right_size=right_size) - self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) - self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (10, 10, n_cols)) + features, labels = ( + np.random.sample((n_sample, 10, 10, n_cols)), + np.random.sample((n_sample, n_pred)), + ) + tf_dataset = tf.data.Dataset.from_tensor_slices((features, labels)) + dataset_left, dataset_right = split_dataset( + tf_dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + int(dataset_left.cardinality()), int(n_sample * left_size) + ) + self.assertEqual( + int(dataset_right.cardinality()), int(n_sample * right_size) + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, (10, 10, n_cols) + ) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - features, labels = (np.random.sample((n_sample, 100, 10, 30, n_cols)), np.random.sample((n_sample,n_pred))) - tf_dataset = tf.data.Dataset.from_tensor_slices((features,labels)) - dataset_left, dataset_right = split_dataset(tf_dataset, left_size=left_size, right_size=right_size) - self.assertEqual(int(dataset_left.cardinality()), int(n_sample * left_size)) - self.assertEqual(int(dataset_right.cardinality()), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, 10, 30, n_cols)) + features, labels = ( + np.random.sample((n_sample, 100, 10, 30, n_cols)), + np.random.sample((n_sample, n_pred)), + ) + tf_dataset = tf.data.Dataset.from_tensor_slices((features, labels)) + dataset_left, dataset_right = split_dataset( + tf_dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + int(dataset_left.cardinality()), int(n_sample * left_size) + ) + self.assertEqual( + int(dataset_right.cardinality()), int(n_sample * right_size) + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, + (100, 10, 30, n_cols), + ) def test_split_dataset_torch(self): - # sample torch dataset class from torch.utils.data import Dataset as torchDataset + class Dataset(torchDataset): - 'Characterizes a dataset for PyTorch' + "Characterizes a dataset for PyTorch" + def __init__(self, x, y): - 'Initialization' + "Initialization" self.x = x self.y = y def __len__(self): - 'Denotes the total number of samples' + "Denotes the total number of samples" return len(self.x) def __getitem__(self, index): - 'Generates one sample of data' + "Generates one sample of data" return self.x[index], self.y[index] - n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - features, labels = (np.random.sample((n_sample,n_cols)), np.random.sample((n_sample,n_pred))) - torch_dataset = Dataset(features,labels) - dataset_left, dataset_right = split_dataset(torch_dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (n_cols,)) + features, labels = ( + np.random.sample((n_sample, n_cols)), + np.random.sample((n_sample, n_pred)), + ) + torch_dataset = Dataset(features, labels) + dataset_left, dataset_right = split_dataset( + torch_dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + len([sample for sample in dataset_left]), int(n_sample * left_size) + ) + self.assertEqual( + len([sample for sample in dataset_right]), + int(n_sample * right_size), + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, (n_cols,) + ) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - features, labels = (np.random.sample((n_sample, 100, n_cols)), np.random.sample((n_sample,n_pred))) - torch_dataset = Dataset(features,labels) - dataset_left, dataset_right = split_dataset(torch_dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, n_cols)) + features, labels = ( + np.random.sample((n_sample, 100, n_cols)), + np.random.sample((n_sample, n_pred)), + ) + torch_dataset = Dataset(features, labels) + dataset_left, dataset_right = split_dataset( + torch_dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + len([sample for sample in dataset_left]), int(n_sample * left_size) + ) + self.assertEqual( + len([sample for sample in dataset_right]), + int(n_sample * right_size), + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, (100, n_cols) + ) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - features, labels = (np.random.sample((n_sample,10, 10, n_cols)), np.random.sample((n_sample,n_pred))) - torch_dataset = Dataset(features,labels) - dataset_left, dataset_right = split_dataset(torch_dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (10, 10, n_cols)) + features, labels = ( + np.random.sample((n_sample, 10, 10, n_cols)), + np.random.sample((n_sample, n_pred)), + ) + torch_dataset = Dataset(features, labels) + dataset_left, dataset_right = split_dataset( + torch_dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + len([sample for sample in dataset_left]), int(n_sample * left_size) + ) + self.assertEqual( + len([sample for sample in dataset_right]), + int(n_sample * right_size), + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, (10, 10, n_cols) + ) n_sample, n_cols, n_pred, left_size, right_size = 100, 2, 1, 0.2, 0.8 - features, labels = (np.random.sample((n_sample, 100, 10, 30, n_cols)), np.random.sample((n_sample,n_pred))) - torch_dataset = Dataset(features,labels) - dataset_left, dataset_right = split_dataset(torch_dataset, left_size=left_size, right_size=right_size) - self.assertEqual(len([sample for sample in dataset_left]), int(n_sample * left_size)) - self.assertEqual(len([sample for sample in dataset_right]), int(n_sample * right_size)) - self.assertEqual([sample for sample in dataset_right][0][0].shape, (100, 10, 30, n_cols)) - - + features, labels = ( + np.random.sample((n_sample, 100, 10, 30, n_cols)), + np.random.sample((n_sample, n_pred)), + ) + torch_dataset = Dataset(features, labels) + dataset_left, dataset_right = split_dataset( + torch_dataset, left_size=left_size, right_size=right_size + ) + self.assertEqual( + len([sample for sample in dataset_left]), int(n_sample * left_size) + ) + self.assertEqual( + len([sample for sample in dataset_right]), + int(n_sample * right_size), + ) + self.assertEqual( + [sample for sample in dataset_right][0][0].shape, + (100, 10, 30, n_cols), + ) From 47e4d4fdfd8c354e62d0b5bdc38a72b8fbcb4732 Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Fri, 21 Jul 2023 19:46:18 +0530 Subject: [PATCH 10/16] reformatted --- keras_core/utils/dataset_utils.py | 29 ++++++++++++++++++-------- keras_core/utils/dataset_utils_test.py | 6 +++--- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index f66ad5428..e4d11c551 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -1,8 +1,10 @@ -import torch -import numpy as np -import warnings import random import time +import warnings + +import numpy as np +import torch + from keras_core.api_export import keras_core_export from keras_core.utils.module_utils import tensorflow as tf @@ -14,7 +16,9 @@ def split_dataset( """Split a dataset into a left half and a right half (e.g. train / test). Args: - dataset: A `tf.data.Dataset or torchDataset` object, or a list/tuple of arrays with the + dataset: + A `tf.data.Dataset or torchDataset` object, + or a list/tuple of arrays with the same length. left_size: If float (in the range `[0, 1]`), it signifies the fraction of the data to pack in the left dataset. If integer, it @@ -28,7 +32,8 @@ def split_dataset( seed: A random seed for shuffling. Returns: - A tuple of two `tf.data.Dataset or torchDataset` objects: the left and right splits. + A tuple of two `tf.data.Dataset or torchDataset` objects: + the left and right splits. Example: @@ -112,10 +117,13 @@ def _convert_dataset_to_list( data_size_warning_flag=True, ensure_shape_similarity=True, ): - """Convert `tf.data.Dataset or torchDataset` object or list/tuple of NumPy arrays to a list. + """Convert `tf.data.Dataset or torchDataset` object + or list/tuple of NumPy arrays to a list. Args: - dataset : A `tf.data.Dataset or torchDataset` object or a list/tuple of arrays. + dataset : + A `tf.data.Dataset or torchDataset` object + or a list/tuple of arrays. dataset_type_spec : the type of the dataset data_size_warning_flag (bool, optional): If set to True, a warning will be issued if the dataset takes longer than 10 seconds to iterate. @@ -157,8 +165,11 @@ def _get_data_iterator_from_dataset(dataset, dataset_type_spec): """Get the iterator from a dataset. Args: - dataset : A `tf.data.Dataset or torchDataset` object or a list/tuple of arrays. - dataset_type_spec : the type of the dataset + dataset : + A `tf.data.Dataset or torchDataset` object + or a list/tuple of arrays. + dataset_type_spec : + the type of the dataset Raises: ValueError: diff --git a/keras_core/utils/dataset_utils_test.py b/keras_core/utils/dataset_utils_test.py index 645441fb3..13939aa44 100644 --- a/keras_core/utils/dataset_utils_test.py +++ b/keras_core/utils/dataset_utils_test.py @@ -1,8 +1,8 @@ +import numpy as np + from keras_core.testing import test_case -from keras_core.utils import naming -from keras_core.utils.module_utils import tensorflow as tf from keras_core.utils.dataset_utils import split_dataset -import numpy as np +from keras_core.utils.module_utils import tensorflow as tf class DatasetUtilsTest(test_case.TestCase): From 78e48242eab0f6bbb03836a1e7270cb12a82e594 Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Sat, 22 Jul 2023 00:01:39 +0530 Subject: [PATCH 11/16] removed tf.keras mentions --- keras_core/utils/dataset_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index e4d11c551..c0a407540 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -38,7 +38,7 @@ def split_dataset( Example: >>> data = np.random.random(size=(1000, 4)) - >>> left_ds, right_ds = tf.keras.utils.split_dataset(data, left_size=0.8) + >>> left_ds, right_ds = keras_core.utils.split_dataset(data, left_size=0.8) >>> int(left_ds.cardinality()) 800 >>> int(right_ds.cardinality()) From 4b12421378c22348b916f59362a941b5194d1530 Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Sun, 23 Jul 2023 08:43:16 +0530 Subject: [PATCH 12/16] removed torch dependency --- keras_core/utils/dataset_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index c0a407540..21df833d1 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -3,7 +3,6 @@ import warnings import numpy as np -import torch from keras_core.api_export import keras_core_export from keras_core.utils.module_utils import tensorflow as tf @@ -278,6 +277,7 @@ def _get_next_sample( data_sample: A tuple/list of numpy arrays. """ try: + import torch dataset_iterator = iter(dataset_iterator) first_sample = next(dataset_iterator) if isinstance(first_sample, (tf.Tensor, np.ndarray)) or torch.is_tensor( From d358e7220967b51076e9ba6574bfb008e546e92f Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Sun, 23 Jul 2023 10:18:36 +0530 Subject: [PATCH 13/16] fixed indent issue --- keras_core/utils/dataset_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index 21df833d1..9e6e9b9ae 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -17,8 +17,7 @@ def split_dataset( Args: dataset: A `tf.data.Dataset or torchDataset` object, - or a list/tuple of arrays with the - same length. + or a list/tuple of arrays with the same length. left_size: If float (in the range `[0, 1]`), it signifies the fraction of the data to pack in the left dataset. If integer, it signifies the number of samples to pack in the left dataset. If From dbedcfb8d6548097f33ec4cc4c06013067eab17a Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Sun, 23 Jul 2023 17:23:06 +0530 Subject: [PATCH 14/16] only tf.data.dataset will be returned --- keras_core/utils/dataset_utils.py | 38 +++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index 9e6e9b9ae..d83f972de 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -30,7 +30,7 @@ def split_dataset( seed: A random seed for shuffling. Returns: - A tuple of two `tf.data.Dataset or torchDataset` objects: + A tuple of two `tf.data.Dataset` objects: the left and right splits. Example: @@ -90,23 +90,20 @@ def split_dataset( right_split, dataset_type_spec, dataset ) - if dataset_type_spec != torchDataset: - left_split = tf.data.Dataset.from_tensor_slices(left_split) - right_split = tf.data.Dataset.from_tensor_slices(right_split) + + left_split = tf.data.Dataset.from_tensor_slices(left_split) + right_split = tf.data.Dataset.from_tensor_slices(right_split) - # apply batching to the splits if the dataset is batched - if dataset_type_spec is tf.data.Dataset and is_batched(dataset): - batch_size = get_batch_size(dataset) - if batch_size is not None: - left_split = left_split.batch(batch_size) - right_split = right_split.batch(batch_size) + # apply batching to the splits if the dataset is batched + if dataset_type_spec is tf.data.Dataset and is_batched(dataset): + batch_size = get_batch_size(dataset) + if batch_size is not None: + left_split = left_split.batch(batch_size) + right_split = right_split.batch(batch_size) - left_split = left_split.prefetch(tf.data.AUTOTUNE) - right_split = right_split.prefetch(tf.data.AUTOTUNE) - return left_split, right_split - - elif dataset_type_spec == torchDataset: - return dataset.__class__(*left_split), dataset.__class__(*right_split) + left_split = left_split.prefetch(tf.data.AUTOTUNE) + right_split = right_split.prefetch(tf.data.AUTOTUNE) + return left_split, right_split def _convert_dataset_to_list( @@ -276,10 +273,10 @@ def _get_next_sample( data_sample: A tuple/list of numpy arrays. """ try: - import torch + dataset_iterator = iter(dataset_iterator) first_sample = next(dataset_iterator) - if isinstance(first_sample, (tf.Tensor, np.ndarray)) or torch.is_tensor( + if isinstance(first_sample, (tf.Tensor, np.ndarray)) or is_torch_tensor( first_sample ): first_sample_shape = np.array(first_sample).shape @@ -320,6 +317,9 @@ def _get_next_sample( data_size_warning_flag = False yield sample +def is_torch_tensor(value): + return value.__class__.__name__ == 'Tensor' + def _rescale_dataset_split_sizes(left_size, right_size, total_length): """Rescale the dataset split sizes. @@ -454,8 +454,8 @@ def _rescale_dataset_split_sizes(left_size, right_size, total_length): def _restore_dataset_from_list( dataset_as_list, dataset_type_spec, original_dataset ): + from torch.utils.data import Dataset as torchDataset - """Restore the dataset from the list of arrays.""" if dataset_type_spec in [tuple, list]: return tuple(np.array(sample) for sample in zip(*dataset_as_list)) From 84980e39f2a079f0170d75e4e2c289acf6dfb4d9 Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Sun, 23 Jul 2023 17:59:40 +0530 Subject: [PATCH 15/16] torch only imported for get_type_spec --- keras_core/utils/dataset_utils.py | 100 ++++++++++++++++-------------- 1 file changed, 52 insertions(+), 48 deletions(-) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index d83f972de..67c3f92c7 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -16,16 +16,16 @@ def split_dataset( Args: dataset: - A `tf.data.Dataset or torchDataset` object, + A `tf.data.Dataset or torch.utils.data.Dataset` object, or a list/tuple of arrays with the same length. left_size: If float (in the range `[0, 1]`), it signifies - the fraction of the data to pack in the left dataset. If integer, it - signifies the number of samples to pack in the left dataset. If - `None`, it uses the complement to `right_size`. Defaults to `None`. + the fraction of the data to pack in the left dataset. If integer, it + signifies the number of samples to pack in the left dataset. If + `None`, it uses the complement to `right_size`. Defaults to `None`. right_size: If float (in the range `[0, 1]`), it signifies - the fraction of the data to pack in the right dataset. If integer, it - signifies the number of samples to pack in the right dataset. If - `None`, it uses the complement to `left_size`. Defaults to `None`. + the fraction of the data to pack in the right dataset. If integer, it + signifies the number of samples to pack in the right dataset. If + `None`, it uses the complement to `left_size`. Defaults to `None`. shuffle: Boolean, whether to shuffle the data before splitting it. seed: A random seed for shuffling. @@ -43,19 +43,11 @@ def split_dataset( 200 """ - from torch.utils.data import Dataset as torchDataset - dataset_type_spec = _get_type_spec(dataset) - if dataset_type_spec not in [ - torchDataset, - tf.data.Dataset, - list, - tuple, - np.ndarray, - ]: + if dataset_type_spec is None: raise TypeError( - "The `dataset` argument must be either a `tf.data.Dataset` " + "The `dataset` argument must be either a `tf.data.Dataset` or `torch.utils.data.Dataset`" "object or a list/tuple of arrays. " f"Received: dataset={dataset} of type {type(dataset)}" ) @@ -112,20 +104,20 @@ def _convert_dataset_to_list( data_size_warning_flag=True, ensure_shape_similarity=True, ): - """Convert `tf.data.Dataset or torchDataset` object + """Convert `tf.data.Dataset` or `torch.utils.data.Dataset` object or list/tuple of NumPy arrays to a list. Args: dataset : - A `tf.data.Dataset or torchDataset` object + A `tf.data.Dataset` or `torch.utils.data.Dataset` object or a list/tuple of arrays. dataset_type_spec : the type of the dataset data_size_warning_flag (bool, optional): If set to True, a warning will - be issued if the dataset takes longer than 10 seconds to iterate. - Defaults to `True`. + be issued if the dataset takes longer than 10 seconds to iterate. + Defaults to `True`. ensure_shape_similarity (bool, optional): If set to True, the shape of - the first sample will be used to validate the shape of rest of the - samples. Defaults to `True`. + the first sample will be used to validate the shape of rest of the + samples. Defaults to `True`. Returns: List: A list of tuples/NumPy arrays. @@ -161,24 +153,22 @@ def _get_data_iterator_from_dataset(dataset, dataset_type_spec): Args: dataset : - A `tf.data.Dataset or torchDataset` object + A `tf.data.Dataset` or `torch.utils.data.Dataset` object or a list/tuple of arrays. dataset_type_spec : the type of the dataset Raises: ValueError: - - If the dataset is empty. - - If the dataset is not a `tf.data.Dataset` object - or a list/tuple of arrays. - - If the dataset is a list/tuple of arrays and the - length of the list/tuple is not equal to the number + - If the dataset is empty. + - If the dataset is not a `tf.data.Dataset` object + or a list/tuple of arrays. + - If the dataset is a list/tuple of arrays and the + length of the list/tuple is not equal to the number Returns: iterator: An `iterator` object. """ - from torch.utils.data import Dataset as torchDataset - if dataset_type_spec == list: if len(dataset) == 0: raise ValueError( @@ -237,7 +227,7 @@ def _get_data_iterator_from_dataset(dataset, dataset_type_spec): return iter(dataset) # torch dataset iterator might be required to change - elif dataset_type_spec == torchDataset: + elif is_torch_dataset(dataset): return iter(dataset) elif dataset_type_spec == np.ndarray: @@ -255,19 +245,20 @@ def _get_next_sample( Args: dataset_iterator : An `iterator` object. ensure_shape_similarity (bool, optional): If set to True, the shape of - the first sample will be used to validate the shape of rest of the - samples. Defaults to `True`. + the first sample will be used to validate the shape of rest of the + samples. Defaults to `True`. data_size_warning_flag (bool, optional): If set to True, a warning will - be issued if the dataset takes longer than 10 seconds to iterate. - Defaults to `True`. + be issued if the dataset takes longer than 10 seconds to iterate. + Defaults to `True`. start_time (float): the start time of the dataset iteration. this is - used only if `data_size_warning_flag` is set to true. + used only if `data_size_warning_flag` is set to true. Raises: - ValueError: - If the dataset is empty. - - If `ensure_shape_similarity` is set to True and the - shape of the first sample is not equal to the shape of - atleast one of the rest of the samples. + ValueError: + - If the dataset is empty. + - If `ensure_shape_similarity` is set to True and the + shape of the first sample is not equal to the shape of + atleast one of the rest of the samples. Yields: data_sample: A tuple/list of numpy arrays. @@ -318,7 +309,22 @@ def _get_next_sample( yield sample def is_torch_tensor(value): - return value.__class__.__name__ == 'Tensor' + if hasattr(value, "__class__"): + for parent in value.__class__.__mro__: + if parent.__name__ == "Tensor" and str( + parent.__module__ + ).endswith("torch"): + return True + return False + +def is_torch_dataset(dataset): + if hasattr(dataset, "__class__"): + for parent in dataset.__class__.__mro__: + if parent.__name__ == "Dataset" and str( + parent.__module__ + ).startswith("torch.utils.data"): + return True + return False def _rescale_dataset_split_sizes(left_size, right_size, total_length): @@ -335,7 +341,7 @@ def _rescale_dataset_split_sizes(left_size, right_size, total_length): Raises: TypeError: - If `left_size` or `right_size` is not an integer or float. ValueError: - If `left_size` or `right_size` is negative or greater - than 1 or greater than `total_length`. + than 1 or greater than `total_length`. Returns: tuple: A tuple of rescaled left_size and right_size @@ -455,7 +461,6 @@ def _restore_dataset_from_list( dataset_as_list, dataset_type_spec, original_dataset ): - from torch.utils.data import Dataset as torchDataset """Restore the dataset from the list of arrays.""" if dataset_type_spec in [tuple, list]: return tuple(np.array(sample) for sample in zip(*dataset_as_list)) @@ -472,7 +477,7 @@ def _restore_dataset_from_list( else: return tuple(np.array(sample) for sample in zip(*dataset_as_list)) - elif dataset_type_spec == torchDataset: + elif is_torch_dataset(original_dataset): return tuple(np.array(sample) for sample in zip(*dataset_as_list)) return dataset_as_list @@ -491,8 +496,6 @@ def get_batch_size(dataset): def _get_type_spec(dataset): - from torch.utils.data import Dataset as torchDataset - """Get the type spec of the dataset.""" if isinstance(dataset, tuple): return tuple @@ -504,7 +507,8 @@ def _get_type_spec(dataset): return dict elif isinstance(dataset, tf.data.Dataset): return tf.data.Dataset - elif isinstance(dataset, torchDataset): + elif is_torch_dataset(dataset): + from torch.utils.data import Dataset as torchDataset return torchDataset else: return None From daab3c14f1be65836ab91f3e8b3392925914f536 Mon Sep 17 00:00:00 2001 From: asingh9530 Date: Sun, 23 Jul 2023 18:10:08 +0530 Subject: [PATCH 16/16] fixed indentation --- keras_core/utils/dataset_utils.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/keras_core/utils/dataset_utils.py b/keras_core/utils/dataset_utils.py index 67c3f92c7..7cc5bfb8c 100644 --- a/keras_core/utils/dataset_utils.py +++ b/keras_core/utils/dataset_utils.py @@ -23,9 +23,11 @@ def split_dataset( signifies the number of samples to pack in the left dataset. If `None`, it uses the complement to `right_size`. Defaults to `None`. right_size: If float (in the range `[0, 1]`), it signifies - the fraction of the data to pack in the right dataset. If integer, it - signifies the number of samples to pack in the right dataset. If - `None`, it uses the complement to `left_size`. Defaults to `None`. + the fraction of the data to pack in the right dataset. + If integer, it signifies the number of samples to pack + in the right dataset. + If `None`, it uses the complement to `left_size`. + Defaults to `None`. shuffle: Boolean, whether to shuffle the data before splitting it. seed: A random seed for shuffling. @@ -47,7 +49,8 @@ def split_dataset( if dataset_type_spec is None: raise TypeError( - "The `dataset` argument must be either a `tf.data.Dataset` or `torch.utils.data.Dataset`" + "The `dataset` argument must be either" + "a `tf.data.Dataset` or `torch.utils.data.Dataset`" "object or a list/tuple of arrays. " f"Received: dataset={dataset} of type {type(dataset)}" ) @@ -82,7 +85,6 @@ def split_dataset( right_split, dataset_type_spec, dataset ) - left_split = tf.data.Dataset.from_tensor_slices(left_split) right_split = tf.data.Dataset.from_tensor_slices(right_split) @@ -254,7 +256,7 @@ def _get_next_sample( used only if `data_size_warning_flag` is set to true. Raises: - ValueError: + ValueError: - If the dataset is empty. - If `ensure_shape_similarity` is set to True and the shape of the first sample is not equal to the shape of @@ -264,7 +266,6 @@ def _get_next_sample( data_sample: A tuple/list of numpy arrays. """ try: - dataset_iterator = iter(dataset_iterator) first_sample = next(dataset_iterator) if isinstance(first_sample, (tf.Tensor, np.ndarray)) or is_torch_tensor( @@ -308,15 +309,17 @@ def _get_next_sample( data_size_warning_flag = False yield sample + def is_torch_tensor(value): if hasattr(value, "__class__"): for parent in value.__class__.__mro__: - if parent.__name__ == "Tensor" and str( - parent.__module__ - ).endswith("torch"): + if parent.__name__ == "Tensor" and str(parent.__module__).endswith( + "torch" + ): return True return False + def is_torch_dataset(dataset): if hasattr(dataset, "__class__"): for parent in dataset.__class__.__mro__: @@ -460,7 +463,6 @@ def _rescale_dataset_split_sizes(left_size, right_size, total_length): def _restore_dataset_from_list( dataset_as_list, dataset_type_spec, original_dataset ): - """Restore the dataset from the list of arrays.""" if dataset_type_spec in [tuple, list]: return tuple(np.array(sample) for sample in zip(*dataset_as_list)) @@ -509,6 +511,7 @@ def _get_type_spec(dataset): return tf.data.Dataset elif is_torch_dataset(dataset): from torch.utils.data import Dataset as torchDataset + return torchDataset else: return None