jankrepl · jankrepl · Jun 7, 2020 · Jun 7, 2020 · Jun 7, 2020 · Jun 7, 2020
diff --git a/deepdow/data/__init__.py b/deepdow/data/__init__.py
@@ -1,6 +1,7 @@
 """Module dealing with data."""
 
-from .augment import (Compose, Dropout, Multiply, Noise)
+from .augment import (Compose, Dropout, Multiply, Noise, Scale, prepare_robust_scaler,
+                      prepare_standard_scaler)
 from .load import (FlexibleDataLoader, InRAMDataset, RigidDataLoader)
 
 __all__ = ['Compose',
@@ -9,4 +10,7 @@
            'InRAMDataset',
            'Multiply',
            'Noise',
-           'RigidDataLoader']
+           'RigidDataLoader',
+           'Scale',
+           'prepare_robust_scaler',
+           'prepare_standard_scaler']
diff --git a/deepdow/data/augment.py b/deepdow/data/augment.py
@@ -1,8 +1,85 @@
 """Collection of callable functions that augment deepdow tensors."""
 
+import numpy as np
 import torch
 
 
+def prepare_standard_scaler(X, overlap=False, indices=None):
+    """Compute mean and standard deviation for each channel.
+
+    Parameters
+    ----------
+    X : np.ndarray
+        Full features array of shape `(n_samples, n_channels, lookback, n_assets)`.
+
+    overlap : bool
+        If False, then only using the most recent timestep. This will guarantee that not counting
+        the same thing multiple times.
+
+    indices : list or None
+        List of indices to consider from the `X.shape[0]` dimension. If None
+        then considering all the samples.
+
+    Returns
+    -------
+    means : np.ndarray
+        Mean of each channel. Shape `(n_channels,)`.
+
+    stds : np.ndarray
+        Standard deviation of each channel. Shape `(n_channels,)`.
+
+    """
+    indices = indices if indices is not None else list(range(len(X)))
+    considered_values = X[indices, ...] if overlap else X[indices, :, -1:, :]
+
+    means = considered_values.mean(axis=(0, 2, 3))
+    stds = considered_values.std(axis=(0, 2, 3))
+
+    return means, stds
+
+
+def prepare_robust_scaler(X, overlap=False, indices=None, percentile_range=(25, 75)):
+    """Compute median and percentile range for each channel.
+
+    Parameters
+    ----------
+    X : np.ndarray
+        Full features array of shape `(n_samples, n_channels, lookback, n_assets)`.
+
+    overlap : bool
+        If False, then only using the most recent timestep. This will guarantee that not counting
+        the same thing multiple times.
+
+    indices : list or None
+        List of indices to consider from the `X.shape[0]` dimension. If None
+        then considering all the samples.
+
+    percentile_range : tuple
+        The left and right percentile to consider. Needs to be in [0, 100].
+
+    Returns
+    -------
+    medians : np.ndarray
+        Median of each channel. Shape `(n_channels,)`.
+
+    ranges : np.ndarray
+        Interquantile range for each channel. Shape `(n_channels,)`.
+
+    """
+    if not 0 <= percentile_range[0] < percentile_range[1] <= 100:
+        raise ValueError('The percentile range needs to be in [0, 100] and left < right')
+
+    indices = indices if indices is not None else list(range(len(X)))
+    considered_values = X[indices, ...] if overlap else X[indices, :, -1:, :]
+
+    medians = np.median(considered_values, axis=(0, 2, 3))
+    percentiles = np.percentile(considered_values, percentile_range, axis=(0, 2, 3))  # (2, n_channels)
+
+    ranges = percentiles[1] - percentiles[0]
+
+    return medians, ranges
+
+
 class Compose:
     """Meta transform inspired by torchvision.
 
@@ -191,3 +268,82 @@ def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
         X_sample_new = self.frac * X_sample.std([1, 2], keepdim=True) * torch.randn_like(X_sample) + X_sample
 
         return X_sample_new, y_sample, timestamps_sample, asset_names
+
+
+class Scale:
+    """Scale input features.
+
+    The input features are per channel centered to zero and scaled to one. We use the same
+    terminology as scikit-learn. However, the equivalent in torchvision is `Normalize`.
+
+    Parameters
+    ----------
+    center : np.ndarray
+        1D array of shape `(n_channels,)` representing the center of the features (mean or median).
+        Needs to be precomputed in advance.
+
+    scale : np.ndarray
+        1D array of shape `(n_channels,)` representing the scale of the features (standard deviation
+        or quantile range). Needs to be precomputed in advance.
+
+    See Also
+    --------
+    prepare_robust_scaler
+    prepare_standard_scaler
+    """
+
+    def __init__(self, center, scale):
+        if len(center) != len(scale):
+            raise ValueError('The center and scale need to have the same size.')
+
+        if np.any(scale <= 0):
+            raise ValueError('The scale parameters need to be positive.')
+
+        self.center = center
+        self.scale = scale
+        self.n_channels = len(self.center)
+
+    def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
+        """Perform transform.
+
+        Parameters
+        ----------
+        X_sample : torch.Tensor
+            Feature vector of shape `(n_channels, lookback, n_assets)`.
+
+        y_sample : torch.Tensor
+            Target vector of shape `(n_channels, horizon, n_assets)`.
+
+        timestamps_sample : datetime
+            Time stamp of the sample.
+
+        asset_names
+            Asset names corresponding to the last channel of `X_sample` and `y_sample`.
+
+        Returns
+        -------
+        X_sample_new : torch.Tensor
+            Feature vector of shape `(n_channels, lookback, n_assets)` scaled appropriately.
+
+        y_sample : torch.Tesnor
+            Same as input.
+
+        timestamps_sample : datetime
+            Same as input.
+
+        asset_names
+            Same as input.
+        """
+        n_channels = X_sample.shape[0]
+        if n_channels != self.n_channels:
+            raise ValueError('Expected {} channels in X, got {}'.format(self.n_channels, n_channels))
+
+        X_sample_new = X_sample.clone()
+        dtype, device = X_sample_new.dtype, X_sample_new.device
+
+        center = torch.as_tensor(self.center, dtype=dtype, device=device)[:, None, None]
+        scale = torch.as_tensor(self.scale, dtype=dtype, device=device)[:, None, None]
+
+        X_sample_new.sub_(center).div_(scale)
+
+        return X_sample_new, y_sample, timestamps_sample, asset_names
diff --git a/docs/source/data_loading.rst b/docs/source/data_loading.rst
@@ -261,10 +261,12 @@ Additionally, one can pass a transformation :code:`transform` that can serve as
 Currently implemented transforms under :code:`deepdow.data` are
 
 - :code:`Compose` - basically a copy of `Compose` from Torch Vision
-- :code:`Dropout` - randomly setting elements to zero (not in place)
-- :code:`Multiply` - multiplying all elements by a constant (not in place)
-- :code:`Noise` - add Gaussian noise (not in place)
+- :code:`Dropout` - randomly setting elements to zero
+- :code:`Multiply` - multiplying all elements by a constant
+- :code:`Noise` - add Gaussian noise
+- :code:`Scale` - centering and scaling (similar to scikit-learn :code:`StandardScaler` and :code:`RobustScaler`)
 
+All of the transforms are not in place.
 
 Dataloaders
 -----------

diff --git a/tests/test_data/test_augment.py b/tests/test_data/test_augment.py
@@ -0,0 +1,86 @@
+"""Collection of tests focused on the `deepdow.data.augment`."""
+
+import numpy as np
+import pytest
+import torch
+
+from deepdow.data import Scale, prepare_robust_scaler, prepare_standard_scaler
+
+
+@pytest.mark.parametrize('overlap', [True, False])
+@pytest.mark.parametrize('indices', [None, [1, 4, 6]])
+def test_prepare_standard_scaler(overlap, indices):
+    n_samples, n_channels, lookback, n_assets = 10, 3, 5, 12
+
+    X = np.random.random((n_samples, n_channels, lookback, n_assets)) - 0.5
+
+    means, stds = prepare_standard_scaler(X, overlap=overlap, indices=indices)
+
+    assert means.shape == (n_channels,)
+    assert stds.shape == (n_channels,)
+    assert np.all(stds > 0)
+
+
+class TestPrepareRobustScaler:
+
+    def test_error(self):
+        with pytest.raises(ValueError):
+            prepare_robust_scaler(np.ones((1, 2, 3, 4)), percentile_range=(20, 10))
+
+        with pytest.raises(ValueError):
+            prepare_robust_scaler(np.ones((1, 2, 3, 4)), percentile_range=(-2, 99))
+
+    @pytest.mark.parametrize('overlap', [True, False])
+    @pytest.mark.parametrize('indices', [None, [1, 4, 6]])
+    def test_basic(self, overlap, indices):
+        n_samples, n_channels, lookback, n_assets = 10, 3, 5, 12
+
+        X = np.random.random((n_samples, n_channels, lookback, n_assets)) - 0.5
+
+        medians, ranges = prepare_robust_scaler(X, overlap=overlap, indices=indices)
+
+        assert medians.shape == (n_channels,)
+        assert ranges.shape == (n_channels,)
+        assert np.all(ranges > 0)
+
+    def test_sanity(self):
+        n_samples, n_channels, lookback, n_assets = 10, 3, 5, 12
+
+        X = np.random.random((n_samples, n_channels, lookback, n_assets)) - 0.5
+
+        medians_1, ranges_1 = prepare_robust_scaler(X, percentile_range=(20, 80))
+        medians_2, ranges_2 = prepare_robust_scaler(X, percentile_range=(10, 90))
+
+        assert np.all(ranges_2 > ranges_1)
+
+
+class TestScaler:
+    def test_erorrs(self):
+        with pytest.raises(ValueError):
+            raise Scale(np.ones(3), np.ones(4))
+
+        with pytest.raises(ValueError):
+            raise Scale(np.array([1, -1]), np.array([9, -0.1]))
+
+        tform = Scale(np.array([1, -1]), np.array([9, 10.]))
+        with pytest.raises(ValueError):
+            tform(torch.rand(3, 4, 5), None, None, None)
+
+    def test_overall(self):
+        n_channels, lookback, n_assets = 3, 5, 12
+
+        X = np.random.random((n_channels, lookback, n_assets))
+        X_torch = torch.as_tensor(X)
+        dtype = X_torch.dtype
+
+        center = X.mean(axis=(1, 2))
+        scale = X.std(axis=(1, 2), )
+
+        tform = Scale(center, scale)
+        X_scaled = tform(X_torch, None, None, None)[0]
+
+        assert torch.is_tensor(X_scaled)
+        assert X_torch.shape == X_scaled.shape
+        assert not torch.allclose(X_torch, X_scaled)
+        assert torch.allclose(X_scaled.mean(dim=(1, 2)), torch.zeros(n_channels, dtype=dtype))
+        assert torch.allclose(X_scaled.std(dim=(1, 2), unbiased=False), torch.ones(n_channels, dtype=dtype))