Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scale transform #53

Merged
merged 3 commits into from
Jun 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions deepdow/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Module dealing with data."""

from .augment import (Compose, Dropout, Multiply, Noise)
from .augment import (Compose, Dropout, Multiply, Noise, Scale, prepare_robust_scaler,
prepare_standard_scaler)
from .load import (FlexibleDataLoader, InRAMDataset, RigidDataLoader)

__all__ = ['Compose',
Expand All @@ -9,4 +10,7 @@
'InRAMDataset',
'Multiply',
'Noise',
'RigidDataLoader']
'RigidDataLoader',
'Scale',
'prepare_robust_scaler',
'prepare_standard_scaler']
156 changes: 156 additions & 0 deletions deepdow/data/augment.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,85 @@
"""Collection of callable functions that augment deepdow tensors."""

import numpy as np
import torch


def prepare_standard_scaler(X, overlap=False, indices=None):
"""Compute mean and standard deviation for each channel.

Parameters
----------
X : np.ndarray
Full features array of shape `(n_samples, n_channels, lookback, n_assets)`.

overlap : bool
If False, then only using the most recent timestep. This will guarantee that not counting
the same thing multiple times.

indices : list or None
List of indices to consider from the `X.shape[0]` dimension. If None
then considering all the samples.

Returns
-------
means : np.ndarray
Mean of each channel. Shape `(n_channels,)`.

stds : np.ndarray
Standard deviation of each channel. Shape `(n_channels,)`.

"""
indices = indices if indices is not None else list(range(len(X)))
considered_values = X[indices, ...] if overlap else X[indices, :, -1:, :]

means = considered_values.mean(axis=(0, 2, 3))
stds = considered_values.std(axis=(0, 2, 3))

return means, stds


def prepare_robust_scaler(X, overlap=False, indices=None, percentile_range=(25, 75)):
"""Compute median and percentile range for each channel.

Parameters
----------
X : np.ndarray
Full features array of shape `(n_samples, n_channels, lookback, n_assets)`.

overlap : bool
If False, then only using the most recent timestep. This will guarantee that not counting
the same thing multiple times.

indices : list or None
List of indices to consider from the `X.shape[0]` dimension. If None
then considering all the samples.

percentile_range : tuple
The left and right percentile to consider. Needs to be in [0, 100].

Returns
-------
medians : np.ndarray
Median of each channel. Shape `(n_channels,)`.

ranges : np.ndarray
Interquantile range for each channel. Shape `(n_channels,)`.

"""
if not 0 <= percentile_range[0] < percentile_range[1] <= 100:
raise ValueError('The percentile range needs to be in [0, 100] and left < right')

indices = indices if indices is not None else list(range(len(X)))
considered_values = X[indices, ...] if overlap else X[indices, :, -1:, :]

medians = np.median(considered_values, axis=(0, 2, 3))
percentiles = np.percentile(considered_values, percentile_range, axis=(0, 2, 3)) # (2, n_channels)

ranges = percentiles[1] - percentiles[0]

return medians, ranges


class Compose:
"""Meta transform inspired by torchvision.

Expand Down Expand Up @@ -191,3 +268,82 @@ def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
X_sample_new = self.frac * X_sample.std([1, 2], keepdim=True) * torch.randn_like(X_sample) + X_sample

return X_sample_new, y_sample, timestamps_sample, asset_names


class Scale:
"""Scale input features.

The input features are per channel centered to zero and scaled to one. We use the same
terminology as scikit-learn. However, the equivalent in torchvision is `Normalize`.

Parameters
----------
center : np.ndarray
1D array of shape `(n_channels,)` representing the center of the features (mean or median).
Needs to be precomputed in advance.

scale : np.ndarray
1D array of shape `(n_channels,)` representing the scale of the features (standard deviation
or quantile range). Needs to be precomputed in advance.

See Also
--------
prepare_robust_scaler
prepare_standard_scaler
"""

def __init__(self, center, scale):
if len(center) != len(scale):
raise ValueError('The center and scale need to have the same size.')

if np.any(scale <= 0):
raise ValueError('The scale parameters need to be positive.')

self.center = center
self.scale = scale
self.n_channels = len(self.center)

def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
"""Perform transform.

Parameters
----------
X_sample : torch.Tensor
Feature vector of shape `(n_channels, lookback, n_assets)`.

y_sample : torch.Tensor
Target vector of shape `(n_channels, horizon, n_assets)`.

timestamps_sample : datetime
Time stamp of the sample.

asset_names
Asset names corresponding to the last channel of `X_sample` and `y_sample`.

Returns
-------
X_sample_new : torch.Tensor
Feature vector of shape `(n_channels, lookback, n_assets)` scaled appropriately.

y_sample : torch.Tesnor
Same as input.

timestamps_sample : datetime
Same as input.

asset_names
Same as input.
"""
n_channels = X_sample.shape[0]
if n_channels != self.n_channels:
raise ValueError('Expected {} channels in X, got {}'.format(self.n_channels, n_channels))

X_sample_new = X_sample.clone()
dtype, device = X_sample_new.dtype, X_sample_new.device

center = torch.as_tensor(self.center, dtype=dtype, device=device)[:, None, None]
scale = torch.as_tensor(self.scale, dtype=dtype, device=device)[:, None, None]

X_sample_new.sub_(center).div_(scale)

return X_sample_new, y_sample, timestamps_sample, asset_names
8 changes: 5 additions & 3 deletions docs/source/data_loading.rst
Original file line number Diff line number Diff line change
Expand Up @@ -261,10 +261,12 @@ Additionally, one can pass a transformation :code:`transform` that can serve as
Currently implemented transforms under :code:`deepdow.data` are

- :code:`Compose` - basically a copy of `Compose` from Torch Vision
- :code:`Dropout` - randomly setting elements to zero (not in place)
- :code:`Multiply` - multiplying all elements by a constant (not in place)
- :code:`Noise` - add Gaussian noise (not in place)
- :code:`Dropout` - randomly setting elements to zero
- :code:`Multiply` - multiplying all elements by a constant
- :code:`Noise` - add Gaussian noise
- :code:`Scale` - centering and scaling (similar to scikit-learn :code:`StandardScaler` and :code:`RobustScaler`)

All of the transforms are not in place.

Dataloaders
-----------
Expand Down
86 changes: 86 additions & 0 deletions tests/test_data/test_augment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Collection of tests focused on the `deepdow.data.augment`."""

import numpy as np
import pytest
import torch

from deepdow.data import Scale, prepare_robust_scaler, prepare_standard_scaler


@pytest.mark.parametrize('overlap', [True, False])
@pytest.mark.parametrize('indices', [None, [1, 4, 6]])
def test_prepare_standard_scaler(overlap, indices):
n_samples, n_channels, lookback, n_assets = 10, 3, 5, 12

X = np.random.random((n_samples, n_channels, lookback, n_assets)) - 0.5

means, stds = prepare_standard_scaler(X, overlap=overlap, indices=indices)

assert means.shape == (n_channels,)
assert stds.shape == (n_channels,)
assert np.all(stds > 0)


class TestPrepareRobustScaler:

def test_error(self):
with pytest.raises(ValueError):
prepare_robust_scaler(np.ones((1, 2, 3, 4)), percentile_range=(20, 10))

with pytest.raises(ValueError):
prepare_robust_scaler(np.ones((1, 2, 3, 4)), percentile_range=(-2, 99))

@pytest.mark.parametrize('overlap', [True, False])
@pytest.mark.parametrize('indices', [None, [1, 4, 6]])
def test_basic(self, overlap, indices):
n_samples, n_channels, lookback, n_assets = 10, 3, 5, 12

X = np.random.random((n_samples, n_channels, lookback, n_assets)) - 0.5

medians, ranges = prepare_robust_scaler(X, overlap=overlap, indices=indices)

assert medians.shape == (n_channels,)
assert ranges.shape == (n_channels,)
assert np.all(ranges > 0)

def test_sanity(self):
n_samples, n_channels, lookback, n_assets = 10, 3, 5, 12

X = np.random.random((n_samples, n_channels, lookback, n_assets)) - 0.5

medians_1, ranges_1 = prepare_robust_scaler(X, percentile_range=(20, 80))
medians_2, ranges_2 = prepare_robust_scaler(X, percentile_range=(10, 90))

assert np.all(ranges_2 > ranges_1)


class TestScaler:
def test_erorrs(self):
with pytest.raises(ValueError):
raise Scale(np.ones(3), np.ones(4))

with pytest.raises(ValueError):
raise Scale(np.array([1, -1]), np.array([9, -0.1]))

tform = Scale(np.array([1, -1]), np.array([9, 10.]))
with pytest.raises(ValueError):
tform(torch.rand(3, 4, 5), None, None, None)

def test_overall(self):
n_channels, lookback, n_assets = 3, 5, 12

X = np.random.random((n_channels, lookback, n_assets))
X_torch = torch.as_tensor(X)
dtype = X_torch.dtype

center = X.mean(axis=(1, 2))
scale = X.std(axis=(1, 2), )

tform = Scale(center, scale)
X_scaled = tform(X_torch, None, None, None)[0]

assert torch.is_tensor(X_scaled)
assert X_torch.shape == X_scaled.shape
assert not torch.allclose(X_torch, X_scaled)
assert torch.allclose(X_scaled.mean(dim=(1, 2)), torch.zeros(n_channels, dtype=dtype))
assert torch.allclose(X_scaled.std(dim=(1, 2), unbiased=False), torch.ones(n_channels, dtype=dtype))