diff --git a/README.md b/README.md index 3346cf8..0996d21 100644 --- a/README.md +++ b/README.md @@ -305,7 +305,26 @@ plt.show() ![](README_files/figure-gfm/compare-2d-5.png) -## 5. References +## 5. `DensityRatio.alpha_normalize` function +This is a supplementary function, that by applying the following linear transformation changes the lower `0` bound (infimum) of its `values` argument in the range of `[0, 1]` to `[alpha, 1]` : +```math +x' = (1 - alpha) * x + alpha = x + alpha * (1 - x) +``` +The function returns an array whose values are elements of the `[alpha, alpha^-1]` bounded set; the upper `alpha^-1` bound (supremum) is due to properties of the alpha-relative density ratio estimator. As long as the outcome is comprised of positive values only, when the `densratio_obj.alpha_normalize` function is applied to the `densratio_obj.compute_density_ratio` result, it renders the density ratio estimation eligible to be further processed on the logarithmic scale. + +### 5.1. Implementation specifics +To preserve vital estimator properties, there are 2 invariants always met by the function results: +1. The number of unique values must not changed. +2. The order of output values remains the same as in the input argument (as determined by [`numpy.argsort`](https://numpy.org/doc/stable/reference/generated/numpy.argsort.html)). + +Due to floating‑point arithmetic, if two consecutive input numbers that are not equal are to be transformed to the same outcome, in order to satisfy the constraints in the list above and yield different values as well, the [`numpy.nextafter`](https://numpy.org/doc/stable/reference/generated/numpy.nextafter.html) is employed in the direction of `0`. +By virtue of this implementation approach, in extreme cases there is a possibility of output values that: +- are less than `alpha`, or +- are nonpositive. + +Should it happen, a relevant warning is issued. + +## 6. References \[1\] Hido, S., Tsuboi, Y., Kashima, H., Sugiyama, M., & Kanamori, T. **Statistical outlier detection using direct density ratio estimation.** @@ -322,7 +341,7 @@ in Machine Learning.** Cambridge University Press 2012. Detection in Time-Series Data by Relative Density-Ratio Estimation** Neural Networks, 2013. -## 6. Related Work +## 7. Related Work - densratio for R - pykliep diff --git a/README.rmd b/README.rmd index 6441ae4..9009327 100644 --- a/README.rmd +++ b/README.rmd @@ -251,7 +251,26 @@ plt.title("Estimated Alpha-Relative Density Ratio") plt.show() ``` -## 5. References +## 5. `DensityRatio.alpha_normalize` function +This is a supplementary function, that by applying the following linear transformation changes the lower `0` bound (infimum) of its `values` argument in the range of `[0, 1]` to `[alpha, 1]` : +```math +x' = (1 - alpha) * x + alpha = x + alpha * (1 - x) +``` +The function returns an array whose values are elements of the `[alpha, alpha^-1]` bounded set; the upper `alpha^-1` bound (supremum) is due to properties of the alpha-relative density ratio estimator. As long as the outcome is comprised of positive values only, when the `densratio_obj.alpha_normalize` function is applied to the `densratio_obj.compute_density_ratio` result, it renders the density ratio estimation eligible to be further processed on the logarithmic scale. + +### 5.1. Implementation specifics +To preserve vital estimator properties, there are 2 invariants always met by the function results: +1. The number of unique values must not changed. +2. The order of output values remains the same as in the input argument (as determined by [`numpy.argsort`](https://numpy.org/doc/stable/reference/generated/numpy.argsort.html)). + +Due to floating‑point arithmetic, if two consecutive input numbers that are not equal are to be transformed to the same outcome, in order to satisfy the constraints in the list above and yield different values as well, the [`numpy.nextafter`](https://numpy.org/doc/stable/reference/generated/numpy.nextafter.html) is employed in the direction of `0`. +By virtue of this implementation approach, in extreme cases there is a possibility of output values that: +- are less than `alpha`, or +- are nonpositive. + +Should it happen, a relevant warning is issued. + +## 6. References [1] Hido, S., Tsuboi, Y., Kashima, H., Sugiyama, M., & Kanamori, T. **Statistical outlier detection using direct density ratio estimation.** @@ -268,7 +287,7 @@ Cambridge University Press 2012. **Change-Point Detection in Time-Series Data by Relative Density-Ratio Estimation** Neural Networks, 2013. -## 6. Related Work +## 7. Related Work - densratio for R https://github.com/hoxo-m/densratio - pykliep https://github.com/srome/pykliep diff --git a/densratio/RuLSIF.py b/densratio/RuLSIF.py index f80f2b8..63460c3 100644 --- a/densratio/RuLSIF.py +++ b/densratio/RuLSIF.py @@ -16,6 +16,7 @@ from warnings import warn from .density_ratio import DensityRatio, KernelInfo from .helpers import guvectorize_compute, np_float, to_ndarray +from .helpers import alpha_normalize as static_alpha_normalize def RuLSIF(x, y, alpha, sigma_range, lambda_range, kernel_num=100, verbose=True): @@ -87,6 +88,21 @@ def alpha_density_ratio(coordinates): return alpha_density_ratio + def alpha_normalize(values: array) -> array: + """ + Normalizes values less than 1 so the minimum value to replace 0 is symmetrical to alpha^-1 + with respect to the natural logarithm. + + Arguments: + values (numpy.array): A vector to normalize. + + Returns: + Normalized numpy.array object that preserves the order and the number of unique input argument values. + """ + + return static_alpha_normalize(values, alpha) + + # Compute the approximate alpha-relative PE-divergence, given samples x and y from the respective distributions. def alpha_PE_divergence(x, y): # This is Y, in Reference 1. @@ -112,11 +128,11 @@ def alpha_KL_divergence(x, y): x = to_ndarray(x) # Obtain alpha-relative density ratio at these points. - g_x = alpha_density_ratio(x) + g_x = alpha_normalize(alpha_density_ratio(x)) # Compute the alpha-relative KL-divergence. n = x.shape[0] - divergence = log(g_x).sum(axis=0) / n + divergence = log(g_x).sum(axis=0) / n if (g_x > 0.).all() else '[not calculated]' return divergence alpha_PE = alpha_PE_divergence(x, y) @@ -124,11 +140,12 @@ def alpha_KL_divergence(x, y): if verbose: print("Approximate alpha-relative PE-divergence = {:03.2f}".format(alpha_PE)) - print("Approximate alpha-relative KL-divergence = {:03.2f}".format(alpha_KL)) + alpha_KL_format = '{}' if isinstance(alpha_KL, str) else '{:03.2f}' + print("Approximate alpha-relative KL-divergence =", alpha_KL_format.format(alpha_KL)) kernel_info = KernelInfo(kernel_type="Gaussian", kernel_num=kernel_num, sigma=sigma, centers=centers) result = DensityRatio(method="RuLSIF", alpha=alpha, theta=theta, lambda_=lambda_, alpha_PE=alpha_PE, alpha_KL=alpha_KL, - kernel_info=kernel_info, compute_density_ratio=alpha_density_ratio) + kernel_info=kernel_info, compute_density_ratio=alpha_density_ratio, alpha_normalize=alpha_normalize) if verbose: print("RuLSIF completed.") diff --git a/densratio/__init__.py b/densratio/__init__.py index 99da370..8dc01db 100644 --- a/densratio/__init__.py +++ b/densratio/__init__.py @@ -1,7 +1,9 @@ from warnings import filterwarnings from .core import densratio +from .helpers import alpha_normalize from .RuLSIF import set_compute_kernel_target filterwarnings('default', message='\'numba\'', category=ImportWarning, module='densratio') -__all__ = ['densratio', 'set_compute_kernel_target'] +filterwarnings('always', message='Normalized vector contains', category=RuntimeWarning, module=r'densratio\.helpers') +__all__ = ['alpha_normalize', 'densratio', 'set_compute_kernel_target'] diff --git a/densratio/density_ratio.py b/densratio/density_ratio.py index bd1016b..0725b4a 100644 --- a/densratio/density_ratio.py +++ b/densratio/density_ratio.py @@ -4,7 +4,7 @@ class DensityRatio: """Density Ratio.""" - def __init__(self, method, alpha, theta, lambda_, alpha_PE, alpha_KL, kernel_info, compute_density_ratio): + def __init__(self, method, alpha, theta, lambda_, alpha_PE, alpha_KL, kernel_info, compute_density_ratio, alpha_normalize): self.method = method self.alpha = alpha self.theta = theta @@ -13,6 +13,7 @@ def __init__(self, method, alpha, theta, lambda_, alpha_PE, alpha_KL, kernel_inf self.alpha_KL = alpha_KL self.kernel_info = kernel_info self.compute_density_ratio = compute_density_ratio + self.alpha_normalize = alpha_normalize def __str__(self): return """ diff --git a/densratio/helpers.py b/densratio/helpers.py index 311d79b..0dc5ff6 100644 --- a/densratio/helpers.py +++ b/densratio/helpers.py @@ -1,4 +1,7 @@ +import numpy as np + from numpy import array, matrix, ndarray, result_type +from warnings import warn np_float = result_type(float) @@ -33,3 +36,54 @@ def to_ndarray(x): raise ValueError("Cannot transform to numpy.matrix.") else: return to_ndarray(array(x)) + + +def alpha_normalize(values: ndarray, alpha: float) -> ndarray: + """ + Normalizes values less than 1 so the minimum value to replace 0 is symmetrical to alpha^-1 + with respect to the natural logarithm. + + Arguments: + values (numpy.array): A vector to normalize. + alpha (float): The nonnegative normalization term. + + Returns: + Normalized numpy.array object that preserves the order and the number of unique input argument values. + """ + values = np.asarray(values) + if values.ndim != 1: + raise ValueError('\'values\' must a 1d vector.') + + if alpha <= 0.: + if alpha < 0.: + warn('\'alpha\' is negative, normalization aborted.', RuntimeWarning) + + return values + + a = 1. - alpha + inserted = last_value = 1. + outcome = np.empty(values.shape, dtype=values.dtype) + + values_argsort = np.argsort(values) + for i in np.flip(values_argsort): + value = values[i] + if value >= 1.: + outcome[i] = value + continue + + if value < last_value: + new_value = inserted - a * (last_value - value) + inserted = np.nextafter(inserted, 0.) if new_value == inserted else new_value + last_value = value + else: + assert value == last_value + + outcome[i] = inserted + + if inserted <= 0.: + warn(f'Normalized vector contains at least one nonpositive [min={inserted}] value.', RuntimeWarning) + elif inserted < alpha: + warn(f'Normalized vector contains at least one value [min={inserted}] less than alpha [{alpha}].', + RuntimeWarning) + + return outcome diff --git a/tests/test_helpers.py b/tests/test_helpers.py index b3bfcd2..618503c 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,10 +1,13 @@ import unittest +import numpy as np from numpy import array, matrix from numpy.testing import assert_array_equal from pandas import DataFrame from .context import helpers + +# noinspection PyMethodMayBeStatic class BasicTestSuite(unittest.TestCase): """Tests for Helper Functions.""" @@ -46,5 +49,51 @@ def test_to_ndarray_pandas_DataFrame(self): x = helpers.to_ndarray(x) assert_array_equal(x, array([[1,2],[3,4]])) + def test_alpha_normalize_0d_input(self): + with self.assertRaises(ValueError): + helpers.alpha_normalize(np.array(None), 1E-3) + + def test_alpha_normalize_2d_input(self): + with self.assertRaises(ValueError): + helpers.alpha_normalize(np.arange(4).reshape(2, -1), 1E-3) + + def test_alpha_normalize_zero_alpha(self): + values = np.random.rand(5) + assert_array_equal(values, helpers.alpha_normalize(values, 0)) + + def test_alpha_normalize_negative_alpha(self): + values = np.random.rand(5) + with self.assertWarns(RuntimeWarning): + assert_array_equal(values, helpers.alpha_normalize(values, -1E-3)) + + def test_alpha_normalize_lower_changed(self): + alpha = .05 + lower, upper = np.arange(0, 1, alpha), np.arange(1, 21) + normalized_values = helpers.alpha_normalize(np.concatenate((lower, upper)), alpha) + self.assertRaises(AssertionError, assert_array_equal, lower, normalized_values[:lower.size]) + assert_array_equal(upper, normalized_values[-upper.size:]) + + def test_alpha_normalize_equal_unique_count(self): + alpha = .05 + values = np.concatenate((np.arange(0, 1, alpha), np.arange(1, 21))) + normalized_values = helpers.alpha_normalize(values, alpha) + self.assertEqual(np.unique(values).size, np.unique(normalized_values).size) + + def test_alpha_normalize_same_argsort(self): + alpha = .05 + values = np.concatenate((np.arange(0, 1, alpha), np.arange(1, 21))) + np.random.shuffle(values) + normalized_values = helpers.alpha_normalize(values, alpha) + self.assertTrue((np.argsort(values) == np.argsort(normalized_values)).all()) + + def test_alpha_normalize_alpha_warning(self): + values = np.array([0, np.nextafter(0, 1), 1]) + self.assertWarns(RuntimeWarning, helpers.alpha_normalize, values, .5) + + def test_alpha_normalize_nonpositive_warning(self): + values = np.array([0, np.nextafter(0, 1), 1]) + self.assertWarns(RuntimeWarning, helpers.alpha_normalize, values, values[1]) + + if __name__ == '__main__': unittest.main()