Merge pull request #13 from jameschapman19/_cca_tutorial_visualisation

partial correlation and simple sns pair plot
raamana · Dec 13, 2021 · c4d0f70 · c4d0f70
2 parents 6591dac + 520a3f0
commit c4d0f70
Show file tree

Hide file tree

Showing 7 changed files with 424 additions and 14 deletions.
diff --git a/confounds/__init__.py b/confounds/__init__.py
@@ -11,6 +11,8 @@
 if version_info.major >= 3:
     from confounds.base import Residualize, Augment, DummyDeconfounding, \
         ConfoundsException
+    from confounds.visualize import pairs
+    from confounds.metrics import partial_correlation,prediction_partial_correlation
 else:
     raise NotImplementedError('confounds library requires Python 3 or higher! ')
 

diff --git a/confounds/metrics.py b/confounds/metrics.py
@@ -6,3 +6,117 @@
 degree of harmonization achieved (e.g. reduction in variance of means/medians)
 
 """
+
+import numpy as np
+from scipy import stats
+
+from confounds import Residualize
+
+
+def partial_correlation(X, C=None):
+    """
+    Calculates the pairwise partial correlations between all of the variables in X with respect to some confounding
+    variables C
+
+    Can be used as a measure of the strength of the relationship between two variables of interest while controlling
+    for some other variables.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The training input samples.
+    C : {array-like, sparse matrix}, shape (n_samples, n_confounds)
+            Array of confounding variables.
+
+    Returns
+    -------
+    partial_correlations : ndarray
+        Returns the pairwise partial correlations of each variable in X
+
+    Examples
+    ----------
+    Observing the pairwise correlations of different MRI brain voxels in X in neuroimaging
+    studies while controlling for the effects of the scanner type and age of participants in C.
+    """
+    resx = Residualize()
+    resx.fit(X, y=C)
+    deconfound_X = resx.transform(X, y=C)
+    return np.corrcoef(deconfound_X, rowvar=False)
+
+
+def partial_correlation_t_test(partial_correlations, n, g):
+    """
+    Calculates the t-statistic and p-value for pairwise partial correlations.
+
+    The statistical significance of the partial correlation can be obtained
+    parametrically using a Student‘s t distribution.
+
+    This is equivalent to fitting and testing the significance of a linear
+    regression model with  model predictions p and confounds C as covariates
+
+    References
+    -----------
+    Dinga R, Schmaal L, Penninx BW, Veltman DJ, Marquand AF. Controlling for effects of
+    confounding variables on machine learning predictions. BioRxiv. 2020 Jan 1.
+
+    Parameters
+    ----------
+    partial_correlations : {array-like, sparse matrix}, shape (n_features, n_features)
+            Correlations of variables with respect to some confounds
+    n : {int}
+            Number of samples
+    g : {int}
+            Number of confounding variables
+
+    Returns
+    -------
+    t_statistic : ndarray
+        Returns the associated t-statistics for these pairwise partial correlations
+    p_value : ndarray
+        Returns the associated p-values for these pairwise partial correlations
+    """
+    partial_correlations[partial_correlations == 1] = 1 - 1e-7
+    #The degrees of freedom for the test are the number of samples minus the number of confounding variables minus 2
+    df = n - 2 - g
+    #We conduct a t-test and compute its significance
+    t_statistic = partial_correlations * np.sqrt(df / (1 - partial_correlations ** 2))
+    p_value = stats.t.sf(np.abs(t_statistic), df=df)
+    return t_statistic, p_value
+
+
+def prediction_partial_correlation(predictions, targets, confounds):
+    """
+    Returns the partial correlation between predictions and targets after residualizing the effect of confounds.
+    Also calculates the t-statistic and p-value for the statistical significance of this partial correlation which
+    is a measure of the predictive power of the model controlling for the effect of confounds.
+
+    References
+    -----------
+    Dinga R, Schmaal L, Penninx BW, Veltman DJ, Marquand AF. Controlling for effects of
+    confounding variables on machine learning predictions. BioRxiv. 2020 Jan 1.
+
+    Parameters
+    ----------
+    predictions : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The training input samples.
+    targets : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The training input samples.
+    confounds : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The training input samples.
+
+    Returns
+    -------
+    corr_p : float
+        The partial correlation of the predictions and targets with respect to the confounds
+    """
+    if predictions.ndim == 1:
+        predictions = predictions[:, np.newaxis]
+    if targets.ndim == 1:
+        targets = targets[:, np.newaxis]
+    assert (predictions.shape == targets.shape), f"Dimensions of predictions and targets do not match. " \
+                                                 f"predictions shape {predictions.shape}," \
+                                                 f"targets shape {targets.shape}"
+    p = predictions.shape[1]
+    corr_p = partial_correlation(np.hstack((predictions, targets)), C=confounds)
+    # Just extract the partials between predictions and associated targets
+    return np.diag(corr_p[:p, p:])
diff --git a/confounds/tests/test_confounds.py b/confounds/tests/test_confounds.py
@@ -3,15 +3,18 @@
 
 """Tests for `confounds` package."""
 
-import numpy as np
 import os
 
-from numpy.testing import assert_almost_equal
+import numpy as np
+from numpy.testing import assert_almost_equal, assert_array_less
 from sklearn.datasets import make_classification, make_sparse_uncorrelated
+from sklearn.linear_model import LinearRegression
 from sklearn.utils.estimator_checks import check_estimator
 
+from confounds import prediction_partial_correlation
 from confounds.base import Augment, DummyDeconfounding, Residualize
 from confounds.combat import ComBat
+from confounds.metrics import partial_correlation_t_test, partial_correlation
 
 
 def test_estimator_API():
@@ -86,14 +89,14 @@ def test_combat():
     n_features = 2
 
     # One effect of interest that we want to keep
-    X = rs.normal(size=(n_subj_per_batch + n_subj_per_batch, ))
+    X = rs.normal(size=(n_subj_per_batch + n_subj_per_batch,))
 
     # Global mean and noise parameters
     grand_mean = np.array([0.3, 0.2])
-    eps = rs.normal(scale=1e-5, size=(n_subj_per_batch*2, n_features))
+    eps = rs.normal(scale=1e-5, size=(n_subj_per_batch * 2, n_features))
 
     # Batch parameters
-    batch = [1]*n_subj_per_batch + [2]*n_subj_per_batch
+    batch = [1] * n_subj_per_batch + [2] * n_subj_per_batch
     shift_1 = np.array([0.05, -0.1])
     shift_2 = np.array([-0.3, 0.15])
     eps_1 = rs.normal(scale=1e-1, size=(n_subj_per_batch, n_features))
@@ -109,8 +112,8 @@ def test_combat():
     Y += grand_mean
 
     # Add dependence with the effect of interest
-    Y[:, 0] += 0.2*X
-    Y[:, 1] += -0.16*X
+    Y[:, 0] += 0.2 * X
+    Y[:, 1] += -0.16 * X
 
     # Add global noise
     Y += eps
@@ -147,7 +150,7 @@ def test_combat():
     # Test that batches no longer have different variances
     p_scale_after = np.array([bartlett(y[:n_subj_per_batch],
                                        y[n_subj_per_batch:])[1]
-                             for y in Y_trans.T]
+                              for y in Y_trans.T]
                              )
     assert np.all(p_scale_after > 0.05)
 
@@ -186,3 +189,32 @@ def test_combat_bladder():
                                             batch=batch,
                                             effects_interest=effects_interest)
     assert np.allclose(Y_combat_effects, bladder_test['Y_combat_effects'])
+
+
+def test_partial_correlation():
+    """check that partial correlations are less than correlations"""
+    n_samples = 100
+    train_all, train_y = make_classification(n_features=5)
+    train_X, train_confounds = splitter_X_confounds(train_all, 2)
+    # check that partial correlation with no confounds is the same as correlation using np.corrcoef
+    assert_almost_equal(np.corrcoef(train_X, rowvar=False),
+                        partial_correlation(train_X, C=np.zeros((train_X.shape[0], 1))))
+    # check that a linear regression fit using all variables has a lower r**2 partial correlation.
+    lr = LinearRegression().fit(train_all, train_y)
+    pred = lr.predict(train_all)
+    # return the partial correlation of predictions after removing confounds
+    corr_p = prediction_partial_correlation(pred, train_y, train_confounds)
+    assert_array_less(corr_p,lr.score(train_all, train_y))
+
+
+def test_partial_correlation_t_test():
+    C = np.random.normal(size=(100, 1))
+    C_dummy = np.zeros_like(C)
+    X = C + np.random.normal(0,0.1,size=(100, 10))
+    corr_p = partial_correlation(X, C=C_dummy)
+    t_statistic, statistical_significance = partial_correlation_t_test(corr_p, X.shape[0], C_dummy.shape[1])
+    corr_pd = partial_correlation(X, C=C)
+    t_statistic, statistical_significance_d = partial_correlation_t_test(corr_pd, X.shape[0], C.shape[1])
+    #checks that the partial correlations of variables with respect to confounds have are lower
+    idx=np.triu_indices_from(statistical_significance,1)
+    assert_array_less(statistical_significance[idx], statistical_significance_d[idx])
diff --git a/confounds/visualize.py b/confounds/visualize.py
@@ -3,6 +3,7 @@
 Module for the visualization for the confounds and their effects.
 
 """
+import seaborn as sns
 
 
 def samplets(features,
@@ -23,8 +24,7 @@ def samplets(features,
 
 
 def pairs(confounds,
-          color_by=None,
-          group_by=None):
+          color_by=None):
     """
     Plots each confound against the rest, one pair at a time, colored and grouped
     by other confounds (which are excluded in plots).
@@ -36,8 +36,8 @@ def pairs(confounds,
     Disease vs. Age, vs. Gender on the third row.
 
     """
-
-    raise NotImplementedError()
+    pp = sns.pairplot(confounds, hue=color_by)
+    return pp
 
 
 def before_after():