Merge branch 'master' into master

stanfordmlgroup · Feb 14, 2024 · 85951a2 · 85951a2
2 parents 610a005 + f5c2a1e
commit 85951a2
Show file tree

Hide file tree

Showing 16 changed files with 146 additions and 60 deletions.
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -1,4 +1,4 @@
-name: Python package
+name: Python lint and test
 
 on:
   push:
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.7', '3.8', '3.9', '3.10']
+        python-version: ['3.9', '3.10', '3.11', '3.12']
 
     steps:
       - uses: actions/checkout@v2

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -33,11 +33,11 @@ repos:
         files: ^ngboost
         entry: flake8
       - id: pylint-ngboost
-        name: pylint on nboost*
+        name: pylint on ngboost*
         types: [file, python]
         language: system
         files: ^ngboost
-        entry: pylint --disable=invalid-name,no-member,missing-docstring,no-self-argument,arguments-differ,R0801
+        entry: pylint --disable=invalid-name,no-member,missing-docstring,no-self-argument,arguments-differ
       - id: pylint-tests
         name: pylint on tests*
         language: system

diff --git a/README.md b/README.md
@@ -32,11 +32,15 @@ Probabilistic regression example on the Boston housing dataset:
 ```python
 from ngboost import NGBRegressor
 
-from sklearn.datasets import load_boston
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error
 
-X, Y = load_boston(True)
+#Load Boston housing dataset
+data_url = "http://lib.stat.cmu.edu/datasets/boston"
+raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
+X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
+Y = raw_df.values[1::2, 2]
+
 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
 
 ngb = NGBRegressor().fit(X_train, Y_train)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,6 +1,22 @@
 # RELEASE NOTES
 
+## Version 0.5.0
+
+* Drops support for python 3.7 and 3.8
+* Now supports Python 3.11 and 3.12
+* Fixed issue with np.bool
+* Optimized memory usage in pred-dist
+* Removed declared pandas dependency
+* Significant improvements to run times on tests during development
+* Minor enhancements to github actions
+
+## Version 0.4.2
+
+* Fix deprecated numpy type alias. This was causing a warning with NumPy >=1.20 and an error with NumPy >=1.24
+* Remove pandas as a declared dependency
+
 ## Version 0.4.1
+
 ### Added `partial_fit` method for incremental learning
 
 NGBoost now includes a new `partial_fit` method that allows for incremental learning. This method appends new base models to the existing ones, which can be useful when new data becomes available over time or when the data is too large to fit in memory all at once.

diff --git a/examples/experiments/survival_exp.py b/examples/experiments/survival_exp.py
@@ -24,7 +24,9 @@
 def Y_join(T, E):
     col_event = "Event"
     col_time = "Time"
-    y = np.empty(dtype=[(col_event, np.bool), (col_time, np.float64)], shape=T.shape[0])
+    y = np.empty(
+        dtype=[(col_event, np.bool_), (col_time, np.float64)], shape=T.shape[0]
+    )
     y[col_event] = E.values
     y[col_time] = T.values
     return y

diff --git a/examples/regression.py b/examples/regression.py
@@ -1,13 +1,18 @@
-from sklearn.datasets import load_boston
+import numpy as np
+import pandas as pd
 from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import train_test_split
 
 from ngboost import NGBRegressor
 from ngboost.distns import Normal
 
 if __name__ == "__main__":
+    # Load Boston housing dataset
+    data_url = "http://lib.stat.cmu.edu/datasets/boston"
+    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
+    X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
+    Y = raw_df.values[1::2, 2]
 
-    X, Y = load_boston(return_X_y=True)
     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
 
     ngb = NGBRegressor(Dist=Normal).fit(X_train, Y_train)

diff --git a/examples/survival.py b/examples/survival.py
@@ -1,14 +1,18 @@
 import numpy as np
-from sklearn.datasets import load_boston
+import pandas as pd
 from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import train_test_split
 
 from ngboost import NGBSurvival
 from ngboost.distns import LogNormal
 
 if __name__ == "__main__":
+    # Load Boston housing dataset
+    data_url = "http://lib.stat.cmu.edu/datasets/boston"
+    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
+    X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
+    Y = raw_df.values[1::2, 2]
 
-    X, Y = load_boston(return_X_y=True)
     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
 
     # introduce administrative censoring

diff --git a/ngboost/__init__.py b/ngboost/__init__.py
@@ -6,7 +6,9 @@
     # before python 3.8
     from importlib_metadata import version
 
-from .api import NGBClassifier, NGBRegressor, NGBSurvival  # NOQA
-from .ngboost import NGBoost  # NOQA
+from .api import NGBClassifier, NGBRegressor, NGBSurvival
+from .ngboost import NGBoost
+
+__all__ = ["NGBClassifier", "NGBRegressor", "NGBSurvival", "NGBoost"]
 
 __version__ = version(__name__)
diff --git a/ngboost/distns/__init__.py b/ngboost/distns/__init__.py
@@ -1,12 +1,33 @@
 """NGBoost distributions"""
-from .categorical import Bernoulli, k_categorical  # NOQA
-from .cauchy import Cauchy  # NOQA
-from .distn import ClassificationDistn, Distn, RegressionDistn  # NOQA
-from .exponential import Exponential  # NOQA
-from .gamma import Gamma  # NOQA
-from .laplace import Laplace  # NOQA
-from .lognormal import LogNormal  # NOQA
-from .multivariate_normal import MultivariateNormal  # NOQA
-from .normal import Normal, NormalFixedMean, NormalFixedVar  # NOQA
-from .poisson import Poisson  # NOQA
-from .t import T, TFixedDf, TFixedDfFixedVar  # NOQA
+from .categorical import Bernoulli, k_categorical
+from .cauchy import Cauchy
+from .distn import ClassificationDistn, Distn, RegressionDistn
+from .exponential import Exponential
+from .gamma import Gamma
+from .laplace import Laplace
+from .lognormal import LogNormal
+from .multivariate_normal import MultivariateNormal
+from .normal import Normal, NormalFixedMean, NormalFixedVar
+from .poisson import Poisson
+from .t import T, TFixedDf, TFixedDfFixedVar
+
+__all__ = [
+    "Bernoulli",
+    "k_categorical",
+    "Cauchy",
+    "ClassificationDistn",
+    "Distn",
+    "RegressionDistn",
+    "Exponential",
+    "Gamma",
+    "Laplace",
+    "LogNormal",
+    "MultivariateNormal",
+    "Normal",
+    "NormalFixedMean",
+    "NormalFixedVar",
+    "Poisson",
+    "T",
+    "TFixedDf",
+    "TFixedDfFixedVar",
+]
diff --git a/ngboost/distns/categorical.py b/ngboost/distns/categorical.py
@@ -1,5 +1,5 @@
 """The NGBoost categorial distribution and scores"""
-# pylint: disable=invalid-unary-operand-type, unused-argument, no-self-use
+# pylint: disable=invalid-unary-operand-type, unused-argument
 import numpy as np
 import scipy as sp
 

diff --git a/ngboost/helpers.py b/ngboost/helpers.py
@@ -17,7 +17,7 @@ def Y_from_censored(T, E=None):
     else:
         E = check_array(E, ensure_2d=False)
         E = E.reshape(E.shape[0])
-    Y = np.empty(dtype=[("Event", np.bool), ("Time", np.float64)], shape=T.shape[0])
-    Y["Event"] = E.astype(np.bool)
+    Y = np.empty(dtype=[("Event", np.bool_), ("Time", np.float64)], shape=T.shape[0])
+    Y["Event"] = E.astype(np.bool_)
     Y["Time"] = T.astype(np.float64)
     return Y
diff --git a/ngboost/ngboost.py b/ngboost/ngboost.py
@@ -2,7 +2,7 @@
 # pylint: disable=line-too-long,too-many-instance-attributes,too-many-arguments
 # pylint: disable=unused-argument,too-many-locals,too-many-branches,too-many-statements
 # pylint: disable=unused-variable,invalid-unary-operand-type,attribute-defined-outside-init
-# pylint: disable=redundant-keyword-arg,protected-access
+# pylint: disable=redundant-keyword-arg,protected-access,unnecessary-lambda-assignment
 import numpy as np
 from sklearn.base import clone
 from sklearn.model_selection import train_test_split
@@ -315,7 +315,6 @@ def partial_fit(
         # if early stopping is specified, split X,Y and sample weights (if given) into training and validation sets
         # This will overwrite any X_val and Y_val values passed by the user directly.
         if self.early_stopping_rounds is not None:
-
             early_stopping_rounds = self.early_stopping_rounds
 
             if sample_weight is None:
@@ -362,14 +361,14 @@ def partial_fit(
             best_val_loss = np.inf
 
         if not train_loss_monitor:
-            train_loss_monitor = lambda D, Y, W: D.total_score(  # NOQA
+            train_loss_monitor = lambda D, Y, W: D.total_score(  # noqa: E731
                 Y, sample_weight=W
             )
 
         if not val_loss_monitor:
-            val_loss_monitor = lambda D, Y: D.total_score(  # NOQA
+            val_loss_monitor = lambda D, Y: D.total_score(  # noqa: E731
                 Y, sample_weight=val_sample_weight
-            )  # NOQA
+            )
 
         for itr in range(len(self.col_idxs), self.n_estimators + len(self.col_idxs)):
             _, col_idx, X_batch, Y_batch, weight_batch, P_batch = self.sample(
@@ -386,7 +385,6 @@ def partial_fit(
             proj_grad = self.fit_base(X_batch, grads, weight_batch)
             scale = self.line_search(proj_grad, P_batch, Y_batch, weight_batch)
 
-            # pdb.set_trace()
             params -= (
                 self.learning_rate
                 * scale
@@ -490,13 +488,9 @@ def pred_dist(self, X, max_iter=None):
 
         X = check_array(X, accept_sparse=True)
 
-        if (
-            max_iter is not None
-        ):  # get prediction at a particular iteration if asked for
-            dist = self.staged_pred_dist(X, max_iter=max_iter)[-1]
-        else:
-            params = np.asarray(self.pred_param(X, max_iter))
-            dist = self.Dist(params.T)
+        params = np.asarray(self.pred_param(X, max_iter))
+        dist = self.Dist(params.T)
+
         return dist
 
     def staged_pred_dist(self, X, max_iter=None):
@@ -587,8 +581,10 @@ def feature_importances_(self):
 
         if not all_params_importances:
             return np.zeros(
-                len(self.base_models[0]),
-                self.base_models[0][0].n_features_,
+                (
+                    len(self.base_models[0]),
+                    self.base_models[0][0].n_features_,
+                ),
                 dtype=np.float64,
             )
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ngboost"
-version = "0.4.1dev"
+version = "0.5.0dev"
 description = "Library for probabilistic predictions via gradient boosting."
 authors = ["Stanford ML Group <avati@cs.stanford.edu>"]
 readme = "README.md"
@@ -13,21 +13,20 @@ classifiers = [
 license = "Apache License 2.0"
 
 [tool.poetry.dependencies]
-python = ">=3.7.1, <3.11"
+python = ">=3.9, <3.13"
 scikit-learn = ">=1.0.2"
 numpy = ">=1.21.2"
 scipy = ">=1.7.2"
 tqdm = ">=4.3"
 lifelines = ">=0.25"
-pandas = ">=1.3.5"
 
 [tool.poetry.dev-dependencies]
 pytest = "^6.1.2"
 black = "^22.8.0"
 pre-commit = "^2.0"
 isort = "^5.6.4"
-pylint = "^2.6.0"
-flake8 = "^5.0.4"
+pylint = "^3.0.3"
+flake8 = "^7.0.0"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,8 +5,8 @@
 from sklearn.datasets import fetch_california_housing, load_breast_cancer
 from sklearn.model_selection import train_test_split
 
-Tuple4Array = Tuple[np.array, np.array, np.array, np.array]
-Tuple5Array = Tuple[np.array, np.array, np.array, np.array, np.array]
+Tuple4Array = Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
+Tuple5Array = Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]
 
 
 def pytest_addoption(parser):
@@ -22,6 +22,11 @@ def pytest_configure(config):
     config.addinivalue_line("markers", "slow: ")
 
 
+@pytest.fixture(scope="session", autouse=True)
+def set_seed():
+    np.random.seed(0)
+
+
 @pytest.fixture(scope="session")
 def california_housing_data() -> Tuple4Array:
     X, Y = fetch_california_housing(return_X_y=True)