Merge branch 'master' of https://github.com/jack-mcivor/ngboost into …

…feature/py311
stanfordmlgroup · Sep 10, 2023 · f948c66 · f948c66
2 parents 1dab475 + 8a33fdf
commit f948c66
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 9 deletions.
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -1,4 +1,4 @@
-name: Python package
+name: Python lint and test
 
 on:
   push:

diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@
 
 </h4>
 
-ngboost is a Python library that implements Natural Gradient Boosting, as described in ["NGBoost: Natural Gradient Boosting for Probabilistic Prediction"](https://stanfordmlgroup.github.io/projects/ngboost/). It is built on top of [Scikit-Learn](https://scikit-learn.org/stable/), and is designed to be scalable and modular with respect to choice of proper scoring rule, distribution, and base learner. A didactic introduction to the methodology underlying NGBoost is available in this [slide deck](https://drive.google.com/file/d/183BWFAdFms81MKy6hSku8qI97OwS_JH_/view?usp=sharing).
+ngboost is a Python library that implements Natural Gradient Boosting, as described in ["NGBoost: Natural Gradient Boosting for Probabilistic Prediction"](https://stanfordmlgroup.github.io/projects/ngboost/). It is built on top of [Scikit-Learn](https://scikit-learn.org/stable/), and is designed to be scalable and modular with respect to choice of proper scoring rule, distribution, and base learner. A didactic introduction to the methodology underlying NGBoost is available in this [slide deck](https://docs.google.com/presentation/d/1Tn23Su0ygR6z11jy3xVNiLGv0ggiUQue/edit?usp=share_link&ouid=102290675300480810195&rtpof=true&sd=true).
 
 ## Installation
 

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,5 +1,14 @@
 # RELEASE NOTES
 
+## Version 0.4.1
+### Added `partial_fit` method for incremental learning
+
+NGBoost now includes a new `partial_fit` method that allows for incremental learning. This method appends new base models to the existing ones, which can be useful when new data becomes available over time or when the data is too large to fit in memory all at once.
+
+The `partial_fit` method takes similar parameters to the `fit` method, including predictors `X`, outcomes `Y`, and validation sets `X_val` and `Y_val`. It also supports custom weights for the training and validation sets, as well as early stopping and custom loss monitoring.
+
+Please note that the `partial_fit` method is not yet fully tested and may not work as expected in all cases. Use it with caution and thoroughly test its behavior in your specific use case before relying on it in production.
+
 ## Version 0.4.0
 
 * Added support for the gamma distribution

diff --git a/ngboost/ngboost.py b/ngboost/ngboost.py
@@ -162,9 +162,12 @@ def sample(self, X, Y, sample_weight, params):
         )
 
     def fit_base(self, X, grads, sample_weight=None):
-        models = [
-            clone(self.Base).fit(X, g, sample_weight=sample_weight) for g in grads.T
-        ]
+        if sample_weight is None:
+            models = [clone(self.Base).fit(X, g) for g in grads.T]
+        else:
+            models = [
+                clone(self.Base).fit(X, g, sample_weight=sample_weight) for g in grads.T
+            ]
         fitted = np.array([m.predict(X) for m in models]).T
         self.base_models.append(models)
         return fitted
@@ -224,9 +227,9 @@ def fit(
             Y_val                 : DataFrame object or List or
                                     numpy array of validation-set outcomes in numeric format
             sample_weight         : how much to weigh each example in the training set.
-                                    numpy array of size (n) (defaults to 1)
+                                    numpy array of size (n) (defaults to None)
             val_sample_weight     : how much to weigh each example in the validation set.
-                                    (defaults to 1)
+                                    (defaults to None)
             train_loss_monitor    : a custom score or set of scores to track on the training set
                                     during training. Defaults to the score defined in the NGBoost
                                     constructor
@@ -240,6 +243,75 @@ def fit(
             A fit NGBRegressor object
         """
 
+        self.base_models = []
+        self.scalings = []
+        self.col_idxs = []
+
+        return self.partial_fit(
+            X,
+            Y,
+            X_val=X_val,
+            Y_val=Y_val,
+            sample_weight=sample_weight,
+            val_sample_weight=val_sample_weight,
+            train_loss_monitor=train_loss_monitor,
+            val_loss_monitor=val_loss_monitor,
+            early_stopping_rounds=early_stopping_rounds,
+        )
+
+    def partial_fit(
+        self,
+        X,
+        Y,
+        X_val=None,
+        Y_val=None,
+        sample_weight=None,
+        val_sample_weight=None,
+        train_loss_monitor=None,
+        val_loss_monitor=None,
+        early_stopping_rounds=None,
+    ):
+        """
+        Fits an NGBoost model to the data appending base models to the existing ones.
+
+        NOTE: This method is not yet fully tested and may not work as expected, for example,
+        the first call to partial_fit will be the most signifcant and later calls will just
+        retune the model to newer data at the cost of making it more expensive. Use with caution.
+
+        Parameters:
+            X                     : DataFrame object or List or
+                                    numpy array of predictors (n x p) in Numeric format
+            Y                     : DataFrame object or List or numpy array of outcomes (n)
+                                    in numeric format. Should be floats for regression and
+                                    integers from 0 to K-1 for K-class classification
+            X_val                 : DataFrame object or List or
+                                    numpy array of validation-set predictors in numeric format
+            Y_val                 : DataFrame object or List or
+                                    numpy array of validation-set outcomes in numeric format
+            sample_weight         : how much to weigh each example in the training set.
+                                    numpy array of size (n) (defaults to None)
+            val_sample_weight     : how much to weigh each example in the validation set.
+                                    (defaults to None)
+            train_loss_monitor    : a custom score or set of scores to track on the training set
+                                    during training. Defaults to the score defined in the NGBoost
+                                    constructor
+            val_loss_monitor      : a custom score or set of scores to track on the validation set
+                                    during training. Defaults to the score defined in the NGBoost
+                                    constructor
+            early_stopping_rounds : the number of consecutive boosting iterations during which
+                                    the loss has to increase before the algorithm stops early.
+
+        Output:
+            A fit NGBRegressor object
+        """
+
+        if len(self.base_models) != len(self.scalings) or len(self.base_models) != len(
+            self.col_idxs
+        ):
+            raise RuntimeError(
+                "Base models, scalings, and col_idxs are not the same length"
+            )
+
         # if early stopping is specified, split X,Y and sample weights (if given) into training and validation sets
         # This will overwrite any X_val and Y_val values passed by the user directly.
         if self.early_stopping_rounds is not None:
@@ -299,7 +371,7 @@ def fit(
                 Y, sample_weight=val_sample_weight
             )
 
-        for itr in range(self.n_estimators):
+        for itr in range(len(self.col_idxs), self.n_estimators + len(self.col_idxs)):
             _, col_idx, X_batch, Y_batch, weight_batch, P_batch = self.sample(
                 X, Y, sample_weight, params
             )

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ngboost"
-version = "0.4.0dev"
+version = "0.4.1dev"
 description = "Library for probabilistic predictions via gradient boosting."
 authors = ["Stanford ML Group <avati@cs.stanford.edu>"]
 readme = "README.md"