Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
joseortiz3 committed Feb 14, 2024
2 parents 610a005 + f5c2a1e commit 85951a2
Show file tree
Hide file tree
Showing 16 changed files with 146 additions and 60 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/pythonpackage.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Python package
name: Python lint and test

on:
push:
Expand All @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
python-version: ['3.9', '3.10', '3.11', '3.12']

steps:
- uses: actions/checkout@v2
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ repos:
files: ^ngboost
entry: flake8
- id: pylint-ngboost
name: pylint on nboost*
name: pylint on ngboost*
types: [file, python]
language: system
files: ^ngboost
entry: pylint --disable=invalid-name,no-member,missing-docstring,no-self-argument,arguments-differ,R0801
entry: pylint --disable=invalid-name,no-member,missing-docstring,no-self-argument,arguments-differ
- id: pylint-tests
name: pylint on tests*
language: system
Expand Down
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,15 @@ Probabilistic regression example on the Boston housing dataset:
```python
from ngboost import NGBRegressor

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X, Y = load_boston(True)
#Load Boston housing dataset
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
Y = raw_df.values[1::2, 2]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

ngb = NGBRegressor().fit(X_train, Y_train)
Expand Down
16 changes: 16 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,22 @@
# RELEASE NOTES

## Version 0.5.0

* Drops support for python 3.7 and 3.8
* Now supports Python 3.11 and 3.12
* Fixed issue with np.bool
* Optimized memory usage in pred-dist
* Removed declared pandas dependency
* Significant improvements to run times on tests during development
* Minor enhancements to github actions

## Version 0.4.2

* Fix deprecated numpy type alias. This was causing a warning with NumPy >=1.20 and an error with NumPy >=1.24
* Remove pandas as a declared dependency

## Version 0.4.1

### Added `partial_fit` method for incremental learning

NGBoost now includes a new `partial_fit` method that allows for incremental learning. This method appends new base models to the existing ones, which can be useful when new data becomes available over time or when the data is too large to fit in memory all at once.
Expand Down
4 changes: 3 additions & 1 deletion examples/experiments/survival_exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
def Y_join(T, E):
col_event = "Event"
col_time = "Time"
y = np.empty(dtype=[(col_event, np.bool), (col_time, np.float64)], shape=T.shape[0])
y = np.empty(
dtype=[(col_event, np.bool_), (col_time, np.float64)], shape=T.shape[0]
)
y[col_event] = E.values
y[col_time] = T.values
return y
Expand Down
9 changes: 7 additions & 2 deletions examples/regression.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from ngboost import NGBRegressor
from ngboost.distns import Normal

if __name__ == "__main__":
# Load Boston housing dataset
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
Y = raw_df.values[1::2, 2]

X, Y = load_boston(return_X_y=True)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

ngb = NGBRegressor(Dist=Normal).fit(X_train, Y_train)
Expand Down
8 changes: 6 additions & 2 deletions examples/survival.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
import numpy as np
from sklearn.datasets import load_boston
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from ngboost import NGBSurvival
from ngboost.distns import LogNormal

if __name__ == "__main__":
# Load Boston housing dataset
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
Y = raw_df.values[1::2, 2]

X, Y = load_boston(return_X_y=True)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# introduce administrative censoring
Expand Down
6 changes: 4 additions & 2 deletions ngboost/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
# before python 3.8
from importlib_metadata import version

from .api import NGBClassifier, NGBRegressor, NGBSurvival # NOQA
from .ngboost import NGBoost # NOQA
from .api import NGBClassifier, NGBRegressor, NGBSurvival
from .ngboost import NGBoost

__all__ = ["NGBClassifier", "NGBRegressor", "NGBSurvival", "NGBoost"]

__version__ = version(__name__)
43 changes: 32 additions & 11 deletions ngboost/distns/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,33 @@
"""NGBoost distributions"""
from .categorical import Bernoulli, k_categorical # NOQA
from .cauchy import Cauchy # NOQA
from .distn import ClassificationDistn, Distn, RegressionDistn # NOQA
from .exponential import Exponential # NOQA
from .gamma import Gamma # NOQA
from .laplace import Laplace # NOQA
from .lognormal import LogNormal # NOQA
from .multivariate_normal import MultivariateNormal # NOQA
from .normal import Normal, NormalFixedMean, NormalFixedVar # NOQA
from .poisson import Poisson # NOQA
from .t import T, TFixedDf, TFixedDfFixedVar # NOQA
from .categorical import Bernoulli, k_categorical
from .cauchy import Cauchy
from .distn import ClassificationDistn, Distn, RegressionDistn
from .exponential import Exponential
from .gamma import Gamma
from .laplace import Laplace
from .lognormal import LogNormal
from .multivariate_normal import MultivariateNormal
from .normal import Normal, NormalFixedMean, NormalFixedVar
from .poisson import Poisson
from .t import T, TFixedDf, TFixedDfFixedVar

__all__ = [
"Bernoulli",
"k_categorical",
"Cauchy",
"ClassificationDistn",
"Distn",
"RegressionDistn",
"Exponential",
"Gamma",
"Laplace",
"LogNormal",
"MultivariateNormal",
"Normal",
"NormalFixedMean",
"NormalFixedVar",
"Poisson",
"T",
"TFixedDf",
"TFixedDfFixedVar",
]
2 changes: 1 addition & 1 deletion ngboost/distns/categorical.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""The NGBoost categorial distribution and scores"""
# pylint: disable=invalid-unary-operand-type, unused-argument, no-self-use
# pylint: disable=invalid-unary-operand-type, unused-argument
import numpy as np
import scipy as sp

Expand Down
4 changes: 2 additions & 2 deletions ngboost/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def Y_from_censored(T, E=None):
else:
E = check_array(E, ensure_2d=False)
E = E.reshape(E.shape[0])
Y = np.empty(dtype=[("Event", np.bool), ("Time", np.float64)], shape=T.shape[0])
Y["Event"] = E.astype(np.bool)
Y = np.empty(dtype=[("Event", np.bool_), ("Time", np.float64)], shape=T.shape[0])
Y["Event"] = E.astype(np.bool_)
Y["Time"] = T.astype(np.float64)
return Y
26 changes: 11 additions & 15 deletions ngboost/ngboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# pylint: disable=line-too-long,too-many-instance-attributes,too-many-arguments
# pylint: disable=unused-argument,too-many-locals,too-many-branches,too-many-statements
# pylint: disable=unused-variable,invalid-unary-operand-type,attribute-defined-outside-init
# pylint: disable=redundant-keyword-arg,protected-access
# pylint: disable=redundant-keyword-arg,protected-access,unnecessary-lambda-assignment
import numpy as np
from sklearn.base import clone
from sklearn.model_selection import train_test_split
Expand Down Expand Up @@ -315,7 +315,6 @@ def partial_fit(
# if early stopping is specified, split X,Y and sample weights (if given) into training and validation sets
# This will overwrite any X_val and Y_val values passed by the user directly.
if self.early_stopping_rounds is not None:

early_stopping_rounds = self.early_stopping_rounds

if sample_weight is None:
Expand Down Expand Up @@ -362,14 +361,14 @@ def partial_fit(
best_val_loss = np.inf

if not train_loss_monitor:
train_loss_monitor = lambda D, Y, W: D.total_score( # NOQA
train_loss_monitor = lambda D, Y, W: D.total_score( # noqa: E731
Y, sample_weight=W
)

if not val_loss_monitor:
val_loss_monitor = lambda D, Y: D.total_score( # NOQA
val_loss_monitor = lambda D, Y: D.total_score( # noqa: E731
Y, sample_weight=val_sample_weight
) # NOQA
)

for itr in range(len(self.col_idxs), self.n_estimators + len(self.col_idxs)):
_, col_idx, X_batch, Y_batch, weight_batch, P_batch = self.sample(
Expand All @@ -386,7 +385,6 @@ def partial_fit(
proj_grad = self.fit_base(X_batch, grads, weight_batch)
scale = self.line_search(proj_grad, P_batch, Y_batch, weight_batch)

# pdb.set_trace()
params -= (
self.learning_rate
* scale
Expand Down Expand Up @@ -490,13 +488,9 @@ def pred_dist(self, X, max_iter=None):

X = check_array(X, accept_sparse=True)

if (
max_iter is not None
): # get prediction at a particular iteration if asked for
dist = self.staged_pred_dist(X, max_iter=max_iter)[-1]
else:
params = np.asarray(self.pred_param(X, max_iter))
dist = self.Dist(params.T)
params = np.asarray(self.pred_param(X, max_iter))
dist = self.Dist(params.T)

return dist

def staged_pred_dist(self, X, max_iter=None):
Expand Down Expand Up @@ -587,8 +581,10 @@ def feature_importances_(self):

if not all_params_importances:
return np.zeros(
len(self.base_models[0]),
self.base_models[0][0].n_features_,
(
len(self.base_models[0]),
self.base_models[0][0].n_features_,
),
dtype=np.float64,
)

Expand Down
9 changes: 4 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ngboost"
version = "0.4.1dev"
version = "0.5.0dev"
description = "Library for probabilistic predictions via gradient boosting."
authors = ["Stanford ML Group <avati@cs.stanford.edu>"]
readme = "README.md"
Expand All @@ -13,21 +13,20 @@ classifiers = [
license = "Apache License 2.0"

[tool.poetry.dependencies]
python = ">=3.7.1, <3.11"
python = ">=3.9, <3.13"
scikit-learn = ">=1.0.2"
numpy = ">=1.21.2"
scipy = ">=1.7.2"
tqdm = ">=4.3"
lifelines = ">=0.25"
pandas = ">=1.3.5"

[tool.poetry.dev-dependencies]
pytest = "^6.1.2"
black = "^22.8.0"
pre-commit = "^2.0"
isort = "^5.6.4"
pylint = "^2.6.0"
flake8 = "^5.0.4"
pylint = "^3.0.3"
flake8 = "^7.0.0"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down
9 changes: 7 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from sklearn.datasets import fetch_california_housing, load_breast_cancer
from sklearn.model_selection import train_test_split

Tuple4Array = Tuple[np.array, np.array, np.array, np.array]
Tuple5Array = Tuple[np.array, np.array, np.array, np.array, np.array]
Tuple4Array = Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
Tuple5Array = Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]


def pytest_addoption(parser):
Expand All @@ -22,6 +22,11 @@ def pytest_configure(config):
config.addinivalue_line("markers", "slow: ")


@pytest.fixture(scope="session", autouse=True)
def set_seed():
np.random.seed(0)


@pytest.fixture(scope="session")
def california_housing_data() -> Tuple4Array:
X, Y = fetch_california_housing(return_X_y=True)
Expand Down
Loading

0 comments on commit 85951a2

Please sign in to comment.