diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..9db7ff8 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.4.2 + hooks: + - id: ruff + args: [ --fix ] + - id: ruff-format +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.10.0 + hooks: + - id: mypy + files: ^src diff --git a/src/spxmod/model.py b/src/spxmod/model.py index 92d2386..d84a0c7 100644 --- a/src/spxmod/model.py +++ b/src/spxmod/model.py @@ -105,8 +105,9 @@ def __init__( data=dict(col_obs=obs, col_weights=weights), variables=[], linear_gpriors=[], - param_specs=param_specs or {}, ) + if param_specs is not None: + self.core_config.update(param_specs) self.spaces = spaces self.var_builders = var_builders diff --git a/src/spxmod/regmod_builder.py b/src/spxmod/regmod_builder.py index 64759e6..5c7820c 100644 --- a/src/spxmod/regmod_builder.py +++ b/src/spxmod/regmod_builder.py @@ -6,9 +6,19 @@ from msca.optim.solver import IPSolver, NTCGSolver from regmod.data import Data from regmod.models import BinomialModel, GaussianModel, PoissonModel -from regmod.prior import GaussianPrior, LinearGaussianPrior, UniformPrior -from regmod.variable import Variable +from regmod.parameter import Parameter +from regmod.prior import ( + GaussianPrior, + LinearGaussianPrior, + LinearUniformPrior, + SplineGaussianPrior, + SplinePrior, + SplineUniformPrior, + UniformPrior, +) +from regmod.variable import SplineVariable, Variable from scipy.stats import norm +from xspline import XSpline from spxmod.linalg import get_pred_var from spxmod.typing import Callable, DataFrame, NDArray, RegmodModel @@ -38,25 +48,109 @@ def msca_optimize( return result.x +class SparseParameter(Parameter): + def get_linear_gmat(self) -> Matrix: + """Get the linear Gaussian prior matrix. + + Returns + ------- + Matrix + Gaussian prior design matrix. + + """ + if len(self.variables) == 0: + return np.empty(shape=(0, 0)) + gmat = sp.block_diag( + [ + ( + var.get_linear_gmat() + if isinstance(var, SplineVariable) + else np.empty((0, 1)) + ) + for var in self.variables + ] + ) + if len(self.linear_gpriors) > 0: + gmat = sp.vstack( + [gmat] + [prior.mat for prior in self.linear_gpriors] + ) + return asmatrix(sp.csc_matrix(gmat)) + + def get_linear_umat(self) -> Matrix: + """Get the linear Uniform prior matrix. + + Returns + ------- + Matrix + Uniform prior design matrix. + + """ + if len(self.variables) == 0: + return np.empty(shape=(0, 0)) + umat = sp.block_diag( + [ + ( + var.get_linear_umat() + if isinstance(var, SplineVariable) + else np.empty((0, 1)) + ) + for var in self.variables + ] + ) + if len(self.linear_upriors) > 0: + umat = sp.vstack( + [umat] + [prior.mat for prior in self.linear_upriors] + ) + return asmatrix(sp.csc_matrix(umat)) + + class SparseRegmodModel(RegmodModel): + def __init__( + self, + data: dict, + variables: list[dict], + linear_gpriors: list[dict] | None = None, + linear_upriors: list[dict] | None = None, + **kwargs, + ): + data = Data(**data) + + # build variables + variables = [_build_regmod_variable(**v) for v in variables] + + linear_gpriors = linear_gpriors or [] + linear_upriors = linear_upriors or [] + + # build smoothing prior + if linear_gpriors: + for i, prior in enumerate(linear_gpriors): + if prior["mat"].size > 0: + linear_gpriors[i] = LinearGaussianPrior(**prior) + + if linear_upriors: + for i, prior in enumerate(linear_upriors): + if prior["mat"].size > 0: + linear_upriors[i] = LinearUniformPrior(**prior) + + param = SparseParameter( + name=self.param_names[0], + variables=variables, + linear_gpriors=linear_gpriors, + linear_upriors=linear_upriors, + **kwargs, + ) + super().__init__(data, params=[param]) + def attach_df(self, df: DataFrame, encode: Callable) -> None: + param = self.params[0] self.data.attach_df(df) self.mat = [asmatrix(sp.csc_matrix(encode(df)))] - self.uvec = self.get_uvec() - self.gvec = self.get_gvec() - self.linear_uvec = self.get_linear_uvec() - self.linear_gvec = self.get_linear_gvec() - self.linear_umat = asmatrix(sp.csc_matrix((0, self.size))) - self.linear_gmat = asmatrix(sp.csc_matrix((0, self.size))) - param = self.params[0] - if self.params[0].linear_gpriors: - self.linear_gmat = asmatrix( - sp.csc_matrix(param.linear_gpriors[0].mat) - ) - if self.params[0].linear_upriors: - self.linear_umat = asmatrix( - sp.csc_matrix(param.linear_upriors[0].mat) - ) + self.uvec = param.get_uvec() + self.gvec = param.get_gvec() + self.linear_uvec = param.get_linear_uvec() + self.linear_gvec = param.get_linear_gvec() + self.linear_gmat = param.get_linear_gmat() + self.linear_umat = param.get_linear_umat() # parse constraints cmat = asmatrix( @@ -180,11 +274,8 @@ class SparseBinomialModel(SparseRegmodModel, BinomialModel): """Sparse Binomial model.""" def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.params[0].inv_link.name != "expit": - raise ValueError( - "Sparse Binomial model can only work with expit inv_link function" - ) + kwargs["inv_link"] = "expit" + SparseRegmodModel.__init__(self, *args, **kwargs) def objective(self, coefs: NDArray) -> float: weights = self.data.weights * self.data.trim_weights @@ -231,11 +322,8 @@ class SparseGaussianModel(SparseRegmodModel, GaussianModel): """Sparse Gaussian model.""" def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.params[0].inv_link.name != "identity": - raise ValueError( - "Sparse Gaussian model can only work with identity inv_link function" - ) + kwargs["inv_link"] = "identity" + SparseRegmodModel.__init__(self, *args, **kwargs) def objective(self, coefs: NDArray) -> float: weights = self.data.weights * self.data.trim_weights @@ -280,11 +368,8 @@ class SparsePoissonModel(SparseRegmodModel, PoissonModel): """Sparse Poisson model.""" def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.params[0].inv_link.name != "exp": - raise ValueError( - "Sparse Poisson model can only work with exp inv_link function" - ) + kwargs["inv_link"] = "exp" + SparseRegmodModel.__init__(self, *args, **kwargs) def objective(self, coefs: NDArray) -> float: weights = self.data.weights * self.data.trim_weights @@ -338,40 +423,49 @@ def build_regmod_model( model_type: str, data: dict, variables: list[dict], - linear_gpriors: list[dict], - param_specs: dict, + linear_gpriors: list[dict] | None = None, + linear_upriors: list[dict] | None = None, + **kwargs, ) -> RegmodModel: - # build data - data = Data(**data) - - # build variables - variables = [_build_regmod_variable(**kwargs) for kwargs in variables] - - # build smoothing prior - for i, prior in enumerate(linear_gpriors): - if prior["mat"].size > 0: - linear_gpriors[i] = LinearGaussianPrior(**prior) - - # buid regmod model - model_class = _model_dict[model_type] - model_param = model_class.param_names[0] - - return model_class( + return _model_dict[model_type]( data, - param_specs={ - model_param: { - "variables": variables, - "linear_gpriors": linear_gpriors, - **param_specs, - } - }, + variables, + linear_gpriors or [], + linear_upriors or [], + **kwargs, ) -def _build_regmod_variable(name: str, gprior: dict, uprior: dict) -> Variable: - gprior = GaussianPrior(**gprior) - uprior = UniformPrior(**uprior) - return Variable(name=name, priors=[gprior, uprior]) +def _build_regmod_variable( + name: str, + gprior: dict | None = None, + uprior: dict | None = None, + spline: XSpline | None = None, + spline_gpriors: list[dict] | None = None, + spline_upriors: list[dict] | None = None, +) -> Variable: + priors = [] + if gprior is not None: + priors.append(GaussianPrior(**gprior)) + if uprior is not None: + priors.append(UniformPrior(**uprior)) + if spline_gpriors is not None: + priors += [ + SplineGaussianPrior(**spline_gprior) + for spline_gprior in spline_gpriors + ] + if spline_upriors is not None: + priors += [ + SplineUniformPrior(**spline_uprior) + for spline_uprior in spline_upriors + ] + if spline is None: + return Variable(name=name, priors=priors) + else: + for prior in priors: + if isinstance(prior, SplinePrior): + prior.attach_spline(spline) + return SplineVariable(name=name, spline=spline, priors=priors) def get_vcov(hessian: Matrix, jacobian2: Matrix) -> NDArray: diff --git a/src/spxmod/space.py b/src/spxmod/space.py index cb78384..ffecdaf 100644 --- a/src/spxmod/space.py +++ b/src/spxmod/space.py @@ -84,41 +84,43 @@ def build_encoded_names(self, column: str) -> list[str]: return [column] return [f"{column}_{self.name}_{i}" for i in range(self.size)] - def encode(self, data: DataFrame, column: str = "intercept") -> DataFrame: + def encode(self, mat: NDArray, coords: DataFrame) -> coo_matrix: """Encode the data into the space grid. Parameters ---------- - data : DataFrame - Dataframe that contains the coordinate information and the column - to encode. - column : str, optional - Column name to encode. Default is "intercept". + mat + Design matrix to be encoded, it should have the same number of rows + with `coords`. + coords + Coordinate data frame for each row of the design matrix. It's + columns should match with the space dimension name. Returns ------- - DataFrame - Encoded dataframe. + coo_matrix + Encoded design matrix. """ - val = ( - np.ones(len(data)) - if column == "intercept" - else data[column].to_numpy() - ) - row = np.arange(len(data), dtype=int) - col = np.zeros(len(data), dtype=int) + vals = mat.ravel() + nrow, ncol = mat.shape + + row = np.repeat(np.arange(nrow, dtype=int), ncol) + col = np.zeros(nrow, dtype=int) if self.dims: col = ( - data[self.dim_names] - .merge(self.span.reset_index(), how="left", on=self.dim_names) + coords.merge( + self.span.reset_index(), how="left", on=self.dim_names + ) .eval("index") .to_numpy() ) - return coo_matrix((val, (row, col)), shape=(len(data), self.size)) + col = np.add.outer(ncol * col, np.arange(ncol)).ravel() + return coo_matrix((vals, (row, col)), shape=(nrow, self.size * ncol)) def build_smoothing_prior( self, + size: int, lam: float | dict[str, float] = 0.0, lam_mean: float = 0.0, scale_by_distance: bool = False, @@ -143,7 +145,7 @@ def build_smoothing_prior( """ if isinstance(lam, float): lam = {name: lam for name in self.dim_names} - mat, sd = coo_matrix((0, self.size)), np.empty((0,)) + mat, sd = coo_matrix((0, self.size * size)), np.empty((0,)) mats_default = list(map(identity, self.dim_sizes)) sds_default = list(map(np.ones, self.dim_sizes)) @@ -152,16 +154,23 @@ def build_smoothing_prior( lam_i = lam[dim.name] if lam_i > 0.0 and isinstance(dim, NumericalDimension): mats = mats_default.copy() - mats[i] = dim.build_smoothing_mat() + mats[i] = kron(dim.build_smoothing_mat(), identity(size)) mat = vstack([mat, functools.reduce(kron, mats)]) sds = sds_default.copy() - sds[i] = dim.build_smoothing_sd(lam_i, scale_by_distance) + sds[i] = np.repeat( + dim.build_smoothing_sd(lam_i, scale_by_distance), size + ) sd = np.hstack([sd, functools.reduce(_flatten_outer, sds)]) if lam_mean > 0.0: - mat = vstack([mat, np.repeat(1 / self.size, self.size)]) - sd = np.hstack([sd, [1 / np.sqrt(lam_mean)]]) + mat = vstack( + [ + mat, + kron(np.repeat(1 / self.size, self.size), identity(size)), + ] + ) + sd = np.hstack([sd, np.repeat(1 / np.sqrt(lam_mean), size)]) # TODO: regmod cannot recognize sparse array as prior, this shouldn't # be necessary in the future diff --git a/src/spxmod/variable_builder.py b/src/spxmod/variable_builder.py index 3e0b2a9..672b8ce 100644 --- a/src/spxmod/variable_builder.py +++ b/src/spxmod/variable_builder.py @@ -1,6 +1,7 @@ from __future__ import annotations import numpy as np +from xspline import XSpline from spxmod.dimension import CategoricalDimension from spxmod.space import Space @@ -13,12 +14,12 @@ class VariableBuilder: Parameters ---------- - name : str + name Name of the variable column in the data. - space : Space, optional + space Space to partition the variable on. If None, the variable is not partitioned. - lam : float, optional + lam Regularization parameter for the coefficients in the variable group. Default is 0. If the dimension is numerical, a Gaussian prior with mean 0 and standard deviation 1/sqrt(lam) is set on @@ -26,21 +27,31 @@ class VariableBuilder: dimension. If the dimension is categorical, a Gaussian prior with mean 0 and standard deviation 1/sqrt(lam) is set on the coefficients. - lam_mean : float, optional + lam_mean Regularization parameter for the mean of the coefficients in the variable group. Default is 1e-8. A Gaussian prior with mean 0 and standard deviation 1/sqrt(lam_mean) is set on the mean of the coefficients. - gprior : tuple, optional + gprior Gaussian prior for the variable. Default is (0, np.inf). Argument is overwritten with (0, 1/sqrt(lam)) if dimension is categorical. - uprior : tuple, optional + uprior Uniform prior for the variable. Default is (-np.inf, np.inf). scale_by_distance : bool, optional Whether to scale the prior standard deviation by the distance between the neighboring values along the dimension. For numerical dimensions only. Default is False. + spline + Spline configuration for the variable. If None, variable will be parsed + as an instance of Variable, otherwise, it will be parsed as an instance + of SplineVariable. + spline_gpriors + Gaussian priors for the spline coefficients. For details please check + https://github.com/ihmeuw-msca/regmod/blob/release/0.1.2/src/regmod/prior.py + spline_upriors + Uniform priors for the spline coefficients. For details please check + https://github.com/ihmeuw-msca/regmod/blob/release/0.1.2/src/regmod/prior.py """ @@ -53,6 +64,9 @@ def __init__( gprior: dict[str, float] | None = None, uprior: dict[str, float] | None = None, scale_by_distance: bool = False, + spline: dict | None = None, + spline_gpriors: list[dict] | None = None, + spline_upriors: list[dict] | None = None, ) -> None: self.name = name self.space = space @@ -62,6 +76,12 @@ def __init__( self.uprior = uprior or dict(lb=-np.inf, ub=np.inf) self.scale_by_distance = scale_by_distance + if spline is not None: + spline = XSpline(**spline) + self.spline: XSpline = spline + self.spline_gpriors = spline_gpriors + self.spline_upriors = spline_upriors + # transfer lam to gprior when dim is categorical if isinstance(self.lam, float): self.lam = {name: self.lam for name in self.space.dim_names} @@ -76,6 +96,9 @@ def __init__( ) if lam_cat > 0: self.gprior["sd"] = 1.0 / np.sqrt(lam_cat) + if self.spline is not None: + self.gprior["size"] = self.spline.num_spline_bases + self.uprior["size"] = self.spline.num_spline_bases @classmethod def from_config( @@ -89,12 +112,21 @@ def from_config( @property def size(self) -> int: """Number of variables in the variable group.""" - return self.space.size + if self.spline is None: + return self.space.size + return self.spline.num_spline_bases * self.space.size def build_variables(self) -> list[dict]: """Returns the list of variables in the variable group.""" variables = [ - dict(name=name, gprior=self.gprior, uprior=self.uprior) + dict( + name=name, + gprior=self.gprior, + uprior=self.uprior, + spline=self.spline, + spline_gpriors=self.spline_gpriors, + spline_upriors=self.spline_upriors, + ) for name in self.space.build_encoded_names(self.name) ] return variables @@ -115,8 +147,9 @@ def build_smoothing_prior(self) -> dict[str, NDArray]: Smoothing Gaussian prior matrix and vector. """ + size = 1 if self.spline is None else self.spline.num_spline_bases return self.space.build_smoothing_prior( - self.lam, self.lam_mean, self.scale_by_distance + size, self.lam, self.lam_mean, self.scale_by_distance ) def encode(self, data: DataFrame) -> DataFrame: @@ -133,4 +166,17 @@ def encode(self, data: DataFrame) -> DataFrame: Encoded variable columns. """ - return self.space.encode(data, column=self.name) + val = ( + np.ones(len(data)) + if self.name == "intercept" + else data[self.name].to_numpy() + ) + + if self.spline is not None: + mat = self.spline.design_mat(val) + else: + mat = val[:, np.newaxis] + + coords = data[self.space.dim_names] + + return self.space.encode(mat, coords) diff --git a/tests/test_regmod_builder.py b/tests/test_regmod_builder.py index 304afaf..30b089b 100644 --- a/tests/test_regmod_builder.py +++ b/tests/test_regmod_builder.py @@ -60,13 +60,15 @@ def ref_model(data, variables, linear_upriors) -> BinomialModel: @pytest.fixture def alt_model(data, variables, linear_upriors) -> SparseBinomialModel: - return SparseBinomialModel( - data, - param_specs={ - "p": {"variables": variables, "linear_upriors": [linear_upriors[1]]} - }, + data = dict(col_obs=data.col_obs, col_weights=data.col_weights) + variables = [dict(name=v.name) for v in variables] + linear_uprior = linear_upriors[1] + linear_uprior = dict( + mat=linear_uprior.mat, lb=linear_uprior.lb, ub=linear_uprior.ub ) + return SparseBinomialModel(data, variables, linear_upriors=[linear_uprior]) + @pytest.fixture def encode(variables) -> Callable: diff --git a/tests/test_space.py b/tests/test_space.py index c8c920d..238b165 100644 --- a/tests/test_space.py +++ b/tests/test_space.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pytest + from spxmod.space import Space @@ -41,24 +42,32 @@ def test_set_span(space, data): def test_encode(space, data): space.set_span(data=data) - mat = space.encode(data=data, column="sdi") + mat = data[["sdi"]].to_numpy() + coords = data[space.dim_names] + mat = space.encode(mat, coords) assert np.allclose(mat.toarray(), np.diag(np.arange(1, 7, dtype=float))[:5]) def test_build_smoothing_prior(space, data): space.set_span(data=data) - prior = space.build_smoothing_prior(lam=1.0, lam_mean=0.0) + prior = space.build_smoothing_prior(size=2, lam=1.0, lam_mean=0.0) assert np.allclose( prior["mat"].toarray(), np.array( [ - [1.0, -1.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 1.0, -1.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 1.0, -1.0, 0.0], - [0.0, 0.0, 0.0, 0.0, 1.0, -1.0], + [1.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 1.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, -1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, -1.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, -1.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, -1.0], ] ), ) - assert np.allclose(prior["sd"], np.array([1.0, 1.0, 1.0, 1.0])) + assert np.allclose( + prior["sd"], np.repeat(np.array([1.0, 1.0, 1.0, 1.0]), 2) + ) diff --git a/tests/test_variable_builder.py b/tests/test_variable_builder.py index fce7008..abdbffd 100644 --- a/tests/test_variable_builder.py +++ b/tests/test_variable_builder.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pytest + from spxmod.space import Space from spxmod.variable_builder import VariableBuilder @@ -63,3 +64,31 @@ def test_encode(data, dimensions): assert np.allclose(mat[:, 0], [1, 0, 0, 1, 0, 0]) assert np.allclose(mat[:, 1], [0, 0, 2, 0, 0, 2]) assert np.allclose(mat[:, 2], [0, 3, 0, 0, 3, 0]) + + +def test_encode_spline_variable(data, dimensions): + space = Space.from_config(dict(dims=[dimensions["loc"], dimensions["age"]])) + space.set_span(data) + spline = dict(knots=[1, 2, 3], degree=2) + var_builder = VariableBuilder( + name="sdi", space=space, lam=1.0, spline=spline + ) + mat = var_builder.encode(data).toarray() + + assert mat.shape == (len(data), var_builder.size) + + spline_mat = var_builder.spline.design_mat(data["sdi"]) + position_mat = np.array( + [ + [1, 0, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0], + [0, 1, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 1, 0], + ] + ) + my_mat = np.array( + [np.outer(x, y).ravel() for x, y in zip(position_mat, spline_mat)] + ) + assert np.allclose(my_mat, mat)