Skip to content

Commit

Permalink
Merge pull request #128 from uzh-dqbm-cmi/model_with_live_data
Browse files Browse the repository at this point in the history
Pipeline
  • Loading branch information
mcmahom5 committed Feb 27, 2023
2 parents d1d6241 + 9ec585a commit 18b2220
Show file tree
Hide file tree
Showing 10 changed files with 114 additions and 39 deletions.
10 changes: 10 additions & 0 deletions conf/base/catalog/live_data.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,13 @@ live_data:
type: pandas.ParquetDataSet
filepath: /data/mridle/data/kedro_data_catalog/04_feature/live_data.parquet
layer: feature

train_data_with_live:
type: pandas.CSVDataSet
filepath: /data/mridle/data/kedro_data_catalog/05_model_input/train_data_with_live.csv
layer: model input

val_data_with_live:
type: pandas.CSVDataSet
filepath: /data/mridle/data/kedro_data_catalog/05_model_input/val_data_with_live.csv
layer: model input
1 change: 0 additions & 1 deletion conf/base/catalog/xgboost.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,3 @@ xgboost_model_metrics_plot:
type: mridle.extras.datasets.altair_dataset.AltairDataSet
filepath: /data/mridle/data/kedro_data_catalog/08_reporting/xgboost/xgboost_model_metrics_plot.html
layer: reporting

9 changes: 9 additions & 0 deletions conf/base/catalog/xgboost_with_live.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
xgboost_model_with_live:
type: pickle.PickleDataSet
filepath: /data/mridle/data/kedro_data_catalog/06_models/xgboost_with_live/xgboost_model_with_live.pkl
layer: models

xgboost_model_results_with_live:
type: pandas.CSVDataSet
filepath: /data/mridle/data/kedro_data_catalog/07_model_output/xgboost_with_live/xgboost_model_results_with_live.csv
layer: model output
76 changes: 42 additions & 34 deletions conf/base/parameters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,9 @@ models:
config:
features:
- 'no_show_before'
- 'appts_before'
- 'show_before'
- 'no_show_rate'
- 'sched_days_advanced'
- 'month'
- 'age'
Expand All @@ -394,6 +397,7 @@ models:
- 'distance_to_usz'
- 'day_of_week_str'
- 'marital'
- 'times_rescheduled'
target: NoShow
Stratifier:
flavor: PartitionedLabelStratifier
Expand All @@ -418,6 +422,10 @@ models:
- 'age'
- 'hour_sched'
- 'distance_to_usz'
- 'times_rescheduled'
- 'appts_before'
- 'show_before'
- 'no_show_rate'
- name: 'onehot'
flavor: sklearn.preprocessing.OneHotEncoder
config:
Expand Down Expand Up @@ -453,40 +461,40 @@ models:
grow_policy: 'depthwise' # less overfit w/ hist vs. lossguide
Trainer:
flavor: Trainer
Tuner:
flavor: BayesianTuner
config:
hyperparameters:
classifier__n_estimators:
parse_hp_uniformint:
start: 10
end: 400
classifier__max_depth:
parse_hp_uniformint:
start: 2
end: 20
classifier__learning_rate:
parse_hp_loguniform:
start: -5.5
end: -1.5
classifier__gamma:
parse_hp_uniform:
start: 0
end: 0.1
classifier__subsample:
parse_hp_uniform:
start: 0.5
end: 1
classifier__reg_lambda:
parse_hp_loguniform:
start: -3
end: 2
search_type: bayesian
num_cv_folds: 5
num_iters: 50 # 150
scoring_function: log_loss
verbose: 0
hyperopt_timeout: 10000000
# Tuner:
# flavor: BayesianTuner
# config:
# hyperparameters:
# classifier__n_estimators:
# parse_hp_uniformint:
# start: 10
# end: 400
# classifier__max_depth:
# parse_hp_uniformint:
# start: 2
# end: 10
# classifier__learning_rate:
# parse_hp_loguniform:
# start: -5.5
# end: -1.5
# classifier__gamma:
# parse_hp_uniform:
# start: 0
# end: 0.1
# classifier__subsample:
# parse_hp_uniform:
# start: 0.5
# end: 1
# classifier__reg_lambda:
# parse_hp_loguniform:
# start: -3
# end: 2
# search_type: bayesian
# num_cv_folds: 5
# num_iters: 50 # 150
# scoring_function: log_loss
# verbose: 0
# hyperopt_timeout: 10000000
Metrics:
- flavor: F1_Macro
config:
Expand Down
6 changes: 5 additions & 1 deletion src/mridle/pipeline_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from kedro.pipeline import Pipeline
from mridle.pipelines.data_engineering import ris, dicom, dispo
from mridle.pipelines.data_science import harvey, feature_engineering, descriptive_viz, random_forest, xgboost, \
logistic_regression, neural_net, model_comparison
logistic_regression, neural_net, model_comparison, live_data, xgboost_with_live


def register_pipelines() -> Dict[str, Pipeline]:
Expand All @@ -47,10 +47,12 @@ def register_pipelines() -> Dict[str, Pipeline]:
dispo_pipeline = dispo.create_pipeline()
descriptive_viz_pipeline = descriptive_viz.create_pipeline()
feature_engineering_pipeline = feature_engineering.create_pipeline()
live_data_pipeline = live_data.create_pipeline()
harvey_pipeline = harvey.create_pipeline()
logistic_regression_pipeline = logistic_regression.create_pipeline()
random_forest_pipeline = random_forest.create_pipeline()
xgboost_pipeline = xgboost.create_pipeline()
xgboost_with_live_pipeline = xgboost_with_live.create_pipeline()
neural_net_pipeline = neural_net.create_pipeline()
model_comparison_pipeline = model_comparison.create_pipeline()

Expand All @@ -66,10 +68,12 @@ def register_pipelines() -> Dict[str, Pipeline]:
"dispo": dispo_pipeline,
"descriptive_viz": descriptive_viz_pipeline,
"feature_engineering": feature_engineering_pipeline,
"live_data": live_data_pipeline,
"harvey": harvey_pipeline,
"logistic_regression": logistic_regression_pipeline,
"random_forest": random_forest_pipeline,
"xgboost": xgboost_pipeline,
"xgboost_with_live": xgboost_with_live_pipeline,
"neural_net": neural_net_pipeline,
"model_comparison": model_comparison_pipeline,
"models": harvey_pipeline + logistic_regression_pipeline + random_forest_pipeline + xgboost_pipeline
Expand Down
25 changes: 24 additions & 1 deletion src/mridle/pipelines/data_science/live_data/nodes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import numpy as np
import datetime


def get_slt_with_outcome():
Expand All @@ -17,6 +18,28 @@ def get_slt_with_outcome():
how='left')
slt_with_outcome['NoShow'].fillna(False, inplace=True)

most_recent_actuals = np.max(actuals['start_time']) # .date()
most_recent_actuals = np.max(actuals['start_time'])
slt_with_outcome = slt_with_outcome[slt_with_outcome['start_time'] <= most_recent_actuals]
return slt_with_outcome


def concat_master_data(master_feature_set_na_removed, live_data):
"""Take live data up until start of last month, and concat with master feature set. That is then training data.
Rest of live data (i.e. from start of last month until now) is then validation data"""

mfs_df = master_feature_set_na_removed.copy()
l_df = live_data.copy()

for col in list(set(l_df.columns) & set(mfs_df.columns)):

mfs_df[col] = mfs_df[col].astype(l_df[col].dtypes.name)

last_monday = datetime.date.today() + datetime.timedelta(days=-datetime.date.today().weekday())
five_weeks_ago = last_monday - datetime.timedelta(weeks=5)

live_data_train = l_df[l_df['start_time'].dt.date < five_weeks_ago]
val_data_with_live = l_df[l_df['start_time'].dt.date >= five_weeks_ago]

train_data_with_live = pd.concat([mfs_df, live_data_train], join="inner")

return train_data_with_live, val_data_with_live
10 changes: 8 additions & 2 deletions src/mridle/pipelines/data_science/live_data/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
from kedro.pipeline import Pipeline, node
from .nodes import get_slt_with_outcome
from .nodes import get_slt_with_outcome, concat_master_data


def create_pipeline(**kwargs):
return Pipeline(
[
node(
func=get_slt_with_outcome,
inputs=[""],
inputs=[],
outputs="live_data",
name="get_slt_with_outcome",
),
node(
func=concat_master_data,
inputs=["master_feature_set_na_removed", 'live_data'],
outputs=["train_data_with_live", "val_data_with_live"],
name="concat_master_data",
)
]
)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .pipeline import create_pipeline # NOQA
Empty file.
15 changes: 15 additions & 0 deletions src/mridle/pipelines/data_science/xgboost_with_live/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from kedro.pipeline import Pipeline, node
from mridle.utilities.modeling import run_experiment


def create_pipeline(**kwargs):
return Pipeline(
[
node(
func=run_experiment,
inputs=["train_data_with_live", "params:models.xgboost"],
outputs=["xgboost_model_with_live", "xgboost_model_results_with_live"],
name="train_xgboost_model_with_live"
)
]
)

0 comments on commit 18b2220

Please sign in to comment.