Merge pull request #128 from uzh-dqbm-cmi/model_with_live_data

Pipeline
uzh-dqbm-cmi · Feb 27, 2023 · 18b2220 · 18b2220
2 parents d1d6241 + 9ec585a
commit 18b2220
Show file tree

Hide file tree

Showing 10 changed files with 114 additions and 39 deletions.
diff --git a/conf/base/catalog/live_data.yml b/conf/base/catalog/live_data.yml
@@ -2,3 +2,13 @@ live_data:
   type: pandas.ParquetDataSet
   filepath: /data/mridle/data/kedro_data_catalog/04_feature/live_data.parquet
   layer: feature
+
+train_data_with_live:
+  type: pandas.CSVDataSet
+  filepath: /data/mridle/data/kedro_data_catalog/05_model_input/train_data_with_live.csv
+  layer: model input
+
+val_data_with_live:
+  type: pandas.CSVDataSet
+  filepath: /data/mridle/data/kedro_data_catalog/05_model_input/val_data_with_live.csv
+  layer: model input
diff --git a/conf/base/catalog/xgboost.yml b/conf/base/catalog/xgboost.yml
@@ -17,4 +17,3 @@ xgboost_model_metrics_plot:
   type: mridle.extras.datasets.altair_dataset.AltairDataSet
   filepath: /data/mridle/data/kedro_data_catalog/08_reporting/xgboost/xgboost_model_metrics_plot.html
   layer: reporting
-
diff --git a/conf/base/catalog/xgboost_with_live.yml b/conf/base/catalog/xgboost_with_live.yml
@@ -0,0 +1,9 @@
+xgboost_model_with_live:
+  type: pickle.PickleDataSet
+  filepath: /data/mridle/data/kedro_data_catalog/06_models/xgboost_with_live/xgboost_model_with_live.pkl
+  layer: models
+
+xgboost_model_results_with_live:
+  type: pandas.CSVDataSet
+  filepath: /data/mridle/data/kedro_data_catalog/07_model_output/xgboost_with_live/xgboost_model_results_with_live.csv
+  layer: model output
diff --git a/conf/base/parameters.yml b/conf/base/parameters.yml
@@ -383,6 +383,9 @@ models:
             config:
                 features:
                     - 'no_show_before'
+                    - 'appts_before'
+                    - 'show_before'
+                    - 'no_show_rate'
                     - 'sched_days_advanced'
                     - 'month'
                     - 'age'
@@ -394,6 +397,7 @@ models:
                     - 'distance_to_usz'
                     - 'day_of_week_str'
                     - 'marital'
+                    - 'times_rescheduled'
                 target: NoShow
         Stratifier:
             flavor: PartitionedLabelStratifier
@@ -418,6 +422,10 @@ models:
                                   - 'age'
                                   - 'hour_sched'
                                   - 'distance_to_usz'
+                                  - 'times_rescheduled'
+                                  - 'appts_before'
+                                  - 'show_before'
+                                  - 'no_show_rate'
                           - name: 'onehot'
                             flavor: sklearn.preprocessing.OneHotEncoder
                             config:
@@ -453,40 +461,40 @@ models:
                         grow_policy: 'depthwise' # less overfit w/ hist vs. lossguide
         Trainer:
             flavor: Trainer
-        Tuner:
-            flavor: BayesianTuner
-            config:
-                hyperparameters:
-                    classifier__n_estimators:
-                        parse_hp_uniformint:
-                            start: 10
-                            end: 400
-                    classifier__max_depth:
-                        parse_hp_uniformint:
-                            start: 2
-                            end: 20
-                    classifier__learning_rate:
-                        parse_hp_loguniform:
-                            start: -5.5
-                            end: -1.5
-                    classifier__gamma:
-                        parse_hp_uniform:
-                            start: 0
-                            end: 0.1
-                    classifier__subsample:
-                        parse_hp_uniform:
-                            start: 0.5
-                            end: 1
-                    classifier__reg_lambda:
-                        parse_hp_loguniform:
-                            start: -3
-                            end: 2
-                search_type: bayesian
-                num_cv_folds: 5
-                num_iters: 50  # 150
-                scoring_function: log_loss
-                verbose: 0
-                hyperopt_timeout: 10000000
+#        Tuner:
+#            flavor: BayesianTuner
+#            config:
+#                hyperparameters:
+#                    classifier__n_estimators:
+#                        parse_hp_uniformint:
+#                            start: 10
+#                            end: 400
+#                    classifier__max_depth:
+#                        parse_hp_uniformint:
+#                            start: 2
+#                            end: 10
+#                    classifier__learning_rate:
+#                        parse_hp_loguniform:
+#                            start: -5.5
+#                            end: -1.5
+#                    classifier__gamma:
+#                        parse_hp_uniform:
+#                            start: 0
+#                            end: 0.1
+#                    classifier__subsample:
+#                        parse_hp_uniform:
+#                            start: 0.5
+#                            end: 1
+#                    classifier__reg_lambda:
+#                        parse_hp_loguniform:
+#                            start: -3
+#                            end: 2
+#                search_type: bayesian
+#                num_cv_folds: 5
+#                num_iters: 50  # 150
+#                scoring_function: log_loss
+#                verbose: 0
+#                hyperopt_timeout: 10000000
         Metrics:
             - flavor: F1_Macro
               config:

diff --git a/src/mridle/pipeline_registry.py b/src/mridle/pipeline_registry.py
@@ -32,7 +32,7 @@
 from kedro.pipeline import Pipeline
 from mridle.pipelines.data_engineering import ris, dicom, dispo
 from mridle.pipelines.data_science import harvey, feature_engineering, descriptive_viz, random_forest, xgboost, \
-    logistic_regression, neural_net, model_comparison
+    logistic_regression, neural_net, model_comparison, live_data, xgboost_with_live
 
 
 def register_pipelines() -> Dict[str, Pipeline]:
@@ -47,10 +47,12 @@ def register_pipelines() -> Dict[str, Pipeline]:
     dispo_pipeline = dispo.create_pipeline()
     descriptive_viz_pipeline = descriptive_viz.create_pipeline()
     feature_engineering_pipeline = feature_engineering.create_pipeline()
+    live_data_pipeline = live_data.create_pipeline()
     harvey_pipeline = harvey.create_pipeline()
     logistic_regression_pipeline = logistic_regression.create_pipeline()
     random_forest_pipeline = random_forest.create_pipeline()
     xgboost_pipeline = xgboost.create_pipeline()
+    xgboost_with_live_pipeline = xgboost_with_live.create_pipeline()
     neural_net_pipeline = neural_net.create_pipeline()
     model_comparison_pipeline = model_comparison.create_pipeline()
 
@@ -66,10 +68,12 @@ def register_pipelines() -> Dict[str, Pipeline]:
         "dispo": dispo_pipeline,
         "descriptive_viz": descriptive_viz_pipeline,
         "feature_engineering": feature_engineering_pipeline,
+        "live_data": live_data_pipeline,
         "harvey": harvey_pipeline,
         "logistic_regression": logistic_regression_pipeline,
         "random_forest": random_forest_pipeline,
         "xgboost": xgboost_pipeline,
+        "xgboost_with_live": xgboost_with_live_pipeline,
         "neural_net": neural_net_pipeline,
         "model_comparison": model_comparison_pipeline,
         "models": harvey_pipeline + logistic_regression_pipeline + random_forest_pipeline + xgboost_pipeline

diff --git a/src/mridle/pipelines/data_science/live_data/nodes.py b/src/mridle/pipelines/data_science/live_data/nodes.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import numpy as np
+import datetime
 
 
 def get_slt_with_outcome():
@@ -17,6 +18,28 @@ def get_slt_with_outcome():
                                    how='left')
     slt_with_outcome['NoShow'].fillna(False, inplace=True)
 
-    most_recent_actuals = np.max(actuals['start_time'])  # .date()
+    most_recent_actuals = np.max(actuals['start_time'])
     slt_with_outcome = slt_with_outcome[slt_with_outcome['start_time'] <= most_recent_actuals]
     return slt_with_outcome
+
+
+def concat_master_data(master_feature_set_na_removed, live_data):
+    """Take live data up until start of last month, and concat with master feature set. That is then training data.
+    Rest of live data (i.e. from start of last month until now) is then validation data"""
+
+    mfs_df = master_feature_set_na_removed.copy()
+    l_df = live_data.copy()
+
+    for col in list(set(l_df.columns) & set(mfs_df.columns)):
+
+        mfs_df[col] = mfs_df[col].astype(l_df[col].dtypes.name)
+
+    last_monday = datetime.date.today() + datetime.timedelta(days=-datetime.date.today().weekday())
+    five_weeks_ago = last_monday - datetime.timedelta(weeks=5)
+
+    live_data_train = l_df[l_df['start_time'].dt.date < five_weeks_ago]
+    val_data_with_live = l_df[l_df['start_time'].dt.date >= five_weeks_ago]
+
+    train_data_with_live = pd.concat([mfs_df, live_data_train], join="inner")
+
+    return train_data_with_live, val_data_with_live
diff --git a/src/mridle/pipelines/data_science/live_data/pipeline.py b/src/mridle/pipelines/data_science/live_data/pipeline.py
@@ -1,15 +1,21 @@
 from kedro.pipeline import Pipeline, node
-from .nodes import get_slt_with_outcome
+from .nodes import get_slt_with_outcome, concat_master_data
 
 
 def create_pipeline(**kwargs):
     return Pipeline(
         [
             node(
                 func=get_slt_with_outcome,
-                inputs=[""],
+                inputs=[],
                 outputs="live_data",
                 name="get_slt_with_outcome",
+            ),
+            node(
+                func=concat_master_data,
+                inputs=["master_feature_set_na_removed", 'live_data'],
+                outputs=["train_data_with_live", "val_data_with_live"],
+                name="concat_master_data",
             )
         ]
     )
diff --git a/src/mridle/pipelines/data_science/xgboost_with_live/__init__.py b/src/mridle/pipelines/data_science/xgboost_with_live/__init__.py
@@ -0,0 +1 @@
+from .pipeline import create_pipeline  # NOQA
diff --git a/src/mridle/pipelines/data_science/xgboost_with_live/nodes.py b/src/mridle/pipelines/data_science/xgboost_with_live/nodes.py
diff --git a/src/mridle/pipelines/data_science/xgboost_with_live/pipeline.py b/src/mridle/pipelines/data_science/xgboost_with_live/pipeline.py
@@ -0,0 +1,15 @@
+from kedro.pipeline import Pipeline, node
+from mridle.utilities.modeling import run_experiment
+
+
+def create_pipeline(**kwargs):
+    return Pipeline(
+        [
+            node(
+                func=run_experiment,
+                inputs=["train_data_with_live", "params:models.xgboost"],
+                outputs=["xgboost_model_with_live", "xgboost_model_results_with_live"],
+                name="train_xgboost_model_with_live"
+            )
+        ]
+    )