From 8c39f9cad08117449c646304d628c43ed271bc33 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 27 Feb 2023 15:02:38 +0100 Subject: [PATCH 01/53] Send email code --- src/mridle/utilities/intervention.py | 48 ++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 src/mridle/utilities/intervention.py diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py new file mode 100644 index 00000000..9a494667 --- /dev/null +++ b/src/mridle/utilities/intervention.py @@ -0,0 +1,48 @@ +import smtplib +import datetime +from email.mime.application import MIMEApplication +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from email.utils import COMMASPACE, formatdate +import configparser + +# Read the configuration file +config = configparser.ConfigParser() +config.read('config.ini') + +# Access the values in the configuration file +username = config['DEFAULT']['username'] +password = config['DEFAULT']['password'] +recipients = config['DEFAULT']['recipients'] + +# create an SMTP object +smtp_obj = smtplib.SMTP('outlook.usz.ch', 587) + +# establish a secure connection +smtp_obj.starttls() + +# login to the email server using your email address and password +smtp_obj.login(username, password) + +# create the email message +msg = MIMEMultipart() +msg['From'] = username +msg['To'] = COMMASPACE.join([recipients]) +msg['Date'] = formatdate(localtime=True) +msg['Subject'] = 'DataFrame as csv attachment' + +body = "Time sent {}".format(datetime.datetime.now()) +msg.attach(MIMEText(body, 'plain')) + +# Add the CSV attachment to the email +with open('/data/mridle/data/silent_live_test/my_dataframe.csv', 'rb') as csv_file: + csv_attachment = MIMEApplication(csv_file.read(), _subtype='csv') + csv_attachment.add_header('Content-Disposition', 'attachment', filename='my_dataframe.csv') + msg.attach(csv_attachment) + + +# send the email +smtp_obj.sendmail(msg['From'], msg['To'], msg.as_string()) + +# close the SMTP connection +smtp_obj.quit() From a1dd9a50f4584c11c8ad171f0d269b1d81d34bf3 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 27 Feb 2023 15:04:41 +0100 Subject: [PATCH 02/53] Send email code --- src/mridle/utilities/intervention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 9a494667..35f63e5f 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -8,7 +8,7 @@ # Read the configuration file config = configparser.ConfigParser() -config.read('config.ini') +config.read('/home/USZ/mcmamacc/config.ini') # Access the values in the configuration file username = config['DEFAULT']['username'] From 9112ef4a9f20d54384f7df90028bba202dfec6af Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 27 Feb 2023 15:15:50 +0100 Subject: [PATCH 03/53] Send email code --- src/mridle/utilities/intervention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 35f63e5f..88d0fd2b 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -42,7 +42,7 @@ # send the email -smtp_obj.sendmail(msg['From'], msg['To'], msg.as_string()) +smtp_obj.sendmail(username, recipients, msg.as_string()) # close the SMTP connection smtp_obj.quit() From efba3ba826a6603df98795174919fb310834f953 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 27 Feb 2023 15:17:28 +0100 Subject: [PATCH 04/53] Send email code --- src/mridle/utilities/intervention.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 88d0fd2b..0e8a2bf9 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -3,7 +3,7 @@ from email.mime.application import MIMEApplication from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText -from email.utils import COMMASPACE, formatdate +from email.utils import formatdate import configparser # Read the configuration file @@ -27,10 +27,9 @@ # create the email message msg = MIMEMultipart() msg['From'] = username -msg['To'] = COMMASPACE.join([recipients]) +msg['To'] = ", ".join(recipients) msg['Date'] = formatdate(localtime=True) msg['Subject'] = 'DataFrame as csv attachment' - body = "Time sent {}".format(datetime.datetime.now()) msg.attach(MIMEText(body, 'plain')) @@ -42,7 +41,7 @@ # send the email -smtp_obj.sendmail(username, recipients, msg.as_string()) +smtp_obj.sendmail(msg['From'], msg['To'], msg.as_string()) # close the SMTP connection smtp_obj.quit() From 930f349cc62463ca7d008ff85a9bdd4bf3a6963a Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 27 Feb 2023 15:18:55 +0100 Subject: [PATCH 05/53] Send email code --- src/mridle/utilities/intervention.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 0e8a2bf9..93aaf0ea 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -23,7 +23,7 @@ # login to the email server using your email address and password smtp_obj.login(username, password) - +print(recipients) # create the email message msg = MIMEMultipart() msg['From'] = username @@ -41,7 +41,7 @@ # send the email -smtp_obj.sendmail(msg['From'], msg['To'], msg.as_string()) +smtp_obj.sendmail(username, recipients, msg.as_string()) # close the SMTP connection smtp_obj.quit() From 8c370388912c720d9c22d3eeb9b809dfa3a30d29 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 27 Feb 2023 15:20:27 +0100 Subject: [PATCH 06/53] Send email code --- src/mridle/utilities/intervention.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 93aaf0ea..48774a80 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -13,7 +13,7 @@ # Access the values in the configuration file username = config['DEFAULT']['username'] password = config['DEFAULT']['password'] -recipients = config['DEFAULT']['recipients'] +recipients = ['mark.mcmahon@uzh.ch', 'markronan.mcmahon@usz.ch'] # config['DEFAULT']['recipients'] # create an SMTP object smtp_obj = smtplib.SMTP('outlook.usz.ch', 587) @@ -23,6 +23,7 @@ # login to the email server using your email address and password smtp_obj.login(username, password) +print(username) print(recipients) # create the email message msg = MIMEMultipart() From a50c1a8c1506db4b9cb100cc1b82e3f3f89173a2 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 27 Feb 2023 15:22:45 +0100 Subject: [PATCH 07/53] Send email code --- src/mridle/utilities/intervention.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 48774a80..83a487cc 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -13,7 +13,8 @@ # Access the values in the configuration file username = config['DEFAULT']['username'] password = config['DEFAULT']['password'] -recipients = ['mark.mcmahon@uzh.ch', 'markronan.mcmahon@usz.ch'] # config['DEFAULT']['recipients'] +# recipients = ['mark.mcmahon@uzh.ch', 'markronan.mcmahon@usz.ch'] +recipients = config['DEFAULT']['recipients'].split(',') # create an SMTP object smtp_obj = smtplib.SMTP('outlook.usz.ch', 587) From 91f5d12bc02de01b62cd9cb619ce97e9cff67300 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 27 Feb 2023 17:28:37 +0100 Subject: [PATCH 08/53] Send email code --- src/mridle/pipelines/data_science/feature_engineering/nodes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mridle/pipelines/data_science/feature_engineering/nodes.py b/src/mridle/pipelines/data_science/feature_engineering/nodes.py index ff79e593..4b3fb6af 100644 --- a/src/mridle/pipelines/data_science/feature_engineering/nodes.py +++ b/src/mridle/pipelines/data_science/feature_engineering/nodes.py @@ -201,7 +201,8 @@ def build_feature_set(status_df: pd.DataFrame, valid_date_range: List[str], mast 'distance_to_usz_sq': 'last', 'close_to_usz': 'last', 'times_rescheduled': 'last', - 'start_time': 'last' + 'start_time': 'last', + 'Telefon': 'max' } slot_df = build_slot_df(status_df, valid_date_range, agg_dict, build_future_slots=build_future_slots, From e1f45cf1a94da78465b96877489a14aad8ff902c Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Tue, 28 Feb 2023 17:06:00 +0100 Subject: [PATCH 09/53] Change order of date filename --- src/mridle/utilities/process_live_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index 81692a1f..74e6e6e5 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -201,7 +201,7 @@ def process_live_data(): data_path = '/data/mridle/data/silent_live_test/live_files/all/out/{}'.format(filename_row['filename']) model_dir = '/data/mridle/data/kedro_data_catalog/06_models/' output_path = '/data/mridle/data/silent_live_test/live_files/all/' \ - 'out_features_data/features_{}_{}_{}.csv'.format(out_day, out_month, out_year) + 'out_features_data/features_{}_{}_{}.csv'.format(out_year, out_month, out_day) make_out_prediction(data_path, model_dir, output_path, valid_date_range=out_valid_date_range, file_encoding='utf-16', master_feature_set=master_feature_set, rfs_df=rfs, From 693db30be3784312fa3c39ef9fd66f256abb2f62 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Tue, 28 Feb 2023 17:11:05 +0100 Subject: [PATCH 10/53] Change order of date filename --- src/mridle/utilities/process_live_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index 74e6e6e5..9c411899 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -160,7 +160,7 @@ def process_live_data(): ago_features_df['file'] = filename master_ago_filepath = '/data/mridle/data/silent_live_test/live_files/all/' \ - 'actuals/master_actuals_with_filename.csv' + 'actuals/master_actuals.csv' if os.path.exists(master_ago_filepath): master_ago = pd.read_csv(master_ago_filepath) else: @@ -171,7 +171,7 @@ def process_live_data(): master_ago_updated.to_csv(master_ago_filepath, index=False) ago_features_df.to_csv( - '/data/mridle/data/silent_live_test/live_files/all/actuals/actuals_{}_{}_{}_with_filename.csv'.format( + '/data/mridle/data/silent_live_test/live_files/all/actuals/actuals_{}_{}_{}.csv'.format( ago_day, ago_month, ago_year)) From a4575910d2b81d513d31a7c71e774c3489764fb6 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Tue, 28 Feb 2023 19:49:06 +0100 Subject: [PATCH 11/53] Change order of date filename --- src/mridle/pipelines/data_science/live_data/nodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/pipelines/data_science/live_data/nodes.py b/src/mridle/pipelines/data_science/live_data/nodes.py index fdc49bb5..6d5459bb 100644 --- a/src/mridle/pipelines/data_science/live_data/nodes.py +++ b/src/mridle/pipelines/data_science/live_data/nodes.py @@ -8,7 +8,7 @@ def get_slt_with_outcome(): '/data/mridle/data/silent_live_test/live_files/all/out_features_data/features_master_slt_features.csv', parse_dates=['start_time', 'end_time']) preds.drop(columns=['NoShow'], inplace=True) - actuals = pd.read_csv('/data/mridle/data/silent_live_test/live_files/all/actuals/master_actuals_with_filename.csv', + actuals = pd.read_csv('/data/mridle/data/silent_live_test/live_files/all/actuals/master_actuals.csv', parse_dates=['start_time', 'end_time']) preds['MRNCmpdId'] = preds['MRNCmpdId'].astype(str) From 33e9cba30e6f6f656d226c7836af25737fde93cf Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 1 Mar 2023 10:50:22 +0100 Subject: [PATCH 12/53] Adding code --- src/mridle/utilities/intervention.py | 142 +++++++++++++++++++-------- 1 file changed, 101 insertions(+), 41 deletions(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 83a487cc..a7c19c34 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -1,3 +1,6 @@ +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.backends.backend_pdf import PdfPages import smtplib import datetime from email.mime.application import MIMEApplication @@ -6,44 +9,101 @@ from email.utils import formatdate import configparser -# Read the configuration file -config = configparser.ConfigParser() -config.read('/home/USZ/mcmamacc/config.ini') - -# Access the values in the configuration file -username = config['DEFAULT']['username'] -password = config['DEFAULT']['password'] -# recipients = ['mark.mcmahon@uzh.ch', 'markronan.mcmahon@usz.ch'] -recipients = config['DEFAULT']['recipients'].split(',') - -# create an SMTP object -smtp_obj = smtplib.SMTP('outlook.usz.ch', 587) - -# establish a secure connection -smtp_obj.starttls() - -# login to the email server using your email address and password -smtp_obj.login(username, password) -print(username) -print(recipients) -# create the email message -msg = MIMEMultipart() -msg['From'] = username -msg['To'] = ", ".join(recipients) -msg['Date'] = formatdate(localtime=True) -msg['Subject'] = 'DataFrame as csv attachment' -body = "Time sent {}".format(datetime.datetime.now()) -msg.attach(MIMEText(body, 'plain')) - -# Add the CSV attachment to the email -with open('/data/mridle/data/silent_live_test/my_dataframe.csv', 'rb') as csv_file: - csv_attachment = MIMEApplication(csv_file.read(), _subtype='csv') - csv_attachment.add_header('Content-Disposition', 'attachment', filename='my_dataframe.csv') - msg.attach(csv_attachment) - - -# send the email -smtp_obj.sendmail(username, recipients, msg.as_string()) - -# close the SMTP connection -smtp_obj.quit() +from mridle.src.mridle.pipelines.data_science.feature_engineering.nodes import add_business_days + + +def intervention(): + """ + df: dataframe with appointments that need to be called for that day. Both intervention and control included . i.e. + just the top 20 (or above threshold...). Should have a col called 'control' indicating if it is control or + intervention. + """ + + filename_date = add_business_days(datetime.datetime.today(), 3).date().strftime('%Y_%m_%d') + filename = '/data/mridle/data/silent_live_test/live_files/all/out_features_data/features_{}.csv'.format( + filename_date) + + preds = pd.read_csv(filename, parse_dates=['start_time']) + preds.rename(columns={"prediction_xgboost": "prediction"}, inplace=True) + preds.drop(columns=[x for x in preds.columns if 'prediction_' in x], inplace=True) + preds.drop(columns=[x for x in preds.columns if 'Unnamed:' in x], inplace=True) + + # Take the top X appts + # preds = preds.sort_values("prediction", ascending=False)[:split_config[day_of_week_from_filename]['num_preds']] + + # Take appts above a certain threshold + preds = preds[preds['prediction'] > 0.185] + + preds['control'] = 'control' + + # use the index of a sampling to change ~50% of the labels to 'intervention' + preds.loc[preds.sample(frac=0.5, replace=False).index, 'control'] = 'intervention' + + intervention_df = preds[preds['control'] == 'intervention'][['MRNCmpdId', 'FillerOrderNo', 'start_time', 'Telefon']] + + # Save the original as csv, and then the intervention one as PDF to be emailed + preds.to_csv("/data/mridle/data/intervention/intervention_{}.csv".format(filename_date), index=False) + + fig, ax = plt.subplots(figsize=(12, 4)) + ax.axis('tight') + ax.axis('off') + ax.table(cellText=intervention_df.values, colLabels=intervention_df.columns, loc='center') + + pp = PdfPages("/data/mridle/data/intervention/intervention_{}.pdf".format(filename_date)) + pp.savefig(fig, bbox_inches='tight') + pp.close() + + # Read the configuration file + config = configparser.ConfigParser() + config.read('/data/mridle/data/intervention/config.ini') + + # Access the values in the configuration file + username = config['DEFAULT']['username'] + password = config['DEFAULT']['password'] + recipients = config['DEFAULT']['recipients'].split(',') + + # create an SMTP object + smtp_obj = smtplib.SMTP('outlook.usz.ch', 587) + + # establish a secure connection + smtp_obj.starttls() + + # login to the email server using your email address and password + smtp_obj.login(username, password) + + # create the email message + msg = MIMEMultipart() + msg['From'] = username + msg['To'] = ", ".join(recipients) + msg['Date'] = formatdate(localtime=True) + msg['Subject'] = 'Intervention Study - {}'.format(filename_date) + body = """ + Dear you, + + Here are the upcoming appointments which we would like to include in the study. + + As discussed, could you please: + + 1. Try to call these patients today and remind them of their appointment + 2. Send me an email with some feedback (i.e. whether you could get talking with the patient, what they said, etc.) \ + in whatever form that suits you. + + Let me know if you have any questions. + + Regards, + Mark + """ + msg.attach(MIMEText(body, 'plain')) + + path_to_pdf = '/data/mridle/data/intervention/intervention_{}.pdf'.format(filename_date) + + with open(path_to_pdf, "rb") as f: + attach = MIMEApplication(f.read(), _subtype="pdf") + attach.add_header('Content-Disposition', 'attachment', filename=str(path_to_pdf)) + msg.attach(attach) + + # send the email + smtp_obj.sendmail(username, recipients, msg.as_string()) + + # close the SMTP connection + smtp_obj.quit() From 27b710be73ff0e57d481a55a183c17ba8fb8f4f7 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 1 Mar 2023 10:51:03 +0100 Subject: [PATCH 13/53] VM changes --- src/mridle/utilities/intervention.oy | 0 src/mridle/utilities/intervention.py | 4 +++- 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 src/mridle/utilities/intervention.oy diff --git a/src/mridle/utilities/intervention.oy b/src/mridle/utilities/intervention.oy new file mode 100644 index 00000000..e69de29b diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 83a487cc..00092fde 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -14,7 +14,7 @@ username = config['DEFAULT']['username'] password = config['DEFAULT']['password'] # recipients = ['mark.mcmahon@uzh.ch', 'markronan.mcmahon@usz.ch'] -recipients = config['DEFAULT']['recipients'].split(',') +recipients = config['DEFAULT']['recipients'].split(', ') # create an SMTP object smtp_obj = smtplib.SMTP('outlook.usz.ch', 587) @@ -47,3 +47,5 @@ # close the SMTP connection smtp_obj.quit() + + From cbf20a5dee179c74bf66be277261e988dbe3fdc5 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 1 Mar 2023 10:58:10 +0100 Subject: [PATCH 14/53] Adding code --- src/mridle/utilities/intervention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index a7c19c34..a290c389 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -9,7 +9,7 @@ from email.utils import formatdate import configparser -from mridle.src.mridle.pipelines.data_science.feature_engineering.nodes import add_business_days +from mridle.pipelines.data_science.feature_engineering.nodes import add_business_days def intervention(): From c2e77ab311ae22e0e28df40ae5d35d4218b4d6b0 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 1 Mar 2023 11:19:31 +0100 Subject: [PATCH 15/53] Adding code --- src/mridle/utilities/intervention.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index a290c389..35de5242 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -12,14 +12,14 @@ from mridle.pipelines.data_science.feature_engineering.nodes import add_business_days -def intervention(): +def intervention(dt): """ df: dataframe with appointments that need to be called for that day. Both intervention and control included . i.e. just the top 20 (or above threshold...). Should have a col called 'control' indicating if it is control or intervention. """ - filename_date = add_business_days(datetime.datetime.today(), 3).date().strftime('%Y_%m_%d') + filename_date = add_business_days(dt, 3).date().strftime('%Y_%m_%d') filename = '/data/mridle/data/silent_live_test/live_files/all/out_features_data/features_{}.csv'.format( filename_date) @@ -107,3 +107,7 @@ def intervention(): # close the SMTP connection smtp_obj.quit() + + +if __name__ == '__main__': + intervention(dt=datetime.datetime.today()) From fa834e6902b0783f113d304f0241f080aaba1045 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 1 Mar 2023 11:22:25 +0100 Subject: [PATCH 16/53] Adding code --- src/mridle/utilities/intervention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 35de5242..96e14013 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -85,7 +85,7 @@ def intervention(dt): As discussed, could you please: 1. Try to call these patients today and remind them of their appointment - 2. Send me an email with some feedback (i.e. whether you could get talking with the patient, what they said, etc.) \ + 2. Send me an email with some feedback (i.e. whether you could get talking with the patient, what they said, etc.)\ in whatever form that suits you. Let me know if you have any questions. From 4302a4651c9ccec21f9a154889a2947df9bd5c27 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 1 Mar 2023 13:17:04 +0100 Subject: [PATCH 17/53] Adding code --- src/mridle/utilities/process_live_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index 9c411899..2940c67d 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -301,11 +301,12 @@ def remove_redundant(df): features_df[f'prediction_{model_name}'] = preds_proba features_df.to_csv(output_path, index=False) + print(features_df.shape) new_appts = features_df.merge(historic_data[['FillerOrderNo', 'start_time']], how='left', indicator=True) new_appts = new_appts[new_appts['_merge'] == 'left_only'] new_appts.drop(columns=['_merge'], inplace=True) - + print(new_appts.shape) master_slt_updated = pd.concat([master_slt, new_appts], axis=0) master_slt_updated.drop_duplicates(inplace=True) master_slt_updated.to_csv(master_slt_filepath, index=False) From 896c8db5326a811c52265eebfcd78d43d6c8fc8e Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 1 Mar 2023 13:47:57 +0100 Subject: [PATCH 18/53] Adding code --- src/mridle/utilities/process_live_data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index 2940c67d..2bbaec7b 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -113,7 +113,6 @@ def get_sorted_filenames(file_dir): def process_live_data(): - # process_live_data() function already_processed_filename = '/data/mridle/data/silent_live_test/live_files/already_processed.txt' master_feature_set = pd.read_parquet( '/data/mridle/data/kedro_data_catalog/04_feature/master_feature_set_na_removed.parquet') @@ -274,7 +273,7 @@ def remove_redundant(df): master_slt_filepath = '/data/mridle/data/silent_live_test/live_files/all/' \ 'out_features_data/features_master_slt_features.csv' if os.path.exists(master_slt_filepath): - master_slt = pd.read_parquet('/data/mridle/data/kedro_data_catalog/04_feature/live_data.parquet') + master_slt = pd.read_parquet(master_slt_filepath) else: master_slt = pd.DataFrame() historic_data = pd.concat([master_df, master_slt], axis=0) From fdcf21b8a328c12ee52d1fe0e995cb560137af31 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 1 Mar 2023 13:50:09 +0100 Subject: [PATCH 19/53] Adding code --- src/mridle/utilities/process_live_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index 2bbaec7b..935f3121 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -273,7 +273,7 @@ def remove_redundant(df): master_slt_filepath = '/data/mridle/data/silent_live_test/live_files/all/' \ 'out_features_data/features_master_slt_features.csv' if os.path.exists(master_slt_filepath): - master_slt = pd.read_parquet(master_slt_filepath) + master_slt = pd.read_csv(master_slt_filepath) else: master_slt = pd.DataFrame() historic_data = pd.concat([master_df, master_slt], axis=0) From dac882f50e1400451c28c545a62e8667f95537bb Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 1 Mar 2023 13:53:18 +0100 Subject: [PATCH 20/53] Adding code --- src/mridle/utilities/process_live_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index 935f3121..0f0cc7e6 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -273,7 +273,7 @@ def remove_redundant(df): master_slt_filepath = '/data/mridle/data/silent_live_test/live_files/all/' \ 'out_features_data/features_master_slt_features.csv' if os.path.exists(master_slt_filepath): - master_slt = pd.read_csv(master_slt_filepath) + master_slt = pd.read_csv(master_slt_filepath, parse_dates=['start_time']) else: master_slt = pd.DataFrame() historic_data = pd.concat([master_df, master_slt], axis=0) From f5f27138d39593fd7000458b0299b420b9785529 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 1 Mar 2023 15:18:10 +0100 Subject: [PATCH 21/53] fix csv --- src/mridle/utilities/process_live_data.py | 34 ++++++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index 0f0cc7e6..c1979fb9 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -10,7 +10,8 @@ from dateutil.relativedelta import relativedelta from pathlib import Path import pickle - +import numpy as np +import csv AGO_DIR = '/data/mridle/data/silent_live_test/live_files/all/ago/' OUT_DIR = '/data/mridle/data/silent_live_test/live_files/all/out/' @@ -232,10 +233,11 @@ def remove_redundant(df): & (st_df['now_sched_for_date'] == st_df['was_sched_for_date']))] return st_df - if file_encoding: + try: + raw_df = pd.read_csv(data_path, encoding=file_encoding) + except pd.errors.ParserError: + fix_csv_file(data_path) raw_df = pd.read_csv(data_path, encoding=file_encoding) - else: - raw_df = pd.read_csv(data_path) exclude_pat_ids = list() # TODO! @@ -406,3 +408,27 @@ def get_silent_live_test_actuals(all_columns=True): else: all_actuals = pd.concat([all_actuals, actuals], axis=0) return all_actuals + + +def fix_csv_file(filename_to_fix): + + res = [] + + with open(filename_to_fix, 'r', encoding='utf-16') as read_obj: + # pass the file object to reader() to get the reader object + # csv_reader = reader(read_obj, skipinitialspace=True) + csv_reader = csv.DictReader(read_obj, restkey='ReasonForStudy2') + for row in csv_reader: + # row variable is a list that represents a row in csv + res.append(row) + + res_df = pd.DataFrame(res) + if 'ReasonForStudy2' in res_df.columns: + res_df['ReasonForStudy'] = np.where(res_df['ReasonForStudy2'].isna(), res_df['ReasonForStudy'], + res_df['ReasonForStudy'].astype(str) + res_df['ReasonForStudy2'].astype( + str)) + res_df.drop(columns=['ReasonForStudy2'], inplace=True) + res_df['ReasonForStudy'].replace('"|,', " ", inplace=True) + + res_df.to_csv(filename_to_fix, encoding='utf-16') + return None From 384e34550244bb84c5b09335dd2b410f6810d498 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Thu, 2 Mar 2023 10:35:46 +0100 Subject: [PATCH 22/53] ignore some models --- src/mridle/utilities/process_live_data.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index c1979fb9..3700073f 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -291,15 +291,16 @@ def remove_redundant(df): model_dirs = Path(model_dir).glob('*') for model_dir in model_dirs: - model_paths = model_dir.glob('*') - for model_path in model_paths: - with open(model_path, "rb+") as f: - serialized_model = pickle.load(f) - exp = Experiment.deserialize(serialized_model) - data_set = DataSet(exp.stratified_dataset.config, features_df) - preds_proba = exp.final_predictor.predict_proba(data_set.x) - model_name = exp.metadata.get('name', model_path.name) - features_df[f'prediction_{model_name}'] = preds_proba + if model_dir != '/data/mridle/data/kedro_data_catalog/06_models/xgboost_with_live': + model_paths = model_dir.glob('*') + for model_path in model_paths: + with open(model_path, "rb+") as f: + serialized_model = pickle.load(f) + exp = Experiment.deserialize(serialized_model) + data_set = DataSet(exp.stratified_dataset.config, features_df) + preds_proba = exp.final_predictor.predict_proba(data_set.x) + model_name = exp.metadata.get('name', model_path.name) + features_df[f'prediction_{model_name}'] = preds_proba features_df.to_csv(output_path, index=False) print(features_df.shape) From 1eaf74661e5f8fbfc6aa02fa4c03507d7ec3ed0a Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Thu, 2 Mar 2023 10:38:41 +0100 Subject: [PATCH 23/53] ignore some models --- src/mridle/utilities/process_live_data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index 3700073f..8824cb5a 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -291,7 +291,9 @@ def remove_redundant(df): model_dirs = Path(model_dir).glob('*') for model_dir in model_dirs: - if model_dir != '/data/mridle/data/kedro_data_catalog/06_models/xgboost_with_live': + if str(model_dir) in ['/data/mridle/data/kedro_data_catalog/06_models/xgboost', + '/data/mridle/data/kedro_data_catalog/06_models/random_forest', + '/data/mridle/data/kedro_data_catalog/06_models/logistic_regression']: model_paths = model_dir.glob('*') for model_path in model_paths: with open(model_path, "rb+") as f: From 89790d7108fed840d5cd618a61267c8b5f785781 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Fri, 3 Mar 2023 16:31:59 +0100 Subject: [PATCH 24/53] Fix no show before --- src/mridle/utilities/process_live_data.py | 24 ++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index 8824cb5a..b176bdd2 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -3,6 +3,7 @@ from mridle.pipelines.data_engineering.ris.nodes import build_status_df, prep_raw_df_for_parquet, build_slot_df from mridle.pipelines.data_science.feature_engineering.nodes import remove_na, \ generate_3_5_days_ahead_features, add_business_days, subtract_business_days, feature_no_show_before +from mridle.pipelines.data_science.live_data.nodes import get_slt_with_outcome from mridle.experiment.experiment import Experiment from mridle.experiment.dataset import DataSet import os @@ -272,13 +273,9 @@ def remove_redundant(df): # Get number of previous no shows from historical data and add to data set master_df = master_feature_set.copy() master_df = master_df[master_df['MRNCmpdId'] != 'SMS0016578'] - master_slt_filepath = '/data/mridle/data/silent_live_test/live_files/all/' \ - 'out_features_data/features_master_slt_features.csv' - if os.path.exists(master_slt_filepath): - master_slt = pd.read_csv(master_slt_filepath, parse_dates=['start_time']) - else: - master_slt = pd.DataFrame() - historic_data = pd.concat([master_df, master_slt], axis=0) + + master_slt_with_outcome = get_slt_with_outcome() + historic_data = pd.concat([master_df, master_slt_with_outcome], axis=0) historic_data['MRNCmpdId'] = historic_data['MRNCmpdId'].astype(str) features_df['MRNCmpdId'] = features_df['MRNCmpdId'].astype(str) @@ -311,9 +308,18 @@ def remove_redundant(df): new_appts = new_appts[new_appts['_merge'] == 'left_only'] new_appts.drop(columns=['_merge'], inplace=True) print(new_appts.shape) - master_slt_updated = pd.concat([master_slt, new_appts], axis=0) + + master_slt_feature_filepath = '/data/mridle/data/silent_live_test/live_files/all/' \ + 'out_features_data/features_master_slt_features.csv' + + if os.path.exists(master_slt_feature_filepath): + master_feature_slt = pd.read_csv(master_slt_feature_filepath, parse_dates=['start_time']) + else: + master_feature_slt = pd.DataFrame() + + master_slt_updated = pd.concat([master_feature_slt, new_appts], axis=0) master_slt_updated.drop_duplicates(inplace=True) - master_slt_updated.to_csv(master_slt_filepath, index=False) + master_slt_updated.to_csv(master_slt_feature_filepath, index=False) def get_silent_live_test_predictions(model_str='prediction_xgboost', all_columns=True): From fc10165405c6e23765f586c795329c6b7cfda0d8 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Fri, 3 Mar 2023 16:38:48 +0100 Subject: [PATCH 25/53] Fix no show before --- src/mridle/utilities/process_live_data.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index b176bdd2..dd8a8cbc 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -274,7 +274,16 @@ def remove_redundant(df): master_df = master_feature_set.copy() master_df = master_df[master_df['MRNCmpdId'] != 'SMS0016578'] - master_slt_with_outcome = get_slt_with_outcome() + master_slt_feature_filepath = '/data/mridle/data/silent_live_test/live_files/all/' \ + 'out_features_data/features_master_slt_features.csv' + + if os.path.exists(master_slt_feature_filepath): + master_slt_with_outcome = get_slt_with_outcome() + master_feature_slt = pd.read_csv(master_slt_feature_filepath, parse_dates=['start_time']) + else: + master_slt_with_outcome = pd.DataFrame() + master_feature_slt = pd.DataFrame() + historic_data = pd.concat([master_df, master_slt_with_outcome], axis=0) historic_data['MRNCmpdId'] = historic_data['MRNCmpdId'].astype(str) @@ -309,14 +318,6 @@ def remove_redundant(df): new_appts.drop(columns=['_merge'], inplace=True) print(new_appts.shape) - master_slt_feature_filepath = '/data/mridle/data/silent_live_test/live_files/all/' \ - 'out_features_data/features_master_slt_features.csv' - - if os.path.exists(master_slt_feature_filepath): - master_feature_slt = pd.read_csv(master_slt_feature_filepath, parse_dates=['start_time']) - else: - master_feature_slt = pd.DataFrame() - master_slt_updated = pd.concat([master_feature_slt, new_appts], axis=0) master_slt_updated.drop_duplicates(inplace=True) master_slt_updated.to_csv(master_slt_feature_filepath, index=False) From 027e0849a509c73788d8b5eb1ae244b22b6df91f Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Fri, 3 Mar 2023 17:33:06 +0100 Subject: [PATCH 26/53] Fix no show before --- src/mridle/utilities/process_live_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index dd8a8cbc..e57e00d7 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -317,7 +317,8 @@ def remove_redundant(df): new_appts = new_appts[new_appts['_merge'] == 'left_only'] new_appts.drop(columns=['_merge'], inplace=True) print(new_appts.shape) - + print(features_df[features_df['MRNCmpdId'] == '60184934'][['MRNCmpdId', 'FillerOrderNo', 'no_show_before', + 'start_time', 'no_show_rate', 'NoShow']]) master_slt_updated = pd.concat([master_feature_slt, new_appts], axis=0) master_slt_updated.drop_duplicates(inplace=True) master_slt_updated.to_csv(master_slt_feature_filepath, index=False) From 784ee3f079c2cff76a2a17202101a99225d29b4b Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Fri, 3 Mar 2023 17:48:01 +0100 Subject: [PATCH 27/53] Fix no show before --- src/mridle/utilities/process_live_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index e57e00d7..e535f87d 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -317,8 +317,8 @@ def remove_redundant(df): new_appts = new_appts[new_appts['_merge'] == 'left_only'] new_appts.drop(columns=['_merge'], inplace=True) print(new_appts.shape) - print(features_df[features_df['MRNCmpdId'] == '60184934'][['MRNCmpdId', 'FillerOrderNo', 'no_show_before', - 'start_time', 'no_show_rate', 'NoShow']]) + # print(features_df[features_df['MRNCmpdId'] == '60184934'][['MRNCmpdId', 'FillerOrderNo', 'no_show_before', + # 'start_time', 'no_show_rate', 'NoShow']]) master_slt_updated = pd.concat([master_feature_slt, new_appts], axis=0) master_slt_updated.drop_duplicates(inplace=True) master_slt_updated.to_csv(master_slt_feature_filepath, index=False) From a3a82ef28aca1849e6f0bf70f49f761a630f59b4 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 6 Mar 2023 17:13:04 +0100 Subject: [PATCH 28/53] XGB config --- conf/base/parameters.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/conf/base/parameters.yml b/conf/base/parameters.yml index cff23cfc..e96cba55 100644 --- a/conf/base/parameters.yml +++ b/conf/base/parameters.yml @@ -382,17 +382,17 @@ models: flavor: DataSet config: features: - - 'no_show_before' + # - 'no_show_before' - 'appts_before' - - 'show_before' + # - 'show_before' - 'no_show_rate' - 'sched_days_advanced' - - 'month' + # - 'month' - 'age' - 'modality' - 'occupation' - 'reason' - - 'sex' + # - 'sex' - 'hour_sched' - 'distance_to_usz' - 'day_of_week_str' @@ -417,14 +417,14 @@ models: with_mean: True args: columns: - - 'no_show_before' + # - 'no_show_before' - 'sched_days_advanced' - 'age' - 'hour_sched' - 'distance_to_usz' - 'times_rescheduled' - 'appts_before' - - 'show_before' + # - 'show_before' - 'no_show_rate' - name: 'onehot' flavor: sklearn.preprocessing.OneHotEncoder @@ -437,13 +437,13 @@ models: - 'reason' - 'modality' - 'day_of_week_str' - - name: 'cyc' - flavor: mridle.utilities.modeling.CyclicalTransformer - config: - period: 12 - args: - columns: - - 'month' + #- name: 'cyc' + # flavor: mridle.utilities.modeling.CyclicalTransformer + # config: + # period: 12 + # args: + # columns: + # - 'month' - flavor: XGBClassifier name: 'classifier' config: From b3408ed2fa5f0c337faa07b58374595da28c497f Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 8 Mar 2023 11:33:20 +0100 Subject: [PATCH 29/53] Change filename --- src/mridle/utilities/intervention.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 96e14013..436681ce 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -78,7 +78,7 @@ def intervention(dt): msg['Date'] = formatdate(localtime=True) msg['Subject'] = 'Intervention Study - {}'.format(filename_date) body = """ - Dear you, + Dear Namka, Here are the upcoming appointments which we would like to include in the study. @@ -86,7 +86,7 @@ def intervention(dt): 1. Try to call these patients today and remind them of their appointment 2. Send me an email with some feedback (i.e. whether you could get talking with the patient, what they said, etc.)\ - in whatever form that suits you. + in whichever form suits you. Let me know if you have any questions. @@ -99,7 +99,7 @@ def intervention(dt): with open(path_to_pdf, "rb") as f: attach = MIMEApplication(f.read(), _subtype="pdf") - attach.add_header('Content-Disposition', 'attachment', filename=str(path_to_pdf)) + attach.add_header('Content-Disposition', 'attachment', filename=str(filename_date)) msg.attach(attach) # send the email From edecdbc5f277783c68f5487e3c20d1614e6ba3ec Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 8 Mar 2023 11:35:41 +0100 Subject: [PATCH 30/53] Change filename --- src/mridle/utilities/intervention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 436681ce..afda4365 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -99,7 +99,7 @@ def intervention(dt): with open(path_to_pdf, "rb") as f: attach = MIMEApplication(f.read(), _subtype="pdf") - attach.add_header('Content-Disposition', 'attachment', filename=str(filename_date)) + attach.add_header('Content-Disposition', 'attachment', filename='Intervention_{}'.format(filename_date)) msg.attach(attach) # send the email From 45fd770ca92afd48923cd5d886e68be2b72fbe86 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 8 Mar 2023 11:43:58 +0100 Subject: [PATCH 31/53] Change --- src/mridle/utilities/intervention.py | 32 +++++++++++++++------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index afda4365..e4a6448e 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -19,6 +19,17 @@ def intervention(dt): intervention. """ + # Read the configuration file + config = configparser.ConfigParser() + config.read('/data/mridle/data/intervention/config.ini') + + # Access the values in the configuration file + username = config['DEFAULT']['username'] + password = config['DEFAULT']['password'] + recipients = config['DEFAULT']['recipients'].split(',') + threshold = float(config['DEFAULT']['threshold']) + + today = dt.strftime('%Y_%m_%d') filename_date = add_business_days(dt, 3).date().strftime('%Y_%m_%d') filename = '/data/mridle/data/silent_live_test/live_files/all/out_features_data/features_{}.csv'.format( filename_date) @@ -32,7 +43,7 @@ def intervention(dt): # preds = preds.sort_values("prediction", ascending=False)[:split_config[day_of_week_from_filename]['num_preds']] # Take appts above a certain threshold - preds = preds[preds['prediction'] > 0.185] + preds = preds[preds['prediction'] > threshold] preds['control'] = 'control' @@ -42,26 +53,17 @@ def intervention(dt): intervention_df = preds[preds['control'] == 'intervention'][['MRNCmpdId', 'FillerOrderNo', 'start_time', 'Telefon']] # Save the original as csv, and then the intervention one as PDF to be emailed - preds.to_csv("/data/mridle/data/intervention/intervention_{}.csv".format(filename_date), index=False) + preds.to_csv("/data/mridle/data/intervention/intervention_{}.csv".format(today), index=False) fig, ax = plt.subplots(figsize=(12, 4)) ax.axis('tight') ax.axis('off') ax.table(cellText=intervention_df.values, colLabels=intervention_df.columns, loc='center') - pp = PdfPages("/data/mridle/data/intervention/intervention_{}.pdf".format(filename_date)) + pp = PdfPages("/data/mridle/data/intervention/intervention_{}.pdf".format(today)) pp.savefig(fig, bbox_inches='tight') pp.close() - # Read the configuration file - config = configparser.ConfigParser() - config.read('/data/mridle/data/intervention/config.ini') - - # Access the values in the configuration file - username = config['DEFAULT']['username'] - password = config['DEFAULT']['password'] - recipients = config['DEFAULT']['recipients'].split(',') - # create an SMTP object smtp_obj = smtplib.SMTP('outlook.usz.ch', 587) @@ -76,7 +78,7 @@ def intervention(dt): msg['From'] = username msg['To'] = ", ".join(recipients) msg['Date'] = formatdate(localtime=True) - msg['Subject'] = 'Intervention Study - {}'.format(filename_date) + msg['Subject'] = 'Intervention Study - {}'.format(today) body = """ Dear Namka, @@ -95,11 +97,11 @@ def intervention(dt): """ msg.attach(MIMEText(body, 'plain')) - path_to_pdf = '/data/mridle/data/intervention/intervention_{}.pdf'.format(filename_date) + path_to_pdf = '/data/mridle/data/intervention/intervention_{}.pdf'.format(today) with open(path_to_pdf, "rb") as f: attach = MIMEApplication(f.read(), _subtype="pdf") - attach.add_header('Content-Disposition', 'attachment', filename='Intervention_{}'.format(filename_date)) + attach.add_header('Content-Disposition', 'attachment', filename='Intervention_{}'.format(today)) msg.attach(attach) # send the email From e45571654c1504508e9f5f74c415fd6e1c260c82 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 8 Mar 2023 15:21:11 +0100 Subject: [PATCH 32/53] Change occupation feature --- .../data_science/feature_engineering/nodes.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/mridle/pipelines/data_science/feature_engineering/nodes.py b/src/mridle/pipelines/data_science/feature_engineering/nodes.py index 4b3fb6af..990707fa 100644 --- a/src/mridle/pipelines/data_science/feature_engineering/nodes.py +++ b/src/mridle/pipelines/data_science/feature_engineering/nodes.py @@ -629,10 +629,6 @@ def feature_occupation(df): df_remap.loc[df_remap['Beruf'] == 'nan', 'occupation'] = 'none_given' df_remap.loc[df_remap['Beruf'] == '-', 'occupation'] = 'none_given' - df_remap.loc[df_remap['Beruf'].apply(regex_search, search_str='rentner|Renter|pensioniert|pens.|rente'), - 'occupation'] = 'retired' - df_remap.loc[df_remap['Beruf'].apply(regex_search, search_str='keine Angaben|keine Ang'), - 'occupation'] = 'none_given' df_remap.loc[df_remap['Beruf'].apply(regex_search, search_str='Angestellte|ang.|baue|angest.|Hauswart|dozent|designer|^KV$|' 'masseu|Raumpflegerin|Apothekerin|Ing.|fotog|Psycholog|' @@ -650,14 +646,24 @@ def feature_occupation(df): 'ingenieur|Kauf|mitarbeiter|Verkäufer|Informatiker|koch|' 'lehrer|arbeiter|architekt'), 'occupation'] = 'employed' + df_remap.loc[df_remap['Beruf'].apply(regex_search, search_str='rentner|Renter|pensioniert|pens.|rente'), + 'occupation'] = 'retired' + df_remap.loc[df_remap['Beruf'].apply(regex_search, search_str='IV-Rentner'), + 'occupation'] = 'iv_retired' + + df_remap.loc[df_remap['Beruf'].apply(regex_search, search_str='keine Angaben|keine Ang'), + 'occupation'] = 'none_given' + df_remap.loc[df_remap['Beruf'].apply(regex_search, search_str='student|Schüler|Doktorand|' 'Kind|Stud.|Ausbildung|^MA$'), 'occupation'] = 'student' df_remap.loc[df_remap['Beruf'].apply(regex_search, search_str='^IV$|^IV-Bezüger|^$|arbeitslos|ohne Arbeit|' 'ohne|o.A.|nicht Arbeitstätig|' 'Sozialhilfeempfänger|o. Arbeit|keine Arbeit|' - 'Asyl|RAV|Hausfrau|Hausmann'), + 'Asyl|RAV'), 'occupation'] = 'unemployed' + df_remap.loc[ + df_remap['Beruf'].apply(regex_search, search_str='Hausfrau|Hausmann'), 'occupation'] = 'stay_at_home_parent' df_remap.loc[df_remap['Beruf'].apply(regex_search, search_str='selbst'), 'occupation'] = 'self_employed' df_remap.loc[df_remap['Beruf'].apply(regex_search, search_str='arzt|aerzt|ärzt|pflegefachfrau|Pflegehelfer|' 'MTRA|Erzieherin|Fachfrau Betreuung|' @@ -668,7 +674,9 @@ def feature_occupation(df): df_remap.loc[df_remap['occupation'] == '', 'occupation'] = 'other' df_remap.loc[df_remap['occupation'].isna(), 'occupation'] = 'other' + df_remap = df_remap.drop('Beruf', axis=1) + return df_remap From f0c55cbf82a8db2c72c8519684d16aceb3e49678 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 8 Mar 2023 17:21:18 +0100 Subject: [PATCH 33/53] Fix telefon --- src/mridle/pipelines/data_science/feature_engineering/nodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/pipelines/data_science/feature_engineering/nodes.py b/src/mridle/pipelines/data_science/feature_engineering/nodes.py index 990707fa..fae12966 100644 --- a/src/mridle/pipelines/data_science/feature_engineering/nodes.py +++ b/src/mridle/pipelines/data_science/feature_engineering/nodes.py @@ -202,7 +202,7 @@ def build_feature_set(status_df: pd.DataFrame, valid_date_range: List[str], mast 'close_to_usz': 'last', 'times_rescheduled': 'last', 'start_time': 'last', - 'Telefon': 'max' + 'Telefon': 'last' } slot_df = build_slot_df(status_df, valid_date_range, agg_dict, build_future_slots=build_future_slots, From 7e2b42304d3900010c54394ae88f4c316b482c5a Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Thu, 9 Mar 2023 11:16:01 +0100 Subject: [PATCH 34/53] Fix telefon --- .../pipelines/data_science/feature_engineering/nodes.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/mridle/pipelines/data_science/feature_engineering/nodes.py b/src/mridle/pipelines/data_science/feature_engineering/nodes.py index fae12966..5a3a232e 100644 --- a/src/mridle/pipelines/data_science/feature_engineering/nodes.py +++ b/src/mridle/pipelines/data_science/feature_engineering/nodes.py @@ -15,6 +15,13 @@ def daterange(date1, date2): yield date1 + timedelta(n) +def get_last_non_na(x): + if x.last_valid_index() is None: + return '0' + else: + return x[x.last_valid_index()] + + def generate_training_data(status_df, valid_date_range, append_outcome=True, add_no_show_before=True): """ Build data for use in models by trying to replicate the conditions under which the model would be used in reality @@ -202,7 +209,7 @@ def build_feature_set(status_df: pd.DataFrame, valid_date_range: List[str], mast 'close_to_usz': 'last', 'times_rescheduled': 'last', 'start_time': 'last', - 'Telefon': 'last' + 'Telefon': lambda x: get_last_non_na(x) } slot_df = build_slot_df(status_df, valid_date_range, agg_dict, build_future_slots=build_future_slots, From dad5aaca44923a3d11e20a51ff647e79320b11f2 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Thu, 9 Mar 2023 15:17:16 +0100 Subject: [PATCH 35/53] Fix telefon --- conf/base/parameters.yml | 4 +-- src/mridle/experiment/stratifier.py | 25 ++++++++++++++++++- .../data_science/feature_engineering/nodes.py | 6 ++++- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/conf/base/parameters.yml b/conf/base/parameters.yml index e96cba55..3da5bd71 100644 --- a/conf/base/parameters.yml +++ b/conf/base/parameters.yml @@ -400,9 +400,9 @@ models: - 'times_rescheduled' target: NoShow Stratifier: - flavor: PartitionedLabelStratifier + flavor: PartitionedFeatureStratifier config: - n_partitions: 5 + split_feature: 'year' Architecture: flavor: Pipeline config: diff --git a/src/mridle/experiment/stratifier.py b/src/mridle/experiment/stratifier.py index ed1ba078..1c082578 100644 --- a/src/mridle/experiment/stratifier.py +++ b/src/mridle/experiment/stratifier.py @@ -58,7 +58,7 @@ def materialize_partition(self, partition_id: int, data_set: DataSet) -> Tuple[p class PartitionedLabelStratifier(Stratifier): def partition_data(self, data_set: DataSet) -> List[Tuple[List[int], List[int]]]: - """Randomly shuffle and split the doc_list into n_partitions roughly equal lists, stratified by label.""" + """Randomly shuffle and split the data_set into n_partitions roughly equal lists, stratified by label.""" label_list = data_set.y skf = StratifiedKFold(n_splits=self.config['n_partitions'], random_state=42, shuffle=True) x = np.zeros(len(label_list)) # split takes a X argument for backwards compatibility and is not used @@ -100,6 +100,29 @@ def validate_config(cls, config): return True +class PartitionedFeatureStratifier(Stratifier): + + def partition_data(self, data_set: DataSet) -> List[Tuple[List[int], List[int]]]: + """Split dataset by feature values of provided column.""" + data_set_copy = data_set.data.copy() + data_set_copy = data_set_copy.reset_index() + label_list = data_set_copy[self.config['split_feature']].unique() + partitions = [] + for l_id, f_label in enumerate(label_list): + print(f_label) + train_ids = np.array(data_set_copy[data_set_copy[self.config['split_feature']] != f_label].index) + test_ids = np.array(data_set_copy[data_set_copy[self.config['split_feature']] == f_label].index) + partitions.append([train_ids, test_ids]) + return partitions + + @classmethod + def validate_config(cls, config): + for key in ['split_feature', ]: + if key not in config: + raise ValueError(f"{cls.__name__} config must contain entry '{key}'.") + return True + + class StratifierInterface(ComponentInterface): registered_flavors = { diff --git a/src/mridle/pipelines/data_science/feature_engineering/nodes.py b/src/mridle/pipelines/data_science/feature_engineering/nodes.py index 5a3a232e..e0871217 100644 --- a/src/mridle/pipelines/data_science/feature_engineering/nodes.py +++ b/src/mridle/pipelines/data_science/feature_engineering/nodes.py @@ -225,7 +225,7 @@ def build_feature_set(status_df: pd.DataFrame, valid_date_range: List[str], mast slot_df = feature_cyclical_month(slot_df) slot_df = slot_df[slot_df['day_of_week_str'].isin(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'])] slot_df = slot_df[slot_df['sched_days_advanced'] > 2] - + slot_df = limit_to_day_hours(slot_df) return slot_df @@ -731,5 +731,9 @@ def feature_duration(dicom_df: pd.DataFrame) -> pd.DataFrame: return dicom_df +def limit_to_day_hours(df): + return df[(df['hour_sched'] < 18) & (df['hour_sched'] > 6)] + + def regex_search(x, search_str): return bool(re.search(search_str, x, re.IGNORECASE)) From 7225f1bbe3601e9d5d065ecf07285da66696eb8f Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Thu, 9 Mar 2023 15:20:25 +0100 Subject: [PATCH 36/53] Partition by features --- src/mridle/experiment/stratifier.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mridle/experiment/stratifier.py b/src/mridle/experiment/stratifier.py index 1c082578..6acda4be 100644 --- a/src/mridle/experiment/stratifier.py +++ b/src/mridle/experiment/stratifier.py @@ -126,6 +126,7 @@ def validate_config(cls, config): class StratifierInterface(ComponentInterface): registered_flavors = { + 'PartitionedFeatureStratifier': PartitionedFeatureStratifier, 'PartitionedLabelStratifier': PartitionedLabelStratifier, 'TrainTestStratifier': TrainTestStratifier, } From 63fb08c5b4deab0e7d129e6b002d399a90427390 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Thu, 9 Mar 2023 15:21:37 +0100 Subject: [PATCH 37/53] Partition by features --- .../data_science/feature_engineering/nodes.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/mridle/pipelines/data_science/feature_engineering/nodes.py b/src/mridle/pipelines/data_science/feature_engineering/nodes.py index e0871217..5c55e96d 100644 --- a/src/mridle/pipelines/data_science/feature_engineering/nodes.py +++ b/src/mridle/pipelines/data_science/feature_engineering/nodes.py @@ -216,6 +216,7 @@ def build_feature_set(status_df: pd.DataFrame, valid_date_range: List[str], mast include_id_cols=True) slot_df = feature_days_scheduled_in_advance(status_df, slot_df) + slot_df = feature_year(slot_df) slot_df = feature_month(slot_df) slot_df = feature_hour_sched(slot_df) slot_df = feature_day_of_week(slot_df) @@ -282,6 +283,20 @@ def feature_month(slot_df: pd.DataFrame) -> pd.DataFrame: return slot_df +def feature_year(slot_df: pd.DataFrame) -> pd.DataFrame: + """ + Append the year feature to the dataframe. + + Args: + slot_df: A dataframe containing appointment slots. + + Returns: A row-per-status-change dataframe with additional column 'year'. + + """ + slot_df['month'] = slot_df['start_time'].dt.year + return slot_df + + def feature_hour_sched(slot_df: pd.DataFrame) -> pd.DataFrame: """ Append the hour_sched feature to the dataframe using was_sched_for_date. From 4b94dd65249de4f80956697b892540d338149223 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Thu, 9 Mar 2023 16:46:07 +0100 Subject: [PATCH 38/53] Partition by features --- src/mridle/pipelines/data_science/feature_engineering/nodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/pipelines/data_science/feature_engineering/nodes.py b/src/mridle/pipelines/data_science/feature_engineering/nodes.py index 5c55e96d..a9938013 100644 --- a/src/mridle/pipelines/data_science/feature_engineering/nodes.py +++ b/src/mridle/pipelines/data_science/feature_engineering/nodes.py @@ -293,7 +293,7 @@ def feature_year(slot_df: pd.DataFrame) -> pd.DataFrame: Returns: A row-per-status-change dataframe with additional column 'year'. """ - slot_df['month'] = slot_df['start_time'].dt.year + slot_df['year'] = slot_df['start_time'].dt.year return slot_df From cb86ff407b88e829d58e3aab0b9bfb61ab4ae534 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 20 Mar 2023 17:02:43 +0100 Subject: [PATCH 39/53] Add feedback fileg --- src/mridle/utilities/intervention.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index e4a6448e..dd3759e1 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -64,6 +64,11 @@ def intervention(dt): pp.savefig(fig, bbox_inches='tight') pp.close() + for_feedback_file = intervention_df[['MRNCmpdId', 'FillerOrderNo', 'start_time']] + feedback_file = "/data/mridle/data/intervention/feedback.txt" + with open(feedback_file, 'a') as ap_f: + ap_f.write(f'\n{for_feedback_file}') + # create an SMTP object smtp_obj = smtplib.SMTP('outlook.usz.ch', 587) From c742aa9f0f90dea8738e5655324f1f8d6b2749fd Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 20 Mar 2023 17:05:02 +0100 Subject: [PATCH 40/53] Add feedback fileg --- src/mridle/utilities/intervention.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index dd3759e1..9ca9777f 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -8,6 +8,7 @@ from email.mime.text import MIMEText from email.utils import formatdate import configparser +import numpy as np from mridle.pipelines.data_science.feature_engineering.nodes import add_business_days @@ -67,7 +68,7 @@ def intervention(dt): for_feedback_file = intervention_df[['MRNCmpdId', 'FillerOrderNo', 'start_time']] feedback_file = "/data/mridle/data/intervention/feedback.txt" with open(feedback_file, 'a') as ap_f: - ap_f.write(f'\n{for_feedback_file}') + np.savetxt(ap_f, for_feedback_file.values, fmt='%d') # create an SMTP object smtp_obj = smtplib.SMTP('outlook.usz.ch', 587) From db26f5cf36ea23be343b223428e61d8c10e92e8f Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 20 Mar 2023 17:07:58 +0100 Subject: [PATCH 41/53] Add feedback fileg --- src/mridle/utilities/intervention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 9ca9777f..4acff6f6 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -68,7 +68,7 @@ def intervention(dt): for_feedback_file = intervention_df[['MRNCmpdId', 'FillerOrderNo', 'start_time']] feedback_file = "/data/mridle/data/intervention/feedback.txt" with open(feedback_file, 'a') as ap_f: - np.savetxt(ap_f, for_feedback_file.values, fmt='%d') + np.savetxt(ap_f, for_feedback_file.values) # create an SMTP object smtp_obj = smtplib.SMTP('outlook.usz.ch', 587) From a38de7db3fc5449c1494eb1e3af53579d3680c7a Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 20 Mar 2023 17:13:43 +0100 Subject: [PATCH 42/53] Add feedback fileg --- src/mridle/utilities/intervention.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 4acff6f6..6ac7618d 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -8,7 +8,6 @@ from email.mime.text import MIMEText from email.utils import formatdate import configparser -import numpy as np from mridle.pipelines.data_science.feature_engineering.nodes import add_business_days @@ -67,8 +66,8 @@ def intervention(dt): for_feedback_file = intervention_df[['MRNCmpdId', 'FillerOrderNo', 'start_time']] feedback_file = "/data/mridle/data/intervention/feedback.txt" - with open(feedback_file, 'a') as ap_f: - np.savetxt(ap_f, for_feedback_file.values) + with open(feedback_file, 'a') as f: + for_feedback_file.to_csv(f, header=False, index=False, sep=',') # create an SMTP object smtp_obj = smtplib.SMTP('outlook.usz.ch', 587) From ab8e09e601403ea5ee234629da5605a414ebddd9 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Tue, 11 Apr 2023 14:41:50 +0200 Subject: [PATCH 43/53] S --- src/mridle/experiment/dataset.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mridle/experiment/dataset.py b/src/mridle/experiment/dataset.py index 28e5b721..9a58961c 100644 --- a/src/mridle/experiment/dataset.py +++ b/src/mridle/experiment/dataset.py @@ -36,8 +36,12 @@ def validate_config(config, data): if col not in data.columns: raise ValueError(f'Feature column {col} not found in dataset.') - if config['target'] not in data.columns: - raise ValueError(f"Target column {config['target']} not found in dataset.") + if isinstance(config['target'], str): + if config['target'] not in data.columns: + raise ValueError(f"Target column {config['target']} not found in dataset.") + elif isinstance(config['target'], list): + if not set(config['target']).issubset(data.columns): + raise ValueError(f"Target columns {config['target']} not found in dataset.") return True From b01c945f0c6b100107f295eed269d2fd6455b2bc Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Tue, 11 Apr 2023 14:45:00 +0200 Subject: [PATCH 44/53] S --- src/mridle/experiment/dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mridle/experiment/dataset.py b/src/mridle/experiment/dataset.py index 9a58961c..1d3281b3 100644 --- a/src/mridle/experiment/dataset.py +++ b/src/mridle/experiment/dataset.py @@ -41,7 +41,8 @@ def validate_config(config, data): raise ValueError(f"Target column {config['target']} not found in dataset.") elif isinstance(config['target'], list): if not set(config['target']).issubset(data.columns): - raise ValueError(f"Target columns {config['target']} not found in dataset.") + not_in_list = list(set(config['target']).difference(data.columns)) + raise ValueError(f"Target columns {not_in_list} not found in dataset.") return True From 1ba2c3a81f7862b67432680619c9ad86f5c61959 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Thu, 20 Apr 2023 09:11:05 +0200 Subject: [PATCH 45/53] Add emir to email message --- src/mridle/utilities/intervention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 6ac7618d..642d9ac0 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -85,7 +85,7 @@ def intervention(dt): msg['Date'] = formatdate(localtime=True) msg['Subject'] = 'Intervention Study - {}'.format(today) body = """ - Dear Namka, + Dear Namka, Dear Emir, Here are the upcoming appointments which we would like to include in the study. From 6cd0c356f2327c67cf591e77a2a4a2a3c2810dfa Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 8 May 2023 14:24:40 +0200 Subject: [PATCH 46/53] Auto send results --- src/mridle/utilities/intervention.py | 68 ++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 642d9ac0..a6c71b0b 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -8,6 +8,7 @@ from email.mime.text import MIMEText from email.utils import formatdate import configparser +import os from mridle.pipelines.data_science.feature_engineering.nodes import add_business_days @@ -116,5 +117,72 @@ def intervention(dt): smtp_obj.quit() +def send_results(): + """ + Sends updated results by email to me + + """ + all_actuals = pd.read_csv("/data/mridle/data/silent_live_test/live_files/all/actuals/master_actuals.csv", + parse_dates=['start_time']) + most_recent_data = all_actuals['start_time'].max() + file_dir = '/data/mridle/data/intervention/' + + intervention_df = pd.DataFrame() + + for filename in os.listdir(file_dir): + if filename.endswith(".csv"): + i_df = pd.read_csv(os.path.join(file_dir, filename), parse_dates=['start_time']) + i_df['file'] = filename + intervention_df = pd.concat([intervention_df, i_df]) + + intervention_df = intervention_df[intervention_df['start_time'] < most_recent_data] + intervention_df.drop(columns=['NoShow'], inplace=True) + intervention_df = intervention_df.merge(all_actuals[['FillerOrderNo', 'start_time', 'NoShow']], how='left') + feedback = pd.read_csv('/data/mridle/data/intervention/feedback.txt', sep=",") + feedback['start_time'] = pd.to_datetime(feedback['start_time']) + intervention_df = intervention_df.merge(feedback, how='left') + intervention_df = intervention_df[intervention_df['feedback'] != 'appt not found'] + intervention_df['NoShow'].fillna(False, inplace=True) + intervention_df.loc[intervention_df['control'] == 'control', 'feedback'] = 'control' + r_1 = intervention_df.groupby('control').agg({'NoShow': ['count', 'sum', 'mean']}).reset_index() + r_2 = intervention_df.groupby(['control', 'feedback']).agg({'NoShow': ['count', 'sum', 'mean']}).reset_index() + + # Read the configuration file + config = configparser.ConfigParser() + config.read('/data/mridle/data/intervention/config.ini') + + # Access the values in the configuration file + username = config['DEFAULT']['username'] + password = config['DEFAULT']['password'] + + # create an SMTP object + smtp_obj = smtplib.SMTP('outlook.usz.ch', 587) + + # establish a secure connection + smtp_obj.starttls() + + # login to the email server using your email address and password + smtp_obj.login(username, password) + + # create the email message + msg = MIMEMultipart() + msg['From'] = username + msg['To'] = 'markronan.mcmahon@usz.ch' + msg['Date'] = formatdate(localtime=True) + msg['Subject'] = 'Intervention results - {}'.format(datetime.datetime.today().strftime('%Y_%m_%d')) + body = """ + {} + + {} + """.format(r_1.to_html(), r_2.to_html()) + msg.attach(MIMEText(body, 'html')) + + # send the email + smtp_obj.sendmail(username, username, msg.as_string()) + + # close the SMTP connection + smtp_obj.quit() + + if __name__ == '__main__': intervention(dt=datetime.datetime.today()) From 4fbe212f2e8988c7323cfbf3c973dcefa317a42a Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 15 May 2023 11:49:56 +0200 Subject: [PATCH 47/53] Some appts were too late to call - remove these --- src/mridle/utilities/intervention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index a6c71b0b..0ed9ad55 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -141,7 +141,7 @@ def send_results(): feedback = pd.read_csv('/data/mridle/data/intervention/feedback.txt', sep=",") feedback['start_time'] = pd.to_datetime(feedback['start_time']) intervention_df = intervention_df.merge(feedback, how='left') - intervention_df = intervention_df[intervention_df['feedback'] != 'appt not found'] + intervention_df = intervention_df[~(intervention_df['feedback'].isin(['appt not found', 'delete']))] intervention_df['NoShow'].fillna(False, inplace=True) intervention_df.loc[intervention_df['control'] == 'control', 'feedback'] = 'control' r_1 = intervention_df.groupby('control').agg({'NoShow': ['count', 'sum', 'mean']}).reset_index() From d6e4a4c125cb54700c5050dbed956716cb7fb365 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Tue, 16 May 2023 10:22:18 +0200 Subject: [PATCH 48/53] S --- src/mridle/utilities/intervention.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 0ed9ad55..987d785a 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -45,6 +45,11 @@ def intervention(dt): # Take appts above a certain threshold preds = preds[preds['prediction'] > threshold] + # We don't want overlap (i.e. appts being in both the Monday batch of calls as well as the thursday batch, so for + # the Thursday batch we remove the appts for following Thursday (which would be the day where a potential overlap + # would occur) + if dt.strftime("%A") == 'Thursday': + preds = preds[preds['start_time'].dt.strftime("%A") != "Thursday"] preds['control'] = 'control' @@ -144,6 +149,10 @@ def send_results(): intervention_df = intervention_df[~(intervention_df['feedback'].isin(['appt not found', 'delete']))] intervention_df['NoShow'].fillna(False, inplace=True) intervention_df.loc[intervention_df['control'] == 'control', 'feedback'] = 'control' + + # remove duplicates (some appts were included in both monday and thursday's intervention/control group) + intervention_df = intervention_df.sort_values('file').groupby(['FillerOrderNo', 'start_time']).last().reset_index() + r_1 = intervention_df.groupby('control').agg({'NoShow': ['count', 'sum', 'mean']}).reset_index() r_2 = intervention_df.groupby(['control', 'feedback']).agg({'NoShow': ['count', 'sum', 'mean']}).reset_index() From 8b9eeb77331c71b7575766cddc56c3d1b58cf4f4 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Thu, 1 Jun 2023 16:47:04 +0200 Subject: [PATCH 49/53] Add rf as flavor --- src/mridle/experiment/architecture.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mridle/experiment/architecture.py b/src/mridle/experiment/architecture.py index b71dbaad..1f4f290a 100644 --- a/src/mridle/experiment/architecture.py +++ b/src/mridle/experiment/architecture.py @@ -2,7 +2,7 @@ import skorch from sklearn.base import BaseEstimator -from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer @@ -34,6 +34,7 @@ class ArchitectureInterface(ComponentInterface): registered_flavors = { 'RandomForestClassifier': RandomForestClassifier, # TODO enable auto-loading from sklearn + 'RandomForestRegressor': RandomForestRegressor, # TODO enable auto-loading from sklearn 'LogisticRegression': LogisticRegression, 'XGBClassifier': xgb.XGBClassifier, 'Pipeline': Pipeline, From 22dd445e528961f1fa734dccc086b544ab47913b Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 5 Jun 2023 11:21:18 +0200 Subject: [PATCH 50/53] Adding metrics --- src/mridle/experiment/metric.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mridle/experiment/metric.py b/src/mridle/experiment/metric.py index bc11f93b..e1ca9052 100644 --- a/src/mridle/experiment/metric.py +++ b/src/mridle/experiment/metric.py @@ -135,6 +135,9 @@ class MetricInterface(ComponentInterface): 'AUPRC': AUPRC, 'AUROC': AUROC, 'LogLoss': LogLoss, + 'MSE': MSE, + 'RMSE': RMSE, + 'MedianAbsoluteError': MedianAbsoluteError } @classmethod From 70e0d52ca9e52fe8a349285b903a2769c7451e73 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Mon, 5 Jun 2023 11:22:31 +0200 Subject: [PATCH 51/53] Adding metrics --- src/mridle/experiment/metric.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mridle/experiment/metric.py b/src/mridle/experiment/metric.py index e1ca9052..d61f6d77 100644 --- a/src/mridle/experiment/metric.py +++ b/src/mridle/experiment/metric.py @@ -135,6 +135,7 @@ class MetricInterface(ComponentInterface): 'AUPRC': AUPRC, 'AUROC': AUROC, 'LogLoss': LogLoss, + 'MAE': MAE, 'MSE': MSE, 'RMSE': RMSE, 'MedianAbsoluteError': MedianAbsoluteError From e87a20a37d41dbd4c961b62995977910468156e8 Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Wed, 28 Jun 2023 10:06:43 +0200 Subject: [PATCH 52/53] More results --- src/mridle/utilities/intervention.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/mridle/utilities/intervention.py b/src/mridle/utilities/intervention.py index 987d785a..afb76323 100644 --- a/src/mridle/utilities/intervention.py +++ b/src/mridle/utilities/intervention.py @@ -152,9 +152,23 @@ def send_results(): # remove duplicates (some appts were included in both monday and thursday's intervention/control group) intervention_df = intervention_df.sort_values('file').groupby(['FillerOrderNo', 'start_time']).last().reset_index() + intervention_df['reached'] = intervention_df['feedback'].map({ + 'control': 'control', + 'comes': 'reached', + 'not reached': 'not reached', + 'to be rescheduled': 'reached' + }) + intervention_df['reached_2'] = intervention_df['feedback'].map({ + 'control': 'control', + 'comes': 'reached', + 'not reached': 'control', + 'to be rescheduled': 'reached' + }) r_1 = intervention_df.groupby('control').agg({'NoShow': ['count', 'sum', 'mean']}).reset_index() r_2 = intervention_df.groupby(['control', 'feedback']).agg({'NoShow': ['count', 'sum', 'mean']}).reset_index() + r_3 = intervention_df.groupby(['reached']).agg({'NoShow': ['count', 'sum', 'mean']}) + r_4 = intervention_df.groupby(['reached_2']).agg({'NoShow': ['count', 'sum', 'mean']}) # Read the configuration file config = configparser.ConfigParser() @@ -183,7 +197,11 @@ def send_results(): {} {} - """.format(r_1.to_html(), r_2.to_html()) + + {} + + {} + """.format(r_1.to_html(), r_2.to_html(), r_3.to_html(), r_4.to_html()) msg.attach(MIMEText(body, 'html')) # send the email From 77288f12c118507fb8c10397cb89446721080d4a Mon Sep 17 00:00:00 2001 From: Mark McMahon Date: Tue, 8 Aug 2023 14:23:34 +0200 Subject: [PATCH 53/53] Fix testsg --- src/mridle/utilities/process_live_data.py | 2 +- .../pipelines/data_engineering/test_feature_engineering.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/mridle/utilities/process_live_data.py b/src/mridle/utilities/process_live_data.py index e535f87d..e43c8648 100644 --- a/src/mridle/utilities/process_live_data.py +++ b/src/mridle/utilities/process_live_data.py @@ -55,7 +55,7 @@ def get_slt_features_delete_if_ok_to_do_so(): rfs_df = pd.read_csv('/data/mridle/data/silent_live_test/live_files/all/' 'retrospective_reasonforstudy/content/[dbo].[MRIdle_retrospective].csv') - rfs_df[['FillerOrderNo', 'ReasonForStudy']].drop_duplicates() + rfs_df[['FillerOrderNo', 'ReasonForStudy']].drop_duplicates(inplace=True) # add on proper noshow ago_st = get_slt_status_data('ago') diff --git a/src/tests/pipelines/data_engineering/test_feature_engineering.py b/src/tests/pipelines/data_engineering/test_feature_engineering.py index b64ddf33..09f59053 100644 --- a/src/tests/pipelines/data_engineering/test_feature_engineering.py +++ b/src/tests/pipelines/data_engineering/test_feature_engineering.py @@ -543,6 +543,7 @@ def test_future_appointments_one_row(self): raw_df = self._fill_out_static_columns(raw_df, create_fon=True) status_df = build_status_df(raw_df, exclude_patient_ids=[]) + status_df['Telefon'] = '0' feature_df = build_feature_set(status_df, valid_date_range) cols = [c for c in expected_feature_df.columns.values] feature_df = feature_df.loc[:, feature_df.columns.isin(cols)] @@ -569,6 +570,7 @@ def test_future_appointments_moved_forward(self): raw_df = self._fill_out_static_columns(raw_df, create_fon=True) status_df = build_status_df(raw_df, exclude_patient_ids=[]) + status_df['Telefon'] = '0' test_dt = day(2) feature_df = generate_3_5_days_ahead_features(status_df, test_dt) @@ -600,6 +602,7 @@ def test_future_appointments_rescheduled(self): raw_df = self._fill_out_static_columns(raw_df, create_fon=True) status_df = build_status_df(raw_df, exclude_patient_ids=[]) + status_df['Telefon'] = '0' feature_df = generate_training_data(status_df, valid_date_range) cols = [c for c in expected_feature_df.columns.values] @@ -701,6 +704,7 @@ def test_multiple_appts(self): raw_df = pd.concat([raw_df_1, raw_df_2, raw_df_3, raw_df_4], axis=0) status_df = build_status_df(raw_df, exclude_patient_ids=[]) + status_df['Telefon'] = '0' # Set valid date range to as if we were generating this data on day(12) test_vdr = [pd.Timestamp(year=2019, month=1, day=1, hour=0, minute=0), @@ -780,6 +784,7 @@ def test_appointment_one_row(self): raw_df = self._fill_out_static_columns(raw_df, create_fon=True) status_df = build_status_df(raw_df, exclude_patient_ids=[]) + status_df['Telefon'] = '0' model_data_df = generate_training_data(status_df, valid_date_range) @@ -887,6 +892,7 @@ def test_multiple_appts(self): raw_df = pd.concat([raw_df_1, raw_df_2, raw_df_3, raw_df_4], axis=0) status_df = build_status_df(raw_df, exclude_patient_ids=[]) + status_df['Telefon'] = '0' # Set valid date range to as if we were generating this data on day(12) test_vdr = [pd.Timestamp(year=2019, month=1, day=1, hour=0, minute=0),