-
Notifications
You must be signed in to change notification settings - Fork 2
/
stacking.py
124 lines (102 loc) · 5.01 KB
/
stacking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import numpy as np
import pandas as pd
from utility import *
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.base import clone
def stacking_regression(models, df_encoded, y_train, n_linear, ver_no, meta_model= None, n_folds=5, average_fold=True):
'''
Function 'stacking_regression' takes list of 1-st level models, train and test set,
and train a 2-nd level meta model if provided or save the model predictions.
Parameters
----------
models : list
List of 1-st level models. You can use any models that follow sklearn
convention i.e. accept numpy arrays and have methods 'fit' and 'predict'.
Must be of form [tree model 1, tree model 2, ..., tree model n, linear model 1, linear model 2, ..., linear model m]
df_encoded : dataframe
Include both train set (without label) and test set. Could be separated with 'Id'.
y_train : dataframe
Target values
n_linear : int
Number of linear models in the list of models.
ver_no : int
Version number for bookkeeping.
meta_model: model, default None
If None, no 2nd layer to be trained. Only save the predictions from cross validations in './stacking_submissions/raw/'
If a model is provided, a 2nd level model is trained and predictions are saved for submission.
You can use any model that follow sklearn convention
n_folds : int, default 5
Number of folds in cross-validation
average_fold: boolean, default True
Whether to take the average of the predictions on test set from each fold.
Refit the model using the whole training set and predict test set if False
'''
# Metric used to display.
metric = mean_squared_error
X_train = df_encoded[df_encoded['Id']<=1460]
X_test = df_encoded[df_encoded['Id']>1460]
X_train.drop('Id',axis=1,inplace=True)
X_test.drop('Id',axis=1,inplace=True)
# need to normalize for linear models
X_train_lin , X_test_lin = normalizeDf(X_train, X_test)
# transform dataframe to nd.matrix works better with KFold
X_train = X_train.values
X_test = X_test.values
X_train_lin = X_train_lin.values
X_test_lin = X_test_lin.values
# reset index works better with KFold
y_train = y_train.reset_index(drop=True)
# Split indices to get folds
kf = KFold(n_splits = n_folds, shuffle = True, random_state = 42)
# Create empty numpy arrays for stacking features
S_train = np.zeros((X_train.shape[0], len(models)))
S_test = np.zeros((X_test.shape[0], len(models)))
# Loop across models
for model_counter, model in enumerate(models):
print('model %d: [%s]' % (model_counter+1, model.__class__.__name__))
# Create empty numpy array, which will contain temporary predictions for test set made in each fold
S_test_temp = np.zeros((X_test.shape[0], n_folds))
# Loop across folds
if model_counter == len(models)-n_linear:
X_train,X_test = X_train_lin,X_test_lin
for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)):
X_tr = X_train[tr_index]
y_tr = y_train[tr_index]
X_te = X_train[te_index]
y_te = y_train[te_index]
# Clone the model because fit will mutate the model.
instance = clone(model)
# Fit 1-st level model
instance.fit(X_tr, y_tr)
# Predict out-of-fold part of train set
S_train[te_index, model_counter] = instance.predict(X_te)
# Predict full test set
S_test_temp[:, fold_counter] = instance.predict(X_test)
# Delete temperatory model
del instance
# Compute mean or mode of predictions for test set
if average_fold:
S_test[:, model_counter] = np.mean(S_test_temp, axis = 1)
else:
model.fit(X_train, y_train)
S_test[:, model_counter] = model.predict(X_test)
print(' ----')
print(' MEAN: [%.8f]\n' % (metric(y_train, S_train[:, model_counter])))
if not meta_model:
print('Logging model results in ./stacking_submissions/raw/')
# Save S_train, S_test for later use
S_tr = pd.DataFrame(S_train)
S_te = pd.DataFrame(S_test)
S_tr.to_csv('./stacking_submissions/raw/stack_raw_train'+str(ver_no)+'.csv',index=False)
S_te.to_csv('./stacking_submissions/raw/stack_raw_test'+str(ver_no)+'.csv',index=False)
else:
print('Training 2nd layer meta model.')
# Fit our second layer meta model
meta_model.fit(S_train, y_train)
final_score = metric(y_train,meta_model.predict(S_train))
# Make our final prediction
stacking_prediction = meta_model.predict(S_test)
stacking_prediction = np.expm1(stacking_prediction)
submission = pd.DataFrame({'Id':list(range(1461,2920)),'SalePrice':stacking_prediction})
submission.to_csv('./stacking_submissions/stack'+str(ver_no)+'_'+str(final_score)+'.csv',index=False)