%%html
<style>
@import url('https://fonts.googleapis.com/css?family=Orbitron');
body {background-color: gainsboro;}
a {color: #228B22; font-family: Orbitron;}
h1, h2 {color: #348ABD; font-family: Orbitron; text-shadow: 4px 4px 4px #aaa;}
h3, h4 {color: #228B22; font-family: Orbitron; text-shadow: 4px 4px 4px #aaa;}
</style>
import numpy as np
import pandas as pd
import scipy
# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import HTML
from matplotlib import rcParams
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.model_selection import KFold, ParameterGrid, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, median_absolute_error, mean_absolute_error
from sklearn.metrics import r2_score, explained_variance_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.linear_model import Ridge, RidgeCV, BayesianRidge
from sklearn.linear_model import HuberRegressor, TheilSenRegressor, RANSACRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
import keras as ks
from keras.models import Sequential, load_model, Model
from keras.optimizers import SGD, RMSprop
from keras.layers import Dense, Dropout, LSTM
from keras.layers import Activation, Flatten, Input, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D
from keras.layers.embeddings import Embedding
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import ModelCheckpoint, EarlyStopping
def regression(regressor, x_train, x_test, y_train):
reg = regressor
reg.fit(x_train, y_train)
y_train_reg = reg.predict(x_train)
y_test_reg = reg.predict(x_test)
return y_train_reg, y_test_reg
def loss_plot(fit_history):
plt.figure(figsize=(18, 4))
plt.plot(fit_history.history['loss'], color='#348ABD', label = 'train')
plt.plot(fit_history.history['val_loss'], color='#228B22', label = 'test')
plt.legend()
plt.title('Loss Function');
def mae_plot(fit_history):
plt.figure(figsize=(18, 4))
plt.plot(fit_history.history['mean_absolute_error'], color='#348ABD', label = 'train')
plt.plot(fit_history.history['val_mean_absolute_error'], color='#228B22', label = 'test')
plt.legend()
plt.title('Mean Absolute Error');
def scores(regressor, y_train, y_test, y_train_reg, y_test_reg):
print("_______________________________________")
print(regressor)
print("_______________________________________")
print("EV score. Train: ", explained_variance_score(y_train, y_train_reg))
print("EV score. Test: ", explained_variance_score(y_test, y_test_reg))
print("---------")
print("R2 score. Train: ", r2_score(y_train, y_train_reg))
print("R2 score. Test: ", r2_score(y_test, y_test_reg))
print("---------")
print("MSE score. Train: ", mean_squared_error(y_train, y_train_reg))
print("MSE score. Test: ", mean_squared_error(y_test, y_test_reg))
print("---------")
print("MAE score. Train: ", mean_absolute_error(y_train, y_train_reg))
print("MAE score. Test: ", mean_absolute_error(y_test, y_test_reg))
print("---------")
print("MdAE score. Train: ", median_absolute_error(y_train, y_train_reg))
print("MdAE score. Test: ", median_absolute_error(y_test, y_test_reg))
def scores2(regressor, target, target_predict):
print("_______________________________________")
print(regressor)
print("_______________________________________")
print("EV score:", explained_variance_score(target, target_predict))
print("---------")
print("R2 score:", r2_score(target, target_predict))
print("---------")
print("MSE score:", mean_squared_error(target, target_predict))
print("---------")
print("MAE score:", mean_absolute_error(target, target_predict))
print("---------")
print("MdAE score:", median_absolute_error(target, target_predict))
Sberbank is challenging programmers to develop algorithms which use a broad spectrum of features to predict real prices. Competitors will rely on a rich dataset that includes housing data and macroeconomic patterns. An accurate forecasting model will allow Sberbank to provide more certainty to their customers in an uncertain economy.
HTML('''<div id="data">
<p><iframe src="data_dictionary.txt" frameborder="0" height="300"width="97%"></iframe></p>
</div>''')
macro = pd.read_csv('kaggle_sberbank_macro.csv')
train = pd.read_csv('kaggle_sberbank_train.csv')
test = pd.read_csv('kaggle_sberbank_test.csv')
macro[100:110].T[1:10]
train[200:210].T[1:10]
X_list_num = ['timestamp',
'full_sq', 'num_room', 'area_m',
'kremlin_km', 'big_road2_km', 'big_road1_km',
'workplaces_km',
'stadium_km', 'swim_pool_km', 'fitness_km',
'detention_facility_km', 'cemetery_km',
'radiation_km', 'oil_chemistry_km',
'theater_km', 'exhibition_km', 'museum_km',
'park_km', 'public_healthcare_km',
'metro_min_walk','metro_km_avto',
'bus_terminal_avto_km', 'public_transport_station_min_walk',
'railroad_station_walk_min', 'railroad_station_avto_km',
'kindergarten_km', 'school_km', 'preschool_km',
'university_km', 'additional_education_km',
'shopping_centers_km', 'big_market_km',
'ekder_all', 'work_all', 'young_all']
X_list_cat = ['sub_area', 'ID_metro',
'office_raion', 'sport_objects_raion',
'raion_popul', 'healthcare_centers_raion',
'school_education_centers_raion',
'preschool_education_centers_raion']
target_train = train['price_doc']
plt.style.use('seaborn-whitegrid')
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(18, 6))
sns.distplot(target_train, bins=200, color='#228B22', ax=ax1)
ax1.set_xlabel("Prices")
sns.distplot(np.log(target_train), bins=200, color='#228B22', ax=ax2)
ax2.set_xlabel("Logarithm of the variable 'Prices'")
plt.suptitle('Sberbank Russian Housing Data');
print ("Sberbank Russian Housing Dataset Statistics: \n")
print ("Number of houses = ", len(target_train))
print ("Number of features = ", len(list(train[X_list_num+X_list_cat].keys())))
print ("Minimum house price = ", np.min(target_train))
print ("Maximum house price = ", np.max(target_train))
print ("Mean house price = ", "%.2f" % np.mean(target_train))
print ("Median house price = ", "%.2f" % np.median(target_train))
print ("Standard deviation of house prices =", "%.2f" % np.std(target_train))
train[X_list_num].isnull().sum()
test[X_list_num].isnull().sum()
df_train = pd.DataFrame(train, columns=X_list_num)
df_train_cat = pd.DataFrame(train, columns=X_list_num+X_list_cat)
df_test = pd.DataFrame(test, columns=X_list_num)
df_test_cat = pd.DataFrame(test, columns=X_list_num+X_list_cat)
df_train['prices'] = target_train
df_train_cat['prices'] = target_train
df_train = df_train.dropna(subset=['num_room'])
df_train_cat = df_train_cat.dropna(subset=['num_room'])
df_train['metro_min_walk'] = \
df_train['metro_min_walk'].interpolate(method='linear')
df_train_cat['metro_min_walk'] = \
df_train_cat['metro_min_walk'].interpolate(method='linear')
df_train['railroad_station_walk_min'] = \
df_train['railroad_station_walk_min'].interpolate(method='linear')
df_train_cat['railroad_station_walk_min'] = \
df_train_cat['railroad_station_walk_min'].interpolate(method='linear')
df_test['metro_min_walk'] = \
df_test['metro_min_walk'].interpolate(method='linear')
df_test_cat['metro_min_walk'] = \
df_test_cat['metro_min_walk'].interpolate(method='linear')
df_test['railroad_station_walk_min'] = \
df_test['railroad_station_walk_min'].interpolate(method='linear')
df_test_cat['railroad_station_walk_min'] = \
df_test_cat['railroad_station_walk_min'].interpolate(method='linear')
len(df_train)
# Add the Macro Feature
usdrub_pairs = dict(zip(list(macro['timestamp']), list(macro['usdrub'])))
# salary_pairs = dict(zip(list(macro['timestamp']), list(macro['salary'])))
df_train['timestamp'].replace(usdrub_pairs,inplace=True)
df_train_cat['timestamp'].replace(usdrub_pairs,inplace=True)
df_test['timestamp'].replace(usdrub_pairs,inplace=True)
df_test_cat['timestamp'].replace(usdrub_pairs,inplace=True)
df_train.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_train_cat.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_test.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_test_cat.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
# Preprocess Categorical Features
for df in [df_train_cat, df_test_cat]:
print ("____________________________________________")
print('sub area')
print('Number of categories:', len(set(df['sub_area'])))
print(set(df['sub_area']))
print('\nID metro')
print('Number of categories:', len(set(df['ID_metro'])))
print(set(df['ID_metro']))
print('\noffice raion')
print('Number of categories:', len(set(df['office_raion'])))
print(set(df['office_raion']))
print('\nsport objects raion')
print('Number of categories:', len(set(df['sport_objects_raion'])))
print(set(df_train_cat['sport_objects_raion']))
print('\nraion popul')
print('Number of categories:', len(set(df['raion_popul'])))
print(set(df['raion_popul']))
print('\nhealthcare centers raion')
print('Number of categories:', len(set(df_train_cat['healthcare_centers_raion'])))
print(set(df['healthcare_centers_raion']))
print('\nschool education centers raion')
print('Number of categories:', len(set(df['school_education_centers_raion'])))
print(set(df['school_education_centers_raion']))
print('\npreschool education centers raion')
print('Number of categories:', len(set(df['preschool_education_centers_raion'])))
print(set(df['preschool_education_centers_raion']))
for feature in X_list_cat:
for element in list(set(df_test_cat[feature])):
if element not in list(set(df_train_cat[feature])):
print (feature, element)
ID_metro_cat = pd.factorize(df_train_cat['ID_metro'])
df_train_cat['ID_metro'] = ID_metro_cat[0]
ID_metro_pairs = dict(zip(list(ID_metro_cat[1]), list(set(ID_metro_cat[0]))))
ID_metro_pairs[224] = 219
df_test_cat['ID_metro'].replace(ID_metro_pairs,inplace=True)
for feature in X_list_cat:
if feature !='ID_metro':
feature_cat = pd.factorize(df_train_cat[feature])
df_train_cat[feature] = feature_cat[0]
feature_pairs = dict(zip(list(feature_cat[1]), list(set(feature_cat[0]))))
df_test_cat[feature].replace(feature_pairs,inplace=True)
for df in [df_train_cat, df_test_cat]:
print ("____________________________________________")
print('sub area')
print('Number of categories:', len(set(df['sub_area'])))
print(set(df['sub_area']))
print('\nID metro')
print('Number of categories:', len(set(df['ID_metro'])))
print(set(df['ID_metro']))
print('\noffice raion')
print('Number of categories:', len(set(df['office_raion'])))
print(set(df['office_raion']))
print('\nsport objects raion')
print('Number of categories:', len(set(df['sport_objects_raion'])))
print(set(df_train_cat['sport_objects_raion']))
print('\nraion popul')
print('Number of categories:', len(set(df['raion_popul'])))
print(set(df['raion_popul']))
print('\nhealthcare centers raion')
print('Number of categories:', len(set(df_train_cat['healthcare_centers_raion'])))
print(set(df['healthcare_centers_raion']))
print('\nschool education centers raion')
print('Number of categories:', len(set(df['school_education_centers_raion'])))
print(set(df['school_education_centers_raion']))
print('\npreschool education centers raion')
print('Number of categories:', len(set(df['preschool_education_centers_raion'])))
print(set(df['preschool_education_centers_raion']))
df_train_cat1 = df_train_cat
encode = OneHotEncoder(sparse=False)
for column in X_list_cat:
encode.fit(df_train_cat[[column]])
transform = encode.transform(df_train_cat[[column]])
transform = pd.DataFrame(transform,
columns=[(column+"_"+str(i)) for i in df_train_cat[column].value_counts().index])
transform = transform.set_index(df_train_cat.index.values)
df_train_cat1 = pd.concat([df_train_cat1, transform], axis=1)
df_train_cat1 = df_train_cat1.drop(column, 1)
df_test_cat1 = df_test_cat
encode = OneHotEncoder(sparse=False)
for column in X_list_cat:
encode.fit(df_test_cat[[column]])
transform = encode.transform(df_test_cat[[column]])
transform = pd.DataFrame(transform,
columns=[(column+"_"+str(i)) for i in df_test_cat[column].value_counts().index])
transform = transform.set_index(df_test_cat.index.values)
df_test_cat1 = pd.concat([df_test_cat1, transform], axis=1)
df_test_cat1 = df_test_cat1.drop(column, 1)
# Check Encoding
df_train_cat1.iloc[:, 623:636][:3].as_matrix()
df_train_cat['preschool_education_centers_raion'][:3]
print('Shape of the train data frame:', df_train_cat1.shape)
print('Shape of the test data frame:', df_test_cat1.shape)
print("Features in the train data, but not in the test data:")
for element in list(df_train_cat1):
if element not in list(df_test_cat1):
print(element)
print("Features in the test data, but not in the train data:")
for element in list(df_test_cat1):
if element not in list(df_train_cat1):
print(element)
for column in ['sub_area_136',' ID_metro_188', 'ID_metro_205', 'ID_metro_216', 'ID_metro_214',
'ID_metro_183',' ID_metro_179', 'ID_metro_153', 'ID_metro_217', 'raion_popul_136']:
df_test_cat1[column] = 0
df_train_cat1['ID_metro_219'] = 0
print('Columns with zero values were added.\n')
print('Shape of the train data frame:', df_train_cat1.shape)
print('Shape of the test data frame:', df_test_cat1.shape)
pearson = df_train.corr(method='pearson')
corr_with_prices = pearson.ix[-1][:-1]
corr_with_prices[abs(corr_with_prices).argsort()[::-1]]
top_features_list = corr_with_prices[abs(corr_with_prices).argsort()[::-1]][:16].index.values.tolist()
print(top_features_list)
target_train = df_train['prices'].as_matrix()
features_train = df_train.drop('prices', 1).as_matrix()
features_test = df_test.as_matrix()
features_train_cat = df_train_cat.drop('prices', 1).as_matrix()
features_test_cat = df_test_cat.as_matrix()
features_train_cat_enc = df_train_cat1.drop('prices', 1).as_matrix()
features_test_cat_enc = df_test_cat1.as_matrix()
print('Numeric Features')
X_train, X_test, y_train, y_test = \
train_test_split(features_train, target_train, test_size = 0.2, random_state = 1)
X_train.shape, X_test.shape
print('Numeric and Categorical Features')
X_train_cat, X_test_cat, y_train_cat, y_test_cat = \
train_test_split(features_train_cat, target_train, test_size = 0.2, random_state = 1)
X_train_cat.shape, X_test_cat.shape
print('Numeric and Encoded Categorical Features')
X_train_cat_enc, X_test_cat_enc, y_train_cat_enc, y_test_cat_enc = \
train_test_split(features_train_cat_enc, target_train, test_size = 0.2, random_state = 1)
X_train_cat_enc.shape, X_test_cat_enc.shape
scale_X = RobustScaler()
X_train = scale_X.fit_transform(X_train)
X_test = scale_X.transform(X_test)
scale_y = RobustScaler()
y_train = scale_y.fit_transform(y_train.reshape(-1,1))
y_test = scale_y.transform(y_test.reshape(-1,1))
scale_X_cat = RobustScaler()
X_train_cat = scale_X_cat.fit_transform(X_train_cat)
X_test_cat = scale_X_cat.transform(X_test_cat)
scale_y_cat = RobustScaler()
y_train_cat = scale_y_cat.fit_transform(y_train_cat.reshape(-1,1))
y_test_cat = scale_y_cat.transform(y_test_cat.reshape(-1,1))
scale_X_cat_enc = RobustScaler()
X_train_cat_enc = scale_X_cat_enc.fit_transform(X_train_cat_enc)
X_test_cat_enc = scale_X_cat_enc.transform(X_test_cat_enc)
scale_y_cat_enc = RobustScaler()
y_train_cat_enc = scale_y_cat_enc.fit_transform(y_train_cat_enc.reshape(-1,1))
y_test_cat_enc = scale_y_cat_enc.transform(y_test_cat_enc.reshape(-1,1))
print('Numeric Features')
print ('Gradient Boosting Regressor')
param_grid_gbr = {'max_depth': [3, 4, 5], 'n_estimators': range(36, 361, 36)}
gridsearch_gbr = GridSearchCV(GradientBoostingRegressor(),
param_grid_gbr, n_jobs=5).fit(X_train, y_train)
gridsearch_gbr.best_params_
print ('Bagging Regressor')
param_grid_br = {'n_estimators': range(36, 361, 36)}
gridsearch_br = GridSearchCV(BaggingRegressor(),
param_grid_br, n_jobs=5).fit(X_train, y_train)
gridsearch_br.best_params_
print('Numeric and Categorical Features')
print ('Gradient Boosting Regressor')
param_grid_gbr_cat = {'max_depth': [3, 4, 5], 'n_estimators': range(44, 441, 44)}
gridsearch_gbr_cat = GridSearchCV(GradientBoostingRegressor(),
param_grid_gbr_cat, n_jobs=5).fit(X_train_cat, y_train_cat)
gridsearch_gbr_cat.best_params_
print ('Bagging Regressor')
param_grid_br_cat = {'n_estimators': range(44, 441, 44)}
gridsearch_br_cat = GridSearchCV(BaggingRegressor(),
param_grid_br_cat, n_jobs=5).fit(X_train_cat, y_train_cat)
gridsearch_br_cat.best_params_
print('Numeric and Encoded Categorical Features')
print ('Gradient Boosting Regressor')
param_grid_gbr_cat_enc = {'max_depth': [3, 4, 5], 'n_estimators': [159, 318, 636]}
gridsearch_gbr_cat_enc = GridSearchCV(GradientBoostingRegressor(),
param_grid_gbr_cat_enc,
n_jobs=5).fit(X_train_cat_enc, y_train_cat_enc)
gridsearch_gbr_cat_enc.best_params_
print ('Bagging Regressor')
param_grid_br_cat_enc = {'n_estimators': [159, 318, 636]}
gridsearch_br_cat_enc = GridSearchCV(BaggingRegressor(),
param_grid_br_cat_enc,
n_jobs=5).fit(X_train_cat_enc, y_train_cat_enc)
gridsearch_br_cat_enc.best_params_
print('Numeric Features')
y_train_gbr, y_test_gbr = regression(GradientBoostingRegressor(max_depth=4, n_estimators=216),
X_train, X_test, y_train)
y_train_br, y_test_br = regression(BaggingRegressor(n_estimators=252),
X_train, X_test, y_train)
scores('GradientBoostingRegressor', y_train, y_test, y_train_gbr, y_test_gbr)
scores('BaggingRegressor', y_train, y_test, y_train_br, y_test_br)
print('Numeric and Categorical Features')
y_train_cat_gbr, y_test_cat_gbr = \
regression(GradientBoostingRegressor(max_depth=3, n_estimators=396), X_train_cat, X_test_cat, y_train_cat)
y_train_cat_br, y_test_cat_br = \
regression(BaggingRegressor(n_estimators=220), X_train_cat, X_test_cat, y_train_cat)
scores('GradientBoostingRegressor',
y_train_cat, y_test_cat, y_train_cat_gbr, y_test_cat_gbr)
scores('BaggingRegressor',
y_train_cat, y_test_cat, y_train_cat_br, y_test_cat_br)
print('Numeric and Encoded Categorical Features')
y_train_cat_enc_gbr, y_test_cat_enc_gbr = \
regression(GradientBoostingRegressor(max_depth=3, n_estimators=159),
X_train_cat_enc, X_test_cat_enc, y_train_cat_enc)
y_train_cat_enc_br, y_test_cat_enc_br = \
regression(BaggingRegressor(n_estimators=159),
X_train_cat_enc, X_test_cat_enc, y_train_cat_enc)
scores('GradientBoostingRegressor',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_gbr, y_test_cat_enc_gbr)
scores('BaggingRegressor',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_br, y_test_cat_enc_br)
mlpr = MLPRegressor(hidden_layer_sizes=(324,), max_iter=200,
solver='lbfgs', alpha=0.01)
mlpr.fit(X_train, y_train)
y_train_mlpr = mlpr.predict(X_train)
y_test_mlpr = mlpr.predict(X_test)
scores('MLP Regressor; Numeric Features',
y_train, y_test, y_train_mlpr, y_test_mlpr)
mlpr_cat = MLPRegressor(hidden_layer_sizes=(396,), max_iter=200,
solver='lbfgs', alpha=0.01)
mlpr_cat.fit(X_train_cat, y_train_cat)
y_train_cat_mlpr = mlpr_cat.predict(X_train_cat)
y_test_cat_mlpr = mlpr_cat.predict(X_test_cat)
scores('MLP Regressor; Numeric and Categorical Features',
y_train_cat, y_test_cat, y_train_cat_mlpr, y_test_cat_mlpr)
mlpr_cat_enc = MLPRegressor(hidden_layer_sizes=(318,), max_iter=200,
solver='lbfgs', alpha=0.01)
mlpr_cat_enc.fit(X_train_cat_enc, y_train_cat_enc)
y_train_cat_enc_mlpr = mlpr_cat_enc.predict(X_train_cat_enc)
y_test_cat_enc_mlpr = mlpr_cat_enc.predict(X_test_cat_enc)
scores('MLP Regressor; Numeric and Encoded Categorical Features',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_mlpr, y_test_cat_enc_mlpr)
plt.figure(figsize = (18, 6))
plt.plot(y_test[1:50], color = 'black', label='Real Data')
plt.plot(y_test_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_br[1:50], label='Bagging Regressor')
plt.plot(y_test_mlpr[1:50], label='MLP Regressor')
plt.legend()
plt.title("Numeric Features; Regressor Predictions vs Real Data");
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')
plt.plot(y_test_cat_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_cat_br[1:50], label='Bagging Regressor')
plt.plot(y_test_cat_mlpr[1:50], label='MLP Regressor')
plt.legend()
plt.title("Numeric and Categorical Features; Regressor Predictions vs Real Data");
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat_enc[1:50], color = 'black', label='Real Data')
plt.plot(y_test_cat_enc_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(y_test_cat_enc_mlpr[1:50], label='MLP Regressor')
plt.legend()
plt.title("Numeric and Encoded Categorical Features; Regressor Predictions vs Real Data");
def mlp_model():
model = Sequential()
model.add(Dense(1152, activation='relu', input_dim=36))
model.add(Dense(288, activation='relu'))
model.add(Dense(72, activation='relu'))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])
return model
mlp_model = mlp_model()
mlp_checkpointer = ModelCheckpoint(filepath='weights.best.mlp_reg.sberbank.hdf5',
verbose=2, save_best_only=True)
mlp_history = mlp_model.fit(X_train, y_train, validation_data=(X_test, y_test),
nb_epoch=10, batch_size=128, verbose=0, callbacks=[mlp_checkpointer])
loss_plot(mlp_history)
mae_plot(mlp_history)
mlp_model.load_weights('weights.best.mlp_reg.sberbank.hdf5')
y_train_mlp = mlp_model.predict(X_train)
y_test_mlp = mlp_model.predict(X_test)
scores('MLP Model; Numeric Features', y_train, y_test, y_train_mlp, y_test_mlp)
mlp_model.save('kaggle_sberbank_mlp_reg_model.h5')
def mlp_cat_model():
model = Sequential()
model.add(Dense(88*16, activation='relu', input_dim=44))
model.add(Dense(88*4, activation='relu'))
model.add(Dense(88, activation='relu'))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])
return model
mlp_cat_model = mlp_cat_model()
mlp_cat_checkpointer = ModelCheckpoint(filepath='weights.best.mlp_cat_reg.sberbank.hdf5',
verbose=2, save_best_only=True)
mlp_cat_history = mlp_cat_model.fit(X_train_cat, y_train_cat,
validation_data=(X_test_cat, y_test_cat),
nb_epoch=10, batch_size=128, verbose=0, callbacks=[mlp_cat_checkpointer])
loss_plot(mlp_cat_history)
mae_plot(mlp_cat_history)
mlp_cat_model.load_weights('weights.best.mlp_cat_reg.sberbank.hdf5')
y_train_cat_mlp = mlp_cat_model.predict(X_train_cat)
y_test_cat_mlp = mlp_cat_model.predict(X_test_cat)
scores('MLP Model; Numeric and Categorical Features',
y_train_cat, y_test_cat, y_train_cat_mlp, y_test_cat_mlp)
mlp_cat_model.save('kaggle_sberbank_mlp_cat_reg_model.h5')
def mlp_cat_enc_model():
model = Sequential()
model.add(Dense(636*16, activation='relu', input_dim=636))
model.add(Dense(636*4, activation='relu'))
model.add(Dense(636, activation='relu'))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])
return model
mlp_cat_enc_model = mlp_cat_enc_model()
mlp_cat_enc_checkpointer = ModelCheckpoint(filepath='weights.best.mlp_cat_enc_reg.sberbank.hdf5',
verbose=2, save_best_only=True)
mlp_cat_enc_history = mlp_cat_enc_model.fit(X_train_cat_enc, y_train_cat_enc,
validation_data=(X_test_cat_enc, y_test_cat_enc),
nb_epoch=5, batch_size=128, verbose=0,
callbacks=[mlp_cat_enc_checkpointer])
loss_plot(mlp_cat_enc_history)
mae_plot(mlp_cat_enc_history)
mlp_cat_enc_model.load_weights('weights.best.mlp_cat_enc_reg.sberbank.hdf5')
y_train_cat_enc_mlp = mlp_cat_enc_model.predict(X_train_cat_enc)
y_test_cat_enc_mlp = mlp_cat_enc_model.predict(X_test_cat_enc)
scores('MLP Model; Numeric and Encoded Categorical Features',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_mlp, y_test_cat_enc_mlp)
mlp_cat_enc_model.save('kaggle_sberbank_mlp_cat_enc_reg_model.h5')
def cnn_model():
model = Sequential()
model.add(Conv1D(36, 5, padding='valid', activation='relu', input_shape=(36, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Conv1D(144, 3, padding='valid', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(576, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])
return model
cnn_model = cnn_model()
cnn_checkpointer = ModelCheckpoint(filepath='weights.best.cnn_reg.sberbank.hdf5',
verbose=2, save_best_only=True)
cnn_history = cnn_model.fit(X_train.reshape(16719, 36, 1), y_train,
epochs=20, batch_size=128, verbose=0, callbacks=[cnn_checkpointer],
validation_data=(X_test.reshape(4180, 36, 1), y_test))
loss_plot(cnn_history)
mae_plot(cnn_history)
cnn_model.load_weights('weights.best.cnn_reg.sberbank.hdf5')
y_train_cnn = cnn_model.predict(X_train.reshape(16719, 36, 1))
y_test_cnn = cnn_model.predict(X_test.reshape(4180, 36, 1))
scores('CNN Model; Numeric Features', y_train, y_test, y_train_cnn, y_test_cnn)
cnn_model.save('kaggle_sberbank_cnn_reg_model.h5')
def cnn_cat_model():
model = Sequential()
model.add(Conv1D(44, 5, padding='valid', activation='relu', input_shape=(44, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Conv1D(156, 3, padding='valid', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(624, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])
return model
cnn_cat_model = cnn_cat_model()
cnn_cat_checkpointer = ModelCheckpoint(filepath='weights.best.cnn_cat_reg.sberbank.hdf5',
verbose=2, save_best_only=True)
cnn_cat_history = cnn_cat_model.fit(X_train_cat.reshape(16719, 44, 1), y_train_cat,
epochs=20, batch_size=128, verbose=0, callbacks=[cnn_cat_checkpointer],
validation_data=(X_test_cat.reshape(4180, 44, 1), y_test_cat))
loss_plot(cnn_cat_history)
mae_plot(cnn_cat_history)
cnn_cat_model.load_weights('weights.best.cnn_cat_reg.sberbank.hdf5')
y_train_cat_cnn = cnn_cat_model.predict(X_train_cat.reshape(16719, 44, 1))
y_test_cat_cnn = cnn_cat_model.predict(X_test_cat.reshape(4180, 44, 1))
scores('CNN Model; Numeric and Categorical Features',
y_train_cat, y_test_cat, y_train_cat_cnn, y_test_cat_cnn)
cnn_cat_model.save('kaggle_sberbank_cnn_cat_reg_model.h5')
def cnn_cat_enc_model():
model = Sequential()
model.add(Conv1D(159, 5, padding='valid', activation='relu', input_shape=(636, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Conv1D(318, 3, padding='valid', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(636, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])
return model
cnn_cat_enc_model = cnn_cat_enc_model()
cnn_cat_enc_checkpointer = ModelCheckpoint(filepath='weights.best.cnn_cat_enc_reg.sberbank.hdf5',
verbose=2, save_best_only=True)
cnn_cat_enc_history = \
cnn_cat_enc_model.fit(X_train_cat_enc.reshape(16719, 636, 1), y_train_cat_enc,
epochs=10, batch_size=128, verbose=2, callbacks=[cnn_cat_enc_checkpointer],
validation_data=(X_test_cat_enc.reshape(4180, 636, 1), y_test_cat_enc))
loss_plot(cnn_cat_enc_history)
mae_plot(cnn_cat_enc_history)
cnn_cat_enc_model.load_weights('weights.best.cnn_cat_enc_reg.sberbank.hdf5')
y_train_cat_enc_cnn = cnn_cat_enc_model.predict(X_train_cat_enc.reshape(16719, 636, 1))
y_test_cat_enc_cnn = cnn_cat_enc_model.predict(X_test_cat_enc.reshape(4180, 636, 1))
scores('CNN Model; Numeric and Encoded Categorical Features',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_cnn, y_test_cat_enc_cnn)
cnn_cat_enc_model.save('kaggle_sberbank_cnn_cat_enc_reg_model.h5')
def rnn_model():
model = Sequential()
model.add(LSTM(144, return_sequences=True, input_shape=(1, 36)))
model.add(LSTM(576, return_sequences=False))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
rnn_model = rnn_model()
rnn_checkpointer = ModelCheckpoint(filepath='weights.best.rnn_reg.sberbank.hdf5',
verbose=2, save_best_only=True)
rnn_history = rnn_model.fit(X_train.reshape(16719, 1, 36), y_train.reshape(16719),
epochs=7, verbose=2, callbacks=[rnn_checkpointer],
validation_data=(X_test.reshape(4180, 1, 36), y_test.reshape(4180)))
loss_plot(rnn_history)
mae_plot(rnn_history)
rnn_model.load_weights('weights.best.rnn_reg.sberbank.hdf5')
y_train_rnn = rnn_model.predict(X_train.reshape(16719, 1, 36))
y_test_rnn = rnn_model.predict(X_test.reshape(4180, 1, 36))
scores('RNN Model; Numeric Features', y_train, y_test, y_train_rnn, y_test_rnn)
rnn_model.save('kaggle_sberbank_rnn_reg_model.h5')
def rnn_cat_model():
model = Sequential()
model.add(LSTM(156, return_sequences=True, input_shape=(1, 44)))
model.add(LSTM(624, return_sequences=False))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
rnn_cat_model = rnn_cat_model()
rnn_cat_checkpointer = ModelCheckpoint(filepath='weights.best.rnn_cat_reg.sberbank.hdf5',
verbose=2, save_best_only=True)
rnn_cat_history = rnn_cat_model.fit(X_train_cat.reshape(16719, 1, 44), y_train_cat.reshape(16719),
epochs=10, verbose=2, callbacks=[rnn_cat_checkpointer],
validation_data=(X_test_cat.reshape(4180, 1, 44), y_test_cat.reshape(4180)))
loss_plot(rnn_cat_history)
mae_plot(rnn_cat_history)
rnn_cat_model.load_weights('weights.best.rnn_cat_reg.sberbank.hdf5')
y_train_cat_rnn = rnn_cat_model.predict(X_train_cat.reshape(16719, 1, 44))
y_test_cat_rnn = rnn_cat_model.predict(X_test_cat.reshape(4180, 1, 44))
scores('RNN Model; Numeric and Categorical Features',
y_train_cat, y_test_cat, y_train_cat_rnn, y_test_cat_rnn)
rnn_cat_model.save('kaggle_sberbank_rnn_cat_reg_model.h5')
def rnn_cat_enc_model():
model = Sequential()
model.add(LSTM(159, return_sequences=True, input_shape=(1, 636)))
model.add(LSTM(636, return_sequences=False))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
rnn_cat_enc_model = rnn_cat_enc_model()
rnn_cat_enc_checkpointer = ModelCheckpoint(filepath='weights.best.rnn_cat_enc_reg.sberbank.hdf5',
verbose=2, save_best_only=True)
rnn_cat_enc_history = \
rnn_cat_enc_model.fit(X_train_cat_enc.reshape(16719, 1, 636), y_train_cat_enc.reshape(16719),
epochs=10, verbose=2, callbacks=[rnn_cat_enc_checkpointer],
validation_data=(X_test_cat_enc.reshape(4180, 1, 636), y_test_cat_enc.reshape(4180)))
loss_plot(rnn_cat_enc_history)
mae_plot(rnn_cat_enc_history)
rnn_cat_enc_model.load_weights('weights.best.rnn_cat_enc_reg.sberbank.hdf5')
y_train_cat_enc_rnn = rnn_cat_enc_model.predict(X_train_cat_enc.reshape(16719, 1, 636))
y_test_cat_enc_rnn = rnn_cat_enc_model.predict(X_test_cat_enc.reshape(4180, 1, 636))
scores('RNN Model; Numeric and Encoded Categorical Features',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_rnn, y_test_cat_enc_rnn)
rnn_cat_enc_model.save('kaggle_sberbank_rnn_cat_enc_reg_model.h5')
plt.figure(figsize = (18, 6))
plt.plot(y_test[1:50], color = 'black', label='Real Data')
plt.plot(y_test_mlp[1:50], label='MLP')
plt.plot(y_test_cnn[1:50], label='CNN')
plt.plot(y_test_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric Features; Neural Network Predictions vs Real Data");
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')
plt.plot(y_test_cat_mlp[1:50], label='MLP')
plt.plot(y_test_cat_cnn[1:50], label='CNN')
plt.plot(y_test_cat_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric and Categorical Features; Neural Network Predictions vs Real Data");
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')
plt.plot(y_test_cat_enc_mlp[1:50], label='MLP')
plt.plot(y_test_cat_enc_cnn[1:50], label='CNN')
plt.plot(y_test_cat_enc_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric and Encoded Categorical Features; Neural Network Predictions vs Real Data");
# Scale
target_scale = RobustScaler()
s_target_train = target_scale.fit_transform(target_train.reshape(-1,1))
#########################################################################################
feature_scale = RobustScaler()
s_features_train = feature_scale.fit_transform(features_train)
s_features_test = feature_scale.transform(features_test)
########################################################################################
feature_cat_scale = RobustScaler()
s_features_train_cat = feature_cat_scale.fit_transform(features_train_cat)
s_features_test_cat = feature_cat_scale.transform(features_test_cat)
########################################################################################
feature_cat_enc_scale = RobustScaler()
s_features_train_cat_enc = feature_cat_enc_scale.fit_transform(features_train_cat_enc)
s_features_test_cat_enc = feature_cat_enc_scale.transform(features_test_cat_enc)
gbr = GradientBoostingRegressor(max_depth=4, n_estimators=216)
gbr.fit(s_features_train, s_target_train)
s_target_train_gbr = gbr.predict(s_features_train)
s_target_test_gbr = gbr.predict(s_features_test)
scores2('Gradient Boosting Regressor', s_target_train, s_target_train_gbr)
br = BaggingRegressor(n_estimators=252)
br.fit(s_features_train, s_target_train)
s_target_train_br = br.predict(s_features_train)
s_target_test_br = br.predict(s_features_test)
scores2('Bagging Regressor', s_target_train, s_target_train_br)
s_target_train_mlpr = mlpr.predict(s_features_train)
s_target_test_mlpr = mlpr.predict(s_features_test)
scores2('MLP Regressor', s_target_train, s_target_train_mlpr)
gbr_cat = GradientBoostingRegressor(max_depth=3, n_estimators=396)
gbr_cat.fit(s_features_train_cat, s_target_train)
s_target_train_cat_gbr = gbr_cat.predict(s_features_train_cat)
s_target_test_cat_gbr = gbr_cat.predict(s_features_test_cat)
scores2('Gradient Boosting Regressor', s_target_train, s_target_train_cat_gbr)
br_cat = BaggingRegressor(n_estimators=220)
br_cat.fit(s_features_train_cat, s_target_train)
s_target_train_cat_br = br_cat.predict(s_features_train_cat)
s_target_test_cat_br = br_cat.predict(s_features_test_cat)
scores2('Bagging Regressor', s_target_train, s_target_train_cat_br)
s_target_train_cat_mlpr = mlpr_cat.predict(s_features_train_cat)
s_target_test_cat_mlpr = mlpr_cat.predict(s_features_test_cat)
scores2('MLP Regressor', s_target_train, s_target_train_cat_mlpr)
gbr_cat_enc = GradientBoostingRegressor(max_depth=3, n_estimators=159)
gbr_cat_enc.fit(s_features_train_cat_enc, s_target_train)
s_target_train_cat_enc_gbr = gbr_cat_enc.predict(s_features_train_cat_enc)
s_target_test_cat_enc_gbr = gbr_cat_enc.predict(s_features_test_cat_enc)
scores2('Gradient Boosting Regressor', s_target_train, s_target_train_cat_enc_gbr)
br_cat_enc = BaggingRegressor(n_estimators=159)
br_cat_enc.fit(s_features_train_cat_enc, s_target_train)
s_target_train_cat_enc_br = br_cat.predict(s_features_train_cat_enc)
s_target_test_cat_enc_br = br_cat.predict(s_features_test_cat_enc)
scores2('Bagging Regressor', s_target_train, s_target_train_cat_enc_br)
s_target_train_cat_enc_mlpr = mlpr_cat_enc.predict(s_features_train_cat_enc)
s_target_test_cat_enc_mlpr = mlpr_cat_enc.predict(s_features_test_cat_enc)
scores2('MLP Regressor', s_target_train, s_target_train_cat_enc_mlpr)
s_target_train_mlp = mlp_model.predict(s_features_train)
s_target_test_mlp = mlp_model.predict(s_features_test)
scores2('MLP', s_target_train, s_target_train_mlp)
s_target_train_cnn = cnn_model.predict(s_features_train.reshape(20899, 36, 1))
s_target_test_cnn = cnn_model.predict(s_features_test.reshape(7662, 36, 1))
scores2('CNN', s_target_train, s_target_train_cnn)
s_target_train_rnn = rnn_model.predict(s_features_train.reshape(20899, 1, 36))
s_target_test_rnn = rnn_model.predict(s_features_test.reshape(7662, 1, 36))
scores2('RNN', s_target_train, s_target_train_rnn)
s_target_train_cat_mlp = mlp_cat_model.predict(s_features_train_cat)
s_target_test_cat_mlp = mlp_cat_model.predict(s_features_test_cat)
scores2('MLP', s_target_train, s_target_train_cat_mlp)
s_target_train_cat_cnn = cnn_cat_model.predict(s_features_train_cat.reshape(20899, 44, 1))
s_target_test_cat_cnn = cnn_cat_model.predict(s_features_test_cat.reshape(7662, 44, 1))
scores2('CNN', s_target_train, s_target_train_cat_cnn)
s_target_train_cat_rnn = rnn_cat_model.predict(s_features_train_cat.reshape(20899, 1, 44))
s_target_test_cat_rnn = rnn_cat_model.predict(s_features_test_cat.reshape(7662, 1, 44))
scores2('RNN', s_target_train, s_target_train_cat_rnn)
s_target_train_cat_enc_mlp = mlp_cat_enc_model.predict(s_features_train_cat_enc)
s_target_test_cat_enc_mlp = mlp_cat_enc_model.predict(s_features_test_cat_enc)
scores2('MLP', s_target_train, s_target_train_cat_enc_mlp)
s_target_train_cat_enc_cnn = cnn_cat_enc_model.predict(s_features_train_cat_enc.reshape(20899, 636, 1))
s_target_test_cat_enc_cnn = cnn_cat_enc_model.predict(s_features_test_cat_enc.reshape(7662, 636, 1))
scores2('CNN', s_target_train, s_target_train_cat_enc_cnn)
s_target_train_cat_enc_rnn = rnn_cat_enc_model.predict(s_features_train_cat_enc.reshape(20899, 1, 636))
s_target_test_cat_enc_rnn = rnn_cat_enc_model.predict(s_features_test_cat_enc.reshape(7662, 1, 636))
scores2('RNN', s_target_train, s_target_train_cat_enc_rnn)
# Rescale Predictions
target_train_gbr = target_scale.inverse_transform(s_target_train_gbr.reshape(-1,1))
target_test_gbr = target_scale.inverse_transform(s_target_test_gbr.reshape(-1,1))
target_train_br = target_scale.inverse_transform(s_target_train_br.reshape(-1,1))
target_test_br = target_scale.inverse_transform(s_target_test_br.reshape(-1,1))
target_train_mlpr = target_scale.inverse_transform(s_target_train_mlpr.reshape(-1,1))
target_test_mlpr = target_scale.inverse_transform(s_target_test_mlpr.reshape(-1,1))
target_train_mlp = target_scale.inverse_transform(s_target_train_mlp)
target_test_mlp = target_scale.inverse_transform(s_target_test_mlp)
target_train_cnn = target_scale.inverse_transform(s_target_train_cnn)
target_test_cnn = target_scale.inverse_transform(s_target_test_cnn)
target_train_rnn = target_scale.inverse_transform(s_target_train_rnn)
target_test_rnn = target_scale.inverse_transform(s_target_test_rnn)
plt.figure(figsize = (18, 6))
plt.plot(target_train[1:50], color = 'black', label='Real Data')
plt.plot(target_train_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_br[1:50], label='Bagging Regressor')
plt.plot(target_train_mlpr[1:50], label='MLP Regressor')
plt.plot(target_train_mlp[1:50], label='MLP')
plt.plot(target_train_cnn[1:50], label='CNN')
plt.plot(target_train_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric Features; Train Predictions vs Real Data");
plt.figure(figsize = (18, 6))
plt.plot(target_test_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_br[1:50], label='Bagging Regressor')
plt.plot(target_test_mlpr[1:50], label='MLP Regressor')
plt.plot(target_test_mlp[1:50], label='MLP')
plt.plot(target_test_cnn[1:50], label='CNN')
plt.plot(target_test_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric Features; Test Predictions");
# Rescale Predictions
target_train_cat_gbr = target_scale.inverse_transform(s_target_train_cat_gbr.reshape(-1,1))
target_test_cat_gbr = target_scale.inverse_transform(s_target_test_cat_gbr.reshape(-1,1))
target_train_cat_br = target_scale.inverse_transform(s_target_train_cat_br.reshape(-1,1))
target_test_cat_br = target_scale.inverse_transform(s_target_test_cat_br.reshape(-1,1))
target_train_cat_mlpr = target_scale.inverse_transform(s_target_train_cat_mlpr.reshape(-1,1))
target_test_cat_mlpr = target_scale.inverse_transform(s_target_test_cat_mlpr.reshape(-1,1))
target_train_cat_mlp = target_scale.inverse_transform(s_target_train_cat_mlp.reshape(-1,1))
target_test_cat_mlp = target_scale.inverse_transform(s_target_test_cat_mlp.reshape(-1,1))
target_train_cat_cnn = target_scale.inverse_transform(s_target_train_cat_cnn.reshape(-1,1))
target_test_cat_cnn = target_scale.inverse_transform(s_target_test_cat_cnn.reshape(-1,1))
target_train_cat_rnn = target_scale.inverse_transform(s_target_train_cat_rnn.reshape(-1,1))
target_test_cat_rnn = target_scale.inverse_transform(s_target_test_cat_rnn.reshape(-1,1))
plt.figure(figsize = (18, 6))
plt.plot(target_train[1:50], color = 'black', label='Real Data')
plt.plot(target_train_cat_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_cat_br[1:50], label='Bagging Regressor')
plt.plot(target_train_cat_mlpr[1:50], label='MLP Regressor')
plt.plot(target_train_cat_mlp[1:50], label='MLP')
plt.plot(target_train_cat_cnn[1:50], label='CNN')
plt.plot(target_train_cat_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric and Categorical Features; Train Predictions vs Real Data");
plt.figure(figsize = (18, 6))
plt.plot(target_test_cat_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_cat_br[1:50], label='Bagging Regressor')
plt.plot(target_test_cat_mlpr[1:50], label='MLP Regressor')
plt.plot(target_test_cat_mlp[1:50], label='MLP')
plt.plot(target_test_cat_cnn[1:50], label='CNN')
plt.plot(target_test_cat_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric and Categorical Features; Test Predictions");
# Rescale Predictions
target_train_cat_enc_gbr = target_scale.inverse_transform(s_target_train_cat_enc_gbr.reshape(-1,1))
target_test_cat_enc_gbr = target_scale.inverse_transform(s_target_test_cat_enc_gbr.reshape(-1,1))
target_train_cat_enc_br = target_scale.inverse_transform(s_target_train_cat_enc_br.reshape(-1,1))
target_test_cat_enc_br = target_scale.inverse_transform(s_target_test_cat_enc_br.reshape(-1,1))
target_train_cat_enc_mlpr = target_scale.inverse_transform(s_target_train_cat_enc_mlpr.reshape(-1,1))
target_test_cat_enc_mlpr = target_scale.inverse_transform(s_target_test_cat_enc_mlpr.reshape(-1,1))
target_train_cat_enc_mlp = target_scale.inverse_transform(s_target_train_cat_enc_mlp.reshape(-1,1))
target_test_cat_enc_mlp = target_scale.inverse_transform(s_target_test_cat_enc_mlp.reshape(-1,1))
target_train_cat_enc_cnn = target_scale.inverse_transform(s_target_train_cat_enc_cnn.reshape(-1,1))
target_test_cat_enc_cnn = target_scale.inverse_transform(s_target_test_cat_enc_cnn.reshape(-1,1))
target_train_cat_enc_rnn = target_scale.inverse_transform(s_target_train_cat_enc_rnn.reshape(-1,1))
target_test_cat_enc_rnn = target_scale.inverse_transform(s_target_test_cat_enc_rnn.reshape(-1,1))
plt.figure(figsize = (18, 6))
plt.plot(target_train[1:50], color = 'black', label='Real Data')
plt.plot(target_train_cat_enc_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(target_train_cat_enc_mlpr[1:50], label='MLP Regressor')
plt.plot(target_train_cat_enc_mlp[1:50], label='MLP')
plt.plot(target_train_cat_enc_cnn[1:50], label='CNN')
plt.plot(target_train_cat_enc_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric and Encoded Categorical Features; Train Predictions vs Real Data");
plt.figure(figsize = (18, 6))
plt.plot(target_test_cat_enc_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(target_test_cat_enc_mlpr[1:50], label='MLP Regressor')
plt.plot(target_test_cat_enc_mlp[1:50], label='MLP')
plt.plot(target_test_cat_enc_cnn[1:50], label='CNN')
plt.plot(target_test_cat_enc_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric and Encoded Categorical Features; Test Predictions");
target_gbr = ["{0:.2f}".format(x) for x in target_test_gbr.reshape(-1)]
submission_gbr = pd.DataFrame({"id": test['id'], "price_doc": target_gbr})
print(submission_gbr[0:20])
submission_gbr.to_csv('kaggle_sberbank_gbr.csv', index=False)
target_br = ["{0:.2f}".format(x) for x in target_test_br.reshape(-1)]
submission_br = pd.DataFrame({"id": test['id'], "price_doc": target_br})
print(submission_br[0:20])
submission_br.to_csv('kaggle_sberbank_br.csv', index=False)
target_mlpr = ["{0:.2f}".format(x) for x in target_test_mlpr.reshape(-1)]
submission_mlpr = pd.DataFrame({"id": test['id'], "price_doc": target_mlpr})
print(submission_mlpr[0:20])
submission_mlpr.to_csv('kaggle_sberbank_mlpr.csv', index=False)
target_gbr_cat = ["{0:.2f}".format(x) for x in target_test_cat_gbr.reshape(-1)]
submission_gbr_cat = pd.DataFrame({"id": test['id'], "price_doc": target_gbr_cat})
print(submission_gbr_cat[0:20])
submission_gbr_cat.to_csv('kaggle_sberbank_gbr_cat.csv', index=False)
target_br_cat = ["{0:.2f}".format(x) for x in target_test_cat_br.reshape(-1)]
submission_br_cat = pd.DataFrame({"id": test['id'], "price_doc": target_br_cat})
print(submission_br_cat[0:20])
submission_br_cat.to_csv('kaggle_sberbank_br_cat.csv', index=False)
target_mlpr_cat = ["{0:.2f}".format(x) for x in target_test_cat_mlpr.reshape(-1)]
submission_mlpr_cat = pd.DataFrame({"id": test['id'], "price_doc": target_mlpr_cat})
print(submission_mlpr_cat[0:20])
submission_mlpr_cat.to_csv('kaggle_sberbank_mlpr_cat.csv', index=False)
target_gbr_cat_enc = ["{0:.2f}".format(x) for x in target_test_cat_enc_gbr.reshape(-1)]
submission_gbr_cat_enc = pd.DataFrame({"id": test['id'], "price_doc": target_gbr_cat_enc})
print(submission_gbr_cat_enc[0:20])
submission_gbr_cat_enc.to_csv('kaggle_sberbank_gbr_cat_enc.csv', index=False)
target_br_cat_enc = ["{0:.2f}".format(x) for x in target_test_cat_enc_br.reshape(-1)]
submission_br_cat_enc = pd.DataFrame({"id": test['id'], "price_doc": target_br_cat_enc})
print(submission_br_cat_enc[0:20])
submission_br_cat_enc.to_csv('kaggle_sberbank_br_cat_enc.csv', index=False)
target_mlpr_cat_enc = ["{0:.2f}".format(x) for x in target_test_cat_enc_mlpr.reshape(-1)]
submission_mlpr_cat_enc = pd.DataFrame({"id": test['id'], "price_doc": target_mlpr_cat_enc})
print(submission_mlpr_cat_enc[0:20])
submission_mlpr_cat_enc.to_csv('kaggle_sberbank_mlpr_cat_enc.csv', index=False)
target_mlp = ["{0:.2f}".format(x) for x in target_test_mlp.reshape(-1)]
submission_mlp = pd.DataFrame({"id": test['id'], "price_doc": target_mlp})
print(submission_mlp[0:20])
submission_mlp.to_csv('kaggle_sberbank_mlp.csv', index=False)
target_cnn = ["{0:.2f}".format(x) for x in target_test_cnn.reshape(-1)]
submission_cnn = pd.DataFrame({"id": test['id'], "price_doc": target_cnn})
print(submission_cnn[0:20])
submission_cnn.to_csv('kaggle_sberbank_cnn.csv', index=False)
target_rnn = ["{0:.2f}".format(x) for x in target_test_rnn.reshape(-1)]
submission_rnn = pd.DataFrame({"id": test['id'], "price_doc": target_cnn})
print(submission_rnn[0:20])
submission_rnn.to_csv('kaggle_sberbank_rnn.csv', index=False)
target_mlp_cat = ["{0:.2f}".format(x) for x in target_test_cat_mlp.reshape(-1)]
submission_mlp_cat = pd.DataFrame({"id": test['id'], "price_doc": target_mlp_cat})
print(submission_mlp_cat[0:20])
submission_mlp_cat.to_csv('kaggle_sberbank_mlp_cat.csv', index=False)
target_cnn_cat = ["{0:.2f}".format(x) for x in target_test_cat_cnn.reshape(-1)]
submission_cnn_cat = pd.DataFrame({"id": test['id'], "price_doc": target_cnn_cat})
print(submission_cnn_cat[0:20])
submission_cnn_cat.to_csv('kaggle_sberbank_cnn_cat.csv', index=False)
target_rnn_cat = ["{0:.2f}".format(x) for x in target_test_cat_rnn.reshape(-1)]
submission_rnn_cat = pd.DataFrame({"id": test['id'], "price_doc": target_cnn_cat})
print(submission_rnn_cat[0:20])
submission_rnn_cat.to_csv('kaggle_sberbank_rnn_cat.csv', index=False)
target_mlp_cat_enc = ["{0:.2f}".format(x) for x in target_test_cat_enc_mlp.reshape(-1)]
submission_mlp_cat_enc = pd.DataFrame({"id": test['id'], "price_doc": target_mlp_cat_enc})
print(submission_mlp_cat_enc[0:20])
submission_mlp_cat_enc.to_csv('kaggle_sberbank_mlp_cat_enc.csv', index=False)
target_cnn_cat_enc = ["{0:.2f}".format(x) for x in target_test_cat_enc_cnn.reshape(-1)]
submission_cnn_cat_enc = pd.DataFrame({"id": test['id'], "price_doc": target_cnn_cat_enc})
print(submission_cnn_cat_enc[0:20])
submission_cnn_cat_enc.to_csv('kaggle_sberbank_cnn_cat_enc.csv', index=False)
target_rnn_cat_enc = ["{0:.2f}".format(x) for x in target_test_cat_enc_rnn.reshape(-1)]
submission_rnn_cat_enc = pd.DataFrame({"id": test['id'], "price_doc": target_cnn_cat_enc})
print(submission_rnn_cat_enc[0:20])
submission_rnn_cat_enc.to_csv('kaggle_sberbank_rnn_cat_enc.csv', index=False)