from IPython.core.display import HTML
hide_code = ''
HTML('''
<script>
code_show = true;
function code_display() {
if (code_show) {
$('div.input').each(function(id) {if (id==0 || $(this).html().indexOf('hide_code')>-1) {$(this).hide();}});
$('div.output_prompt').css('opacity', 0);
} else {
$('div.input').each(function(id) {$(this).show();});
$('div.output_prompt').css('opacity', 1);
}
code_show = !code_show;
};
$(document).ready(code_display);
</script>
<form action="javascript: code_display()">
<input style="color: #228B22; background: ghostwhite; opacity: 0.9;"
type="submit" value="Click to display or hide code">
</form>
''')
hide_code
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import matplotlib.pylab as plt
from random import random
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.model_selection import KFold, ParameterGrid, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, median_absolute_error, mean_absolute_error
from sklearn.metrics import r2_score, explained_variance_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.linear_model import Ridge, RidgeCV, BayesianRidge
from sklearn.linear_model import HuberRegressor, TheilSenRegressor, RANSACRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
import keras as ks
from keras.models import Sequential, load_model, Model
from keras.optimizers import SGD, RMSprop
from keras.layers import Dense, Dropout, LSTM
from keras.layers import Activation, Flatten, Input, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D
from keras.layers.embeddings import Embedding
from keras.wrappers.scikit_learn import KerasRegressor
hide_code
def regression(regressor, x_train, x_test, y_train):
reg = regressor
reg.fit(x_train, y_train)
y_train_reg = reg.predict(x_train)
y_test_reg = reg.predict(x_test)
return y_train_reg, y_test_reg
def loss_plot(fit_history):
plt.figure(figsize=(18, 6))
plt.plot(fit_history.history['loss'], color='#348ABD', label = 'train')
plt.plot(fit_history.history['val_loss'], color='#228B22', label = 'test')
plt.legend()
plt.title('Loss Function');
def mae_plot(fit_history):
plt.figure(figsize=(18, 6))
plt.plot(fit_history.history['mean_absolute_error'], color='#348ABD', label = 'train')
plt.plot(fit_history.history['val_mean_absolute_error'], color='#228B22', label = 'test')
plt.legend()
plt.title('Mean Absolute Error');
def scores(regressor, y_train, y_test, y_train_reg, y_test_reg):
print("_______________________________________")
print(regressor)
print("_______________________________________")
print("EV score. Train: ", explained_variance_score(y_train, y_train_reg))
print("EV score. Test: ", explained_variance_score(y_test, y_test_reg))
print("---------")
print("R2 score. Train: ", r2_score(y_train, y_train_reg))
print("R2 score. Test: ", r2_score(y_test, y_test_reg))
print("---------")
print("MSE score. Train: ", mean_squared_error(y_train, y_train_reg))
print("MSE score. Test: ", mean_squared_error(y_test, y_test_reg))
print("---------")
print("MAE score. Train: ", mean_absolute_error(y_train, y_train_reg))
print("MAE score. Test: ", mean_absolute_error(y_test, y_test_reg))
print("---------")
print("MdAE score. Train: ", median_absolute_error(y_train, y_train_reg))
print("MdAE score. Test: ", median_absolute_error(y_test, y_test_reg))
def scores2(regressor, target, target_predict):
print("_______________________________________")
print(regressor)
print("_______________________________________")
print("EV score:", explained_variance_score(target, target_predict))
print("---------")
print("R2 score:", r2_score(target, target_predict))
print("---------")
print("MSE score:", mean_squared_error(target, target_predict))
print("---------")
print("MAE score:", mean_absolute_error(target, target_predict))
print("---------")
print("MdAE score:", median_absolute_error(target, target_predict))
In this capstone project proposal, prior to completing the following Capstone Project, we will leverage what we've learned throughout the Nanodegree program to author a proposal for solving a problem of our choice by applying machine learning algorithms and techniques. A project proposal encompasses seven key points:
Housing costs demand a significant investment from both consumers and developers. And when it comes to planning a budget — whether personal or corporate — the last thing anyone needs is uncertainty about one of their budgets expenses. Sberbank, Russian oldest and largest bank, helps their customers by making predictions about reality prices so renters, developers, and lenders are more confident when they sign a lease or purchase a building.
Although the housing market is relatively stable in Russia, the country volatile economy makes forecasting prices as a function of apartment characteristics a unique challenge. Complex interactions between housing features such as a number of bedrooms and location are enough to make pricing predictions complicated. Adding an unstable economy to the mix means Sberbank and their customers need more than simple regression models in their arsenal.
Sberbank is challenging programmers to develop algorithms which use a broad spectrum of features to predict real prices. Competitors will rely on a rich dataset that includes housing data and macroeconomic patterns. An accurate forecasting model will allow Sberbank to provide more certainty to their customers in an uncertain economy.
hide_code
HTML('''<div id="data">
<p><iframe src="data_dictionary.txt" frameborder="0" height="300"width="97%"></iframe></p>
</div>''')
hide_code
macro = pd.read_csv('macro.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
hide_code
macro[100:107].T[1:10]
hide_code
train[200:207].T[1:10]
hide_code
X_list_num = ['timestamp',
'full_sq', 'num_room', 'area_m',
'kremlin_km', 'big_road2_km', 'big_road1_km',
'workplaces_km',
'stadium_km', 'swim_pool_km', 'fitness_km',
'detention_facility_km', 'cemetery_km',
'radiation_km', 'oil_chemistry_km',
'theater_km', 'exhibition_km', 'museum_km',
'park_km', 'public_healthcare_km',
'metro_min_walk','metro_km_avto',
'bus_terminal_avto_km', 'public_transport_station_min_walk',
'railroad_station_walk_min', 'railroad_station_avto_km',
'kindergarten_km', 'school_km', 'preschool_km',
'university_km', 'additional_education_km',
'shopping_centers_km', 'big_market_km',
'ekder_all', 'work_all', 'young_all']
X_list_cat = ['sub_area', 'ID_metro',
'office_raion', 'sport_objects_raion',
'raion_popul', 'healthcare_centers_raion',
'school_education_centers_raion',
'preschool_education_centers_raion']
target_train = train['price_doc']
hide_code
plt.style.use('seaborn-whitegrid')
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(18, 6))
sns.distplot(target_train, bins=200, color='#228B22', ax=ax1)
ax1.set_xlabel("Prices")
sns.distplot(np.log(target_train), bins=200, color='#228B22', ax=ax2)
ax2.set_xlabel("Logarithm of the variable 'Prices'")
plt.suptitle('Sberbank Russian Housing Data');
hide_code
print ("Sberbank Russian Housing Dataset Statistics: \n")
print ("Number of houses = ", len(target_train))
print ("Number of features = ", len(list(train[X_list_num+X_list_cat].keys())))
print ("Minimum house price = ", np.min(target_train))
print ("Maximum house price = ", np.max(target_train))
print ("Mean house price = ", "%.2f" % np.mean(target_train))
print ("Median house price = ", "%.2f" % np.median(target_train))
print ("Standard deviation of house prices =", "%.2f" % np.std(target_train))
hide_code
train[X_list_num].isnull().sum()
hide_code
train[X_list_num].isnull().sum()
hide_code
df_train = pd.DataFrame(train, columns=X_list_num)
df_train_cat = pd.DataFrame(train, columns=X_list_num+X_list_cat)
df_test = pd.DataFrame(test, columns=X_list_num)
df_test_cat = pd.DataFrame(test, columns=X_list_num+X_list_cat)
df_train['prices'] = target_train
df_train_cat['prices'] = target_train
df_train = df_train.dropna(subset=['num_room'])
df_train_cat = df_train_cat.dropna(subset=['num_room'])
df_train['metro_min_walk'] = \
df_train['metro_min_walk'].interpolate(method='linear')
df_train_cat['metro_min_walk'] = \
df_train_cat['metro_min_walk'].interpolate(method='linear')
df_train['railroad_station_walk_min'] = \
df_train['railroad_station_walk_min'].interpolate(method='linear')
df_train_cat['railroad_station_walk_min'] = \
df_train_cat['railroad_station_walk_min'].interpolate(method='linear')
df_test['metro_min_walk'] = \
df_test['metro_min_walk'].interpolate(method='linear')
df_test_cat['metro_min_walk'] = \
df_test_cat['metro_min_walk'].interpolate(method='linear')
df_test['railroad_station_walk_min'] = \
df_test['railroad_station_walk_min'].interpolate(method='linear')
df_test_cat['railroad_station_walk_min'] = \
df_test_cat['railroad_station_walk_min'].interpolate(method='linear')
len(df_train)
hide_code
usdrub_pairs = dict(zip(list(macro['timestamp']), list(macro['usdrub'])))
# salary_pairs = dict(zip(list(macro['timestamp']), list(macro['salary'])))
df_train['timestamp'].replace(usdrub_pairs,inplace=True)
df_train_cat['timestamp'].replace(usdrub_pairs,inplace=True)
df_test['timestamp'].replace(usdrub_pairs,inplace=True)
df_test_cat['timestamp'].replace(usdrub_pairs,inplace=True)
df_train.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_train_cat.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_test.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_test_cat.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
hide_code
for df in [df_train_cat, df_test_cat]:
print ("____________________________________________")
print('sub area')
print('Number of categories:', len(set(df['sub_area'])))
print(set(df['sub_area']))
print('\nID metro')
print('Number of categories:', len(set(df['ID_metro'])))
print(set(df['ID_metro']))
print('\noffice raion')
print('Number of categories:', len(set(df['office_raion'])))
print(set(df['office_raion']))
print('\nsport objects raion')
print('Number of categories:', len(set(df['sport_objects_raion'])))
print(set(df_train_cat['sport_objects_raion']))
print('\nraion popul')
print('Number of categories:', len(set(df['raion_popul'])))
print(set(df['raion_popul']))
print('\nhealthcare centers raion')
print('Number of categories:', len(set(df_train_cat['healthcare_centers_raion'])))
print(set(df['healthcare_centers_raion']))
print('\nschool education centers raion')
print('Number of categories:', len(set(df['school_education_centers_raion'])))
print(set(df['school_education_centers_raion']))
print('\npreschool education centers raion')
print('Number of categories:', len(set(df['preschool_education_centers_raion'])))
print(set(df['preschool_education_centers_raion']))
hide_code
for feature in X_list_cat:
for element in list(set(df_test_cat[feature])):
if element not in list(set(df_train_cat[feature])):
print (feature, element)
hide_code
ID_metro_cat = pd.factorize(df_train_cat['ID_metro'])
df_train_cat['ID_metro'] = ID_metro_cat[0]
ID_metro_pairs = dict(zip(list(ID_metro_cat[1]), list(set(ID_metro_cat[0]))))
ID_metro_pairs[224] = 219
df_test_cat['ID_metro'].replace(ID_metro_pairs,inplace=True)
hide_code
for feature in X_list_cat:
if feature !='ID_metro':
feature_cat = pd.factorize(df_train_cat[feature])
df_train_cat[feature] = feature_cat[0]
feature_pairs = dict(zip(list(feature_cat[1]), list(set(feature_cat[0]))))
df_test_cat[feature].replace(feature_pairs,inplace=True)
hide_code
for df in [df_train_cat, df_test_cat]:
print ("____________________________________________")
print('sub area')
print('Number of categories:', len(set(df['sub_area'])))
print(set(df['sub_area']))
print('\nID metro')
print('Number of categories:', len(set(df['ID_metro'])))
print(set(df['ID_metro']))
print('\noffice raion')
print('Number of categories:', len(set(df['office_raion'])))
print(set(df['office_raion']))
print('\nsport objects raion')
print('Number of categories:', len(set(df['sport_objects_raion'])))
print(set(df_train_cat['sport_objects_raion']))
print('\nraion popul')
print('Number of categories:', len(set(df['raion_popul'])))
print(set(df['raion_popul']))
print('\nhealthcare centers raion')
print('Number of categories:', len(set(df_train_cat['healthcare_centers_raion'])))
print(set(df['healthcare_centers_raion']))
print('\nschool education centers raion')
print('Number of categories:', len(set(df['school_education_centers_raion'])))
print(set(df['school_education_centers_raion']))
print('\npreschool education centers raion')
print('Number of categories:', len(set(df['preschool_education_centers_raion'])))
print(set(df['preschool_education_centers_raion']))
hide_code
df_train_cat1 = df_train_cat
encode = OneHotEncoder(sparse=False)
for column in X_list_cat:
encode.fit(df_train_cat[[column]])
transform = encode.transform(df_train_cat[[column]])
transform = pd.DataFrame(transform,
columns=[(column+"_"+str(i)) for i in df_train_cat[column].value_counts().index])
transform = transform.set_index(df_train_cat.index.values)
df_train_cat1 = pd.concat([df_train_cat1, transform], axis=1)
df_train_cat1 = df_train_cat1.drop(column, 1)
hide_code
df_test_cat1 = df_test_cat
encode = OneHotEncoder(sparse=False)
for column in X_list_cat:
encode.fit(df_test_cat[[column]])
transform = encode.transform(df_test_cat[[column]])
transform = pd.DataFrame(transform,
columns=[(column+"_"+str(i)) for i in df_test_cat[column].value_counts().index])
transform = transform.set_index(df_test_cat.index.values)
df_test_cat1 = pd.concat([df_test_cat1, transform], axis=1)
df_test_cat1 = df_test_cat1.drop(column, 1)
hide_code
df_train_cat1.iloc[:, 623:636][:3].as_matrix()
hide_code
df_train_cat['preschool_education_centers_raion'][:3]
hide_code
print('Shape of the train data frame:', df_train_cat1.shape)
print('Shape of the test data frame:', df_test_cat1.shape)
hide_code
print("Features in the train data, but not in the test data:")
for element in list(df_train_cat1):
if element not in list(df_test_cat1):
print(element)
hide_code
print("Features in the test data, but not in the train data:")
for element in list(df_test_cat1):
if element not in list(df_train_cat1):
print(element)
hide_code
for column in ['sub_area_136',' ID_metro_188', 'ID_metro_205', 'ID_metro_216', 'ID_metro_214',
'ID_metro_183',' ID_metro_179', 'ID_metro_153', 'ID_metro_217', 'raion_popul_136']:
df_test_cat1[column] = 0
df_train_cat1['ID_metro_219'] = 0
print('Columns with zero values were added.\n')
print('Shape of the train data frame:', df_train_cat1.shape)
print('Shape of the test data frame:', df_test_cat1.shape)
hide_code
pearson = df_train.corr(method='pearson')
corr_with_prices = pearson.ix[-1][:-1]
corr_with_prices[abs(corr_with_prices).argsort()[::-1]]
hide_code
target_train = df_train['prices'].as_matrix()
features_train = df_train.drop('prices', 1).as_matrix()
features_test = df_test.as_matrix()
features_train_cat = df_train_cat.drop('prices', 1).as_matrix()
features_test_cat = df_test_cat.as_matrix()
features_train_cat_enc = df_train_cat1.drop('prices', 1).as_matrix()
features_test_cat_enc = df_test_cat1.as_matrix()
hide_code
print('Numeric Features')
X_train, X_test, y_train, y_test = \
train_test_split(features_train, target_train, test_size = 0.2, random_state = 1)
X_train.shape, X_test.shape
hide_code
print('Numeric and Categorical Features')
X_train_cat, X_test_cat, y_train_cat, y_test_cat = \
train_test_split(features_train_cat, target_train, test_size = 0.2, random_state = 1)
X_train_cat.shape, X_test_cat.shape
hide_code
print('Numeric and Encoded Categorical Features')
X_train_cat_enc, X_test_cat_enc, y_train_cat_enc, y_test_cat_enc = \
train_test_split(features_train_cat_enc, target_train, test_size = 0.2, random_state = 1)
X_train_cat_enc.shape, X_test_cat_enc.shape
hide_code
scale_X = RobustScaler()
X_train = scale_X.fit_transform(X_train)
X_test = scale_X.transform(X_test)
scale_y = RobustScaler()
y_train = scale_y.fit_transform(y_train.reshape(-1,1))
y_test = scale_y.transform(y_test.reshape(-1,1))
scale_X_cat = RobustScaler()
X_train_cat = scale_X_cat.fit_transform(X_train_cat)
X_test_cat = scale_X_cat.transform(X_test_cat)
scale_y_cat = RobustScaler()
y_train_cat = scale_y_cat.fit_transform(y_train_cat.reshape(-1,1))
y_test_cat = scale_y_cat.transform(y_test_cat.reshape(-1,1))
scale_X_cat_enc = RobustScaler()
X_train_cat_enc = scale_X_cat_enc.fit_transform(X_train_cat_enc)
X_test_cat_enc = scale_X_cat_enc.transform(X_test_cat_enc)
scale_y_cat_enc = RobustScaler()
y_train_cat_enc = scale_y_cat_enc.fit_transform(y_train_cat_enc.reshape(-1,1))
y_test_cat_enc = scale_y_cat_enc.transform(y_test_cat_enc.reshape(-1,1))
hide_code
print('Numeric Features')
print ('Gradient Boosting Regressor')
param_grid_gbr = {'max_depth': [3, 4, 5], 'n_estimators': range(36, 361, 36)}
gridsearch_gbr = GridSearchCV(GradientBoostingRegressor(),
param_grid_gbr, n_jobs=5).fit(X_train, y_train)
gridsearch_gbr.best_params_
hide_code
print ('Bagging Regressor')
param_grid_br = {'n_estimators': range(36, 361, 36)}
gridsearch_br = GridSearchCV(BaggingRegressor(),
param_grid_br, n_jobs=5).fit(X_train, y_train)
gridsearch_br.best_params_
hide_code
print('Numeric and Categorical Features')
print ('Gradient Boosting Regressor')
param_grid_gbr_cat = {'max_depth': [3, 4, 5], 'n_estimators': range(44, 441, 44)}
gridsearch_gbr_cat = GridSearchCV(GradientBoostingRegressor(),
param_grid_gbr_cat, n_jobs=5).fit(X_train_cat, y_train_cat)
gridsearch_gbr_cat.best_params_
hide_code
print ('Bagging Regressor')
param_grid_br_cat = {'n_estimators': range(44, 441, 44)}
gridsearch_br_cat = GridSearchCV(BaggingRegressor(),
param_grid_br_cat, n_jobs=5).fit(X_train_cat, y_train_cat)
gridsearch_br_cat.best_params_
hide_code
print('Numeric and Encoded Categorical Features')
print ('Gradient Boosting Regressor')
param_grid_gbr_cat_enc = {'max_depth': [3, 4, 5], 'n_estimators': [159, 318, 636]}
gridsearch_gbr_cat_enc = GridSearchCV(GradientBoostingRegressor(),
param_grid_gbr_cat_enc,
n_jobs=5).fit(X_train_cat_enc, y_train_cat_enc)
gridsearch_gbr_cat_enc.best_params_
hide_code
print ('Bagging Regressor')
param_grid_br_cat_enc = {'n_estimators': [159, 318, 636]}
gridsearch_br_cat_enc = GridSearchCV(BaggingRegressor(),
param_grid_br_cat_enc,
n_jobs=5).fit(X_train_cat_enc, y_train_cat_enc)
gridsearch_br_cat_enc.best_params_
hide_code
print('Numeric Features')
y_train_gbr, y_test_gbr = regression(GradientBoostingRegressor(max_depth=4, n_estimators=324),
X_train, X_test, y_train)
y_train_br, y_test_br = regression(BaggingRegressor(n_estimators=252),
X_train, X_test, y_train)
scores('GradientBoostingRegressor', y_train, y_test, y_train_gbr, y_test_gbr)
scores('BaggingRegressor', y_train, y_test, y_train_br, y_test_br)
hide_code
print('Numeric and Categorical Features')
y_train_cat_gbr, y_test_cat_gbr = \
regression(GradientBoostingRegressor(max_depth=3, n_estimators=396), X_train_cat, X_test_cat, y_train_cat)
y_train_cat_br, y_test_cat_br = \
regression(BaggingRegressor(n_estimators=220), X_train_cat, X_test_cat, y_train_cat)
scores('GradientBoostingRegressor',
y_train_cat, y_test_cat, y_train_cat_gbr, y_test_cat_gbr)
scores('BaggingRegressor',
y_train_cat, y_test_cat, y_train_cat_br, y_test_cat_br)
hide_code
print('Numeric and Encoded Categorical Features')
y_train_cat_enc_gbr, y_test_cat_enc_gbr = \
regression(GradientBoostingRegressor(max_depth=3, n_estimators=159),
X_train_cat_enc, X_test_cat_enc, y_train_cat_enc)
y_train_cat_enc_br, y_test_cat_enc_br = \
regression(BaggingRegressor(n_estimators=159),
X_train_cat_enc, X_test_cat_enc, y_train_cat_enc)
scores('GradientBoostingRegressor',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_gbr, y_test_cat_enc_gbr)
scores('BaggingRegressor',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_br, y_test_cat_enc_br)
hide_code
mlpr = MLPRegressor(hidden_layer_sizes=(324,), max_iter=200,
solver='lbfgs', alpha=0.01)
mlpr.fit(X_train, y_train)
y_train_mlpr = mlpr.predict(X_train)
y_test_mlpr = mlpr.predict(X_test)
scores('MLP Regressor; Numeric Features',
y_train, y_test, y_train_mlpr, y_test_mlpr)
hide_code
mlpr_cat = MLPRegressor(hidden_layer_sizes=(396,), max_iter=200,
solver='lbfgs', alpha=0.01)
mlpr_cat.fit(X_train_cat, y_train_cat)
y_train_cat_mlpr = mlpr_cat.predict(X_train_cat)
y_test_cat_mlpr = mlpr_cat.predict(X_test_cat)
scores('MLP Regressor; Numeric and Categorical Features',
y_train_cat, y_test_cat, y_train_cat_mlpr, y_test_cat_mlpr)
hide_code
mlpr_cat_enc = MLPRegressor(hidden_layer_sizes=(318,), max_iter=200,
solver='lbfgs', alpha=0.01)
mlpr_cat_enc.fit(X_train_cat_enc, y_train_cat_enc)
y_train_cat_enc_mlpr = mlpr_cat_enc.predict(X_train_cat_enc)
y_test_cat_enc_mlpr = mlpr_cat_enc.predict(X_test_cat_enc)
scores('MLP Regressor; Numeric and Encoded Categorical Features',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_mlpr, y_test_cat_enc_mlpr)
hide_code
plt.figure(figsize = (18, 6))
plt.plot(y_test[1:50], color = 'black', label='Real Data')
plt.plot(y_test_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_br[1:50], label='Bagging Regressor')
plt.plot(y_test_mlpr[1:50], label='MLP Regressor')
plt.legend()
plt.title("Numeric Features; Regressor Predictions vs Real Data");
hide_code
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')
plt.plot(y_test_cat_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_cat_br[1:50], label='Bagging Regressor')
plt.plot(y_test_cat_mlpr[1:50], label='MLP Regressor')
plt.legend()
plt.title("Numeric and Categorical Features; Regressor Predictions vs Real Data");
hide_code
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat_enc[1:50], color = 'black', label='Real Data')
plt.plot(y_test_cat_enc_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(y_test_cat_enc_mlpr[1:50], label='MLP Regressor')
plt.legend()
plt.title("Numeric and Encoded Categorical Features; Regressor Predictions vs Real Data");
hide_code
def mlp_model():
model = Sequential()
model.add(Dense(36, activation='relu', input_dim=36))
model.add(Dense(36, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(144, activation='relu'))
model.add(Dense(144, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(576, activation='relu'))
model.add(Dense(576, activation='relu'))
model.add(Dense(1))
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
mlp_model = mlp_model()
mlp_history = mlp_model.fit(X_train, y_train, validation_data=(X_test, y_test),
nb_epoch=20, batch_size=128, verbose=0)
hide_code
loss_plot(mlp_history)
mae_plot(mlp_history)
hide_code
y_train_mlp = mlp_model.predict(X_train)
y_test_mlp = mlp_model.predict(X_test)
scores('MLP Model; Numeric Features', y_train, y_test, y_train_mlp, y_test_mlp)
hide_code
mlp_model.save('mlp_model_p6_v1.h5')
hide_code
def mlp_cat_model():
model = Sequential()
model.add(Dense(44, activation='relu', input_dim=44))
model.add(Dense(44, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(156, activation='relu'))
model.add(Dense(156, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(624, activation='relu'))
model.add(Dense(624, activation='relu'))
model.add(Dense(1))
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
mlp_cat_model = mlp_cat_model()
mlp_cat_history = mlp_cat_model.fit(X_train_cat, y_train_cat,
validation_data=(X_test_cat, y_test_cat),
nb_epoch=30, batch_size=128, verbose=0)
hide_code
loss_plot(mlp_cat_history)
mae_plot(mlp_cat_history)
hide_code
y_train_cat_mlp = mlp_cat_model.predict(X_train_cat)
y_test_cat_mlp = mlp_cat_model.predict(X_test_cat)
scores('MLP Model; Numeric and Categorical Features',
y_train_cat, y_test_cat, y_train_cat_mlp, y_test_cat_mlp)
hide_code
mlp_cat_model.save('mlp_cat_model_p6_v1.h5')
hide_code
def mlp_cat_enc_model():
model = Sequential()
model.add(Dense(159, activation='relu', input_dim=636))
model.add(Dense(159, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(318, activation='relu'))
model.add(Dense(318, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(636, activation='relu'))
model.add(Dense(636, activation='relu'))
model.add(Dense(1))
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
mlp_cat_enc_model = mlp_cat_enc_model()
mlp_cat_enc_history = mlp_cat_enc_model.fit(X_train_cat_enc, y_train_cat_enc,
validation_data=(X_test_cat_enc, y_test_cat_enc),
nb_epoch=20, batch_size=128, verbose=0)
hide_code
loss_plot(mlp_cat_enc_history)
mae_plot(mlp_cat_enc_history)
hide_code
y_train_cat_enc_mlp = mlp_cat_enc_model.predict(X_train_cat_enc)
y_test_cat_enc_mlp = mlp_cat_enc_model.predict(X_test_cat_enc)
scores('MLP Model; Numeric and Encoded Categorical Features',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_mlp, y_test_cat_enc_mlp)
hide_code
mlp_cat_enc_model.save('mlp_cat_enc_model_p6_v1.h5')
hide_code
def cnn_model():
model = Sequential()
model.add(Conv1D(36, 5, padding='valid', activation='relu', input_shape=(36, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Conv1D(144, 3, padding='valid', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(576, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer='normal'))
# opt = keras.optimizers.rmsprop(decay=1e-6)
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
cnn_model = cnn_model()
cnn_history = cnn_model.fit(X_train.reshape(16719, 36, 1), y_train,
epochs=20, batch_size=128, verbose=0,
validation_data=(X_test.reshape(4180, 36, 1), y_test))
hide_code
loss_plot(cnn_history)
mae_plot(cnn_history)
hide_code
y_train_cnn = cnn_model.predict(X_train.reshape(16719, 36, 1))
y_test_cnn = cnn_model.predict(X_test.reshape(4180, 36, 1))
scores('CNN Model; Numeric Features', y_train, y_test, y_train_cnn, y_test_cnn)
hide_code
cnn_model.save('cnn_model_p6_v1.h5')
hide_code
def cnn_cat_model():
model = Sequential()
model.add(Conv1D(44, 5, padding='valid', activation='relu', input_shape=(44, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Conv1D(156, 3, padding='valid', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(624, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer='normal'))
# opt = keras.optimizers.rmsprop(decay=1e-6)
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
cnn_cat_model = cnn_cat_model()
cnn_cat_history = cnn_cat_model.fit(X_train_cat.reshape(16719, 44, 1), y_train_cat,
epochs=20, batch_size=128, verbose=0,
validation_data=(X_test_cat.reshape(4180, 44, 1), y_test_cat))
hide_code
loss_plot(cnn_cat_history)
mae_plot(cnn_cat_history)
hide_code
y_train_cat_cnn = cnn_cat_model.predict(X_train_cat.reshape(16719, 44, 1))
y_test_cat_cnn = cnn_cat_model.predict(X_test_cat.reshape(4180, 44, 1))
scores('CNN Model; Numeric and Categorical Features',
y_train_cat, y_test_cat, y_train_cat_cnn, y_test_cat_cnn)
hide_code
cnn_cat_model.save('cnn_cat_model_p6_v1.h5')
hide_code
def cnn_cat_enc_model():
model = Sequential()
model.add(Conv1D(159, 5, padding='valid', activation='relu', input_shape=(636, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Conv1D(318, 3, padding='valid', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(636, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer='normal'))
# opt = keras.optimizers.rmsprop(decay=1e-6)
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
cnn_cat_enc_model = cnn_cat_enc_model()
cnn_cat_enc_history = \
cnn_cat_enc_model.fit(X_train_cat_enc.reshape(16719, 636, 1), y_train_cat_enc,
epochs=10, batch_size=128, verbose=2,
validation_data=(X_test_cat_enc.reshape(4180, 636, 1), y_test_cat_enc))
hide_code
loss_plot(cnn_cat_enc_history)
mae_plot(cnn_cat_enc_history)
hide_code
y_train_cat_enc_cnn = cnn_cat_enc_model.predict(X_train_cat_enc.reshape(16719, 636, 1))
y_test_cat_enc_cnn = cnn_cat_enc_model.predict(X_test_cat_enc.reshape(4180, 636, 1))
scores('CNN Model; Numeric and Encoded Categorical Features',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_cnn, y_test_cat_enc_cnn)
hide_code
cnn_cat_enc_model.save('cnn_cat_enc_model_p6_v1.h5')
hide_code
def rnn_model():
model = Sequential()
model.add(LSTM(144, return_sequences=True, input_shape=(1, 36)))
model.add(LSTM(576, return_sequences=False))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
rnn_model = rnn_model()
rnn_history = rnn_model.fit(X_train.reshape(16719, 1, 36), y_train.reshape(16719),
epochs=5, verbose=2,
validation_data=(X_test.reshape(4180, 1, 36), y_test.reshape(4180)))
hide_code
loss_plot(rnn_history)
mae_plot(rnn_history)
hide_code
y_train_rnn = rnn_model.predict(X_train.reshape(16719, 1, 36))
y_test_rnn = rnn_model.predict(X_test.reshape(4180, 1, 36))
scores('RNN Model; Numeric Features', y_train, y_test, y_train_rnn, y_test_rnn)
hide_code
rnn_model.save('rnn_model_p6_v1.h5')
hide_code
def rnn_cat_model():
model = Sequential()
model.add(LSTM(156, return_sequences=True, input_shape=(1, 44)))
model.add(LSTM(624, return_sequences=False))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
rnn_cat_model = rnn_cat_model()
rnn_cat_history = rnn_cat_model.fit(X_train_cat.reshape(16719, 1, 44), y_train_cat.reshape(16719),
epochs=10, verbose=2,
validation_data=(X_test_cat.reshape(4180, 1, 44), y_test_cat.reshape(4180)))
hide_code
loss_plot(rnn_cat_history)
mae_plot(rnn_cat_history)
hide_code
y_train_cat_rnn = rnn_cat_model.predict(X_train_cat.reshape(16719, 1, 44))
y_test_cat_rnn = rnn_cat_model.predict(X_test_cat.reshape(4180, 1, 44))
scores('RNN Model; Numeric and Categorical Features',
y_train_cat, y_test_cat, y_train_cat_rnn, y_test_cat_rnn)
hide_code
rnn_cat_model.save('rnn_cat_model_p6_v1.h5')
hide_code
def rnn_cat_enc_model():
model = Sequential()
model.add(LSTM(159, return_sequences=True, input_shape=(1, 636)))
model.add(LSTM(636, return_sequences=False))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
rnn_cat_enc_model = rnn_cat_enc_model()
rnn_cat_enc_history = \
rnn_cat_enc_model.fit(X_train_cat_enc.reshape(16719, 1, 636), y_train_cat_enc.reshape(16719),
epochs=10, verbose=2,
validation_data=(X_test_cat_enc.reshape(4180, 1, 636), y_test_cat_enc.reshape(4180)))
hide_code
loss_plot(rnn_cat_enc_history)
mae_plot(rnn_cat_enc_history)
hide_code
y_train_cat_enc_rnn = rnn_cat_enc_model.predict(X_train_cat_enc.reshape(16719, 1, 636))
y_test_cat_enc_rnn = rnn_cat_enc_model.predict(X_test_cat_enc.reshape(4180, 1, 636))
scores('RNN Model; Numeric and Encoded Categorical Features',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_rnn, y_test_cat_enc_rnn)
hide_code
rnn_cat_enc_model.save('rnn_cat_enc_model_p6_v1.h5')
hide_code
plt.figure(figsize = (18, 6))
plt.plot(y_test[1:50], color = 'black', label='Real Data')
plt.plot(y_test_mlp[1:50], label='MLP')
plt.plot(y_test_cnn[1:50], label='CNN')
plt.plot(y_test_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric Features; Neural Network Predictions vs Real Data");
hide_code
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')
plt.plot(y_test_cat_mlp[1:50], label='MLP')
plt.plot(y_test_cat_cnn[1:50], label='CNN')
plt.plot(y_test_cat_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric and Categorical Features; Neural Network Predictions vs Real Data");
hide_code
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')
plt.plot(y_test_cat_enc_mlp[1:50], label='MLP')
plt.plot(y_test_cat_enc_cnn[1:50], label='CNN')
plt.plot(y_test_cat_enc_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric and Encoded Categorical Features; Neural Network Predictions vs Real Data");
hide_code
target_scale = RobustScaler()
s_target_train = target_scale.fit_transform(target_train.reshape(-1,1))
##############################################################################
feature_scale = RobustScaler()
s_features_train = feature_scale.fit_transform(features_train)
s_features_test = feature_scale.transform(features_test)
##############################################################################
feature_cat_scale = RobustScaler()
s_features_train_cat = feature_cat_scale.fit_transform(features_train_cat)
s_features_test_cat = feature_cat_scale.transform(features_test_cat)
##############################################################################
feature_cat_enc_scale = RobustScaler()
s_features_train_cat_enc = feature_cat_enc_scale.fit_transform(features_train_cat_enc)
s_features_test_cat_enc = feature_cat_enc_scale.transform(features_test_cat_enc)
hide_code
gbr = GradientBoostingRegressor(max_depth=4, n_estimators=324)
gbr.fit(s_features_train, s_target_train)
s_target_train_gbr = gbr.predict(s_features_train)
s_target_test_gbr = gbr.predict(s_features_test)
scores2('Gradient Boosting Regressor', s_target_train, s_target_train_gbr)
hide_code
br = BaggingRegressor(n_estimators=252)
br.fit(s_features_train, s_target_train)
s_target_train_br = br.predict(s_features_train)
s_target_test_br = br.predict(s_features_test)
scores2('Bagging Regressor', s_target_train, s_target_train_br)
hide_code
s_target_train_mlpr = mlpr.predict(s_features_train)
s_target_test_mlpr = mlpr.predict(s_features_test)
scores2('MLP Regressor', s_target_train, s_target_train_mlpr)
hide_code
gbr_cat = GradientBoostingRegressor(max_depth=3, n_estimators=396)
gbr_cat.fit(s_features_train_cat, s_target_train)
s_target_train_cat_gbr = gbr_cat.predict(s_features_train_cat)
s_target_test_cat_gbr = gbr_cat.predict(s_features_test_cat)
scores2('Gradient Boosting Regressor', s_target_train, s_target_train_cat_gbr)
hide_code
br_cat = BaggingRegressor(n_estimators=220)
br_cat.fit(s_features_train_cat, s_target_train)
s_target_train_cat_br = br_cat.predict(s_features_train_cat)
s_target_test_cat_br = br_cat.predict(s_features_test_cat)
scores2('Bagging Regressor', s_target_train, s_target_train_cat_br)
hide_code
s_target_train_cat_mlpr = mlpr_cat.predict(s_features_train_cat)
s_target_test_cat_mlpr = mlpr_cat.predict(s_features_test_cat)
scores2('MLP Regressor', s_target_train, s_target_train_cat_mlpr)
hide_code
gbr_cat_enc = GradientBoostingRegressor(max_depth=3, n_estimators=159)
gbr_cat_enc.fit(s_features_train_cat_enc, s_target_train)
s_target_train_cat_enc_gbr = gbr_cat_enc.predict(s_features_train_cat_enc)
s_target_test_cat_enc_gbr = gbr_cat_enc.predict(s_features_test_cat_enc)
scores2('Gradient Boosting Regressor', s_target_train, s_target_train_cat_enc_gbr)
hide_code
br_cat_enc = BaggingRegressor(n_estimators=159)
br_cat_enc.fit(s_features_train_cat_enc, s_target_train)
s_target_train_cat_enc_br = br_cat.predict(s_features_train_cat_enc)
s_target_test_cat_enc_br = br_cat.predict(s_features_test_cat_enc)
scores2('Bagging Regressor', s_target_train, s_target_train_cat_enc_br)
hide_code
s_target_train_cat_enc_mlpr = mlpr_cat_enc.predict(s_features_train_cat_enc)
s_target_test_cat_enc_mlpr = mlpr_cat_enc.predict(s_features_test_cat_enc)
scores2('MLP Regressor', s_target_train, s_target_train_cat_enc_mlpr)
hide_code
s_target_train_mlp = mlp_model.predict(s_features_train)
s_target_test_mlp = mlp_model.predict(s_features_test)
scores2('MLP', s_target_train, s_target_train_mlp)
hide_code
s_target_train_cnn = cnn_model.predict(s_features_train.reshape(20899, 36, 1))
s_target_test_cnn = cnn_model.predict(s_features_test.reshape(7662, 36, 1))
scores2('CNN', s_target_train, s_target_train_cnn)
hide_code
s_target_train_rnn = rnn_model.predict(s_features_train.reshape(20899, 1, 36))
s_target_test_rnn = rnn_model.predict(s_features_test.reshape(7662, 1, 36))
scores2('RNN', s_target_train, s_target_train_rnn)
hide_code
s_target_train_cat_mlp = mlp_cat_model.predict(s_features_train_cat)
s_target_test_cat_mlp = mlp_cat_model.predict(s_features_test_cat)
scores2('MLP', s_target_train, s_target_train_cat_mlp)
hide_code
s_target_train_cat_cnn = cnn_cat_model.predict(s_features_train_cat.reshape(20899, 44, 1))
s_target_test_cat_cnn = cnn_cat_model.predict(s_features_test_cat.reshape(7662, 44, 1))
scores2('CNN', s_target_train, s_target_train_cat_cnn)
hide_code
s_target_train_cat_rnn = rnn_cat_model.predict(s_features_train_cat.reshape(20899, 1, 44))
s_target_test_cat_rnn = rnn_cat_model.predict(s_features_test_cat.reshape(7662, 1, 44))
scores2('RNN', s_target_train, s_target_train_cat_rnn)
hide_code
s_target_train_cat_enc_mlp = mlp_cat_enc_model.predict(s_features_train_cat_enc)
s_target_test_cat_enc_mlp = mlp_cat_enc_model.predict(s_features_test_cat_enc)
scores2('MLP', s_target_train, s_target_train_cat_enc_mlp)
hide_code
s_target_train_cat_enc_cnn = cnn_cat_enc_model.predict(s_features_train_cat_enc.reshape(20899, 636, 1))
s_target_test_cat_enc_cnn = cnn_cat_enc_model.predict(s_features_test_cat_enc.reshape(7662, 636, 1))
scores2('CNN', s_target_train, s_target_train_cat_enc_cnn)
hide_code
s_target_train_cat_enc_rnn = rnn_cat_enc_model.predict(s_features_train_cat_enc.reshape(20899, 1, 636))
s_target_test_cat_enc_rnn = rnn_cat_enc_model.predict(s_features_test_cat_enc.reshape(7662, 1, 636))
scores2('RNN', s_target_train, s_target_train_cat_enc_rnn)
hide_code
# Rescale Predictions
target_train_gbr = target_scale.inverse_transform(s_target_train_gbr.reshape(-1,1))
target_test_gbr = target_scale.inverse_transform(s_target_test_gbr.reshape(-1,1))
target_train_br = target_scale.inverse_transform(s_target_train_br.reshape(-1,1))
target_test_br = target_scale.inverse_transform(s_target_test_br.reshape(-1,1))
target_train_mlpr = target_scale.inverse_transform(s_target_train_mlpr.reshape(-1,1))
target_test_mlpr = target_scale.inverse_transform(s_target_test_mlpr.reshape(-1,1))
target_train_mlp = target_scale.inverse_transform(s_target_train_mlp)
target_test_mlp = target_scale.inverse_transform(s_target_test_mlp)
target_train_cnn = target_scale.inverse_transform(s_target_train_cnn)
target_test_cnn = target_scale.inverse_transform(s_target_test_cnn)
target_train_rnn = target_scale.inverse_transform(s_target_train_rnn)
target_test_rnn = target_scale.inverse_transform(s_target_test_rnn)
hide_code
plt.figure(figsize = (18, 6))
plt.plot(target_train[1:50], color = 'black', label='Real Data')
plt.plot(target_train_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_br[1:50], label='Bagging Regressor')
plt.plot(target_train_mlpr[1:50], label='MLP Regressor')
plt.plot(target_train_mlp[1:50], label='MLP')
plt.plot(target_train_cnn[1:50], label='CNN')
plt.plot(target_train_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric Features; Train Predictions vs Real Data");
hide_code
plt.figure(figsize = (18, 6))
plt.plot(target_test_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_br[1:50], label='Bagging Regressor')
plt.plot(target_test_mlpr[1:50], label='MLP Regressor')
plt.plot(target_test_mlp[1:50], label='MLP')
plt.plot(target_test_cnn[1:50], label='CNN')
plt.plot(target_test_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric Features; Test Predictions");
hide_code
# Rescale Predictions
target_train_cat_gbr = target_scale.inverse_transform(s_target_train_cat_gbr.reshape(-1,1))
target_test_cat_gbr = target_scale.inverse_transform(s_target_test_cat_gbr.reshape(-1,1))
target_train_cat_br = target_scale.inverse_transform(s_target_train_cat_br.reshape(-1,1))
target_test_cat_br = target_scale.inverse_transform(s_target_test_cat_br.reshape(-1,1))
target_train_cat_mlpr = target_scale.inverse_transform(s_target_train_cat_mlpr.reshape(-1,1))
target_test_cat_mlpr = target_scale.inverse_transform(s_target_test_cat_mlpr.reshape(-1,1))
target_train_cat_mlp = target_scale.inverse_transform(s_target_train_cat_mlp.reshape(-1,1))
target_test_cat_mlp = target_scale.inverse_transform(s_target_test_cat_mlp.reshape(-1,1))
target_train_cat_cnn = target_scale.inverse_transform(s_target_train_cat_cnn.reshape(-1,1))
target_test_cat_cnn = target_scale.inverse_transform(s_target_test_cat_cnn.reshape(-1,1))
target_train_cat_rnn = target_scale.inverse_transform(s_target_train_cat_rnn.reshape(-1,1))
target_test_cat_rnn = target_scale.inverse_transform(s_target_test_cat_rnn.reshape(-1,1))
hide_code
plt.figure(figsize = (18, 6))
plt.plot(target_train[1:50], color = 'black', label='Real Data')
plt.plot(target_train_cat_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_cat_br[1:50], label='Bagging Regressor')
plt.plot(target_train_cat_mlpr[1:50], label='MLP Regressor')
plt.plot(target_train_cat_mlp[1:50], label='MLP')
plt.plot(target_train_cat_cnn[1:50], label='CNN')
plt.plot(target_train_cat_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric and Categorical Features; Train Predictions vs Real Data");
hide_code
plt.figure(figsize = (18, 6))
plt.plot(target_test_cat_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_cat_br[1:50], label='Bagging Regressor')
plt.plot(target_test_cat_mlpr[1:50], label='MLP Regressor')
plt.plot(target_test_cat_mlp[1:50], label='MLP')
plt.plot(target_test_cat_cnn[1:50], label='CNN')
plt.plot(target_test_cat_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric and Categorical Features; Test Predictions");
hide_code
# Rescale Predictions
target_train_cat_enc_gbr = target_scale.inverse_transform(s_target_train_cat_enc_gbr.reshape(-1,1))
target_test_cat_enc_gbr = target_scale.inverse_transform(s_target_test_cat_enc_gbr.reshape(-1,1))
target_train_cat_enc_br = target_scale.inverse_transform(s_target_train_cat_enc_br.reshape(-1,1))
target_test_cat_enc_br = target_scale.inverse_transform(s_target_test_cat_enc_br.reshape(-1,1))
target_train_cat_enc_mlpr = target_scale.inverse_transform(s_target_train_cat_enc_mlpr.reshape(-1,1))
target_test_cat_enc_mlpr = target_scale.inverse_transform(s_target_test_cat_enc_mlpr.reshape(-1,1))
target_train_cat_enc_mlp = target_scale.inverse_transform(s_target_train_cat_enc_mlp.reshape(-1,1))
target_test_cat_enc_mlp = target_scale.inverse_transform(s_target_test_cat_enc_mlp.reshape(-1,1))
target_train_cat_enc_cnn = target_scale.inverse_transform(s_target_train_cat_enc_cnn.reshape(-1,1))
target_test_cat_enc_cnn = target_scale.inverse_transform(s_target_test_cat_enc_cnn.reshape(-1,1))
target_train_cat_enc_rnn = target_scale.inverse_transform(s_target_train_cat_enc_rnn.reshape(-1,1))
target_test_cat_enc_rnn = target_scale.inverse_transform(s_target_test_cat_enc_rnn.reshape(-1,1))
hide_code
plt.figure(figsize = (18, 6))
plt.plot(target_train[1:50], color = 'black', label='Real Data')
plt.plot(target_train_cat_enc_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(target_train_cat_enc_mlpr[1:50], label='MLP Regressor')
plt.plot(target_train_cat_enc_mlp[1:50], label='MLP')
plt.plot(target_train_cat_enc_cnn[1:50], label='CNN')
plt.plot(target_train_cat_enc_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric and Encoded Categorical Features; Train Predictions vs Real Data");
hide_code
plt.figure(figsize = (18, 6))
plt.plot(target_test_cat_enc_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(target_test_cat_enc_mlpr[1:50], label='MLP Regressor')
plt.plot(target_test_cat_enc_mlp[1:50], label='MLP')
plt.plot(target_test_cat_enc_cnn[1:50], label='CNN')
plt.plot(target_test_cat_enc_rnn[1:50], label='RNN')
plt.legend()
plt.title("Numeric and Encoded Categorical Features; Test Predictions");
The project was built on the basis of the competition offered on the site https://www.kaggle.com.
The competition version of this notebook is avalible here: https://www.kaggle.com/olgabelitskaya/sberbank-russian-housing-market .
There are several popular resources (numpy, pandas, matplotlib, scikit-learn and keras) for regression models were used.
The most valuable in this project is the study of real data and the attempt to approximate the predictions on them to the threshold of 70-80 percent.