%%html
<style>
@import url('https://fonts.googleapis.com/css?family=Orbitron|Roboto');
body {background-color: honeydew;}
a {color: #31c831; font-family: 'Roboto';}
h1 {color: forestgreen; font-family: 'Orbitron'; text-shadow: 4px 4px 4px #ccc;}
h2, h3 {color: slategray; font-family: 'Orbitron'; text-shadow: 4px 4px 4px #ccc;}
h4 {color: #31c831; font-family: 'Roboto';}
span {text-shadow: 4px 4px 4px #ccc;}
div.output_prompt, div.output_area pre {color: slategray;}
div.input_prompt, div.output_subarea {color: forestgreen;}
div.output_stderr pre {background-color: ghostwhite;}
div.output_stderr {background-color: slategrey;}
</style>
<script>
code_show = true;
function code_display() {
if (code_show) {
$('div.input').each(function(id) {
if (id == 0 || $(this).html().indexOf('hide_code') > -1) {$(this).hide();}
});
$('div.output_prompt').css('opacity', 0);
} else {
$('div.input').each(function(id) {$(this).show();});
$('div.output_prompt').css('opacity', 1);
};
code_show = !code_show;
}
$(document).ready(code_display);
</script>
<form action="javascript: code_display()">
<input style="color: forestgreen; background: honeydew; opacity: 0.8;" \
type="submit" value="Click to display or hide code cells">
</form>
hide_code = ''
#########################################
### IMPORT LIBRARIES FOR THIS PROJECT ###
#########################################
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import matplotlib.pylab as plt
from random import random
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, HTML, SVG
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.model_selection import KFold, ParameterGrid, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, median_absolute_error, mean_absolute_error
from sklearn.metrics import r2_score, explained_variance_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.linear_model import Ridge, RidgeCV, BayesianRidge
from sklearn.linear_model import HuberRegressor, TheilSenRegressor, RANSACRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
import keras as ks
from keras.models import Sequential, load_model, Model
from keras.optimizers import SGD, RMSprop
from keras.layers import Dense, Dropout, LSTM, GlobalAveragePooling1D
from keras.layers import Activation, Flatten, Input, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint
from keras.wrappers.scikit_learn import KerasRegressor
from keras.utils.vis_utils import model_to_dot
hide_code
# Fit the Regressor
def regression(regressor, x_train, x_test, y_train):
reg = regressor
reg.fit(x_train, y_train)
y_train_reg = reg.predict(x_train)
y_test_reg = reg.predict(x_test)
return y_train_reg, y_test_reg
# Plot the Neural network fitting history
def history_plot(fit_history):
plt.figure(figsize=(18, 12))
plt.subplot(211)
plt.plot(fit_history.history['loss'], color='#348ABD', label = 'train')
plt.plot(fit_history.history['val_loss'], color='#228B22', label = 'test')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title('Loss Function');
plt.subplot(212)
plt.plot(fit_history.history['mean_absolute_error'], color='#348ABD', label = 'train')
plt.plot(fit_history.history['val_mean_absolute_error'], color='#228B22', label = 'test')
plt.xlabel("Epochs")
plt.ylabel("MAE")
plt.legend()
plt.title('Mean Absolute Error');
# Get values of the metrics
def scores(regressor, y_train, y_test, y_train_reg, y_test_reg):
separator1, separator2 = '<_>'*18, '-'*10
print(separator1, '\n', regressor, '\n'+separator1)
print("EV score. Train: ", explained_variance_score(y_train, y_train_reg))
print("EV score. Test: ", explained_variance_score(y_test, y_test_reg))
print(separator2)
print("R2 score. Train: ", r2_score(y_train, y_train_reg))
print("R2 score. Test: ", r2_score(y_test, y_test_reg))
print(separator2)
print("MSE score. Train: ", mean_squared_error(y_train, y_train_reg))
print("MSE score. Test: ", mean_squared_error(y_test, y_test_reg))
print(separator2)
print("MAE score. Train: ", mean_absolute_error(y_train, y_train_reg))
print("MAE score. Test: ", mean_absolute_error(y_test, y_test_reg))
print(separator2)
print("MdAE score. Train: ", median_absolute_error(y_train, y_train_reg))
print("MdAE score. Test: ", median_absolute_error(y_test, y_test_reg))
def scores2(regressor, target, target_predict):
separator1, separator2 = '<_>'*18, '-'*10
print(separator1, '\n', regressor, '\n'+separator1)
print("EV score:", explained_variance_score(target, target_predict))
print(separator2)
print("R2 score:", r2_score(target, target_predict))
print(separator2)
print("MSE score:", mean_squared_error(target, target_predict))
print(separator2)
print("MAE score:", mean_absolute_error(target, target_predict))
print(separator2)
print("MdAE score:", median_absolute_error(target, target_predict))
In this capstone project proposal
, the goals are stated to leverage what we've learned throughout the Nanodegree program to author a proposal for solving a problem of our choice by applying machine learning algorithms and techniques. A project proposal encompasses seven key points:
The full project report
about results will be completed and published as well.
Housing costs demand a significant investment from both consumers and developers. And when it comes to planning a budget—whether personal or corporate—the last thing anyone needs is uncertainty about one of their budgets expenses. Sberbank, Russia’s oldest and largest bank, helps their customers by making predictions about reality prices so renters, developers, and lenders are more confident when they sign a lease or purchase a building.
Although the housing market is relatively stable in Russia, the country’s volatile economy makes forecasting prices as a function of apartment characteristics a unique challenge. Complex interactions between housing features such as a number of bedrooms and location are enough to make pricing predictions complicated. Adding an unstable economy to the mix means Sberbank and their customers need more than simple regression models in their arsenal.
Sberbank is challenging programmers to develop algorithms which use a broad spectrum of features to predict real prices. Algorithm applications rely on a rich dataset that includes housing data and macroeconomic patterns. An accurate forecasting model will allow Sberbank to provide more certainty to their customers in an uncertain economy.
My choice of the solution in this situation is to select the most correlated indicators with the target variable and apply ensemble algorithms that have repeatedly shown successful results in the study of price trends in real estate. Boosting and bagging methods combine several models at once in order to improve the prediction accuracy on learning problems with a numerical target variable.
Then I am going to explore the different types of neural networks in the sphere of regression predictions and try to achieve the same with ensemble methods level of model perfomance.
The basis for the investigation is a large number of economic indicators for pricing and prices themselves (train.csv
and test.csv
). Macroeconomic variables are collected in a separate file for transaction dates (macro.csv
). In addition, the detailed description of variables is provided (data_dictionary.txt
).
For practical reasons, I have not analyzed all the data and have chosen the following independent variables:
All these economic indicators have a strong influence on price formation and can be used as a basic set for regression analysis. Examples of numerical variables: the distance to the metro, the distance to the school, the dollar rate at the transaction moment, the area of the living space. Examples of categorical variables: neighborhoods, the nearest metro station, the number of rooms.
The goal of the project is to predict the price of housing using the chosen set of numerical and categorical variables. The predicted target is not discrete, for the training set all the values of this dependent variable are given, and therefore it is necessary to apply the regression algorithms of supervised learning.
hide_code
# Display the description file
HTML('''<div id="data">
<p><iframe src="data_dictionary.txt" frameborder="3" height="300" width="99%"></iframe></p>
</div>''')
hide_code
# Load the dataset
macro = pd.read_csv('macro.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
hide_code
# Display the data tables
macro[100:110].T[1:15]
hide_code
# Display the data tables
train[200:208].T[1:15]
hide_code
# Create lists of the features
X_list_num = ['timestamp',
'full_sq', 'num_room', 'area_m',
'kremlin_km', 'big_road2_km', 'big_road1_km',
'workplaces_km',
'stadium_km', 'swim_pool_km', 'fitness_km',
'detention_facility_km', 'cemetery_km',
'radiation_km', 'oil_chemistry_km',
'theater_km', 'exhibition_km', 'museum_km',
'park_km', 'public_healthcare_km',
'metro_min_walk','metro_km_avto',
'bus_terminal_avto_km', 'public_transport_station_min_walk',
'railroad_station_walk_min', 'railroad_station_avto_km',
'kindergarten_km', 'school_km', 'preschool_km',
'university_km', 'additional_education_km',
'shopping_centers_km', 'big_market_km',
'ekder_all', 'work_all', 'young_all']
X_list_cat = ['sub_area', 'ID_metro',
'office_raion', 'sport_objects_raion',
'raion_popul', 'healthcare_centers_raion',
'school_education_centers_raion',
'preschool_education_centers_raion']
target_train = train['price_doc']
hide_code
# Create the distribution plot for the target
plt.style.use('seaborn-whitegrid')
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(18, 6))
sns.distplot(target_train, bins=200, color='#228B22', ax=ax1)
ax1.set_xlabel("Prices")
ax1.set_ylabel("Distribution")
sns.distplot(np.log(target_train), bins=200, color='#228B22', ax=ax2)
ax2.set_xlabel("Logarithm of the variable 'Prices'")
ax2.set_ylabel("Distribution")
plt.suptitle('Sberbank Russian Housing Data');
hide_code
# Create the table of descriptive statistics
print ("Sberbank Russian Housing Dataset Statistics: \n")
print ("Number of houses = ", len(target_train))
print ("Number of features = ", len(list(train[X_list_num+X_list_cat].keys())))
print ("Minimum house price = ", np.min(target_train))
print ("Maximum house price = ", np.max(target_train))
print ("Mean house price = ", "%.2f" % np.mean(target_train))
print ("Median house price = ", "%.2f" % np.median(target_train))
print ("Standard deviation of house prices =", "%.2f" % np.std(target_train))
hide_code
# Find out the number of missing values
train[X_list_num].isnull().sum()
hide_code
# Find out the number of missing values
test[X_list_num].isnull().sum()
hide_code
# Create dataframes for sets of features
df_train = pd.DataFrame(train, columns=X_list_num)
df_train_cat = pd.DataFrame(train, columns=X_list_num+X_list_cat)
df_test = pd.DataFrame(test, columns=X_list_num)
df_test_cat = pd.DataFrame(test, columns=X_list_num+X_list_cat)
df_train['prices'] = target_train
df_train_cat['prices'] = target_train
# Delete rows with a lot of missing values
df_train = df_train.dropna(subset=['num_room'])
df_train_cat = df_train_cat.dropna(subset=['num_room'])
# Fill in missing values by interpolation
df_train['metro_min_walk'] = \
df_train['metro_min_walk'].interpolate(method='linear')
df_train_cat['metro_min_walk'] = \
df_train_cat['metro_min_walk'].interpolate(method='linear')
df_train['railroad_station_walk_min'] = \
df_train['railroad_station_walk_min'].interpolate(method='linear')
df_train_cat['railroad_station_walk_min'] = \
df_train_cat['railroad_station_walk_min'].interpolate(method='linear')
df_test['metro_min_walk'] = \
df_test['metro_min_walk'].interpolate(method='linear')
df_test_cat['metro_min_walk'] = \
df_test_cat['metro_min_walk'].interpolate(method='linear')
df_test['railroad_station_walk_min'] = \
df_test['railroad_station_walk_min'].interpolate(method='linear')
df_test_cat['railroad_station_walk_min'] = \
df_test_cat['railroad_station_walk_min'].interpolate(method='linear')
# Display the number of rows in the final training set
len(df_train)
hide_code
# Create a dictionary 'Date => Currency rate'
usdrub_pairs = dict(zip(list(macro['timestamp']), list(macro['usdrub'])))
# salary_pairs = dict(zip(list(macro['timestamp']), list(macro['salary'])))
# Replace the data by currency rates in the training and testing sets
df_train['timestamp'].replace(usdrub_pairs,inplace=True)
df_train_cat['timestamp'].replace(usdrub_pairs,inplace=True)
df_test['timestamp'].replace(usdrub_pairs,inplace=True)
df_test_cat['timestamp'].replace(usdrub_pairs,inplace=True)
df_train.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_train_cat.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_test.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_test_cat.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
hide_code
# Display categorical features
separator = '<_>'*38
for df in [df_train_cat, df_test_cat]:
print ('\n', separator)
print('\nsub area')
print('Number of categories:', len(set(df['sub_area'])))
print(set(df['sub_area']))
print('\nID metro')
print('Number of categories:', len(set(df['ID_metro'])))
print(set(df['ID_metro']))
print('\noffice raion')
print('Number of categories:', len(set(df['office_raion'])))
print(set(df['office_raion']))
print('\nsport objects raion')
print('Number of categories:', len(set(df['sport_objects_raion'])))
print(set(df_train_cat['sport_objects_raion']))
print('\nraion popul')
print('Number of categories:', len(set(df['raion_popul'])))
print(set(df['raion_popul']))
print('\nhealthcare centers raion')
print('Number of categories:', len(set(df_train_cat['healthcare_centers_raion'])))
print(set(df['healthcare_centers_raion']))
print('\nschool education centers raion')
print('Number of categories:', len(set(df['school_education_centers_raion'])))
print(set(df['school_education_centers_raion']))
print('\npreschool education centers raion')
print('Number of categories:', len(set(df['preschool_education_centers_raion'])))
print(set(df['preschool_education_centers_raion']))
# Find the missing category in the testing set
hide_code
for feature in X_list_cat:
for element in list(set(df_test_cat[feature])):
if element not in list(set(df_train_cat[feature])):
print (feature, element)
hide_code
# Replace categorical values of'ID_metro' by discrete numbers
ID_metro_cat = pd.factorize(df_train_cat['ID_metro'])
df_train_cat['ID_metro'] = ID_metro_cat[0]
ID_metro_pairs = dict(zip(list(ID_metro_cat[1]), list(set(ID_metro_cat[0]))))
ID_metro_pairs[224] = 219
df_test_cat['ID_metro'].replace(ID_metro_pairs,inplace=True)
hide_code
# Replace categorical values of other categorical features by discrete numbers
for feature in X_list_cat:
if feature !='ID_metro':
feature_cat = pd.factorize(df_train_cat[feature])
df_train_cat[feature] = feature_cat[0]
feature_pairs = dict(zip(list(feature_cat[1]), list(set(feature_cat[0]))))
df_test_cat[feature].replace(feature_pairs,inplace=True)
hide_code
# Display the result of preprocessing for categorical features
for df in [df_train_cat, df_test_cat]:
print ('\n', separator)
print('\nsub area')
print('Number of categories:', len(set(df['sub_area'])))
print(set(df['sub_area']))
print('\nID metro')
print('Number of categories:', len(set(df['ID_metro'])))
print(set(df['ID_metro']))
print('\noffice raion')
print('Number of categories:', len(set(df['office_raion'])))
print(set(df['office_raion']))
print('\nsport objects raion')
print('Number of categories:', len(set(df['sport_objects_raion'])))
print(set(df_train_cat['sport_objects_raion']))
print('\nraion popul')
print('Number of categories:', len(set(df['raion_popul'])))
print(set(df['raion_popul']))
print('\nhealthcare centers raion')
print('Number of categories:', len(set(df_train_cat['healthcare_centers_raion'])))
print(set(df['healthcare_centers_raion']))
print('\nschool education centers raion')
print('Number of categories:', len(set(df['school_education_centers_raion'])))
print(set(df['school_education_centers_raion']))
print('\npreschool education centers raion')
print('Number of categories:', len(set(df['preschool_education_centers_raion'])))
print(set(df['preschool_education_centers_raion']))
hide_code
# Apply one hot encoding for the training set
df_train_cat1 = df_train_cat
encode = OneHotEncoder(sparse=False)
for column in X_list_cat:
encode.fit(df_train_cat[[column]])
transform = encode.transform(df_train_cat[[column]])
transform = pd.DataFrame(transform,
columns=[(column+"_"+str(i)) for i in df_train_cat[column].value_counts().index])
transform = transform.set_index(df_train_cat.index.values)
df_train_cat1 = pd.concat([df_train_cat1, transform], axis=1)
df_train_cat1 = df_train_cat1.drop(column, 1)
hide_code
# Apply one hot encoding for the testing set
df_test_cat1 = df_test_cat
encode = OneHotEncoder(sparse=False)
for column in X_list_cat:
encode.fit(df_test_cat[[column]])
transform = encode.transform(df_test_cat[[column]])
transform = pd.DataFrame(transform,
columns=[(column+"_"+str(i)) for i in df_test_cat[column].value_counts().index])
transform = transform.set_index(df_test_cat.index.values)
df_test_cat1 = pd.concat([df_test_cat1, transform], axis=1)
df_test_cat1 = df_test_cat1.drop(column, 1)
hide_code
# Display the example of encoded values
df_train_cat1.iloc[:, 623:636][:3].as_matrix()
hide_code
# Check these values without one hot encoding
df_train_cat['preschool_education_centers_raion'][:3]
hide_code
# Display the number of features in the training and testing datasets
print('Shape of the train data frame:', df_train_cat1.shape)
print('Shape of the test data frame:', df_test_cat1.shape)
hide_code
print("Features in the train data, but not in the test data:")
for element in list(df_train_cat1):
if element not in list(df_test_cat1):
print(element)
hide_code
print("Features in the test data, but not in the train data:")
for element in list(df_test_cat1):
if element not in list(df_train_cat1):
print(element)
hide_code
# Fill in by zeros the missing columns in the training and testing datasets
for column in ['sub_area_136',' ID_metro_188', 'ID_metro_205', 'ID_metro_216', 'ID_metro_214',
'ID_metro_183',' ID_metro_179', 'ID_metro_153', 'ID_metro_217', 'raion_popul_136']:
df_test_cat1[column] = 0
df_train_cat1['ID_metro_219'] = 0
print('Columns with zero values were added.\n')
print('Shape of the train data frame:', df_train_cat1.shape)
print('Shape of the test data frame:', df_test_cat1.shape)
hide_code
# Display the feature correlation with the target
pearson = df_train.corr(method='pearson')
corr_with_prices = pearson.ix[-1][:-1]
corr_with_prices[abs(corr_with_prices).argsort()[::-1]]
hide_code
# Display the most correlated features
features_list2 = corr_with_prices[abs(corr_with_prices).argsort()[::-1]][:10].index.values.tolist()
print('The most correlated with prices:\n', features_list2)
hide_code
# Display the correlation matrix of features
plt.figure(figsize=(18, 12))
train_corr = df_train.corr()
sns.heatmap(train_corr, cmap=plt.cm.Greens,
xticklabels=train_corr.columns.values,
yticklabels=train_corr.columns.values)
plt.title("Correlation Matrix", fontsize=20);
hide_code
# Create the feature and target arrays
target_train = df_train['prices'].as_matrix()
features_train = df_train.drop('prices', 1).as_matrix()
features_test = df_test.as_matrix()
features_train_cat = df_train_cat.drop('prices', 1).as_matrix()
features_test_cat = df_test_cat.as_matrix()
features_train_cat_enc = df_train_cat1.drop('prices', 1).as_matrix()
features_test_cat_enc = df_test_cat1.as_matrix()
hide_code
# Split the data
print(separator, '\n\nNumeric Features')
X_train, X_test, y_train, y_test = \
train_test_split(features_train, target_train, test_size = 0.2, random_state = 1)
X_train.shape, X_test.shape
hide_code
# Split the data
print(separator, '\n\nNumeric and Categorical Features')
X_train_cat, X_test_cat, y_train_cat, y_test_cat = \
train_test_split(features_train_cat, target_train, test_size = 0.2, random_state = 1)
X_train_cat.shape, X_test_cat.shape
hide_code
# Split the data
print(separator, '\n\nNumeric and Encoded Categorical Features')
X_train_cat_enc, X_test_cat_enc, y_train_cat_enc, y_test_cat_enc = \
train_test_split(features_train_cat_enc, target_train, test_size = 0.2, random_state = 1)
X_train_cat_enc.shape, X_test_cat_enc.shape
hide_code
# Scale the data
scale_X = RobustScaler()
X_train = scale_X.fit_transform(X_train)
X_test = scale_X.transform(X_test)
scale_y = RobustScaler()
y_train = scale_y.fit_transform(y_train.reshape(-1,1))
y_test = scale_y.transform(y_test.reshape(-1,1))
scale_X_cat = RobustScaler()
X_train_cat = scale_X_cat.fit_transform(X_train_cat)
X_test_cat = scale_X_cat.transform(X_test_cat)
scale_y_cat = RobustScaler()
y_train_cat = scale_y_cat.fit_transform(y_train_cat.reshape(-1,1))
y_test_cat = scale_y_cat.transform(y_test_cat.reshape(-1,1))
scale_X_cat_enc = RobustScaler()
X_train_cat_enc = scale_X_cat_enc.fit_transform(X_train_cat_enc)
X_test_cat_enc = scale_X_cat_enc.transform(X_test_cat_enc)
scale_y_cat_enc = RobustScaler()
y_train_cat_enc = scale_y_cat_enc.fit_transform(y_train_cat_enc.reshape(-1,1))
y_test_cat_enc = scale_y_cat_enc.transform(y_test_cat_enc.reshape(-1,1))
To compare the prediction quality, I chose the most effective (for financial indicators) regression ensemble algorithms and different types of neural networks: multilayer perceptrons, convolutional and recurrent neural networks. In addition, I was wondering what the highest accuracy rate will be achieved by each of the presented algorithms and whether the predicted trends of price change for all used types of techniques will coincide.
hide_code
# Tuning parameters max_depth & n_estimators
print(separator, '\n\nNumeric Features', '\nGradient Boosting Regressor')
param_grid_gbr = {'max_depth': [3, 4, 5], 'n_estimators': range(36, 361, 36)}
gridsearch_gbr = GridSearchCV(GradientBoostingRegressor(), param_grid_gbr, n_jobs=5)\
.fit(X_train, y_train)
gridsearch_gbr.best_params_
hide_code
# Tuning parameters n_estimators
print ('Bagging Regressor')
param_grid_br = {'n_estimators': range(36, 361, 36)}
gridsearch_br = GridSearchCV(BaggingRegressor(), param_grid_br, n_jobs=5)\
.fit(X_train, y_train)
gridsearch_br.best_params_
hide_code
# Tuning parameters max_depth & n_estimators
print(separator, '\n\nNumeric and Categorical Features', '\nGradient Boosting Regressor')
param_grid_gbr_cat = {'max_depth': [3, 4, 5], 'n_estimators': range(44, 441, 44)}
gridsearch_gbr_cat = GridSearchCV(GradientBoostingRegressor(), param_grid_gbr_cat, n_jobs=5)\
.fit(X_train_cat, y_train_cat)
gridsearch_gbr_cat.best_params_
hide_code
# Tuning parameters n_estimators
print ('Bagging Regressor')
param_grid_br_cat = {'n_estimators': range(44, 441, 44)}
gridsearch_br_cat = GridSearchCV(BaggingRegressor(), param_grid_br_cat, n_jobs=5)\
.fit(X_train_cat, y_train_cat)
gridsearch_br_cat.best_params_
hide_code
# Tuning parameters max_depth & n_estimators
print(separator, '\n\nNumeric and Encoded Categorical Features', '\nGradient Boosting Regressor')
param_grid_gbr_cat_enc = {'max_depth': [3, 4, 5], 'n_estimators': [159, 318, 636]}
gridsearch_gbr_cat_enc = GridSearchCV(GradientBoostingRegressor(), param_grid_gbr_cat_enc, n_jobs=5)\
.fit(X_train_cat_enc, y_train_cat_enc)
gridsearch_gbr_cat_enc.best_params_
hide_code
# Tuning parameters n_estimators
print ('Bagging Regressor')
param_grid_br_cat_enc = {'n_estimators': [159, 318, 636]}
gridsearch_br_cat_enc = GridSearchCV(BaggingRegressor(), param_grid_br_cat_enc, n_jobs=5)\
.fit(X_train_cat_enc, y_train_cat_enc)
gridsearch_br_cat_enc.best_params_
hide_code
# Fit the initial Regressors and display the results
print(separator, '\nNumeric Features')
y_train_gbr, y_test_gbr = regression(GradientBoostingRegressor(),
X_train, X_test, y_train)
y_train_br, y_test_br = regression(BaggingRegressor(),
X_train, X_test, y_train)
scores('GradientBoostingRegressor', y_train, y_test, y_train_gbr, y_test_gbr)
scores('BaggingRegressor', y_train, y_test, y_train_br, y_test_br)
hide_code
# Fit the tuning Regressors and display the results
print(separator, '\nNumeric Features')
y_train_gbr, y_test_gbr = regression(GradientBoostingRegressor(max_depth=4, n_estimators=360),
X_train, X_test, y_train)
y_train_br, y_test_br = regression(BaggingRegressor(n_estimators=360),
X_train, X_test, y_train)
scores('GradientBoostingRegressor', y_train, y_test, y_train_gbr, y_test_gbr)
scores('BaggingRegressor', y_train, y_test, y_train_br, y_test_br)
hide_code
# Display parameters of the regressor
GradientBoostingRegressor(max_depth=4, n_estimators=360).get_params(deep=True)
hide_code
# Display the feature importance
importances = GradientBoostingRegressor(max_depth=4, n_estimators=360)\
.fit(X_train, y_train).feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize = (18, 4.5))
plt.bar(range(X_train.shape[1]), importances[indices],
color="forestgreen", align="center", alpha=0.5)
plt.xlabel("Feature Index")
plt.ylabel("Feature Importance")
plt.xticks(range(X_train.shape[1]), indices)
plt.title("Importance of the Features; Gradient Boosting Regressor");
hide_code
# Fit the initial Regressors and display the results
print(separator, '\nNumeric and Categorical Features')
y_train_cat_gbr, y_test_cat_gbr = \
regression(GradientBoostingRegressor(),
X_train_cat, X_test_cat, y_train_cat)
y_train_cat_br, y_test_cat_br = \
regression(BaggingRegressor(), X_train_cat, X_test_cat, y_train_cat)
scores('GradientBoostingRegressor',
y_train_cat, y_test_cat, y_train_cat_gbr, y_test_cat_gbr)
scores('BaggingRegressor',
y_train_cat, y_test_cat, y_train_cat_br, y_test_cat_br)
hide_code
# Fit the final Regressors and display the results
print(separator, '\nNumeric and Categorical Features')
y_train_cat_gbr, y_test_cat_gbr = \
regression(GradientBoostingRegressor(max_depth=3, n_estimators=396),
X_train_cat, X_test_cat, y_train_cat)
y_train_cat_br, y_test_cat_br = \
regression(BaggingRegressor(n_estimators=308), X_train_cat, X_test_cat, y_train_cat)
scores('GradientBoostingRegressor',
y_train_cat, y_test_cat, y_train_cat_gbr, y_test_cat_gbr)
scores('BaggingRegressor',
y_train_cat, y_test_cat, y_train_cat_br, y_test_cat_br)
hide_code
# Display the feature importance
importances_cat = GradientBoostingRegressor(max_depth=3, n_estimators=396)\
.fit(X_train_cat, y_train_cat).feature_importances_
indices_cat = np.argsort(importances_cat)[::-1]
plt.figure(figsize = (18, 4.5))
plt.bar(range(X_train_cat.shape[1]), importances_cat[indices_cat],
color="forestgreen", align="center", alpha=0.5)
plt.xlabel("Feature Index")
plt.ylabel("Feature Importance")
plt.xticks(range(X_train_cat.shape[1]), indices_cat)
plt.title("Importance of the Features; Gradient Boosting Regressor");
hide_code
# Fit the initial Regressors and display the results
print(separator, '\nNumeric and Encoded Categorical Features')
y_train_cat_enc_gbr, y_test_cat_enc_gbr = \
regression(GradientBoostingRegressor(),
X_train_cat_enc, X_test_cat_enc, y_train_cat_enc)
y_train_cat_enc_br, y_test_cat_enc_br = \
regression(BaggingRegressor(),
X_train_cat_enc, X_test_cat_enc, y_train_cat_enc)
scores('GradientBoostingRegressor',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_gbr, y_test_cat_enc_gbr)
scores('BaggingRegressor',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_br, y_test_cat_enc_br)
hide_code
# Fit the final Regressors and display the results
print(separator, '\nNumeric and Encoded Categorical Features')
y_train_cat_enc_gbr, y_test_cat_enc_gbr = \
regression(GradientBoostingRegressor(max_depth=4, n_estimators=318),
X_train_cat_enc, X_test_cat_enc, y_train_cat_enc)
y_train_cat_enc_br, y_test_cat_enc_br = \
regression(BaggingRegressor(n_estimators=159),
X_train_cat_enc, X_test_cat_enc, y_train_cat_enc)
scores('GradientBoostingRegressor',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_gbr, y_test_cat_enc_gbr)
scores('BaggingRegressor',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_br, y_test_cat_enc_br)
hide_code
# Display the feature importance
importances_cat_enc = GradientBoostingRegressor(max_depth=4, n_estimators=318)\
.fit(X_train_cat_enc, y_train_cat_enc).feature_importances_
indices_cat_enc = np.argsort(importances_cat_enc)[::-1][:50]
plt.figure(figsize = (18, 4.5))
plt.bar(range(50), importances_cat_enc[indices_cat_enc],
color="forestgreen", align="center", alpha=0.5)
plt.xlabel("Feature Index")
plt.ylabel("Feature Importance")
plt.xticks(range(50), indices_cat_enc)
plt.title("Importance of the Features; Gradient Boosting Regressor");
hide_code
# Fit the initial MLPRegressor and display the results
mlpr = MLPRegressor()
mlpr.fit(X_train, y_train)
y_train_mlpr = mlpr.predict(X_train)
y_test_mlpr = mlpr.predict(X_test)
print(separator, '\nNumeric Features')
scores('MLP Regressor', y_train, y_test, y_train_mlpr, y_test_mlpr)
hide_code
# Fit the final MLPRegressor and display the results
mlpr = MLPRegressor(hidden_layer_sizes=(360,), max_iter=300,
solver='adam', alpha=0.01)
mlpr.fit(X_train, y_train)
y_train_mlpr = mlpr.predict(X_train)
y_test_mlpr = mlpr.predict(X_test)
print(separator, '\nNumeric Features')
scores('MLP Regressor', y_train, y_test, y_train_mlpr, y_test_mlpr)
hide_code
# Fit the initial MLPRegressor and display the results
mlpr_cat = MLPRegressor()
mlpr_cat.fit(X_train_cat, y_train_cat)
y_train_cat_mlpr = mlpr_cat.predict(X_train_cat)
y_test_cat_mlpr = mlpr_cat.predict(X_test_cat)
print(separator, '\nNumeric and Categorical Features')
scores('MLP Regressor', y_train_cat, y_test_cat, y_train_cat_mlpr, y_test_cat_mlpr)
hide_code
# Fit the final MLPRegressor and display the results
mlpr_cat = MLPRegressor(hidden_layer_sizes=(396,), max_iter=300,
solver='adam', alpha=0.01)
mlpr_cat.fit(X_train_cat, y_train_cat)
y_train_cat_mlpr = mlpr_cat.predict(X_train_cat)
y_test_cat_mlpr = mlpr_cat.predict(X_test_cat)
print(separator, '\nNumeric and Categorical Features')
scores('MLP Regressor', y_train_cat, y_test_cat, y_train_cat_mlpr, y_test_cat_mlpr)
hide_code
# Fit the initial MLPRegressor and display the results
mlpr_cat_enc = MLPRegressor()
mlpr_cat_enc.fit(X_train_cat_enc, y_train_cat_enc)
y_train_cat_enc_mlpr = mlpr_cat_enc.predict(X_train_cat_enc)
y_test_cat_enc_mlpr = mlpr_cat_enc.predict(X_test_cat_enc)
print(separator, '\nNumeric and Encoded Categorical Features')
scores('MLP Regressor', y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_mlpr, y_test_cat_enc_mlpr)
hide_code
# Fit the final MLPRegressor and display the results
mlpr_cat_enc = MLPRegressor(hidden_layer_sizes=(318,), max_iter=150,
solver='lbfgs', alpha=0.01)
mlpr_cat_enc.fit(X_train_cat_enc, y_train_cat_enc)
y_train_cat_enc_mlpr = mlpr_cat_enc.predict(X_train_cat_enc)
y_test_cat_enc_mlpr = mlpr_cat_enc.predict(X_test_cat_enc)
print(separator, '\nNumeric and Encoded Categorical Features')
scores('MLP Regressor', y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_mlpr, y_test_cat_enc_mlpr)
hide_code
# Plot predictions of the regressors with real values
plt.figure(figsize = (18, 6))
plt.plot(y_test[1:50], color = 'black', label='Real Data')
plt.plot(y_test_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_br[1:50], label='Bagging Regressor')
plt.plot(y_test_mlpr[1:50], label='MLP Regressor')
plt.xlabel("Data Points")
plt.ylabel("Predicted and Real Target Values")
plt.legend()
plt.title("Numeric Features; Regressor Predictions vs Real Data");
hide_code
# Plot predictions of the regressors with real values
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')
plt.plot(y_test_cat_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_cat_br[1:50], label='Bagging Regressor')
plt.plot(y_test_cat_mlpr[1:50], label='MLP Regressor')
plt.xlabel("Data Points")
plt.ylabel("Predicted and Real Target Values")
plt.legend()
plt.title("Numeric and Categorical Features; Regressor Predictions vs Real Data");
hide_code
# Plot predictions of the regressors with real values
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat_enc[1:50], color = 'black', label='Real Data')
plt.plot(y_test_cat_enc_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(y_test_cat_enc_mlpr[1:50], label='MLP Regressor')
plt.xlabel("Data Points")
plt.ylabel("Predicted and Real Target Values")
plt.legend()
plt.title("Numeric and Encoded Categorical Features; Regressor Predictions vs Real Data");
hide_code
# Create the initial sequential model
def mlp_model():
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=36))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
mlp_model = mlp_model()
# Fit the model
mlp_history = mlp_model.fit(X_train, y_train,
validation_data=(X_test, y_test),
nb_epoch=10, batch_size=128, verbose=0)
hide_code
# Create predictions
y_train_mlp = mlp_model.predict(X_train)
y_test_mlp = mlp_model.predict(X_test)
# Display initial metrics
print(separator, '\nNumeric Features')
scores('MLP Initial Model', y_train, y_test, y_train_mlp, y_test_mlp)
hide_code
# Create the sequential model
def mlp_model():
model = Sequential()
model.add(Dense(1024, activation='relu', input_dim=36))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='nadam', metrics=['mae'])
return model
mlp_model = mlp_model()
# Create the checkpointer for saving the best results
mlp_checkpointer = ModelCheckpoint(filepath='weights.best.mlp.hdf5',
verbose=2, save_best_only=True)
# Fit the model
mlp_history = mlp_model.fit(X_train, y_train,
validation_data=(X_test, y_test),
nb_epoch=10, batch_size=128, verbose=0,
callbacks=[mlp_checkpointer])
hide_code
# Plot the fitting history
history_plot(mlp_history)
hide_code
# Load the best model results
mlp_model.load_weights('weights.best.mlp.hdf5')
# Create predictions
y_train_mlp = mlp_model.predict(X_train)
y_test_mlp = mlp_model.predict(X_test)
# Save the model
mlp_model.save('mlp_model_p6.h5')
# Display metrics
print(separator, '\nNumeric Features')
scores('MLP Model', y_train, y_test, y_train_mlp, y_test_mlp)
hide_code
# Create the initial sequential model
def mlp_cat_model():
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=44))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
mlp_cat_model = mlp_cat_model()
# Fit the model
mlp_cat_history = mlp_cat_model.fit(X_train_cat, y_train_cat,
validation_data=(X_test_cat, y_test_cat),
nb_epoch=10, batch_size=128, verbose=0)
hide_code
# Create predictions
y_train_cat_mlp = mlp_cat_model.predict(X_train_cat)
y_test_cat_mlp = mlp_cat_model.predict(X_test_cat)
# Display initial metrics
print(separator, '\nNumeric Features and Categorical Features')
scores('MLP Initial Model', y_train_cat, y_test_cat, y_train_cat_mlp, y_test_cat_mlp)
hide_code
# Create the sequential model
def mlp_cat_model():
model = Sequential()
model.add(Dense(1024, activation='relu', input_dim=44))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='nadam', metrics=['mae'])
return model
mlp_cat_model = mlp_cat_model()
# Create the checkpointer for saving the best results
mlp_cat_checkpointer = ModelCheckpoint(filepath='weights.best.mlp_cat.hdf5',
verbose=2, save_best_only=True)
# Fit the model
mlp_cat_history = mlp_cat_model.fit(X_train_cat, y_train_cat,
validation_data=(X_test_cat, y_test_cat),
nb_epoch=10, batch_size=128, verbose=0,
callbacks=[mlp_cat_checkpointer])
hide_code
# Plot the history
history_plot(mlp_cat_history)
hide_code
# Load the best model results
mlp_cat_model.load_weights('weights.best.mlp_cat.hdf5')
# Create predictions
y_train_cat_mlp = mlp_cat_model.predict(X_train_cat)
y_test_cat_mlp = mlp_cat_model.predict(X_test_cat)
# Save the model
mlp_cat_model.save('mlp_cat_model_p6.h5')
# Display metrics
print(separator, '\nNumeric and Categorical Features')
scores('MLP Model',
y_train_cat, y_test_cat, y_train_cat_mlp, y_test_cat_mlp)
hide_code
# Create the initial sequential model
def mlp_cat_enc_model():
model = Sequential()
model.add(Dense(1024, activation='relu', input_dim=636))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
mlp_cat_enc_model = mlp_cat_enc_model()
# Fit the model
mlp_cat_enc_history = mlp_cat_enc_model.fit(X_train_cat_enc, y_train_cat_enc,
validation_data=(X_test_cat_enc, y_test_cat_enc),
nb_epoch=10, batch_size=128, verbose=0)
hide_code
# Create predictions
y_train_cat_enc_mlp = mlp_cat_enc_model.predict(X_train_cat_enc)
y_test_cat_enc_mlp = mlp_cat_enc_model.predict(X_test_cat_enc)
# Display initial metrics
print(separator, '\nNumeric Features and Encoded Categorical Features')
scores('MLP Initial Model', y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_mlp, y_test_cat_enc_mlp)
hide_code
# Create the sequential model
def mlp_cat_enc_model():
model = Sequential()
model.add(Dense(159, activation='relu', input_dim=636))
model.add(Dense(159, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(318, activation='relu'))
model.add(Dense(318, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(636, activation='relu'))
model.add(Dense(636, activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
mlp_cat_enc_model = mlp_cat_enc_model()
# Create the checkpointer for saving the best results
mlp_cat_enc_checkpointer = ModelCheckpoint(filepath='weights.best.mlp_cat_enc.hdf5',
verbose=2, save_best_only=True)
# Fit the model
mlp_cat_enc_history = mlp_cat_enc_model.fit(X_train_cat_enc, y_train_cat_enc,
validation_data=(X_test_cat_enc, y_test_cat_enc),
nb_epoch=10, batch_size=128, verbose=0,
callbacks=[mlp_cat_enc_checkpointer])
hide_code
# Plot the fitting history
history_plot(mlp_cat_enc_history)
hide_code
# Load the best model results
mlp_cat_enc_model.load_weights('weights.best.mlp_cat_enc.hdf5')
# Create predictions
y_train_cat_enc_mlp = mlp_cat_enc_model.predict(X_train_cat_enc)
y_test_cat_enc_mlp = mlp_cat_enc_model.predict(X_test_cat_enc)
# Save the model
mlp_cat_enc_model.save('mlp_cat_enc_model_p6.h5')
# Display metrics
print(separator, '\nNumeric and Encoded Categorical Features')
scores('MLP Model',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_mlp, y_test_cat_enc_mlp)
hide_code
# Create the initial sequential model
def cnn_model():
model = Sequential()
model.add(Conv1D(36, 3, padding='valid', activation='relu', input_shape=(36, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(64, activation='relu', kernel_initializer='normal',))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
cnn_model = cnn_model()
# Fit the model
cnn_history = cnn_model.fit(X_train.reshape(-1, 36, 1), y_train,
epochs=30, batch_size=128, verbose=0,
validation_data=(X_test.reshape(-1, 36, 1), y_test))
hide_code
# Create predictions
y_train_cnn = cnn_model.predict(X_train.reshape(-1, 36, 1))
y_test_cnn = cnn_model.predict(X_test.reshape(-1, 36, 1))
# Display initial metrics
print(separator, '\nNumeric Features')
scores('CNN Initial Model', y_train, y_test, y_train_cnn, y_test_cnn)
hide_code
# Create the sequential model
def cnn_model():
model = Sequential()
model.add(Conv1D(36, 3, padding='valid', activation='relu', input_shape=(36, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(512, activation='relu', kernel_initializer='normal',))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
cnn_model = cnn_model()
# Create the checkpointer for saving the best results
cnn_checkpointer = ModelCheckpoint(filepath='weights.best.cnn.hdf5',
verbose=2, save_best_only=True)
# Fit the model
cnn_history = cnn_model.fit(X_train.reshape(-1, 36, 1), y_train,
epochs=30, batch_size=128, verbose=0, callbacks=[cnn_checkpointer],
validation_data=(X_test.reshape(-1, 36, 1), y_test))
hide_code
# Plot the fitting history
history_plot(cnn_history)
hide_code
# Load the best model results
cnn_model.load_weights('weights.best.cnn.hdf5')
# Create predictions
y_train_cnn = cnn_model.predict(X_train.reshape(-1, 36, 1))
y_test_cnn = cnn_model.predict(X_test.reshape(-1, 36, 1))
# Save the model
cnn_model.save('cnn_model_p6.h5')
# Display metrics
print(separator, '\nNumeric Features')
scores('CNN Model', y_train, y_test, y_train_cnn, y_test_cnn)
hide_code
# Display the model description
cnn_model.summary()
hide_code
# Display the example of the model architecture
SVG(model_to_dot(cnn_model).create(prog='dot', format='svg'))
hide_code
# Create the initial sequential model
def cnn_cat_model():
model = Sequential()
model.add(Conv1D(44, 3, padding='valid', activation='relu', input_shape=(44, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(64, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
cnn_cat_model = cnn_cat_model()
# Fit the model
cnn_cat_history = cnn_cat_model.fit(X_train_cat.reshape(-1, 44, 1), y_train_cat,
epochs=20, batch_size=128, verbose=0,
validation_data=(X_test_cat.reshape(-1, 44, 1), y_test_cat))
hide_code
# Create predictions
y_train_cat_cnn = cnn_cat_model.predict(X_train_cat.reshape(-1, 44, 1))
y_test_cat_cnn = cnn_cat_model.predict(X_test_cat.reshape(-1, 44, 1))
# Display initial metrics
print(separator, '\nNumeric and Categorical Features')
scores('CNN Initial Model', y_train_cat, y_test_cat, y_train_cat_cnn, y_test_cat_cnn)
hide_code
# Create the sequential model
def cnn_cat_model():
model = Sequential()
model.add(Conv1D(44, 3, padding='valid', activation='relu', input_shape=(44, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(256, activation='relu', kernel_initializer='normal',))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
cnn_cat_model = cnn_cat_model()
# Create the checkpointer for saving the best results
cnn_cat_checkpointer = ModelCheckpoint(filepath='weights.best.cnn_cat.hdf5',
verbose=2, save_best_only=True)
# Fit the model
cnn_cat_history = cnn_cat_model.fit(X_train_cat.reshape(-1, 44, 1), y_train_cat,
epochs=20, batch_size=128, verbose=0, callbacks=[cnn_cat_checkpointer],
validation_data=(X_test_cat.reshape(-1, 44, 1), y_test_cat))
hide_code
# Plot the fitting history
history_plot(cnn_cat_history)
hide_code
# Load the best model results
cnn_cat_model.load_weights('weights.best.cnn_cat.hdf5')
# Create predictions
y_train_cat_cnn = cnn_cat_model.predict(X_train_cat.reshape(-1, 44, 1))
y_test_cat_cnn = cnn_cat_model.predict(X_test_cat.reshape(-1, 44, 1))
# Save the model
cnn_cat_model.save('cnn_cat_model_p6.h5')
# Display metrics
print(separator, '\nNumeric and Categorical Features')
scores('CNN Model',
y_train_cat, y_test_cat, y_train_cat_cnn, y_test_cat_cnn)
hide_code
# Create the initial sequential model
def cnn_cat_enc_model():
model = Sequential()
model.add(Conv1D(159, 3, padding='valid', activation='relu', input_shape=(636, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
cnn_cat_enc_model = cnn_cat_enc_model()
# Fit the model
cnn_cat_enc_history = \
cnn_cat_enc_model.fit(X_train_cat_enc.reshape(-1, 636, 1), y_train_cat_enc,
epochs=10, batch_size=128, verbose=2,
validation_data=(X_test_cat_enc.reshape(-1, 636, 1), y_test_cat_enc))
hide_code
# Create predictions
y_train_cat_enc_cnn = cnn_cat_enc_model.predict(X_train_cat_enc.reshape(-1, 636, 1))
y_test_cat_enc_cnn = cnn_cat_enc_model.predict(X_test_cat_enc.reshape(-1, 636, 1))
# Display initial metrics
print(separator, '\nNumeric and Encoded Categorical Features')
scores('CNN Initial Model', y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_cnn, y_test_cat_enc_cnn)
hide_code
# Create the sequential model
def cnn_cat_enc_model():
model = Sequential()
model.add(Conv1D(159, 3, padding='valid', activation='relu', input_shape=(636, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(512, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
return model
cnn_cat_enc_model = cnn_cat_enc_model()
# Create the checkpointer for saving the best results
cnn_cat_enc_checkpointer = ModelCheckpoint(filepath='weights.best.cnn_cat_enc.hdf5',
verbose=2, save_best_only=True)
# Fit the model
cnn_cat_enc_history = \
cnn_cat_enc_model.fit(X_train_cat_enc.reshape(-1, 636, 1), y_train_cat_enc,
epochs=10, batch_size=128, verbose=0, callbacks=[cnn_cat_enc_checkpointer],
validation_data=(X_test_cat_enc.reshape(-1, 636, 1), y_test_cat_enc))
hide_code
# Plot the fitting history
history_plot(cnn_cat_enc_history)
hide_code
# Load the best model results
cnn_cat_enc_model.load_weights('weights.best.cnn_cat_enc.hdf5')
# Create predictions
y_train_cat_enc_cnn = cnn_cat_enc_model.predict(X_train_cat_enc.reshape(-1, 636, 1))
y_test_cat_enc_cnn = cnn_cat_enc_model.predict(X_test_cat_enc.reshape(-1, 636, 1))
# Save the model
cnn_cat_enc_model.save('cnn_cat_enc_model_p6.h5')
# Display metrics
print(separator, '\nNumeric and Encoded Categorical Features')
scores('CNN Model',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_cnn, y_test_cat_enc_cnn)
hide_code
# Create the initial sequential model
def rnn_model():
model = Sequential()
model.add(LSTM(36, return_sequences=False, input_shape=(1, 36)))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
rnn_model = rnn_model()
# Fit the model
rnn_history = rnn_model.fit(X_train.reshape(-1, 1, 36), y_train.reshape(-1),
validation_data=(X_test.reshape(-1, 1, 36), y_test.reshape(-1)),
nb_epoch=10, batch_size=128, verbose=0)
hide_code
# Create predictions
y_train_rnn = rnn_model.predict(X_train.reshape(-1, 1, 36))
y_test_rnn = rnn_model.predict(X_test.reshape(-1, 1, 36))
# Display initial metrics
print(separator, '\nNumeric Features')
scores('RNN Initial Model', y_train, y_test, y_train_rnn, y_test_rnn)
hide_code
# Create the sequential model
def rnn_model():
model = Sequential()
model.add(LSTM(144, return_sequences=True, input_shape=(1, 36)))
model.add(LSTM(144, return_sequences=True))
model.add(LSTM(144, return_sequences=False))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
rnn_model = rnn_model()
# Create the checkpointer for saving the best results
rnn_checkpointer = ModelCheckpoint(filepath='weights.best.rnn.hdf5',
verbose=2, save_best_only=True)
# Fit the model
rnn_history = rnn_model.fit(X_train.reshape(-1, 1, 36), y_train.reshape(-1),
epochs=10, verbose=0, callbacks=[rnn_checkpointer],
validation_data=(X_test.reshape(-1, 1, 36), y_test.reshape(-1)))
hide_code
# Plot the fitting history
history_plot(rnn_history)
hide_code
# Load the best model results
rnn_model.load_weights('weights.best.rnn.hdf5')
# Create predictions
y_train_rnn = rnn_model.predict(X_train.reshape(-1, 1, 36))
y_test_rnn = rnn_model.predict(X_test.reshape(-1, 1, 36))
# Save the model
rnn_model.save('rnn_model_p6.h5')
# Display metrics
print(separator, '\nNumeric Features')
scores('RNN Model', y_train, y_test, y_train_rnn, y_test_rnn)
hide_code
# Create the initial sequential model
def rnn_cat_model():
model = Sequential()
model.add(LSTM(44, return_sequences=False, input_shape=(1, 44)))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
rnn_cat_model = rnn_cat_model()
# Fit the model
rnn_cat_history = rnn_cat_model.fit(X_train_cat.reshape(-1, 1, 44), y_train_cat.reshape(-1),
validation_data=(X_test_cat.reshape(-1, 1, 44), y_test_cat.reshape(-1)),
nb_epoch=10, batch_size=128, verbose=0)
hide_code
# Create predictions
y_train_cat_rnn = rnn_cat_model.predict(X_train_cat.reshape(-1, 1, 44))
y_test_cat_rnn = rnn_cat_model.predict(X_test_cat.reshape(-1, 1, 44))
# Display initial metrics
print(separator, '\nNumeric and Categorical Features')
scores('RNN Initial Model', y_train_cat, y_test_cat, y_train_cat_rnn, y_test_cat_rnn)
hide_code
# Create the sequential model
def rnn_cat_model():
model = Sequential()
model.add(LSTM(156, return_sequences=True, input_shape=(1, 44)))
model.add(LSTM(624, return_sequences=False))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
rnn_cat_model = rnn_cat_model()
# Create the checkpointer for saving the best results
rnn_cat_checkpointer = ModelCheckpoint(filepath='weights.best.rnn_cat.hdf5',
verbose=2, save_best_only=True)
# Fit the model
rnn_cat_history = rnn_cat_model.fit(X_train_cat.reshape(-1, 1, 44), y_train_cat.reshape(-1),
epochs=10, verbose=0, callbacks=[rnn_cat_checkpointer],
validation_data=(X_test_cat.reshape(-1, 1, 44), y_test_cat.reshape(-1)))
hide_code
# Plot the fitting history
history_plot(rnn_cat_history)
hide_code
# Load the best model results
rnn_cat_model.load_weights('weights.best.rnn_cat.hdf5')
# Create predictions
y_train_cat_rnn = rnn_cat_model.predict(X_train_cat.reshape(-1, 1, 44))
y_test_cat_rnn = rnn_cat_model.predict(X_test_cat.reshape(-1, 1, 44))
# Save the model
rnn_cat_model.save('rnn_cat_model_p6.h5')
# Display metrics
print(separator, '\nNumeric and Categorical Features')
scores('RNN Model',
y_train_cat, y_test_cat, y_train_cat_rnn, y_test_cat_rnn)
hide_code
# Create the initial sequential model
def rnn_cat_enc_model():
model = Sequential()
model.add(LSTM(636, return_sequences=False, input_shape=(1, 636)))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
rnn_cat_enc_model = rnn_cat_enc_model()
# Fit the model
rnn_cat_enc_history = rnn_cat_enc_model.fit(X_train_cat_enc.reshape(-1, 1, 636),
y_train_cat_enc.reshape(-1),
validation_data=(X_test_cat_enc.reshape(-1, 1, 636),
y_test_cat_enc.reshape(-1)),
nb_epoch=10, batch_size=128, verbose=0)
hide_code
# Create predictions
y_train_cat_enc_rnn = rnn_cat_enc_model.predict(X_train_cat_enc.reshape(-1, 1, 636))
y_test_cat_enc_rnn = rnn_cat_enc_model.predict(X_test_cat_enc.reshape(-1, 1, 636))
# Display initial metrics
print(separator, '\nNumeric and Encoded Categorical Features')
scores('RNN Initial Model', y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_rnn, y_test_cat_enc_rnn)
hide_code
# Create the sequential model
def rnn_cat_enc_model():
model = Sequential()
model.add(LSTM(159, return_sequences=True, input_shape=(1, 636)))
model.add(LSTM(636, return_sequences=False))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
rnn_cat_enc_model = rnn_cat_enc_model()
# Create the checkpointer for saving the best results
rnn_cat_enc_checkpointer = ModelCheckpoint(filepath='weights.best.rnn_cat_enc.hdf5',
verbose=2, save_best_only=True)
# Fit the model
rnn_cat_enc_history = \
rnn_cat_enc_model.fit(X_train_cat_enc.reshape(-1, 1, 636), y_train_cat_enc.reshape(-1),
epochs=10, verbose=0, callbacks=[rnn_cat_enc_checkpointer],
validation_data=(X_test_cat_enc.reshape(-1, 1, 636), y_test_cat_enc.reshape(-1)))
hide_code
# Plot the fitting history
history_plot(rnn_cat_enc_history)
hide_code
# Load the best model results
rnn_cat_enc_model.load_weights('weights.best.rnn_cat_enc.hdf5')
# Create predictions
y_train_cat_enc_rnn = rnn_cat_enc_model.predict(X_train_cat_enc.reshape(-1, 1, 636))
y_test_cat_enc_rnn = rnn_cat_enc_model.predict(X_test_cat_enc.reshape(-1, 1, 636))
# Save the model
rnn_cat_enc_model.save('rnn_cat_enc_model_p6.h5')
# Display metrics
print(separator, '\nNumeric and Encoded Categorical Features')
scores('RNN Model',
y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_rnn, y_test_cat_enc_rnn)
hide_code
# Plot predicted values and real data points
plt.figure(figsize = (18, 6))
plt.plot(y_test[1:50], color = 'black', label='Real Data')
plt.plot(y_test_mlp[1:50], label='MLP')
plt.plot(y_test_cnn[1:50], label='CNN')
plt.plot(y_test_rnn[1:50], label='RNN')
plt.xlabel("Data Points")
plt.ylabel("Predicted and Real Target Values")
plt.legend()
plt.title("Numeric Features; Neural Network Predictions vs Real Data");
hide_code
# Plot predicted values and real data points
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')
plt.plot(y_test_cat_mlp[1:50], label='MLP')
plt.plot(y_test_cat_cnn[1:50], label='CNN')
plt.plot(y_test_cat_rnn[1:50], label='RNN')
plt.xlabel("Data Points")
plt.ylabel("Predicted and Real Target Values")
plt.legend()
plt.title("Numeric and Categorical Features; Neural Network Predictions vs Real Data");
hide_code
# Plot predicted values and real data points
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')
plt.plot(y_test_cat_enc_mlp[1:50], label='MLP')
plt.plot(y_test_cat_enc_cnn[1:50], label='CNN')
plt.plot(y_test_cat_enc_rnn[1:50], label='RNN')
plt.xlabel("Data Points")
plt.ylabel("Predicted and Real Target Values")
plt.legend()
plt.title("Numeric and Encoded Categorical Features; Neural Network Predictions vs Real Data");
explained variance regression score
coefficient of determination
mean squared error
mean absolute error
median absolute error
Evaluation metrics capture different properties of the prediction performance: how well the model explains the target variance and makes predictions, how far the predictions are from the real values. It allows us to choose the best algorithm by comparing many indicators.
hide_code
# Scale the whole dataset
target_scale = RobustScaler()
s_target_train = target_scale.fit_transform(target_train.reshape(-1,1))
######################################################################################
feature_scale = RobustScaler()
s_features_train = feature_scale.fit_transform(features_train)
s_features_test = feature_scale.transform(features_test)
######################################################################################
feature_cat_scale = RobustScaler()
s_features_train_cat = feature_cat_scale.fit_transform(features_train_cat)
s_features_test_cat = feature_cat_scale.transform(features_test_cat)
######################################################################################
feature_cat_enc_scale = RobustScaler()
s_features_train_cat_enc = feature_cat_enc_scale.fit_transform(features_train_cat_enc)
s_features_test_cat_enc = feature_cat_enc_scale.transform(features_test_cat_enc)
hide_code
# Fit the Regressors
gbr = GradientBoostingRegressor(max_depth=4, n_estimators=360)
gbr.fit(s_features_train, s_target_train)
br = BaggingRegressor(n_estimators=360)
br.fit(s_features_train, s_target_train)
# Create predictions
s_target_train_gbr = gbr.predict(s_features_train)
s_target_test_gbr = gbr.predict(s_features_test)
s_target_train_br = br.predict(s_features_train)
s_target_test_br = br.predict(s_features_test)
s_target_train_mlpr = mlpr.predict(s_features_train)
s_target_test_mlpr = mlpr.predict(s_features_test)
# Display metrics
scores2('Gradient Boosting Regressor', s_target_train, s_target_train_gbr)
scores2('Bagging Regressor', s_target_train, s_target_train_br)
scores2('MLP Regressor', s_target_train, s_target_train_mlpr)
hide_code
# Fit the Regressors
gbr_cat = GradientBoostingRegressor(max_depth=3, n_estimators=396)
gbr_cat.fit(s_features_train_cat, s_target_train)
br_cat = BaggingRegressor(n_estimators=308)
br_cat.fit(s_features_train_cat, s_target_train)
# Create predictions
s_target_train_cat_gbr = gbr_cat.predict(s_features_train_cat)
s_target_test_cat_gbr = gbr_cat.predict(s_features_test_cat)
s_target_train_cat_br = br_cat.predict(s_features_train_cat)
s_target_test_cat_br = br_cat.predict(s_features_test_cat)
s_target_train_cat_mlpr = mlpr_cat.predict(s_features_train_cat)
s_target_test_cat_mlpr = mlpr_cat.predict(s_features_test_cat)
# Display metrics
scores2('Gradient Boosting Regressor', s_target_train, s_target_train_cat_gbr)
scores2('Bagging Regressor', s_target_train, s_target_train_cat_br)
scores2('MLP Regressor', s_target_train, s_target_train_cat_mlpr)
hide_code
# Fit the Regressors
gbr_cat_enc = GradientBoostingRegressor(max_depth=4, n_estimators=318)
gbr_cat_enc.fit(s_features_train_cat_enc, s_target_train)
br_cat_enc = BaggingRegressor(n_estimators=159)
br_cat_enc.fit(s_features_train_cat_enc, s_target_train)
# Create predictions
s_target_train_cat_enc_gbr = gbr_cat_enc.predict(s_features_train_cat_enc)
s_target_test_cat_enc_gbr = gbr_cat_enc.predict(s_features_test_cat_enc)
s_target_train_cat_enc_br = br_cat.predict(s_features_train_cat_enc)
s_target_test_cat_enc_br = br_cat.predict(s_features_test_cat_enc)
s_target_train_cat_enc_mlpr = mlpr_cat_enc.predict(s_features_train_cat_enc)
s_target_test_cat_enc_mlpr = mlpr_cat_enc.predict(s_features_test_cat_enc)
# Display metrics
scores2('Gradient Boosting Regressor', s_target_train, s_target_train_cat_enc_gbr)
scores2('Bagging Regressor', s_target_train, s_target_train_cat_enc_br)
scores2('MLP Regressor', s_target_train, s_target_train_cat_enc_mlpr)
hide_code
# Create predictions
s_target_train_mlp = mlp_model.predict(s_features_train)
s_target_test_mlp = mlp_model.predict(s_features_test)
s_target_train_cnn = cnn_model.predict(s_features_train.reshape(-1, 36, 1))
s_target_test_cnn = cnn_model.predict(s_features_test.reshape(-1, 36, 1))
s_target_train_rnn = rnn_model.predict(s_features_train.reshape(-1, 1, 36))
s_target_test_rnn = rnn_model.predict(s_features_test.reshape(-1, 1, 36))
# Display metrics
scores2('MLP', s_target_train, s_target_train_mlp)
scores2('CNN', s_target_train, s_target_train_cnn)
scores2('RNN', s_target_train, s_target_train_rnn)
hide_code
# Create predictions
s_target_train_cat_mlp = mlp_cat_model.predict(s_features_train_cat)
s_target_test_cat_mlp = mlp_cat_model.predict(s_features_test_cat)
s_target_train_cat_cnn = cnn_cat_model.predict(s_features_train_cat.reshape(-1, 44, 1))
s_target_test_cat_cnn = cnn_cat_model.predict(s_features_test_cat.reshape(-1, 44, 1))
s_target_train_cat_rnn = rnn_cat_model.predict(s_features_train_cat.reshape(-1, 1, 44))
s_target_test_cat_rnn = rnn_cat_model.predict(s_features_test_cat.reshape(-1, 1, 44))
# Display metrics
scores2('MLP', s_target_train, s_target_train_cat_mlp)
scores2('CNN', s_target_train, s_target_train_cat_cnn)
scores2('RNN', s_target_train, s_target_train_cat_rnn)
hide_code
# Create predictions
s_target_train_cat_enc_mlp = mlp_cat_enc_model.predict(s_features_train_cat_enc)
s_target_test_cat_enc_mlp = mlp_cat_enc_model.predict(s_features_test_cat_enc)
s_target_train_cat_enc_cnn = cnn_cat_enc_model.predict(s_features_train_cat_enc.reshape(-1, 636, 1))
s_target_test_cat_enc_cnn = cnn_cat_enc_model.predict(s_features_test_cat_enc.reshape(-1, 636, 1))
s_target_train_cat_enc_rnn = rnn_cat_enc_model.predict(s_features_train_cat_enc.reshape(-1, 1, 636))
s_target_test_cat_enc_rnn = rnn_cat_enc_model.predict(s_features_test_cat_enc.reshape(-1, 1, 636))
# Display metrics
scores2('MLP', s_target_train, s_target_train_cat_enc_mlp)
scores2('CNN', s_target_train, s_target_train_cat_enc_cnn)
scores2('RNN', s_target_train, s_target_train_cat_enc_rnn)
hide_code
# Rescale regressor predictions
target_train_gbr = target_scale.inverse_transform(s_target_train_gbr.reshape(-1,1))
target_test_gbr = target_scale.inverse_transform(s_target_test_gbr.reshape(-1,1))
target_train_br = target_scale.inverse_transform(s_target_train_br.reshape(-1,1))
target_test_br = target_scale.inverse_transform(s_target_test_br.reshape(-1,1))
target_train_mlpr = target_scale.inverse_transform(s_target_train_mlpr.reshape(-1,1))
target_test_mlpr = target_scale.inverse_transform(s_target_test_mlpr.reshape(-1,1))
# Rescale neural network predictions
target_train_mlp = target_scale.inverse_transform(s_target_train_mlp)
target_test_mlp = target_scale.inverse_transform(s_target_test_mlp)
target_train_cnn = target_scale.inverse_transform(s_target_train_cnn)
target_test_cnn = target_scale.inverse_transform(s_target_test_cnn)
target_train_rnn = target_scale.inverse_transform(s_target_train_rnn)
target_test_rnn = target_scale.inverse_transform(s_target_test_rnn)
hide_code
# Plot predictions and real target values
plt.figure(figsize = (18, 6))
plt.plot(target_train[1:50], color = 'black', label='Real Data')
plt.plot(target_train_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_br[1:50], label='Bagging Regressor')
plt.plot(target_train_mlpr[1:50], label='MLP Regressor')
plt.plot(target_train_mlp[1:50], label='MLP')
plt.plot(target_train_cnn[1:50], label='CNN')
plt.plot(target_train_rnn[1:50], label='RNN')
plt.xlabel("Data Points")
plt.ylabel("Predicted and Real Target Values")
plt.legend()
plt.title("Numeric Features; Train Predictions vs Real Data");
hide_code
# Plot test predictions
plt.figure(figsize = (18, 6))
plt.plot(target_test_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_br[1:50], label='Bagging Regressor')
plt.plot(target_test_mlpr[1:50], label='MLP Regressor')
plt.plot(target_test_mlp[1:50], label='MLP')
plt.plot(target_test_cnn[1:50], label='CNN')
plt.plot(target_test_rnn[1:50], label='RNN')
plt.xlabel("Data Points")
plt.ylabel("Predicted Target Values")
plt.legend()
plt.title("Numeric Features; Test Predictions");
hide_code
# Rescale regressor redictions
target_train_cat_gbr = target_scale.inverse_transform(s_target_train_cat_gbr.reshape(-1,1))
target_test_cat_gbr = target_scale.inverse_transform(s_target_test_cat_gbr.reshape(-1,1))
target_train_cat_br = target_scale.inverse_transform(s_target_train_cat_br.reshape(-1,1))
target_test_cat_br = target_scale.inverse_transform(s_target_test_cat_br.reshape(-1,1))
target_train_cat_mlpr = target_scale.inverse_transform(s_target_train_cat_mlpr.reshape(-1,1))
target_test_cat_mlpr = target_scale.inverse_transform(s_target_test_cat_mlpr.reshape(-1,1))
# Rescale neural network predictions
target_train_cat_mlp = target_scale.inverse_transform(s_target_train_cat_mlp.reshape(-1,1))
target_test_cat_mlp = target_scale.inverse_transform(s_target_test_cat_mlp.reshape(-1,1))
target_train_cat_cnn = target_scale.inverse_transform(s_target_train_cat_cnn.reshape(-1,1))
target_test_cat_cnn = target_scale.inverse_transform(s_target_test_cat_cnn.reshape(-1,1))
target_train_cat_rnn = target_scale.inverse_transform(s_target_train_cat_rnn.reshape(-1,1))
target_test_cat_rnn = target_scale.inverse_transform(s_target_test_cat_rnn.reshape(-1,1))
hide_code
# Plot predictions and real target values
plt.figure(figsize = (18, 6))
plt.plot(target_train[1:50], color = 'black', label='Real Data')
plt.plot(target_train_cat_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_cat_br[1:50], label='Bagging Regressor')
plt.plot(target_train_cat_mlpr[1:50], label='MLP Regressor')
plt.plot(target_train_cat_mlp[1:50], label='MLP')
plt.plot(target_train_cat_cnn[1:50], label='CNN')
plt.plot(target_train_cat_rnn[1:50], label='RNN')
plt.xlabel("Data Points")
plt.ylabel("Predicted and Real Target Values")
plt.legend()
plt.title("Numeric and Categorical Features; Train Predictions vs Real Data");
hide_code
# Plot test predictions
plt.figure(figsize = (18, 6))
plt.plot(target_test_cat_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_cat_br[1:50], label='Bagging Regressor')
plt.plot(target_test_cat_mlpr[1:50], label='MLP Regressor')
plt.plot(target_test_cat_mlp[1:50], label='MLP')
plt.plot(target_test_cat_cnn[1:50], label='CNN')
plt.plot(target_test_cat_rnn[1:50], label='RNN')
plt.xlabel("Data Points")
plt.ylabel("Predicted Target Values")
plt.legend()
plt.title("Numeric and Categorical Features; Test Predictions");
hide_code
# Rescale regressor predictions
target_train_cat_enc_gbr = target_scale.inverse_transform(s_target_train_cat_enc_gbr.reshape(-1,1))
target_test_cat_enc_gbr = target_scale.inverse_transform(s_target_test_cat_enc_gbr.reshape(-1,1))
target_train_cat_enc_br = target_scale.inverse_transform(s_target_train_cat_enc_br.reshape(-1,1))
target_test_cat_enc_br = target_scale.inverse_transform(s_target_test_cat_enc_br.reshape(-1,1))
target_train_cat_enc_mlpr = target_scale.inverse_transform(s_target_train_cat_enc_mlpr.reshape(-1,1))
target_test_cat_enc_mlpr = target_scale.inverse_transform(s_target_test_cat_enc_mlpr.reshape(-1,1))
# Rescale neural network preditions
target_train_cat_enc_mlp = target_scale.inverse_transform(s_target_train_cat_enc_mlp.reshape(-1,1))
target_test_cat_enc_mlp = target_scale.inverse_transform(s_target_test_cat_enc_mlp.reshape(-1,1))
target_train_cat_enc_cnn = target_scale.inverse_transform(s_target_train_cat_enc_cnn.reshape(-1,1))
target_test_cat_enc_cnn = target_scale.inverse_transform(s_target_test_cat_enc_cnn.reshape(-1,1))
target_train_cat_enc_rnn = target_scale.inverse_transform(s_target_train_cat_enc_rnn.reshape(-1,1))
target_test_cat_enc_rnn = target_scale.inverse_transform(s_target_test_cat_enc_rnn.reshape(-1,1))
hide_code
# Plot predictions and real target values
plt.figure(figsize = (18, 6))
plt.plot(target_train[1:50], color = 'black', label='Real Data')
plt.plot(target_train_cat_enc_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(target_train_cat_enc_mlpr[1:50], label='MLP Regressor')
plt.plot(target_train_cat_enc_mlp[1:50], label='MLP')
plt.plot(target_train_cat_enc_cnn[1:50], label='CNN')
plt.plot(target_train_cat_enc_rnn[1:50], label='RNN')
plt.xlabel("Data Points")
plt.ylabel("Predicted and Real Target Values")
plt.legend()
plt.title("Numeric and Encoded Categorical Features; Train Predictions vs Real Data");
hide_code
# Plot test predictions
plt.figure(figsize = (18, 6))
plt.plot(target_test_cat_enc_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(target_test_cat_enc_mlpr[1:50], label='MLP Regressor')
plt.plot(target_test_cat_enc_mlp[1:50], label='MLP')
plt.plot(target_test_cat_enc_cnn[1:50], label='CNN')
plt.plot(target_test_cat_enc_rnn[1:50], label='RNN')
plt.xlabel("Data Points")
plt.ylabel("Predicted Target Values")
plt.legend()
plt.title("Numeric and Encoded Categorical Features; Test Predictions");
The project was built on the basis of the competition offered on the site https://www.kaggle.com.
The competition version of this notebook is avalible here: https://www.kaggle.com/olgabelitskaya/sberbank-russian-housing-market .
There are several popular resources (numpy, pandas, matplotlib, scikit-learn and keras) for regression models were used.
The most valuable side of this project is the investigation of real data and the attempt to approximate the predictions on them to the threshold of 0.7-0.8 for the coefficient of determination.