import warnings
warnings.filterwarnings('ignore')
%pylab inline
%matplotlib inline
%load_ext rpy2.ipython
%R library(devtools)
%R library(ggplot2)
%R library(corrplot)
%R library(repr)
import numpy as np
import pandas as pd
import pickle
import sys
import scipy
import matplotlib
import pylab
from IPython.display import HTML
matplotlib.style.reload_library()
import matplotlib.pyplot as plt
from functools import partial
from sklearn.preprocessing import Imputer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit
hide_code = ''
HTML('''<script>
code_show = true;
function code_display() {
if (code_show) {
$('div.input').each(function(id) {
if (id == 0 || $(this).html().indexOf('hide_code') > -1) {
$(this).hide();
}
});
$('div.output_prompt').css('opacity', 0);
} else {
$('div.input').each(function(id) {
$(this).show();
});
$('div.output_prompt').css('opacity', 1);
}
code_show = !code_show
}
$( document ).ready(code_display);
</script>
<form action="javascript: code_display()"><input style="opacity: 100" type="submit"
value="Click to show or to hide code cells"></form>''')
For displaying or hiding the code cells with programs and functions, the reader of the project can use the button "Click to show or to hide code cells" on the top.
hide_code
# finction for reading a dictionary as a dataframe
def dict_to_dataframe(dictionary):
df = pd.DataFrame.from_dict(dictionary).transpose()
df.apply(partial(pd.to_numeric, errors='ignore'))
df.reset_index(level=0, inplace=True)
columns = list(df.columns)
columns[0] = 'staff_name'
df.columns = columns
return(df)
hide_code
# function for counting 'NaN' values without replacing
def count_nan(column):
k = 0
for value in column:
if value == 'NaN':
k += 1
p = 100.0*k/len(column)
return k, p
hide_code
# function for cleaning 'NaN' values without replacing
def column_without_nan(column):
data = []
for value in column:
if value == 'NaN':
continue
data.append(value)
return data
hide_code
# function for cleaning 'NaN' values with replacing
def column_with_npnan(column):
data = []
for value in column:
if value == 'NaN':
value = np.nan
data.append(value)
return np.array(data)
hide_code
# function for displaying 3 top values
def show_three_top(data, feature):
print "three largest", feature, ":"
sorted_list = sorted(column_without_nan(data[feature]), reverse=True)[0:3]
return sorted_list
You can see the set of useful functions from the Udacity course "INTRO TO MACHINE LEARNING" after clicking the button above.
hide_code
def featureFormat( dictionary, features, remove_NaN=True,
remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
return_list = []
# Key order - first branch is for Python 3 compatibility on mini-projects,
# second branch is for compatibility on final project.
if isinstance(sort_keys, str):
keys = pickle.load(open(sort_keys, "rb"))
elif sort_keys:
keys = sorted(dictionary.keys())
else:
keys = dictionary.keys()
for key in keys:
tmp_list = []
for feature in features:
try:
dictionary[key][feature]
except KeyError:
print "error: key ", feature, " not present"
return
value = dictionary[key][feature]
if value=="NaN" and remove_NaN:
value = 0
tmp_list.append( float(value) )
# Logic for deciding whether or not to add the data point.
append = True
# exclude 'poi' class as criteria.
if features[0] == 'poi':
test_list = tmp_list[1:]
else:
test_list = tmp_list
### if all features are zero and you want to remove
### data points that are all zero, do that here
if remove_all_zeroes:
append = False
for item in test_list:
if item != 0 and item != "NaN":
append = True
break
### if any features for a given data point are zero
### and you want to remove data points with any zeroes,
### handle that here
if remove_any_zeroes:
if 0 in test_list or "NaN" in test_list:
append = False
### Append the data point if flagged for addition.
if append:
return_list.append( np.array(tmp_list) )
return np.array(return_list)
hide_code
def targetFeatureSplit( data ):
target = []
features = []
for item in data:
target.append( item[0] )
features.append( item[1:] )
return target, features
hide_code
PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"
CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"
hide_code
def test_classifier(clf, dataset, feature_list, folds = 1000):
data = featureFormat(dataset, feature_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
true_negatives = 0
false_negatives = 0
true_positives = 0
false_positives = 0
for train_idx, test_idx in cv:
features_train = []
features_test = []
labels_train = []
labels_test = []
for ii in train_idx:
features_train.append( features[ii] )
labels_train.append( labels[ii] )
for jj in test_idx:
features_test.append( features[jj] )
labels_test.append( labels[jj] )
### fit the classifier using training set, and test on test set
clf.fit(features_train, labels_train)
predictions = clf.predict(features_test)
for prediction, truth in zip(predictions, labels_test):
if prediction == 0 and truth == 0:
true_negatives += 1
elif prediction == 0 and truth == 1:
false_negatives += 1
elif prediction == 1 and truth == 0:
false_positives += 1
elif prediction == 1 and truth == 1:
true_positives += 1
else:
print "Warning: Found a predicted label not == 0 or 1."
print "All predictions should take value 0 or 1."
print "Evaluating performance for processed predictions:"
break
try:
total_predictions = true_negatives + false_negatives + false_positives + true_positives
accuracy = 1.0*(true_positives + true_negatives)/total_predictions
precision = 1.0*true_positives/(true_positives+false_positives)
recall = 1.0*true_positives/(true_positives+false_negatives)
f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
print clf
print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
print ""
except:
print "Got a divide by zero when trying out:", clf
print "Precision or recall may be undefined due to a lack of true positive predicitons."
hide_code
def dump_classifier_and_data(clf, dataset, feature_list):
with open(CLF_PICKLE_FILENAME, "w") as clf_outfile:
pickle.dump(clf, clf_outfile)
with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
pickle.dump(dataset, dataset_outfile)
with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
pickle.dump(feature_list, featurelist_outfile)
hide_code
def load_classifier_and_data():
with open(CLF_PICKLE_FILENAME, "r") as clf_infile:
clf = pickle.load(clf_infile)
with open(DATASET_PICKLE_FILENAME, "r") as dataset_infile:
dataset = pickle.load(dataset_infile)
with open(FEATURE_LIST_FILENAME, "r") as featurelist_infile:
feature_list = pickle.load(featurelist_infile)
return clf, dataset, feature_list
hide_code
def main():
### load up student's classifier, dataset, and feature_list
clf, dataset, feature_list = load_classifier_and_data()
### Run testing script
test_classifier(clf, dataset, feature_list)
final_project_dataset.pkl
enron61702insiderpay.pdf
https://jaycode.github.io/enron/identifying-fraud-from-enron-email.html
In 2000, Enron was one of the largest companies in the United States. By 2002, it had collapsed into bankruptcy due to widespread corporate fraud. In the resulting Federal investigation, a significant amount of typically confidential information entered into the public record, including tens of thousands of emails and detailed financial data for top executives. In this project, we will play detective, and put our new skills to use by building a person of interest identifier based on financial and email data made public as a result of the Enron scandal. To assist us in our detective work, the authors have combined this data with a hand-generated list of persons of interest in the fraud case, which means individuals who were indicted, reached a settlement or plea deal with the government, or testified in exchange for prosecution immunity.
The database has been loaded from the file "final_project_dataset.pkl".
hide_code
enron_data = pickle.load(open("final_project_dataset.pkl", "r"))
hide_code
print "The lenght of the dataset: ", len(enron_data)
hide_code
print "The first element in the dictionary: ", next(enron_data.__iter__())
hide_code
print "The example of features in the dictionary: ", str(enron_data.itervalues().next())
hide_code
print "The length of each element: ", len(enron_data['METTS MARK'])
hide_code
print "The staff list: ", str(sorted(enron_data.keys()))
hide_code
# Get names and count the persons of interest
k_poi=0
poi = []
for i in range(len(enron_data.keys())):
person = enron_data.keys()[i]
if enron_data[person]['poi'] == True:
k_poi += 1
poi.append(person)
print "Persons of interest:", k_poi
print str(poi)
The dataset consists of 146 data points with 21 features. 18 records are labeled as persons of interest.
financial features: ['salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees'] (all units are in US dollars).
email features: ['to_messages', 'email_address', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi'] (units are generally number of emails messages; notable exception is ‘email_address’, which is a text string).
POI label: [‘poi’] (boolean, represented as integer).
(salary) Reflects items such as base salary, executive cash allowances, and benefits payments.
(bonus) Reflects annual cash incentives paid based upon company performance. Also may include other retention payments.
(long_term_incentive) Reflects long-term incentive cash payments from various long-term incentive programs designed to tie executive compensation to long-term success as measured against key performance drivers and business objectives over a multi-year period, generally 3 to 5 years.
(deferred_income) Reflects voluntary executive deferrals of salary, annual cash incentives, and long-term cash incentives as well as cash fees deferred by non-employee directors under a deferred compensation arrangement. May also reflect deferrals under a stock option or phantom stock unit in lieu of cash arrangement.
(deferral_payments) Reflects distributions from a deferred compensation arrangement due to termination of employment or due to in-service withdrawals as per plan provisions.
(loan_advances) Reflects total amount of loan advances, excluding repayments, provided by the Debtor in return for a promise of repayment. In certain instances, the terms of the promissory notes allow for the option to repay with stock of the company.
(other) Reflects items such as payments for severance, consulting services, relocation costs, tax advances and allowances for employees on international assignment (i.e. housing allowances, cost of living allowances, payments under Enron’s Tax Equalization Program, etc.). May also include payments provided with respect to employment agreements, as well as imputed income amounts for such things as use of corporate aircraft.
(expenses) Reflects reimbursements of business expenses. May include fees paid for consulting services.
(director_fees) Reflects cash payments and/or value of stock grants made in lieu of cash payments to non-employee directors.
(exercised_stock_options) Reflects amounts from exercised stock options which equal the market value in excess of the exercise price on the date the options were exercised either through cashless (same-day sale), stock swap or cash exercises. The reflected gain may differ from that realized by the insider due to fluctuations in the market price and the timing of any subsequent sale of the securities.
(restricted_stock) Reflects the gross fair market value of shares and accrued dividends (and/or phantom units and dividend equivalents) on the date of release due to lapse of vesting periods, regardless of whether deferred.
(restricted_stock_deferred) Reflects value of restricted stock voluntarily deferred prior to release under a deferred compensation arrangement.
(total_stock_value) In 1998, 1999 and 2000, Debtor and non-debtor affiliates were charged for options granted. The Black-Scholes method was used to determine the amount to be charged. Any amounts charged to Debtor and non-debtor affiliates associated with the options exercised related to these three years have not been subtracted from the share value amounts shown.
Reading the data and cleaning it.
Exploring and understanding the input data.
Analyzing how best to present the data to the algorithm.
Choosing the right model and algorithm.
Measuring the performance correctly.
Constructing a dataframe from a dictionary can be useful in the researching. So enron_df was created from enron_data and here is an example of rows in the dataset.
hide_code
# Construct the dataframe from the dictionary
enron_df = dict_to_dataframe(enron_data)
hide_code
# Display five records in a suitable form
enron_df.head().transpose()
import warnings
warnings.filterwarnings('ignore')
This database contains indexes that hamper analysis: for example, a spreadsheet artifact 'TOTAL' or "NaN' values in the rows. The visualizing as a scatter plot can confirm it.
'TOTAL' was not being used in the analysis and thus removed at all.
The datasets with 'NaN' are incompatible with scikit-learn estimators. They assume that all values in an array are numerical. A basic strategy to use these datasets is to discard entire rows and/or columns containing missing values. But in our case this can be a reason of losing data which may be valuable (even though incomplete). A better strategy is to impute the missing values.The Imputer class provides basic strategies for imputing missing values, either using the mean, the median or the most frequent value of the row or column in which the missing values are located.
Before deleting:
hide_code
# Create columns 'staff_name', 'salary' without NaN
salary_name = enron_df[['staff_name', 'salary']]
salary_name = salary_name[salary_name['salary'] != 'NaN']
hide_code
# Plot columns 'staff_name', 'salary'
plt.style.use('seaborn-notebook')
plt.figure(figsize=(10,4))
x = list(salary_name['salary'].index)
plt.scatter(x, salary_name['salary'], c=salary_name['salary'], cmap='jet')
hide_code
# Find the name of the outlier
salary_name['staff_name'][salary_name['salary'].idxmax()]
hide_code
# Delete the record 'TOTAL'
del enron_data['TOTAL']
enron_df = enron_df[enron_df['staff_name'] != 'TOTAL']
After deleting:
hide_code
# Create columns 'staff_name', 'salary' without NaN after deleting the outlier
salary_name = enron_df[['staff_name', 'salary']]
salary_name = salary_name[salary_name['salary'] != 'NaN']
hide_code
# Plot columns 'staff_name', 'salary' after deleting the outlier
plt.figure(figsize=(10,4))
x = list(salary_name['salary'].index)
plt.scatter(x, salary_name['salary'], c = salary_name['salary'], cmap='jet')
Finding 2 rows with wrong values by creating the artificial variable total check as a sum of other financial variables:
hide_code
# Create dataframe with replaced NaN by zero
enron_df1 = pd.DataFrame(enron_df)
enron_df1 = enron_df1.convert_objects(convert_numeric=True)
enron_df1 = enron_df1.fillna(0)
# Create dataframe to check total payments
enron_df2 = pd.DataFrame()
enron_df2['staff_name'] = enron_df1['staff_name']
enron_df2['total_check'] = enron_df1['bonus'] + enron_df1['director_fees'] + enron_df1['deferral_payments'] + \
enron_df1['deferred_income'] + enron_df1['loan_advances'] + enron_df1['long_term_incentive'] + \
enron_df1['expenses'] + enron_df1['other'] + enron_df1['salary']
enron_df2['total_payments'] = enron_df1['total_payments']
enron_df2['stock_check'] = (enron_df1['restricted_stock'] + enron_df1['exercised_stock_options'] + \
enron_df1['restricted_stock_deferred'])
enron_df2['total_stock_value'] = enron_df1['total_stock_value']
enron_df2['same_total'] = (enron_df2['total_check'] == enron_df2['total_payments'])
enron_df2['same_stock'] = (enron_df2['stock_check'] == enron_df2['total_stock_value'])
enron_df2['poi'] = enron_df1['poi']
# Display results of comparing
print np.sum(enron_df2['same_total']), " are the same in total_check and total_payments from ", len(enron_df2)
print "Difference between total_check and total_payments: ", (np.sum(enron_df2['total_payments'])
- np.sum(enron_df2['total_check']))
print
print "Data points with difference"
print enron_df2[enron_df2['same_total'] == False]
print enron_df1[enron_df1['staff_name'].isin(enron_df2[enron_df2['same_total']
== False]['staff_name'].tolist())].transpose()
Replacing them in the dictionary and in the dataframe:
hide_code
# Replacing values in 2 rows in the dictionary
enron_data['BELFER ROBERT']['deferred_income'] = -102500
enron_data['BELFER ROBERT']['deferral_payments'] = 'NaN'
enron_data['BELFER ROBERT']['director_fees'] = 102500
enron_data['BELFER ROBERT']['expenses'] = 3285
enron_data['BELFER ROBERT']['total_payments'] = 3285
enron_data['BELFER ROBERT']['exercised_stock_options'] = 'NaN'
enron_data['BELFER ROBERT']['restricted_stock'] = 44093
enron_data['BELFER ROBERT']['restricted_stock_deferred'] = -44093
enron_data['BELFER ROBERT']['total_stock_value'] = 'NaN'
enron_data['BHATNAGAR SANJAY']['director_fees'] = 'NaN'
enron_data['BHATNAGAR SANJAY']['expenses'] = 137864
enron_data['BHATNAGAR SANJAY']['other'] = 'NaN'
enron_data['BHATNAGAR SANJAY']['total_payments'] = 137864
enron_data['BHATNAGAR SANJAY']['exercised_stock_options'] = 15456290
enron_data['BHATNAGAR SANJAY']['restricted_stock'] = 2604490
enron_data['BHATNAGAR SANJAY']['restricted_stock_deferred'] = -2604490
enron_data['BHATNAGAR SANJAY']['total_stock_value'] = 15456290
# Replacing values in 2 rows in the dataframe
enron_df = dict_to_dataframe(enron_data)
hide_code
# Check replacing
enron_df[(enron_df['staff_name'] == 'BHATNAGAR SANJAY') | (enron_df['staff_name'] == 'BELFER ROBERT')].T
It should be noted quite a high percentage of missing data.
hide_code
# Create a general list of features
feature_list = ['bonus', 'deferral_payments', 'deferred_income', 'director_fees', 'expenses',
'exercised_stock_options', 'loan_advances', 'long_term_incentive', 'other', 'restricted_stock',
'restricted_stock_deferred', 'salary', 'total_payments', 'total_stock_value',
'to_messages', 'email_address', 'from_poi_to_this_person',
'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']
hide_code
print "Counting NaN values in the dataset."
for element in feature_list:
k, p = count_nan(enron_df[element])
print element, ":", k, "NaN, ", "%.2f" %p, "%"
Let's continue to discover suspicious values. The statistical description and the histograms of all finance features can help us.
hide_code
# Create a list of finance features
finance_feature_list = ['bonus', 'deferral_payments', 'deferred_income', 'director_fees', 'exercised_stock_options',
'expenses', 'loan_advances', 'long_term_incentive', 'other', 'restricted_stock',
'restricted_stock_deferred', 'salary', 'total_payments', 'total_stock_value']
# Display statistical decription for finance features
for element in finance_feature_list:
print element, ":", scipy.stats.describe(column_without_nan(enron_df[element]))
hide_code
# Plotting finance features
plt.style.use('seaborn-deep')
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(18, 22))
ax0, ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9 = axes.flat
ax0.hist(column_without_nan(enron_df['bonus']), 20)
ax0.set_title('bonus')
ax1.hist(column_without_nan(enron_df['deferral_payments']), 20)
ax1.set_title('deferral payments')
ax2.hist(column_without_nan(enron_df['director_fees']), 20)
ax2.set_title('director fees')
ax3.hist(column_without_nan(enron_df['deferred_income']), 20)
ax3.set_title('deferred income')
ax4.hist(column_without_nan(enron_df['expenses']), 20)
ax4.set_title('expenses')
ax5.hist(column_without_nan(enron_df['loan_advances']), 20)
ax5.set_title('loan advances')
ax6.hist(column_without_nan(enron_df['long_term_incentive']), 20)
ax6.set_title('long term incentive')
ax7.hist(column_without_nan(enron_df['other']), 20)
ax7.set_title('other')
ax8.hist(column_without_nan(enron_df['salary']), 20)
ax8.set_title('salary')
ax9.hist(column_without_nan(enron_df['total_payments']), 20)
ax9.set_title('total payments')
plt.show()
hide_code
# Plotting finance features
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 6))
ax0, ax1, ax2, ax3 = axes.flat
ax0.hist(column_without_nan(enron_df['exercised_stock_options']), 20)
ax0.set_title('exercised stock options')
ax1.hist(column_without_nan(enron_df['restricted_stock']), 20)
ax1.set_title('restricted stock')
ax2.hist(column_without_nan(enron_df['restricted_stock_deferred']), 20)
ax2.set_title('restricted stock deferred')
ax3.hist(column_without_nan(enron_df['total_stock_value']), 20)
ax3.set_title('total stock value')
plt.show()
hide_code
# Create a list of email features
email_feature_list = ['to_messages', 'from_poi_to_this_person', 'from_messages',
'from_this_person_to_poi', 'shared_receipt_with_poi']
# Display statistical decription for email features
for element in email_feature_list:
print element, ":", scipy.stats.describe(column_without_nan(enron_df[element]))
hide_code
# Plotting email features
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 3))
ax0, ax1 = axes.flat
ax0.hist(column_without_nan(enron_df['to_messages']), 100)
ax0.set_title('to messages')
ax1.hist(column_without_nan(enron_df['from_messages']), 100)
ax1.set_title('from messages')
plt.show()
hide_code
# Plotting email features
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 3))
ax0, ax1, ax2 = axes.flat
ax0.hist(column_without_nan(enron_df['from_poi_to_this_person']), 100)
ax0.set_title('from poi to this person')
ax1.hist(column_without_nan(enron_df['from_this_person_to_poi']), 100)
ax1.set_title('from this person to poi')
ax2.hist(column_without_nan(enron_df['shared_receipt_with_poi']), 100)
ax2.set_title('shared receipt with poi')
plt.show()
Let us discover the influence of deleting outliers in the case of email features using as an example only one variable.
hide_code
# Plot 'from_messages'
plt.figure(figsize=(10,4))
x = list(enron_df['from_messages'][enron_df['from_messages'] != 'NaN'].index)
y = enron_df['from_messages'][enron_df['from_messages'] != 'NaN']
plt.scatter(x, y, c=y, cmap='jet')
hide_code
# Find the name of the outlier
print "Maximum values in each column."
print "from messages:", enron_df['staff_name'][enron_df['from_messages'].idxmax()]
print "to messages:", enron_df['staff_name'][enron_df['to_messages'].idxmax()]
print "from this person to poi:", enron_df['staff_name'][enron_df['from_this_person_to_poi'].idxmax()]
print "from poi to this person:", enron_df['staff_name'][enron_df['from_poi_to_this_person'].idxmax()]
print "shared receipt with poi:", enron_df['staff_name'][enron_df['shared_receipt_with_poi'].idxmax()]
hide_code
# Find 1 outlier for each email feature
email_outliers = enron_df[(enron_df['staff_name'] == 'KAMINSKI WINCENTY J') |
(enron_df['staff_name'] == 'SHAPIRO RICHARD S') |
(enron_df['staff_name'] == 'DELAINEY DAVID W') |
(enron_df['staff_name'] == 'LAVORATO JOHN J') |
(enron_df['staff_name'] == 'BELDEN TIMOTHY N') ] \
[['staff_name', 'to_messages', 'from_messages',
'from_poi_to_this_person', 'from_this_person_to_poi', 'shared_receipt_with_poi']]
email_outliers_list = ['KAMINSKI WINCENTY J', 'SHAPIRO RICHARD S', 'DELAINEY DAVID W',
'LAVORATO JOHN J', 'BELDEN TIMOTHY N']
email_outliers
hide_code
print "Plot 'from_messages' without 1 outlier for 5 variables"
y_no_outliers = enron_df['from_messages']\
[(enron_df['staff_name'] != 'KAMINSKI WINCENTY J') & (enron_df['staff_name'] != 'SHAPIRO RICHARD S') &
(enron_df['staff_name'] != 'DELAINEY DAVID W') & (enron_df['staff_name'] != 'LAVORATO JOHN J') &
(enron_df['staff_name'] != 'BELDEN TIMOTHY N')]
plt.figure(figsize=(10,4))
x_no_outliers = list(y_no_outliers.index)
plt.scatter(x_no_outliers, y_no_outliers, c=y_no_outliers, cmap='jet')
hide_code
print "3 maximum values in each column."
print show_three_top(enron_df, 'from_messages')
print show_three_top(enron_df, 'to_messages')
print show_three_top(enron_df, 'from_this_person_to_poi')
print show_three_top(enron_df, 'from_poi_to_this_person')
print show_three_top(enron_df, 'shared_receipt_with_poi')
hide_code
print "Plot 'from_messages' without 3 outlier for 1 variable"
y3outliers = enron_df['from_messages'][(enron_df['from_messages'] < 4343) & (enron_df['from_messages'] != 'NaN')]
x3outliers = list(y3outliers.index)
plt.scatter(x3outliers, y3outliers, c=y3outliers, cmap='jet')
Removing rows with large values leads to the appearance of new outliers. I decided not to remove them in order to avoid the loss of valuable information.
hide_code
# Replace string NaN by np.nan
enron_df_np = enron_df.apply(column_with_npnan)
hide_code
# Setup Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
# Setup Scaler
scaler = MinMaxScaler()
hide_code
# Setup feature list without email address
feature_list0 = ['bonus', 'deferral_payments', 'deferred_income', 'director_fees', 'expenses',
'exercised_stock_options', 'loan_advances', 'long_term_incentive', 'other', 'restricted_stock',
'restricted_stock_deferred', 'salary', 'total_payments', 'total_stock_value',
'to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi',
'shared_receipt_with_poi']
hide_code
# Setup variable for features after Imputer and Scaler
feature_imp = [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0],
[0], [0], [0], [0], [0], [0], [0], [0], [0]]
feature_imp_scaled = [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0],
[0], [0], [0], [0], [0], [0], [0], [0], [0]]
hide_code
# Transform data for finance features by Imputer
for i in range(len(feature_list0)):
element = feature_list0[i]
imp.fit([enron_df_np[element]])
feature_imp[i] = imp.transform([enron_df_np[element]])
feature_imp[i] = feature_imp[i][0]
feature_imp_scaled[i] = scaler.fit_transform(feature_imp[i])
enron_df_imp_scaled = pd.DataFrame(feature_imp_scaled)
enron_df_imp_scaled.index = feature_list0
hide_code
#Transform the dataframe
enron_df_imp_scaled = enron_df_imp_scaled.T
enron_df_imp_scaled.head().T
The histograms of all finance features after these processes we can see below.
hide_code
# Plotting finance features
plt.style.use('seaborn-deep')
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(18, 22))
ax0, ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9 = axes.flat
ax0.hist(enron_df_imp_scaled['bonus'], 20)
ax0.set_title('bonus')
ax1.hist(enron_df_imp_scaled['deferral_payments'], 20)
ax1.set_title('deferral payments')
ax2.hist(enron_df_imp_scaled['deferred_income'], 20)
ax2.set_title('deferred income')
ax3.hist(enron_df_imp_scaled['director_fees'], 20)
ax3.set_title('director fees')
ax4.hist(enron_df_imp_scaled['expenses'], 20)
ax4.set_title('expenses')
ax5.hist(enron_df_imp_scaled['loan_advances'], 20)
ax5.set_title('loan advances')
ax6.hist(enron_df_imp_scaled['long_term_incentive'], 20)
ax6.set_title('long term incentive')
ax7.hist(enron_df_imp_scaled['other'], 20)
ax7.set_title('other')
ax8.hist(enron_df_imp_scaled['salary'], 20)
ax8.set_title('salary')
ax9.hist(enron_df_imp_scaled['total_payments'], 20)
ax9.set_title('total payments')
plt.show()
hide_code
# Plotting finance features
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 6))
ax0, ax1, ax2, ax3 = axes.flat
ax0.hist(enron_df_imp_scaled['exercised_stock_options'], 20)
ax0.set_title('exercised stock options')
ax1.hist(enron_df_imp_scaled['restricted_stock'], 20)
ax1.set_title('restricted stock')
ax2.hist(enron_df_imp_scaled['restricted_stock_deferred'], 20)
ax2.set_title('restricted stock deferred')
ax3.hist(enron_df_imp_scaled['total_stock_value'], 20)
ax3.set_title('total stock value')
plt.show()
hide_code
# Plotting email features
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 3))
ax0, ax1 = axes.flat
ax0.hist(enron_df_imp_scaled['to_messages'], 100)
ax0.set_title('to messages')
ax1.hist(enron_df_imp_scaled['from_messages'], 100)
ax1.set_title('from messages')
plt.show()
hide_code
# Plotting email features
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 3))
ax0, ax1, ax2 = axes.flat
ax0.hist(enron_df_imp_scaled['from_poi_to_this_person'], 100)
ax0.set_title('from poi to this person')
ax1.hist(enron_df_imp_scaled['from_this_person_to_poi'], 100)
ax1.set_title('from this person to poi')
ax2.hist(enron_df_imp_scaled['shared_receipt_with_poi'], 100)
ax2.set_title('shared receipt with poi')
plt.show()
hide_code
# Complete a scaled dataframe
df1 = enron_df['staff_name']
df2 = enron_df['email_address']
df3 = enron_df['poi']
scaled_enron_df = pd.concat([enron_df_imp_scaled, df1, df2, df3], axis=1)
scaled_enron_df.head().T
scaled_enron_data = scaled_enron_df.to_dict(orient="index")
hide_code
print "The example of features in the dictionary: ", str(scaled_enron_data.itervalues().next())
hide_code
correlation_enron_df = pd.DataFrame(scaled_enron_df)
hide_code
correlation_enron_ff = correlation_enron_df.drop(correlation_enron_df[['staff_name', 'email_address', 'poi',
'to_messages', 'from_poi_to_this_person',
'from_messages', 'from_this_person_to_poi',
'shared_receipt_with_poi']], axis=1)
print "correlation_enron_ff:", list(correlation_enron_ff.T.index)
correlation_enron_ef = correlation_enron_df[['to_messages', 'from_poi_to_this_person',
'from_messages', 'from_this_person_to_poi',
'shared_receipt_with_poi']]
print "correlation_enron_ef:", list(correlation_enron_ef.T.index)
enron_df.to_csv('enron.csv')
scaled_enron_df.to_csv('scaled_enron.csv')
correlation_enron_ff.to_csv('correlation_enron_ff.csv')
correlation_enron_ef.to_csv('correlation_enron_ef.csv')
A graphical display of a correlation matrix, highlighting the most correlated variables in a data table, help us understand relationship between features.
Financial features:
%%R -w1000 -h1000
correlation_enron_ff <- read.csv("correlation_enron_ff.csv")
correlation_enron_ff$X <- NULL
plot <- corrplot.mixed(cor(correlation_enron_ff), lower="number", upper="pie",
order ="original", tl.col="black", tl.cex=0.8)
Email features:
%%R -w800 -h800
correlation_enron_ef <- read.csv("correlation_enron_ef.csv")
correlation_enron_ef$X <- NULL
plot2 <- corrplot.mixed(cor(correlation_enron_ef), lower="number", upper="pie",
order ="original", tl.col="black", tl.cex=1.0)
A very small number of strong dependencies between variables is observed. It means we should use a lot of features for building a model or construct some new.
We always think of models as simplified theoretical approximations of the reality. As such there is some inferiority involved, also called the approximation error. Before fitting the complex models we can try to research dependencies between variables as a part of understanding our dataset.
hide_code
# Setup features for polyfitting
x1 = scaled_enron_df['salary']
y1 = scaled_enron_df['bonus']
hide_code
# Polyfit for 1 degree
fp11, residuals11, rank11, sv11, rcond11 = scipy.polyfit(x1, y1, 1, full=True)
print ("Model parameters for salary and bonus (1 degree): %s" % fp11)
The formulae of the polyfit line (1 degree): f(x) = 3.61613148 * x + 174475.168
hide_code
# Polyfit for 3,5,7 degree
fp12 = scipy.polyfit(x1, y1, 3)
fp13 = scipy.polyfit(x1, y1, 5)
fp14 = scipy.polyfit(x1, y1, 7)
hide_code
# Create lines with coefficients
f11 = scipy.poly1d(fp11)
f12 = scipy.poly1d(fp12)
f13 = scipy.poly1d(fp13)
f14 = scipy.poly1d(fp14)
hide_code
print "Polyfit for 1, 3, 5, 7 degrees for salary and bonus."
fx1 = scipy.linspace(0, 1.0, 100)
plt.plot(fx1, f11(fx1), linewidth=2)
plt.plot(fx1, f12(fx1), linewidth=2)
plt.plot(fx1, f13(fx1), linewidth=2)
plt.plot(fx1, f14(fx1), linewidth=2)
plt.scatter(x1, y1)
plt.legend(["d=%i" % f11.order, "d=%i" % f12.order, "d=%i" % f13.order, "d=%i" % f14.order], loc="upper left")
hide_code
# Setup features for polyfitting
x2 = scaled_enron_df['from_this_person_to_poi']
y2 = scaled_enron_df['from_messages']
hide_code
# Polyfit for 1 degree
fp21, residuals21, rank21, sv21, rcond21 = scipy.polyfit(x2, y2, 1, full=True)
print ("Model parameters for from_this_person_to_poi and from_messages (1 dergee): %s" % fp21)
hide_code
# Polyfit for 1, 3, 5 degree
fp22 = scipy.polyfit(x2, y2, 3)
fp23 = scipy.polyfit(x2, y2, 5)
hide_code
# Polyfit for 1, 3, 5 degree
f21 = scipy.poly1d(fp21)
f22 = scipy.poly1d(fp22)
f23 = scipy.poly1d(fp23)
hide_code
print "Polyfit for 1, 3, 5 degrees for from_this_person_to_poi and from_poi_to_this_person."
fx2 = scipy.linspace(0, 1, 100)
plt.plot(fx2, f21(fx2), linewidth=2)
plt.plot(fx2, f22(fx2), linewidth=2)
plt.plot(fx2, f23(fx2), linewidth=2)
plt.scatter(x2, y2)
plt.legend(["d=%i" % f21.order, "d=%i" % f22.order, "d=%i" % f23.order], loc="upper left")
In these pairs of variables I expected the presence of more stringent dependence.
Feature sets:
features_list01 = ['poi','salary', 'bonus', 'exercised_stock_options', 'deferred_income']
features_list02 = ['poi','salary', 'bonus', 'exercised_stock_options', 'deferred_income',
'expenses', 'long_term_incentive', 'restricted_stock']
features_list03 = ['poi', 'salary', 'bonus', 'exercised_stock_options', 'deferred_income',
'long_term_incentive', 'expenses']
features_list04 = ['poi','from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']
features_list05 = ['poi','salary', 'bonus', 'exercised_stock_options', 'deferred_income',
'from_poi_to_this_person', 'from_this_person_to_poi', 'shared_receipt_with_poi']
Setup the dataset:
my_dataset = enron_data
Classifiers:
I wanted to perform a large number of tests, so I chose a fairly wide range of classifiers.
clf01 = DecisionTreeClassifier(max_depth=1)
clf02 = AdaBoostClassifier()
clf03 = RandomForestClassifier(min_samples_split=50)
clf04 = GaussianNB()
clf05 = neighbors.KNeighborsClassifier()
clf06 = QuadraticDiscriminantAnalysis()
clf07 = KMeans(n_clusters=2)
clf08 = LogisticRegression()
Results:
For each set of features results are arranged in ascending order of accuracy.
hide_code
data01 = featureFormat(my_dataset, features_list01, sort_keys = True)
labels01, features01 = targetFeatureSplit(data01)
hide_code
features_train01, features_test01, labels_train01, labels_test01 = \
train_test_split(features01, labels01, test_size=0.3, random_state=42)
hide_code
dump_classifier_and_data(clf02, my_dataset, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf01, my_dataset, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf06, my_dataset, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf03, my_dataset, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf04, my_dataset, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf05, my_dataset, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
data02 = featureFormat(my_dataset, features_list02, sort_keys = True)
labels02, features02 = targetFeatureSplit(data02)
hide_code
features_train02, features_test02, labels_train02, labels_test02 = \
train_test_split(features02, labels02, test_size=0.3, random_state=42)
hide_code
dump_classifier_and_data(clf04, my_dataset, features_list02)
load_classifier_and_data()
print features_list02
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf01, my_dataset, features_list02)
load_classifier_and_data()
print features_list02
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf02, my_dataset, features_list02)
load_classifier_and_data()
print features_list02
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf05, my_dataset, features_list02)
load_classifier_and_data()
print features_list02
hide_code
if __name__ == '__main__':
main()
hide_code
data03 = featureFormat(my_dataset, features_list03, sort_keys = True)
labels03, features03 = targetFeatureSplit(data03)
hide_code
features_train03, features_test03, labels_train03, labels_test03 = \
train_test_split(features03, labels03, test_size=0.3, random_state=42)
hide_code
dump_classifier_and_data(clf07, my_dataset, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf01, my_dataset, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf06, my_dataset, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf04, my_dataset, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf03, my_dataset, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf02, my_dataset, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf05, my_dataset, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
data04 = featureFormat(my_dataset, features_list04, sort_keys = True)
labels04, features04 = targetFeatureSplit(data04)
hide_code
features_train04, features_test04, labels_train04, labels_test04 = \
train_test_split(features04, labels04, test_size=0.3, random_state=42)
hide_code
dump_classifier_and_data(clf07, my_dataset, features_list04)
load_classifier_and_data()
print features_list04
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf02, my_dataset, features_list04)
load_classifier_and_data()
print features_list04
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf05, my_dataset, features_list04)
load_classifier_and_data()
print features_list04
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf08, my_dataset, features_list04)
load_classifier_and_data()
print features_list04
hide_code
if __name__ == '__main__':
main()
hide_code
data05 = featureFormat(my_dataset, features_list05, sort_keys = True)
labels05, features05 = targetFeatureSplit(data05)
hide_code
features_train05, features_test05, labels_train05, labels_test05 = \
train_test_split(features05, labels05, test_size=0.3, random_state=42)
hide_code
dump_classifier_and_data(clf02, my_dataset, features_list05)
load_classifier_and_data()
print features_list05
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf04, my_dataset, features_list05)
load_classifier_and_data()
print features_list05
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf03, my_dataset, features_list05)
load_classifier_and_data()
print features_list05
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf05, my_dataset, features_list05)
load_classifier_and_data()
print features_list05
hide_code
if __name__ == '__main__':
main()
Setup the scaled dataset.
my_dataset2 = scaled_enron_data
Results:
hide_code
data21 = featureFormat(my_dataset2, features_list01, sort_keys = True)
labels21, features21 = targetFeatureSplit(data21)
hide_code
features_train21, features_test21, labels_train21, labels_test21 = \
train_test_split(features21, labels21, test_size=0.3, random_state=42)
hide_code
dump_classifier_and_data(clf02, my_dataset2, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf03, my_dataset2, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf04, my_dataset2, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf05, my_dataset2, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf06, my_dataset2, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf01, my_dataset2, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
data22 = featureFormat(my_dataset2, features_list02, sort_keys = True)
labels22, features22 = targetFeatureSplit(data22)
hide_code
features_train22, features_test22, labels_train22, labels_test22 = \
train_test_split(features22, labels22, test_size=0.3, random_state=42)
hide_code
dump_classifier_and_data(clf02, my_dataset2, features_list02)
load_classifier_and_data()
print features_list02
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf05, my_dataset2, features_list02)
load_classifier_and_data()
print features_list02
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf04, my_dataset2, features_list02)
load_classifier_and_data()
print features_list02
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf01, my_dataset2, features_list02)
load_classifier_and_data()
print features_list02
hide_code
if __name__ == '__main__':
main()
hide_code
data23 = featureFormat(my_dataset2, features_list03, sort_keys = True)
labels23, features23 = targetFeatureSplit(data23)
hide_code
features_train23, features_test23, labels_train23, labels_test23 = \
train_test_split(features23, labels23, test_size=0.3, random_state=42)
hide_code
dump_classifier_and_data(clf07, my_dataset2, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf01, my_dataset2, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf02, my_dataset2, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf05, my_dataset2, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf06, my_dataset2, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf04, my_dataset2, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf03, my_dataset2, features_list03)
load_classifier_and_data()
print features_list03
hide_code
if __name__ == '__main__':
main()
hide_code
data24 = featureFormat(my_dataset2, features_list04, sort_keys = True)
labels24, features24 = targetFeatureSplit(data24)
hide_code
features_train24, features_test24, labels_train24, labels_test24 = \
train_test_split(features24, labels24, test_size=0.3, random_state=42)
hide_code
dump_classifier_and_data(clf07, my_dataset2, features_list04)
load_classifier_and_data()
print features_list04
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf02, my_dataset2, features_list04)
load_classifier_and_data()
print features_list04
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf05, my_dataset2, features_list04)
load_classifier_and_data()
print features_list04
hide_code
if __name__ == '__main__':
main()
hide_code
data25 = featureFormat(my_dataset2, features_list05, sort_keys = True)
labels25, features25 = targetFeatureSplit(data25)
hide_code
features_train25, features_test25, labels_train25, labels_test25 = \
train_test_split(features25, labels25, test_size=0.3, random_state=42)
hide_code
dump_classifier_and_data(clf02, my_dataset2, features_list05)
load_classifier_and_data()
print features_list05
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf04, my_dataset2, features_list05)
load_classifier_and_data()
print features_list05
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf05, my_dataset2, features_list05)
load_classifier_and_data()
print features_list05
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf03, my_dataset2, features_list05)
load_classifier_and_data()
print features_list05
hide_code
if __name__ == '__main__':
main()
hide_code
# Create a dataframe for engineering
engineer_enron_df = pd.DataFrame(scaled_enron_df)
New features:
1) coefficient_bonus_salary_ratio: bonus compared to salary;
2) coefficient_from_poi_all: messages from poi to this person compared to all messages received by this person;
3) coefficient_to_poi_all: messages from this person to poi and shared with poi compared to all messages sent by this person;
4) coefficient_income_total: deferred income for this person compared to all total payments for this person.
These features were added to the scaled data frame and to the scaled dictionary.
hide_code
# Create new features
engineer_enron_df['coefficient_bonus_salary'] = 0.0
engineer_enron_df['coefficient_from_poi_all'] = 0.0
engineer_enron_df['coefficient_to_poi_all'] = 0.0
engineer_enron_df['coefficient_income_total'] = 0.0
for i in range(len(scaled_enron_df['salary'])):
if scaled_enron_df['salary'][i] > 0:
engineer_enron_df['coefficient_bonus_salary'][i] = \
1.0 * scaled_enron_df['bonus'][i] / scaled_enron_df['salary'][i]
for i in range(len(scaled_enron_df['to_messages'])):
if scaled_enron_df['to_messages'][i] > 0:
engineer_enron_df['coefficient_from_poi_all'][i] = \
1.0 * scaled_enron_df['from_poi_to_this_person'][i] / scaled_enron_df['to_messages'][i]
for i in range(len(scaled_enron_df['from_messages'])):
if scaled_enron_df['from_messages'][i] > 0:
engineer_enron_df['coefficient_to_poi_all'][i] = \
1.0 * (scaled_enron_df['from_this_person_to_poi'][i] + scaled_enron_df['shared_receipt_with_poi'][i]) \
/ scaled_enron_df['from_messages'][i]
for i in range(len(scaled_enron_df['total_payments'])):
if scaled_enron_df['total_payments'][i] > 0:
engineer_enron_df['coefficient_income_total'][i] = \
1.0 * scaled_enron_df['deferred_income'][i] / scaled_enron_df['total_payments'][i]
hide_code
# Display new features
engineer_enron_df.head().T
hide_code
# Reading the dataframe into a dictionary
engineer_enron_data = engineer_enron_df.to_dict(orient="index")
hide_code
print "The example of features in the dictionary: ", str(engineer_enron_data.itervalues().next())
Setup the dataset and new feature lists.
my_dataset3 = engineer_enron_data
features_list06 = ['poi', 'coefficient_bonus_salary', 'coefficient_income_total',
'coefficient_from_poi_all', 'coefficient_to_poi_all',
'exercised_stock_options']
Results:
hide_code
data36 = featureFormat(my_dataset3, features_list06, sort_keys = True)
labels36, features36 = targetFeatureSplit(data36)
hide_code
features_train36, features_test36, labels_train36, labels_test36 = \
train_test_split(features36, labels36, test_size=0.3, random_state=42)
hide_code
dump_classifier_and_data(clf04, my_dataset3, features_list06)
load_classifier_and_data()
print features_list06
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf02, my_dataset3, features_list06)
load_classifier_and_data()
print features_list06
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf06, my_dataset3, features_list06)
load_classifier_and_data()
print features_list06
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf03, my_dataset3, features_list06)
load_classifier_and_data()
print features_list06
hide_code
if __name__ == '__main__':
main()
hide_code
dump_classifier_and_data(clf01, my_dataset3, features_list06)
load_classifier_and_data()
print features_list06
hide_code
if __name__ == '__main__':
main()
hide_code
clf15 = neighbors.KNeighborsClassifier()
dump_classifier_and_data(clf15, my_dataset, features_list05)
load_classifier_and_data()
print features_list05
hide_code
if __name__ == '__main__':
main()
hide_code
clf25 = neighbors.KNeighborsClassifier(p=1)
dump_classifier_and_data(clf25, my_dataset, features_list05)
load_classifier_and_data()
print features_list05
hide_code
if __name__ == '__main__':
main()
hide_code
clf35 = neighbors.KNeighborsClassifier(n_neighbors=4)
dump_classifier_and_data(clf35, my_dataset, features_list05)
load_classifier_and_data()
print features_list05
hide_code
if __name__ == '__main__':
main()
hide_code
clf45 = neighbors.KNeighborsClassifier(weights='distance')
dump_classifier_and_data(clf45, my_dataset, features_list05)
load_classifier_and_data()
print features_list05
hide_code
if __name__ == '__main__':
main()
hide_code
clf15 = neighbors.KNeighborsClassifier()
dump_classifier_and_data(clf15, my_dataset, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
clf25 = neighbors.KNeighborsClassifier(p=1)
dump_classifier_and_data(clf25, my_dataset, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
clf35 = neighbors.KNeighborsClassifier(n_neighbors=3)
dump_classifier_and_data(clf35, my_dataset, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
clf45 = neighbors.KNeighborsClassifier(n_neighbors=4, weights='distance')
dump_classifier_and_data(clf45, my_dataset, features_list01)
load_classifier_and_data()
print features_list01
hide_code
if __name__ == '__main__':
main()
hide_code
clf11 = DecisionTreeClassifier(max_depth=1)
dump_classifier_and_data(clf11, my_dataset3, features_list06)
load_classifier_and_data()
print features_list06
hide_code
if __name__ == '__main__':
main()
hide_code
clf21 = DecisionTreeClassifier(min_samples_split=15, max_depth=7)
dump_classifier_and_data(clf21, my_dataset3, features_list06)
load_classifier_and_data()
print features_list06
hide_code
if __name__ == '__main__':
main()
hide_code
clf18 = LogisticRegression()
dump_classifier_and_data(clf18, my_dataset, features_list04)
load_classifier_and_data()
print features_list04
hide_code
if __name__ == '__main__':
main()
hide_code
clf28 = LogisticRegression(class_weight='balanced', solver='sag')
dump_classifier_and_data(clf28, my_dataset, features_list04)
load_classifier_and_data()
print features_list04
hide_code
if __name__ == '__main__':
main()
hide_code
clf38 = LogisticRegression(C=100)
dump_classifier_and_data(clf38, my_dataset, features_list04)
load_classifier_and_data()
print features_list04
hide_code
if __name__ == '__main__':
main()
We verify the impact of new variables on the same algorithm. The list №06 includes all new variables, the list №07 - old variables from which they were created. For example, the variable 'coefficient_bonus_salary' is created from two: 'bonus' and 'salary'.
features_list = ['poi', 'bonus', 'deferral_payments', 'deferred_income', 'director_fees','expenses',
'exercised_stock_options', 'loan_advances', 'long_term_incentive', 'other',
'restricted_stock', 'restricted_stock_deferred', 'salary',
'total_payments', 'total_stock_value',
'to_messages', 'from_poi_to_this_person',
'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']
clf01 = DecisionTreeClassifier(max_depth=1)
features_list06 = ['poi', 'coefficient_bonus_salary', 'coefficient_income_total',
'coefficient_from_poi_all', 'coefficient_to_poi_all',
'exercised_stock_options']
hide_code
dump_classifier_and_data(clf01, my_dataset3, features_list06)
load_classifier_and_data()
print features_list06
hide_code
if __name__ == '__main__':
main()
features_list07 = ['poi', 'bonus', 'deferred_income',
'exercised_stock_options', 'salary', 'total_payments',
'to_messages', 'from_poi_to_this_person',
'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']
hide_code
data37 = featureFormat(my_dataset3, features_list07, sort_keys = True)
labels37, features37 = targetFeatureSplit(data37)
hide_code
features_train37, features_test37, labels_train37, labels_test37 = \
train_test_split(features37, labels37, test_size=0.3, random_state=42)
hide_code
dump_classifier_and_data(clf01, my_dataset3, features_list07)
load_classifier_and_data()
print features_list07
hide_code
if __name__ == '__main__':
main()
It is easy to see that the three measures (Accuracy: 0.86673 Precision: 0.50119 Recall: 0.10550) increased significantly (Accuracy: 0.89327 Precision: 0.91476 Recall: 0.22000) with the introduction of new features with the classifier DecisionTreeClassifier().
The highest result for the financial and mixed sets of features was shown bythe KNeighborsClassifier() without imputing mean values instead of 'NaN' and without scaling the features:
1) ['poi', 'salary', 'bonus', 'exercised_stock_options', 'deferred_income', 'from_poi_to_this_person', 'from_this_person_to_poi', 'shared_receipt_with_poi'] - 89.186% accuracy, 81.806% precision, 31.250% recall;
2) ['poi', 'salary', 'bonus', 'exercised_stock_options', 'deferred_income'] - 88.293% accuracy, 78.361% precision, 30.600% recall.
The highest result for the email set of features was shown by the LogisticRegression(C=100):
['poi', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi'] - 86.556% accuracy, 36.000% precision, 27.000% recall.
The highest result for the created features was shown by the DecisionTreeClassifier(max_depth=1): ['poi', 'coefficient_bonus_salary', 'coefficient_income_total', 'coefficient_from_poi_all', 'coefficient_to_poi_all', 'exercised_stock_options'] - 89.327% accuracy, 91.476% precision, 22.000% recall.
Among all the sets of features and algorithms there is a really valuable find - KNeighborsClassifier() and ['poi', 'salary', 'bonus', 'exercised_stock_options', 'deferred_income', 'from_poi_to_this_person', 'from_this_person_to_poi', 'shared_receipt_with_poi'].
A combination of these options gives us a high level of accuracy (89.186%) and precision (81.806%) and a normal level of recall (31.250%)
It has a great practical meaning: with a lot of confidence we can see that it’s very likely for a marked person to be a real POI and not a false alarm. This allows to speed the search process, to narrow down the suspects and to protect innocent people from suspicion.
Summarize for us the goal of this project and how machine learning is useful in trying to accomplish it. As part of your answer, give some background on the dataset and how it can be used to answer the project question. Were there any outliers in the data when you got it, and how did you handle those? [relevant rubric items: “data exploration”, “outlier investigation”]
The goal of this project is constructing a predictive model for identifying persons of interest using many feature sets and different types of classifiers. The base for this model is the dataset with real events.
During the research incorrect values and outliers were found. Invalid values were corrected. High values associated with the generalization of data were removed, the rest outliers were remained in the database in order to avoid the loss of valuable information.
Detailed description of the database is presented in Section 2, the studying and the graphical representation of the data (including invalid values and outliers) are presented in Section 3.3.
What features did you end up using in your POI identifier, and what selection process did you use to pick them? Did you have to do any scaling? Why or why not? As part of the assignment, you should attempt to engineer your own feature that does not come ready-made in the dataset -- explain what feature you tried to make, and the rationale behind it. (You do not necessarily have to use it in the final analysis, only engineer and test it.) In your feature selection step, if you used an algorithm like a decision tree, please also give the feature importances of the features that you use, and if you used an automated feature selection function like SelectKBest, please report the feature scores and reasons for your choice of parameter values. [relevant rubric items: “create new features”, “properly scale features”, “intelligently select feature”]
I was not reading about real persons in this dataset at all. For me it was another interesting experiment, can I detect POI only with the datapoints or not. So I observed only dependencies between variables and their values and picked features by weak dependencies and suspect large values.
Selection of features for investigation seemed quite obvious: 1) indicators of salaries and bonuses - the basic data when considering the payments to staff, they are required to include both as they demonstrate quite a weak proportionality; 2) all quantitative indicators of correspondence with persons of interest are required for including; 3) including 'exercised_stock_options', 'deferred_income' almost always led to an increase in all indicators of the accuracy of the predictions, so I added them in the final version too.
When thinking over the construction of features I decided that the ratio of these values can give even more information about the suspicious trends than the variables themselves.
Significant impact of introducing new variables on the algorithm DecisionTreeClassifier(max_depth=1)was confirmed by experiments. Accuracy: 0.86673 Precision: 0.50119 Recall: 0.10550 F1: 0.17431 F2: 0.12528 - the set with old features. Accuracy: 0.89327 Precision: 0.91476 Recall: 0.22000 F1: 0.35470 F2: 0.25940 - the set with new features.
Feature scaling is not a mandatory option for prediction models. The main reasons for applying it: 1) the range of values of raw data varies is really wide in our case and in several machine learning algorithms will not work properly without normalization; 2) gradient descent converges much faster with feature scaling than without it.
Standardizing and imputing values of variables tends to make the training process better behaved by improving the numerical condition. Family of algorithms that is most likely to be scale-invariant are tree-based methods. Scaling matters in the cases of k-nearest neighbors with an Euclidean distance measure, k-means, logistic regression, SVM, linear discriminant analysis, PCA, etc. We can see it in the case of KMeans(n_clusters=2) and ['poi','from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']. Without scaling: Accuracy: 0.75711 Precision: 0.10832 Recall: 0.16400 F1: 0.13047 F2: 0.14871 With scaling: Accuracy: 0.72560 Precision: 0.15959 Recall: 0.24800 F1: 0.19421 F2: 0.22326
Scaling of cases should be approached with caution because it discards information. We can see it in the case of KNeighborsClassifier() and ['poi', 'salary', 'bonus', 'exercised_stock_options', 'deferred_income', 'from_poi_to_this_person', 'from_this_person_to_poi', 'shared_receipt_with_poi']. Without scaling: Accuracy: 0.89186 Precision: 0.81806 Recall: 0.31250 F1: 0.45224 F2: 0.35657 With scaling: Accuracy: 0.85253 Precision: 0.00467 Recall: 0.00050 F1: 0.00090 F2: 0.00061
Presented variables allowed to make a fairly accurate prediction (accuracy about 89%) without creating new variables and scaling. But the process of scaling and the introduction of new variables - the ratio between most important data parameters - have led to high performance in another area (precision).
The process of scaling and the creation of new variables is described and illustrated in Sections 3.3.6-3.3.9 and 3.5.5, 3.5.6, 3.5.8.
What algorithm did you end up using? What other one(s) did you try? How did model performance differ between algorithms? [relevant rubric item: “pick an algorithm”]
During the work I tried to use more than 10 different algorithms, in the final version of the project I left only 8 from them to examine empirically the impact of the type of the algorithm and its parameters on the quality of predictions.
The experimental results for all algorithms are presented in Section 3.5. For the project, I selected only the information about the experiments with high accuracy and precision trying to improve the level of recall. The results of experiments on all parameters (precision, recall, f1, f2) varies widely.
KNeighborsClassifier() and ['poi', 'salary', 'bonus', 'exercised_stock_options', 'deferred_income', 'from_poi_to_this_person', 'from_this_person_to_poi', 'shared_receipt_with_poi'] is the final version of all experiments.
It should be noted that the algorithm GaussianNB() gives good results in the most of cases.
https://docs.google.com/spreadsheets/d/16rvga-kKqTFWakYPBZulxpGbjT7ISN_rtPeWo6P-Ook/edit#gid=0
What does it mean to tune the parameters of an algorithm, and what can happen if you don’t do this well? How did you tune the parameters of your particular algorithm? (Some algorithms do not have parameters that you need to tune -- if this is the case for the one you picked, identify and briefly explain how you would have done it for the model that was not your final choice or a different model that does utilize parameter tuning, e.g. a decision tree classifier). [relevant rubric item: “tune the algorithm”]
Models can have many parameters and the goal for the algorithm tuning is finding the best combination of these indicators.
Tuning an algorithm is extremely important because different settings can have a profound effect on its performance.
The section 3.5.7 "Experiments with the highest accuracy and influence the parameters of the algorithms" shows very clearly how the classifier parameters can modify the final result.
What is validation, and what’s a classic mistake you can make if you do it wrong? How did you validate your analysis? [relevant rubric item: “validation strategy”]
Validation is a technique for checking how our model generalizes with the remaining part of the dataset. The common mistake in this case is overfitting where the model performed well on training set but have substantial lower result on test set.
In this project we have the great start code with very useful function for validation - test_classifier(). I did not change it at all.
The parameter "folds" in this function is equal 1000 and it means the model runs 1000 times with different test sets based on the original data. StratifiedSuffleSplit was applied for splitting in this case.
Give at least 2 evaluation metrics and your average performance for each of them. Explain an interpretation of your metrics that says something human-understandable about your algorithm’s performance. [relevant rubric item: “usage of evaluation metrics”]
Definition:
accuracy = number of people that are correctly predicted as POI or non-POI / number of all people in the dataset
recall = number of people that are predicted as POI and they are actually POI / number of people are actually POI
precision = number of people that are predicted as POI and they are actually POI / number of people that are predicted as POI
In the final algoritm these metrics have the values: accuracy = 89.186%, precision = 81.806%, recall = 31.250%.
In practice it means I am confident in general in my predictions by 89.186%, the POI persons that were predicted in this model with confidence by 81.806% are really POI and the level of confidence that predicted POI person are all POI persons is 31.250%.
It's my personal choice because from my point of view it's better to be confident in the precision.
In the project we can see the result that many people will recognize as very perspective for researching DecisionTreeClassifier(max_depth=1): ['poi', 'coefficient_bonus_salary', 'coefficient_income_total', 'coefficient_from_poi_all', 'coefficient_to_poi_all', 'exercised_stock_options'] - 89.327% accuracy, 91.476% precision, 22.000% recall.
To run a large number of experiments, I divided the start codes for the project in tester.py and in poi_id.py to fragments with functions. For the convenience of the project review, I did not make the individual code files. To start running this project and check all the results, the reader needs only the data file final_project_dataset.pkl and this notebook.
After this research I consider my project as a very basic study of the data: I feel that it is necessary to perform dozens of times more experiments to improve understanding of the relationship.
The database is chosen for these experiments remarkably:
1) it corresponds to real events;
2) the available information allows us to estimate the effectiveness of the used methods;
3) the described situation is standard for this type of events and therefore found good solutions may be used in many other cases;
4) a lot of material for other studies (not only the identification of persons of interests).
https://www.udacity.com/course/intro-to-machine-learning--ud120
https://www.packtpub.com/mapt/book/big_data_and_business_intelligence/9781783555130