In this project I was working with the data set "Titanic Data" from the Udacity website.
It contains demographics and passenger information from 891 of the 2224 passengers and crew on board the Titanic.
This link allows to see the description of this dataset on the Kaggle website, where the data was obtained.
https://www.kaggle.com/c/titanic/data
http://matplotlib.org/examples/pylab_examples/
http://people.duke.edu/~ccc14/pcfb/numpympl/MatplotlibBarPlots.html
http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.linregress.html
https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
Online Statistics Education: An Interactive Multimedia Course of Study. Project Leader: David M. Lane, Rice University.
import pandas as pd
import numpy as np
import scipy
%pylab inline
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from scipy import stats
from pylab import plot,show
import seaborn as sns
from operator import truediv
def convert_list_to_int(x):
y =[]
for element in x:
el = element[0]
y.append(el)
return y
def percentages_xy(x,y):
return 100.0*x/y
def pieplot(x,xlabel):
figure(1, figsize=(5,5))
ax = axes([0.1, 0.1, 0.8, 0.8])
labels = '1', '2', '3'
fracs = np.array(x)
explode=(0, 0.05, 0)
pie(fracs, explode=explode, labels=labels,
autopct='%1.0f%%', shadow=True, startangle=0)
title(xlabel)
show()
def pearson_stat(x,y,ylabel):
x1 = np.array(x)
y1 = np.array(y)
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print ' slope r standard deviation '
print ' '
print ' ', slope, " ", r_value, " ", std_err
line = slope*x1+intercept
plot(x1,line,'m-',x1,y1,'o')
pylab.xlim([x[0]-0.5,x[-1]+0.5])
if x == pclass_list:
pylab.xlabel('Pclass')
elif x == index0:
pylab.xlabel('Group by fare')
else:
pylab.xlabel('Age category')
pylab.ylabel(ylabel)
show()
titanic_df = pd.read_csv('/Users/olgabelitskaya/Downloads/titanic_data.csv')
titanic_df.head()
len(titanic_df)
titanic_df['Survived'].sum()
100*titanic_df['Survived'].mean()
titanic_df.info()
pclass = pd.Series(titanic_df['Pclass'])
pclass_list = list(set(pclass.values))
pclass_list
number_by_pclass = titanic_df.groupby('Pclass').count()['PassengerId']
number_by_pclass_percent = percentages_xy(number_by_pclass,len(titanic_df))
number_by_pclass_df = pd.DataFrame(data={'Number by Pclass': number_by_pclass,
'Number by Pclass in percentages':number_by_pclass_percent})
number_by_pclass_df
pieplot(number_by_pclass,'Number by Pclass in percentages')
survived_by_pclass = titanic_df.groupby('Pclass').sum()['Survived']
survived_by_pclass_percent1 = percentages_xy(titanic_df.groupby('Pclass').sum()['Survived'],
titanic_df['Survived'].sum())
survived_by_pclass_percent2 = percentages_xy(titanic_df.groupby('Pclass').sum()['Survived'],
number_by_pclass)
survived_by_pclass_df = pd.DataFrame(data={'Survived by Pclass': survived_by_pclass,
'Survived by Pclass in percentages I':survived_by_pclass_percent1,
'Survived by Pclass in percentages II':survived_by_pclass_percent2,})
survived_by_pclass_df
pieplot(survived_by_pclass,'Survived by Pclass in percentages I')
plt.style.use('seaborn-pastel')
plt.rcParams['figure.figsize'] = (8, 4)
pearson_stat(pclass_list,survived_by_pclass_df['Survived by Pclass in percentages II'] ,
'Survived by Pclass in percentages II')
fare_mean_by_class = titanic_df.groupby('Pclass').mean()['Fare']
fare_mean_by_class
pearson_stat(pclass_list,fare_mean_by_class,'Fare mean by Pclass')
fare = pd.Series(titanic_df['Fare'])
print 'max =', max(fare)
print 'min =', min(fare)
print 'mean =', mean(fare)
plt.rcParams['figure.figsize'] = (6, 3)
titanic_df.Fare.hist()
plt.xlabel("Fare")
plt.ylabel("Number by fare")
number_by_fare = pd.Series(titanic_df.groupby('Fare').count()['PassengerId'])
plt.rcParams['figure.figsize'] = (8, 4)
number_by_fare.plot(color='steelblue', linewidth=1.5, linestyle="-")
plt.xlabel("Fare")
plt.ylabel("Number by fare")
bins1 = np.linspace(fare.min(), fare.max(), 52)
groups = fare.groupby(pd.cut(fare, bins1)).count()
groups1 = groups.tolist()
index1 = [i for i in range(0,510) if i%10 == 0]
groups_df = pd.DataFrame(data={'Number in group by fare': groups1}, index=index1)
groups_df.plot(color='darkred', linewidth=1.5, linestyle="-")
plt.xlabel("Fare")
plt.ylabel("Number in group by fare")
groups0 = groups1[:27]
index0 = index1[:27]
number_in_rest = sum(groups1[27:])
print number_in_rest
pearson_stat(index0,groups0,'Number in group by fare')
survived_by_fare = titanic_df.groupby('Fare').sum()['Survived']
survived_groups_df = pd.DataFrame(data={'Survived by fare': survived_by_fare})
survived_groups_df.reset_index(inplace=True)
survived_groups = survived_groups_df.groupby(pd.cut(survived_groups_df["Fare"], bins1)).sum()
survived_groups1 = survived_groups.fillna(0)
survived_groups2 = survived_groups1['Survived by fare'][:27].tolist()
survived_groups2 = [int(x) for x in survived_groups2]
fare_groups_df = pd.DataFrame(data={'Number by group': groups0,
'Survived by group': survived_groups2},
index=index0)
fare_groups_df.head()
m = np.array(survived_groups2)
n = np.array(groups0)
# survived_groups_in_percentages0 = 100.0*np.true_divide(m,n)
# survived_groups_in_percentages1 = pd.Series(survived_groups_in_percentages0).fillna(0)
pearson_stat(index0,survived_groups_in_percentages1,'Survived in group by fare in percentages')
number_by_sex = titanic_df.groupby('Sex').count()['PassengerId']
number_by_sex_in_percentages = percentages_xy(titanic_df.groupby('Sex').count()['PassengerId'],
len(titanic_df))
survived_by_sex = titanic_df.groupby('Sex').sum()['Survived']
survived_by_sex_in_percentages1=percentages_xy(titanic_df.groupby('Sex').sum()['Survived'],
titanic_df['Survived'].sum())
survived_by_sex_in_percentages2 = percentages_xy(titanic_df.groupby('Sex').sum()['Survived'],
number_by_sex)
survived_by_sex_df = pd.DataFrame(data={'Number by sex': number_by_sex,
'Number by sex in percentages':number_by_sex_in_percentages,
'Survived by sex':survived_by_sex,
'Survived by sex in percentages I':survived_by_sex_in_percentages1,
'Survived by sex in percentages II':survived_by_sex_in_percentages2})
survived_by_sex_df
plt.rcParams['figure.figsize'] = (8, 4)
fig = plt.figure()
ax = fig.add_subplot(111)
## Data
Number_by_Sex_in_percentages = pd.Series(survived_by_sex_df['Number by sex in percentages'])
Survived_by_Sex_in_percentages1 = pd.Series(survived_by_sex_df['Survived by sex in percentages I'])
Survived_by_Sex_in_percentages2 = pd.Series(survived_by_sex_df['Survived by sex in percentages II'])
## Necessary variables
ind = np.array([1,2]) # the x locations for the pclass
width = 0.2 # the width of the bars
## Bars
rects1 = ax.bar(ind, Number_by_Sex_in_percentages, width, color='red', alpha = 0.7)
rects2 = ax.bar(ind+width, Survived_by_Sex_in_percentages1, width, color='green', alpha = 0.7)
rects3 = ax.bar(ind+2*width, Survived_by_Sex_in_percentages2, width, color='blue', alpha = 0.7)
# Axes and labels
ax.set_xlim(-width+1,len(ind)+width+0.6)
ax.set_ylim(0,90)
ax.set_ylabel('Values by sex in percentages')
ax.set_title('Number of passengers and Survived passengers by sex in percentages')
xTickMarks = ['female','male']
ax.set_xticks(ind+width)
xtickNames = ax.set_xticklabels(xTickMarks)
plt.setp(xtickNames, rotation=10, fontsize=30)
## Legend
ax.legend((rects1[0], rects2[0], rects3[0]),
('Number by sex in percentages', 'Survived by sex in percentages I', 'Survived by sex in percentages II') )
plt.show()
age = pd.Series(titanic_df['Age'])
print 'max =', max(age)
print 'min =', min(age)
print 'mean =', mean(age)
number_age = titanic_df.groupby('Age').count()['PassengerId']
plt.rcParams['figure.figsize'] = (8, 4)
titanic_df.Age.hist()
plt.xlabel("Age")
plt.ylabel("Number by age")
age = pd.cut(titanic_df['Age'], [0,13,19,35,60,85], labels=['child', 'teenager', 'young','middle-aged','old'])
df_age = pd.DataFrame(data={'Age category': age,'Survived': titanic_df['Survived']})
number_by_age = df_age.groupby('Age category').count()
number_by_age1 = number_by_age.values.tolist()
survived_by_age = df_age.groupby('Age category').sum()
survived_by_age1 = survived_by_age.values.tolist()
survived_by_age_in_percentages = percentages_xy(survived_by_age,number_by_age)
survived_by_age_in_percentages1 = survived_by_age_in_percentages.values.tolist()
df_survived_by_age = pd.DataFrame(data={'Number by age category': convert_list_to_int(number_by_age1),
'Survived by age category': convert_list_to_int(survived_by_age1),
'Survived by age category in percentages':
convert_list_to_int(survived_by_age_in_percentages1)},
index=['child', 'teenager', 'young','middle-aged','old'])
df_survived_by_age
pearson_stat([1, 2, 3, 4, 5],
convert_list_to_int(survived_by_age_in_percentages1),
'Survived by age category in percentages')