El RMS Titanic era un transatlántico de pasajeros británico operado por la White Star Line que se hundió en el Océano Atlántico Norte en la madrugada del 15 de abril de 1912, después de chocar con un iceberg durante su viaje inaugural desde Southampton a la ciudad de Nueva York. De los 2.224 pasajeros y tripulación estimados a bordo, más de 1.500 murieron, lo que hace que el hundimiento sea uno de los desastres marinos comerciales más mortíferos de la historia moderna en tiempos de paz.
Análisis exploratorio
import numpy as np
import pandas as pd
df = pd.read_csv('train.csv')
dfdf.describe()
del df['PassengerId'] del df['Name']
df.isnull().sum()
del df['Cabin']
df['Embarked'].value_counts()
El valor más frecuente para Embarked es 'S', así que lo usaremos para reemplazar los valores nulos.
df['Embarked'].loc[pd.isnull(df['Embarked'])] = 'S'
mean_age_train = np.mean(df['Age'].loc[pd.isnull(df['Age']) == False].values) df['Age'].loc[pd.isnull(df['Age'])] = mean_age_train
mean_fare_train = np.mean(df['Fare'].loc[pd.isnull(df['Fare']) == False].values)
Nos deshicimos de los valores nulos, ahora veamos qué sigue.
Lea también:
Una guia práctica para la limpieza de datos
Codificación ordinal
df_bkp = df.copy() from sklearn.preprocessing import OrdinalEncoder df['Sex'] = OrdinalEncoder().fit_transform(df['Sex'].values.reshape((-1, 1))) df['Ticket'] = OrdinalEncoder().fit_transform(df['Ticket'].values.reshape((-1, 1))) df['Embarked'] = OrdinalEncoder().fit_transform(df['Embarked'].values.reshape((-1, 1)))
Visualizaciones
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
def show(txt):
# this function is for printing markdown in jupyter notebook
display(Markdown(txt))
for i in range(1, 9):
show(f'### {df.columns[i]}')
f, (survived, not_survived) = plt.subplots(1, 2, sharey=True, figsize=(18, 8))
survived.hist(df.iloc[np.where(df['Survived'] == 1)[0], i])
survived.set_title('Survived')
not_survived.hist(df.iloc[np.where(df['Survived'] == 0)[0], i])
not_survived.set_title('Not Survived')
plt.show()Corriendo aprendizaje automático en este formato de datos
- Regresión logística
- Support vector machines
- Árbol de decisión
- K Vecinos más cercanos
- Perceptrón multicapa
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
# Logistic Regression
lr = LogisticRegression()
lr_score = np.mean(cross_val_score(lr, X, y))
print(f'Logistic Regression: {lr_score}')
# Support Vector Machine
svc = SVC()
svc_score = np.mean(cross_val_score(svc, X, y))
print(f'Support Vector Machine: {svc_score}')
# Decision Tree
dtc = DecisionTreeClassifier()
dtc_score = np.mean(cross_val_score(dtc, X, y))
print(f'Decision Tree: {dtc_score}')
# K Nearest Neighbors
knc = KNeighborsClassifier()
knc_score = np.mean(cross_val_score(knc, X, y))
print(f'K Nearest Neighbors: {knc_score}')# Multi-Layer Perceptron
mlpc = MLPClassifier()
mlpc_score = np.mean(cross_val_score(mlpc, X, y))
print(f'Multi-Layer Perceptron: {mlpc_score}')
Ingeniería de características
df = df_bkp.copy()
Codificación única (One Hot Encoding)
from sklearn.preprocessing import OneHotEncoder
# Pclass
pclass_transf = OneHotEncoder(sparse=False, dtype=np.uint8, handle_unknown='ignore')
pclass_transf.fit(df['Pclass'].values.reshape((-1, 1)))
pclass = pclass_transf.transform(df['Pclass'].values.reshape((-1, 1)))
df['Pclass0'] = pclass[:, 0]
df['Pclass1'] = pclass[:, 1]
df['Pclass2'] = pclass[:, 2]
del df['Pclass']
# Sex
gender_transf = OneHotEncoder(sparse=False, dtype=np.uint8, handle_unknown='ignore')
gender_transf.fit(df['Sex'].values.reshape((-1, 1)))
gender = gender_transf.transform(df['Sex'].values.reshape((-1, 1)))
df['Male'] = gender[:, 0]
df['Female'] = gender[:, 1]
del df['Sex']
# Ticket
ticket_transf = OneHotEncoder(sparse=False, dtype=np.uint8, handle_unknown='ignore')
ticket_transf.fit(df['Ticket'].values.reshape((-1, 1)))
ticket = ticket_transf.transform(df['Ticket'].values.reshape((-1, 1)))
for i in range(ticket.shape[1]):
df[f'Ticket{i}'] = ticket[:, i]
del df['Ticket']
# Embarked
embarked_transf = OneHotEncoder(sparse=False, dtype=np.uint8, handle_unknown='ignore')
embarked_transf.fit(df['Embarked'].values.reshape((-1, 1)))
embarked = embarked_transf.transform(df['Embarked'].values.reshape((-1, 1)))
for i in range(embarked.shape[1]):
df[f'Embarked{i}'] = embarked[:, i]
del df['Embarked']Escalando a rangos entre [0, 1]
from sklearn.preprocessing import MinMaxScaler age_transf = MinMaxScaler().fit(df['Age'].values.reshape(-1, 1)) df['Age'] = age_transf.transform(df['Age'].values.reshape(-1, 1)) sibsp_transf = MinMaxScaler().fit(df['SibSp'].values.reshape(-1, 1)) df['SibSp'] = sibsp_transf.transform(df['SibSp'].values.reshape(-1, 1)) parch_transf = MinMaxScaler().fit(df['Parch'].values.reshape(-1, 1)) df['Parch'] = parch_transf.transform(df['Parch'].values.reshape(-1, 1)) fare_transf = MinMaxScaler().fit(df['Fare'].values.reshape(-1, 1)) df['Fare'] = fare_transf.transform(df['Fare'].values.reshape(-1, 1))
Haciendo aprendizaje automático en este nuevo formato de datos
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
# Logistic Regression
lr = LogisticRegression()
lr_score = np.mean(cross_val_score(lr, X, y))
print(f'Logistic Regression: {lr_score}')
# Support Vector Machine
svc = SVC()
svc_score = np.mean(cross_val_score(svc, X, y))
print(f'Support Vector Machine: {svc_score}')
# Decision Tree
dtc = DecisionTreeClassifier()
dtc_score = np.mean(cross_val_score(dtc, X, y))
print(f'Decision Tree: {dtc_score}')
# K Nearest Neighbors
knc = KNeighborsClassifier()
knc_score = np.mean(cross_val_score(knc, X, y))
print(f'K Nearest Neighbors: {knc_score}')
# Multi-Layer Perceptron
mlpc = MLPClassifier()
mlpc_score = np.mean(cross_val_score(mlpc, X, y))
print(f'Multi-Layer Perceptron: {mlpc_score}')Ajuste de Hiper-Parámetros
from sklearn.model_selection import GridSearchCV
dtc = DecisionTreeClassifier()
params = {
'max_depth': list(range(2, 151)),
'min_samples_split': list(range(2, 15))
}
clf = GridSearchCV(dtc, params)
clf.fit(X, y)
print(f'Best params: {clf.best_params_}')
print(f'Best score: {clf.best_score_}')