El presente notebook es un trabajo realizado por Tomás Ertola, Segundo Puesto en la competición de data science llamada "Predicción De Precios De Apartamentos En Argentina y Colombia". El competidor ha generado un repositorio en Github que podrá ser encontrado en los siguientes links para su estudio y detalle.
El objetivo de este Notebook es que le pueda servir para la comunidad de data science, y entender el proceso que el competidor llevó a cabo para obtener este excelente resultado.
La publicación del presente Notebook ha sido autorizada por el autor del mismo.
Cualquier inquietud, por favor déjala en los comentarios y la haremos llegar al autor.
%%html
<a href="https://www.datasource.ai/es/home/data-science-competitions-for-startups/prediccion-de-precios-de-apartamentos-en-argentina-y-colombia" target="blank">Competición en Data Science</a>
%%html
<a href="https://www.datasource.ai/es/data-science-articles/entrevista-a-los-ganadores-de-la-competencia-de-data-science-prediccion-de-precios-de-apartamentos-en-argentina-y-colombia" target="blank">Entrevista Tomás Ertola y a otros de los ganadores</a>
%%html
<a href="https://github.com/rubzk/datasourceai-pice-competition" target="blank">Repositorio de la Competición</a>
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
train.shape
test.shape
X = train
X.head()
In a very short analysis we can see that features operation_type, property_type and currency are useless
num = train.select_dtypes(exclude='object')
num_corr = num.corr()
#Plot the correlations
f, ax = plt.subplots(figsize=(17,1))
sns.heatmap(num_corr.sort_values(by=['price'], ascending=False).head(1), cmap='Reds')
plt.title('Numerical Features Corr with price', weight='bold', fontsize=18)
plt.xticks(weight='bold')
plt.yticks(weight='bold', color='red', rotation=0)
plt.show()
Seems pretty obvious that the price is highly correlated with the total surface of the building. On the other hand we have the number of rooms, bedrooms and bathrooms.
num_corr = num_corr['price'].sort_values(ascending=False).head(5).to_frame()
num_corr
Let's dive into the categorial features
sns.set(style="darkgrid")
sns.countplot(x='pais', data=X)
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
columns = ['Argentina','Colombia']
for column,axes in zip(columns, ax.flatten()):
plt.setp(axes.get_xticklabels(), rotation=90)
sns.countplot(x='provincia_departamento', data=X[X.pais == column],ax=axes)
plt.show()
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
columns = ['Argentina','Colombia']
for column,axes in zip(columns, ax.flatten()):
axes.title.set_text('Distplot Total Surface {}'.format(column))
sns.distplot(X[X.pais == column]['surface_total'],ax=axes)
plt.show()
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(15,5))
columns = ['price','rooms', 'surface_total']
for column,axes in zip(columns, ax.flatten()):
sns.distplot(train[column],ax=axes)
plt.show()
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(15,5))
for column,axes in zip(columns, ax.flatten()):
sns.boxplot(train[column],ax=axes)
plt.show()
Okay so we not only have a negative skewness in the features, we also have a lot of outliers
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
columns = ['Argentina','Colombia']
color = ['red']
for column,axes in zip(columns, ax.flatten()):
axes.title.set_text('Distplot Total Surface {}'.format(column))
sns.scatterplot(y=train[train.pais == column]['surface_total'],x=train[train.pais == column]['price'],ax=axes, alpha=0.5)
plt.show()
Seems like Argentina is the one with more outliers
from sklearn.linear_model import LinearRegression
le = LinearRegression()
le.fit(train['surface_total'].values.reshape(-1,1), train['price'].values)
pred = le.predict(train['surface_total'].values.reshape(-1,1))
plt.figure(figsize=(10,5))
sns.scatterplot(train['price'], train['surface_total'], alpha=0.5)
plt.plot(pred, train['surface_total'],color='black', linewidth=2)
plt.show()
y = X['price']
X = pd.concat([X.loc[:, X.columns != 'price'], test])
numeric_features = ['surface_total', 'rooms', 'bedrooms', 'bathrooms']
cat_features = ['pais','provincia_departamento', 'ciudad']
features = cat_features + numeric_features
from scipy.stats import skew
skewed_feats = X[numeric_features].apply(lambda x: skew(x))
#skewed_feats = skewed_feats[skewed_feats > 0.75]
#skewed_feats
skewed_feats = skewed_feats.index
X[skewed_feats] = np.log1p(X[skewed_feats])
y = np.log1p(y)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cat_features= ['pais', 'ciudad', 'provincia_departamento']
X['pais'] = le.fit_transform(X['pais'])
X['ciudad'] = le.fit_transform(X['ciudad'])
X['provincia_departamento'] = le.fit_transform(X['provincia_departamento'])
final_features = pd.get_dummies(X[cat_features + numeric_features]).reset_index(drop=True)
final_features
test = final_features[len(y):]
X_train = final_features[:len(y)]
Regularized linear regression
from sklearn.linear_model import Ridge, RidgeCV, ElasticNetCV, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
def rmse_cv(model):
rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring='neg_mean_squared_error', cv=5))
return rmse
m_ridge = make_pipeline(RobustScaler(),Ridge())
alphas_ridge = [0.05,0.1,0.3,1,3,5,10]
cv_ridge = [rmse_cv(Ridge(alpha=alpha)).mean() for alpha in alphas_ridge]
cv_ridge = pd.Series(cv_ridge, index=alphas_ridge)
cv_ridge.plot(title = 'validatioin')
plt.xlabel('alpha')
plt.ylabel('rmse')
from sklearn.model_selection import KFold
kfolds = KFold(n_splits=10,shuffle=True)
def cv_rmse(model, X_train):
rmse = np.sqrt(-cross_val_score(model, X_train, y,
scoring="neg_mean_squared_error",
cv=kfolds))
return (rmse)
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7,alphas=[5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]))
score = cv_rmse(lasso, X_train)
print('Lasso Score {:.4f} ({:.4f})'.format(score.mean(),score.std()))
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=[14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]))
score = cv_rmse(ridge, X_train)
print('Ridge Score {:.4f} ({:.4f})'.format(score.mean(),score.std()))
elastic = make_pipeline(RobustScaler(), ElasticNetCV(alphas=[0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007],
l1_ratio=[0.8, 0.85, 0.9, 0.95, 0.99, 1]))
score = cv_rmse(elastic, X_train)
print('Elastic Score {:.4f} ({:.4f})'.format(score.mean(),score.std()))
from sklearn.svm import SVR
svr = make_pipeline(RobustScaler(), SVR(C=20, epsilon=0.008, gamma=0.0003))
score = cv_rmse(svr, X_train)
print('SVR Score {:.4f} ({:.4f})'.format(score.mean(),score.std()))
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, loss='huber', random_state=42)
score = cv_rmse(gbr, X_train)
print('GBR Score {:.4f} ({:.4f})'.format(score.mean(),score.std()))
from lightgbm import LGBMRegressor
lgb = LGBMRegressor(objective='regression', learning_rate=0.08, n_estimators=4000, num_leaves=32,max_depth=7, reg_alpha=0.0001)
score = cv_rmse(lgb, X_train)
print('LGBM Score {:.4f} ({:.4f})'.format(score.mean(),score.std()))
from catboost import CatBoostRegressor
catboost = CatBoostRegressor(learning_rate=0.05, n_estimators=10000, max_depth=5, verbose=False, random_seed=42,
max_leaves=32, min_data_in_leaf=1)
score = cv_rmse(catboost, X_train)
print('Catboost Score {:.4f} ({:.4f})'.format(score.mean(),score.std()))
from xgboost import XGBRegressor
xgb = XGBRegressor(learning_rate=0.08, n_estimators=3000, max_depth=7, reg_alpha=0.0001, objective='reg:squarederror', max_leaves=10)
score = cv_rmse(xgb, X_train)
print('XGB Score {:.4f} ({:.4f})'.format(score.mean(),score.std()))
from sklearn.ensemble import RandomForestRegressor
rnd = RandomForestRegressor(n_estimators=4000, criterion='mse', max_depth=7, random_state=42, n_jobs=4, min_samples_leaf=10)
score = cv_rmse(rnd, X_train)
print('Random_forest Score {:.4f} ({:.4f})'.format(score.mean(),score.std()))
from sklearn.ensemble import StackingRegressor
est = [('lasso', lasso), ('ridge', ridge), ('elastic', elastic), ('rnadomforest',rnd), ('gradient', gbr),('xgboost', xgb), ('lightgmb',lgb), ('cat', catboost)]
stack_gen = StackingRegressor(estimators=est,
final_estimator=gbr,
n_jobs=4)
stack_gen.fit(X_train, y)
lasso.fit(X_train,y)
ridge.fit(X_train,y)
elastic.fit(X_train,y)
svr.fit(X_train,y)
gbr.fit(X_train,y)
xgb.fit(X_train,y)
lgb.fit(X_train,y)
rnd.fit(X_train,y)
catboost.fit(X_train, y)
def blend_models_prediction(X):
return ((0.4 * gbr.predict(X)) + \
(0.15 * xgb.predict(X)) + \
(0.3 * catboost.predict(X)) + \
(0.15 * stack_gen.predict(X)))
from sklearn.metrics import mean_squared_error
def rmsle(y, y_pred):
return np.sqrt(mean_squared_error(y, y_pred))
score = rmsle(y,blend_models_prediction(X_train))
print('RMSLE Score on train data{}'.format(score))
prediction = blend_models_prediction(test)
submission = pd.DataFrame({'id': test.index,
'price':np.expm1(prediction)})
submission.to_csv('blend.csv', index=False)