{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Final Project Data Preparation Pipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Classification Project"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.pipeline import Pipeline\n",
"\n",
"from feature_engine import imputation as miss_data_imput\n",
"from feature_engine import encoding as cat_encode\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" survived | \n",
" pclass | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" embarked | \n",
" class | \n",
" who | \n",
" adult_male | \n",
" deck | \n",
" embark_town | \n",
" alive | \n",
" alone | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 3 | \n",
" male | \n",
" 22.0 | \n",
" 1 | \n",
" 0 | \n",
" 7.2500 | \n",
" S | \n",
" Third | \n",
" man | \n",
" True | \n",
" NaN | \n",
" Southampton | \n",
" no | \n",
" False | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 1 | \n",
" female | \n",
" 38.0 | \n",
" 1 | \n",
" 0 | \n",
" 71.2833 | \n",
" C | \n",
" First | \n",
" woman | \n",
" False | \n",
" C | \n",
" Cherbourg | \n",
" yes | \n",
" False | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 3 | \n",
" female | \n",
" 26.0 | \n",
" 0 | \n",
" 0 | \n",
" 7.9250 | \n",
" S | \n",
" Third | \n",
" woman | \n",
" False | \n",
" NaN | \n",
" Southampton | \n",
" yes | \n",
" True | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 1 | \n",
" female | \n",
" 35.0 | \n",
" 1 | \n",
" 0 | \n",
" 53.1000 | \n",
" S | \n",
" First | \n",
" woman | \n",
" False | \n",
" C | \n",
" Southampton | \n",
" yes | \n",
" False | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
" 3 | \n",
" male | \n",
" 35.0 | \n",
" 0 | \n",
" 0 | \n",
" 8.0500 | \n",
" S | \n",
" Third | \n",
" man | \n",
" True | \n",
" NaN | \n",
" Southampton | \n",
" no | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" survived pclass sex age sibsp parch fare embarked class \\\n",
"0 0 3 male 22.0 1 0 7.2500 S Third \n",
"1 1 1 female 38.0 1 0 71.2833 C First \n",
"2 1 3 female 26.0 0 0 7.9250 S Third \n",
"3 1 1 female 35.0 1 0 53.1000 S First \n",
"4 0 3 male 35.0 0 0 8.0500 S Third \n",
"\n",
" who adult_male deck embark_town alive alone \n",
"0 man True NaN Southampton no False \n",
"1 woman False C Cherbourg yes False \n",
"2 woman False NaN Southampton yes True \n",
"3 woman False C Southampton yes False \n",
"4 man True NaN Southampton no True "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titanic_data = sns.load_dataset('titanic')\n",
"\n",
"titanic_data.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"cols = [\n",
"'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare','embarked', 'survived']\n",
"\n",
"titanic_data = titanic_data[cols]\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" embarked | \n",
" survived | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 3 | \n",
" male | \n",
" 22.0 | \n",
" 1 | \n",
" 0 | \n",
" 7.2500 | \n",
" S | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" female | \n",
" 38.0 | \n",
" 1 | \n",
" 0 | \n",
" 71.2833 | \n",
" C | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" female | \n",
" 26.0 | \n",
" 0 | \n",
" 0 | \n",
" 7.9250 | \n",
" S | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" female | \n",
" 35.0 | \n",
" 1 | \n",
" 0 | \n",
" 53.1000 | \n",
" S | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 3 | \n",
" male | \n",
" 35.0 | \n",
" 0 | \n",
" 0 | \n",
" 8.0500 | \n",
" S | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass sex age sibsp parch fare embarked survived\n",
"0 3 male 22.0 1 0 7.2500 S 0\n",
"1 1 female 38.0 1 0 71.2833 C 1\n",
"2 3 female 26.0 0 0 7.9250 S 1\n",
"3 1 female 35.0 1 0 53.1000 S 1\n",
"4 3 male 35.0 0 0 8.0500 S 0"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titanic_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pclass int64\n",
"sex object\n",
"age float64\n",
"sibsp int64\n",
"parch int64\n",
"fare float64\n",
"embarked object\n",
"survived int64\n",
"dtype: object"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titanic_data.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pclass 0.000000\n",
"sex 0.000000\n",
"age 0.198653\n",
"sibsp 0.000000\n",
"parch 0.000000\n",
"fare 0.000000\n",
"embarked 0.002245\n",
"survived 0.000000\n",
"dtype: float64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titanic_data.isnull().mean()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((712, 7), (179, 7))"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(\n",
" titanic_data.drop('survived', axis=1), \n",
" titanic_data['survived'], \n",
" test_size=0.2, \n",
" random_state=42) \n",
"\n",
"X_train.shape, X_test.shape\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"titanic_data_pipe = Pipeline([\n",
"\n",
"\n",
" ('numerical_imputation', miss_data_imput.ArbitraryNumberImputer(arbitrary_number=-1, variables=['age', 'fare'])),\n",
" ('categorical_imputation', miss_data_imput.CategoricalImputer(variables=['embarked'])),\n",
" ('categorical_encoder',cat_encode.OrdinalEncoder(encoding_method='ordered', variables=[ 'sex', 'embarked'])),\n",
" ('rf', RandomForestClassifier(random_state=0))\n",
"\n",
"])\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"\n",
"titanic_data_pipe.fit(X_train, y_train)\n",
"\n",
"pred_X_train = titanic_data_pipe.predict(X_train)\n",
"pred_X_test = titanic_data_pipe.predict(X_test)\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[90 15]\n",
" [19 55]]\n",
" precision recall f1-score support\n",
"\n",
" 0 0.83 0.86 0.84 105\n",
" 1 0.79 0.74 0.76 74\n",
"\n",
" accuracy 0.81 179\n",
" macro avg 0.81 0.80 0.80 179\n",
"weighted avg 0.81 0.81 0.81 179\n",
"\n",
"0.8100558659217877\n"
]
}
],
"source": [
"from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
"\n",
"print(confusion_matrix(y_test,pred_X_test))\n",
"print(classification_report(y_test,pred_X_test))\n",
"print(accuracy_score(y_test, pred_X_test))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Regression Project"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" carat | \n",
" cut | \n",
" color | \n",
" clarity | \n",
" depth | \n",
" table | \n",
" price | \n",
" x | \n",
" y | \n",
" z | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.23 | \n",
" Ideal | \n",
" E | \n",
" SI2 | \n",
" 61.5 | \n",
" 55.0 | \n",
" 326 | \n",
" 3.95 | \n",
" 3.98 | \n",
" 2.43 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0.21 | \n",
" Premium | \n",
" E | \n",
" SI1 | \n",
" 59.8 | \n",
" 61.0 | \n",
" 326 | \n",
" 3.89 | \n",
" 3.84 | \n",
" 2.31 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.23 | \n",
" Good | \n",
" E | \n",
" VS1 | \n",
" 56.9 | \n",
" 65.0 | \n",
" 327 | \n",
" 4.05 | \n",
" 4.07 | \n",
" 2.31 | \n",
"
\n",
" \n",
" | 3 | \n",
" 0.29 | \n",
" Premium | \n",
" I | \n",
" VS2 | \n",
" 62.4 | \n",
" 58.0 | \n",
" 334 | \n",
" 4.20 | \n",
" 4.23 | \n",
" 2.63 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0.31 | \n",
" Good | \n",
" J | \n",
" SI2 | \n",
" 63.3 | \n",
" 58.0 | \n",
" 335 | \n",
" 4.34 | \n",
" 4.35 | \n",
" 2.75 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" carat cut color clarity depth table price x y z\n",
"0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n",
"1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31\n",
"2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31\n",
"3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63\n",
"4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diamond_data = sns.load_dataset('diamonds')\n",
"\n",
"diamond_data.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" carat | \n",
" depth | \n",
" table | \n",
" price | \n",
" x | \n",
" y | \n",
" z | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 53940.000000 | \n",
" 53940.000000 | \n",
" 53940.000000 | \n",
" 53940.000000 | \n",
" 53940.000000 | \n",
" 53940.000000 | \n",
" 53940.000000 | \n",
"
\n",
" \n",
" | mean | \n",
" 0.797940 | \n",
" 61.749405 | \n",
" 57.457184 | \n",
" 3932.799722 | \n",
" 5.731157 | \n",
" 5.734526 | \n",
" 3.538734 | \n",
"
\n",
" \n",
" | std | \n",
" 0.474011 | \n",
" 1.432621 | \n",
" 2.234491 | \n",
" 3989.439738 | \n",
" 1.121761 | \n",
" 1.142135 | \n",
" 0.705699 | \n",
"
\n",
" \n",
" | min | \n",
" 0.200000 | \n",
" 43.000000 | \n",
" 43.000000 | \n",
" 326.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 25% | \n",
" 0.400000 | \n",
" 61.000000 | \n",
" 56.000000 | \n",
" 950.000000 | \n",
" 4.710000 | \n",
" 4.720000 | \n",
" 2.910000 | \n",
"
\n",
" \n",
" | 50% | \n",
" 0.700000 | \n",
" 61.800000 | \n",
" 57.000000 | \n",
" 2401.000000 | \n",
" 5.700000 | \n",
" 5.710000 | \n",
" 3.530000 | \n",
"
\n",
" \n",
" | 75% | \n",
" 1.040000 | \n",
" 62.500000 | \n",
" 59.000000 | \n",
" 5324.250000 | \n",
" 6.540000 | \n",
" 6.540000 | \n",
" 4.040000 | \n",
"
\n",
" \n",
" | max | \n",
" 5.010000 | \n",
" 79.000000 | \n",
" 95.000000 | \n",
" 18823.000000 | \n",
" 10.740000 | \n",
" 58.900000 | \n",
" 31.800000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" carat depth table price x \\\n",
"count 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 \n",
"mean 0.797940 61.749405 57.457184 3932.799722 5.731157 \n",
"std 0.474011 1.432621 2.234491 3989.439738 1.121761 \n",
"min 0.200000 43.000000 43.000000 326.000000 0.000000 \n",
"25% 0.400000 61.000000 56.000000 950.000000 4.710000 \n",
"50% 0.700000 61.800000 57.000000 2401.000000 5.700000 \n",
"75% 1.040000 62.500000 59.000000 5324.250000 6.540000 \n",
"max 5.010000 79.000000 95.000000 18823.000000 10.740000 \n",
"\n",
" y z \n",
"count 53940.000000 53940.000000 \n",
"mean 5.734526 3.538734 \n",
"std 1.142135 0.705699 \n",
"min 0.000000 0.000000 \n",
"25% 4.720000 2.910000 \n",
"50% 5.710000 3.530000 \n",
"75% 6.540000 4.040000 \n",
"max 58.900000 31.800000 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diamond_data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"carat float64\n",
"cut category\n",
"color category\n",
"clarity category\n",
"depth float64\n",
"table float64\n",
"price int64\n",
"x float64\n",
"y float64\n",
"z float64\n",
"dtype: object"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diamond_data.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"carat 0.0\n",
"cut 0.0\n",
"color 0.0\n",
"clarity 0.0\n",
"depth 0.0\n",
"table 0.0\n",
"price 0.0\n",
"x 0.0\n",
"y 0.0\n",
"z 0.0\n",
"dtype: float64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diamond_data.isnull().mean()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((43152, 9), (10788, 9))"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(\n",
" diamond_data.drop('price', axis=1), \n",
" diamond_data['price'], \n",
" test_size=0.2, \n",
" random_state=42) \n",
"\n",
"X_train.shape, X_test.shape\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"diamond_data_pipe = Pipeline([\n",
"\n",
"\n",
" ('categorical_encoder',\n",
" cat_encode.OrdinalEncoder(encoding_method='ordered',\n",
" variables=[ 'cut', 'color', 'clarity'])),\n",
"\n",
" ('rf', RandomForestRegressor(random_state=42))\n",
"])\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"diamond_data_pipe.fit(X_train, y_train)\n",
"\n",
"pred_X_train = diamond_data_pipe.predict(X_train)\n",
"pred_X_test = diamond_data_pipe.predict(X_test)\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean Absolute Error: 271.5354506226789\n",
"Mean Squared Error: 308115.6650668038\n",
"Root Mean Squared Error: 555.0816742307422\n"
]
}
],
"source": [
"from sklearn import metrics\n",
"\n",
"print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_X_test))\n",
"print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_X_test))\n",
"print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_X_test)))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}