{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Final Project Data Preparation Pipeline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Classification Project" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.pipeline import Pipeline\n", "\n", "from feature_engine import imputation as miss_data_imput\n", "from feature_engine import encoding as cat_encode\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealone
003male22.0107.2500SThirdmanTrueNaNSouthamptonnoFalse
111female38.01071.2833CFirstwomanFalseCCherbourgyesFalse
213female26.0007.9250SThirdwomanFalseNaNSouthamptonyesTrue
311female35.01053.1000SFirstwomanFalseCSouthamptonyesFalse
403male35.0008.0500SThirdmanTrueNaNSouthamptonnoTrue
\n", "
" ], "text/plain": [ " survived pclass sex age sibsp parch fare embarked class \\\n", "0 0 3 male 22.0 1 0 7.2500 S Third \n", "1 1 1 female 38.0 1 0 71.2833 C First \n", "2 1 3 female 26.0 0 0 7.9250 S Third \n", "3 1 1 female 35.0 1 0 53.1000 S First \n", "4 0 3 male 35.0 0 0 8.0500 S Third \n", "\n", " who adult_male deck embark_town alive alone \n", "0 man True NaN Southampton no False \n", "1 woman False C Cherbourg yes False \n", "2 woman False NaN Southampton yes True \n", "3 woman False C Southampton yes False \n", "4 man True NaN Southampton no True " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_data = sns.load_dataset('titanic')\n", "\n", "titanic_data.head()\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "cols = [\n", "'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare','embarked', 'survived']\n", "\n", "titanic_data = titanic_data[cols]\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pclasssexagesibspparchfareembarkedsurvived
03male22.0107.2500S0
11female38.01071.2833C1
23female26.0007.9250S1
31female35.01053.1000S1
43male35.0008.0500S0
\n", "
" ], "text/plain": [ " pclass sex age sibsp parch fare embarked survived\n", "0 3 male 22.0 1 0 7.2500 S 0\n", "1 1 female 38.0 1 0 71.2833 C 1\n", "2 3 female 26.0 0 0 7.9250 S 1\n", "3 1 female 35.0 1 0 53.1000 S 1\n", "4 3 male 35.0 0 0 8.0500 S 0" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_data.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pclass int64\n", "sex object\n", "age float64\n", "sibsp int64\n", "parch int64\n", "fare float64\n", "embarked object\n", "survived int64\n", "dtype: object" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_data.dtypes" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pclass 0.000000\n", "sex 0.000000\n", "age 0.198653\n", "sibsp 0.000000\n", "parch 0.000000\n", "fare 0.000000\n", "embarked 0.002245\n", "survived 0.000000\n", "dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_data.isnull().mean()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((712, 7), (179, 7))" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " titanic_data.drop('survived', axis=1), \n", " titanic_data['survived'], \n", " test_size=0.2, \n", " random_state=42) \n", "\n", "X_train.shape, X_test.shape\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "titanic_data_pipe = Pipeline([\n", "\n", "\n", " ('numerical_imputation', miss_data_imput.ArbitraryNumberImputer(arbitrary_number=-1, variables=['age', 'fare'])),\n", " ('categorical_imputation', miss_data_imput.CategoricalImputer(variables=['embarked'])),\n", " ('categorical_encoder',cat_encode.OrdinalEncoder(encoding_method='ordered', variables=[ 'sex', 'embarked'])),\n", " ('rf', RandomForestClassifier(random_state=0))\n", "\n", "])\n", "\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "\n", "titanic_data_pipe.fit(X_train, y_train)\n", "\n", "pred_X_train = titanic_data_pipe.predict(X_train)\n", "pred_X_test = titanic_data_pipe.predict(X_test)\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[90 15]\n", " [19 55]]\n", " precision recall f1-score support\n", "\n", " 0 0.83 0.86 0.84 105\n", " 1 0.79 0.74 0.76 74\n", "\n", " accuracy 0.81 179\n", " macro avg 0.81 0.80 0.80 179\n", "weighted avg 0.81 0.81 0.81 179\n", "\n", "0.8100558659217877\n" ] } ], "source": [ "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n", "\n", "print(confusion_matrix(y_test,pred_X_test))\n", "print(classification_report(y_test,pred_X_test))\n", "print(accuracy_score(y_test, pred_X_test))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Regression Project" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
caratcutcolorclaritydepthtablepricexyz
00.23IdealESI261.555.03263.953.982.43
10.21PremiumESI159.861.03263.893.842.31
20.23GoodEVS156.965.03274.054.072.31
30.29PremiumIVS262.458.03344.204.232.63
40.31GoodJSI263.358.03354.344.352.75
\n", "
" ], "text/plain": [ " carat cut color clarity depth table price x y z\n", "0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n", "1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31\n", "2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31\n", "3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63\n", "4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diamond_data = sns.load_dataset('diamonds')\n", "\n", "diamond_data.head()\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
caratdepthtablepricexyz
count53940.00000053940.00000053940.00000053940.00000053940.00000053940.00000053940.000000
mean0.79794061.74940557.4571843932.7997225.7311575.7345263.538734
std0.4740111.4326212.2344913989.4397381.1217611.1421350.705699
min0.20000043.00000043.000000326.0000000.0000000.0000000.000000
25%0.40000061.00000056.000000950.0000004.7100004.7200002.910000
50%0.70000061.80000057.0000002401.0000005.7000005.7100003.530000
75%1.04000062.50000059.0000005324.2500006.5400006.5400004.040000
max5.01000079.00000095.00000018823.00000010.74000058.90000031.800000
\n", "
" ], "text/plain": [ " carat depth table price x \\\n", "count 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 \n", "mean 0.797940 61.749405 57.457184 3932.799722 5.731157 \n", "std 0.474011 1.432621 2.234491 3989.439738 1.121761 \n", "min 0.200000 43.000000 43.000000 326.000000 0.000000 \n", "25% 0.400000 61.000000 56.000000 950.000000 4.710000 \n", "50% 0.700000 61.800000 57.000000 2401.000000 5.700000 \n", "75% 1.040000 62.500000 59.000000 5324.250000 6.540000 \n", "max 5.010000 79.000000 95.000000 18823.000000 10.740000 \n", "\n", " y z \n", "count 53940.000000 53940.000000 \n", "mean 5.734526 3.538734 \n", "std 1.142135 0.705699 \n", "min 0.000000 0.000000 \n", "25% 4.720000 2.910000 \n", "50% 5.710000 3.530000 \n", "75% 6.540000 4.040000 \n", "max 58.900000 31.800000 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diamond_data.describe()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "carat float64\n", "cut category\n", "color category\n", "clarity category\n", "depth float64\n", "table float64\n", "price int64\n", "x float64\n", "y float64\n", "z float64\n", "dtype: object" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diamond_data.dtypes" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "carat 0.0\n", "cut 0.0\n", "color 0.0\n", "clarity 0.0\n", "depth 0.0\n", "table 0.0\n", "price 0.0\n", "x 0.0\n", "y 0.0\n", "z 0.0\n", "dtype: float64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diamond_data.isnull().mean()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((43152, 9), (10788, 9))" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " diamond_data.drop('price', axis=1), \n", " diamond_data['price'], \n", " test_size=0.2, \n", " random_state=42) \n", "\n", "X_train.shape, X_test.shape\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "diamond_data_pipe = Pipeline([\n", "\n", "\n", " ('categorical_encoder',\n", " cat_encode.OrdinalEncoder(encoding_method='ordered',\n", " variables=[ 'cut', 'color', 'clarity'])),\n", "\n", " ('rf', RandomForestRegressor(random_state=42))\n", "])\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "diamond_data_pipe.fit(X_train, y_train)\n", "\n", "pred_X_train = diamond_data_pipe.predict(X_train)\n", "pred_X_test = diamond_data_pipe.predict(X_test)\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean Absolute Error: 271.5354506226789\n", "Mean Squared Error: 308115.6650668038\n", "Root Mean Squared Error: 555.0816742307422\n" ] } ], "source": [ "from sklearn import metrics\n", "\n", "print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_X_test))\n", "print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_X_test))\n", "print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_X_test)))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }