{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Final Project Data Preparation Pipeline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Classification Project"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "from feature_engine import imputation as miss_data_imput\n",
    "from feature_engine import encoding as cat_encode\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>survived</th>\n",
       "      <th>pclass</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>fare</th>\n",
       "      <th>embarked</th>\n",
       "      <th>class</th>\n",
       "      <th>who</th>\n",
       "      <th>adult_male</th>\n",
       "      <th>deck</th>\n",
       "      <th>embark_town</th>\n",
       "      <th>alive</th>\n",
       "      <th>alone</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>S</td>\n",
       "      <td>Third</td>\n",
       "      <td>man</td>\n",
       "      <td>True</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>no</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C</td>\n",
       "      <td>First</td>\n",
       "      <td>woman</td>\n",
       "      <td>False</td>\n",
       "      <td>C</td>\n",
       "      <td>Cherbourg</td>\n",
       "      <td>yes</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>S</td>\n",
       "      <td>Third</td>\n",
       "      <td>woman</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>yes</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>S</td>\n",
       "      <td>First</td>\n",
       "      <td>woman</td>\n",
       "      <td>False</td>\n",
       "      <td>C</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>yes</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>S</td>\n",
       "      <td>Third</td>\n",
       "      <td>man</td>\n",
       "      <td>True</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>no</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \\\n",
       "0         0       3    male  22.0      1      0   7.2500        S  Third   \n",
       "1         1       1  female  38.0      1      0  71.2833        C  First   \n",
       "2         1       3  female  26.0      0      0   7.9250        S  Third   \n",
       "3         1       1  female  35.0      1      0  53.1000        S  First   \n",
       "4         0       3    male  35.0      0      0   8.0500        S  Third   \n",
       "\n",
       "     who  adult_male deck  embark_town alive  alone  \n",
       "0    man        True  NaN  Southampton    no  False  \n",
       "1  woman       False    C    Cherbourg   yes  False  \n",
       "2  woman       False  NaN  Southampton   yes   True  \n",
       "3  woman       False    C  Southampton   yes  False  \n",
       "4    man        True  NaN  Southampton    no   True  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titanic_data = sns.load_dataset('titanic')\n",
    "\n",
    "titanic_data.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols = [\n",
    "'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare','embarked', 'survived']\n",
    "\n",
    "titanic_data = titanic_data[cols]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>fare</th>\n",
       "      <th>embarked</th>\n",
       "      <th>survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pclass     sex   age  sibsp  parch     fare embarked  survived\n",
       "0       3    male  22.0      1      0   7.2500        S         0\n",
       "1       1  female  38.0      1      0  71.2833        C         1\n",
       "2       3  female  26.0      0      0   7.9250        S         1\n",
       "3       1  female  35.0      1      0  53.1000        S         1\n",
       "4       3    male  35.0      0      0   8.0500        S         0"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titanic_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pclass        int64\n",
       "sex          object\n",
       "age         float64\n",
       "sibsp         int64\n",
       "parch         int64\n",
       "fare        float64\n",
       "embarked     object\n",
       "survived      int64\n",
       "dtype: object"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titanic_data.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pclass      0.000000\n",
       "sex         0.000000\n",
       "age         0.198653\n",
       "sibsp       0.000000\n",
       "parch       0.000000\n",
       "fare        0.000000\n",
       "embarked    0.002245\n",
       "survived    0.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titanic_data.isnull().mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((712, 7), (179, 7))"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "  titanic_data.drop('survived', axis=1), \n",
    "  titanic_data['survived'], \n",
    "  test_size=0.2, \n",
    "  random_state=42) \n",
    "\n",
    "X_train.shape, X_test.shape\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "titanic_data_pipe = Pipeline([\n",
    "\n",
    "\n",
    "  ('numerical_imputation', miss_data_imput.ArbitraryNumberImputer(arbitrary_number=-1, variables=['age', 'fare'])),\n",
    "  ('categorical_imputation', miss_data_imput.CategoricalImputer(variables=['embarked'])),\n",
    "  ('categorical_encoder',cat_encode.OrdinalEncoder(encoding_method='ordered', variables=[ 'sex', 'embarked'])),\n",
    "  ('rf', RandomForestClassifier(random_state=0))\n",
    "\n",
    "])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "titanic_data_pipe.fit(X_train, y_train)\n",
    "\n",
    "pred_X_train = titanic_data_pipe.predict(X_train)\n",
    "pred_X_test = titanic_data_pipe.predict(X_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[90 15]\n",
      " [19 55]]\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.83      0.86      0.84       105\n",
      "           1       0.79      0.74      0.76        74\n",
      "\n",
      "    accuracy                           0.81       179\n",
      "   macro avg       0.81      0.80      0.80       179\n",
      "weighted avg       0.81      0.81      0.81       179\n",
      "\n",
      "0.8100558659217877\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
    "\n",
    "print(confusion_matrix(y_test,pred_X_test))\n",
    "print(classification_report(y_test,pred_X_test))\n",
    "print(accuracy_score(y_test, pred_X_test))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Regression Project"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>carat</th>\n",
       "      <th>cut</th>\n",
       "      <th>color</th>\n",
       "      <th>clarity</th>\n",
       "      <th>depth</th>\n",
       "      <th>table</th>\n",
       "      <th>price</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.23</td>\n",
       "      <td>Ideal</td>\n",
       "      <td>E</td>\n",
       "      <td>SI2</td>\n",
       "      <td>61.5</td>\n",
       "      <td>55.0</td>\n",
       "      <td>326</td>\n",
       "      <td>3.95</td>\n",
       "      <td>3.98</td>\n",
       "      <td>2.43</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.21</td>\n",
       "      <td>Premium</td>\n",
       "      <td>E</td>\n",
       "      <td>SI1</td>\n",
       "      <td>59.8</td>\n",
       "      <td>61.0</td>\n",
       "      <td>326</td>\n",
       "      <td>3.89</td>\n",
       "      <td>3.84</td>\n",
       "      <td>2.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.23</td>\n",
       "      <td>Good</td>\n",
       "      <td>E</td>\n",
       "      <td>VS1</td>\n",
       "      <td>56.9</td>\n",
       "      <td>65.0</td>\n",
       "      <td>327</td>\n",
       "      <td>4.05</td>\n",
       "      <td>4.07</td>\n",
       "      <td>2.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.29</td>\n",
       "      <td>Premium</td>\n",
       "      <td>I</td>\n",
       "      <td>VS2</td>\n",
       "      <td>62.4</td>\n",
       "      <td>58.0</td>\n",
       "      <td>334</td>\n",
       "      <td>4.20</td>\n",
       "      <td>4.23</td>\n",
       "      <td>2.63</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.31</td>\n",
       "      <td>Good</td>\n",
       "      <td>J</td>\n",
       "      <td>SI2</td>\n",
       "      <td>63.3</td>\n",
       "      <td>58.0</td>\n",
       "      <td>335</td>\n",
       "      <td>4.34</td>\n",
       "      <td>4.35</td>\n",
       "      <td>2.75</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   carat      cut color clarity  depth  table  price     x     y     z\n",
       "0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43\n",
       "1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31\n",
       "2   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31\n",
       "3   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63\n",
       "4   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diamond_data = sns.load_dataset('diamonds')\n",
    "\n",
    "diamond_data.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>carat</th>\n",
       "      <th>depth</th>\n",
       "      <th>table</th>\n",
       "      <th>price</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>53940.000000</td>\n",
       "      <td>53940.000000</td>\n",
       "      <td>53940.000000</td>\n",
       "      <td>53940.000000</td>\n",
       "      <td>53940.000000</td>\n",
       "      <td>53940.000000</td>\n",
       "      <td>53940.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.797940</td>\n",
       "      <td>61.749405</td>\n",
       "      <td>57.457184</td>\n",
       "      <td>3932.799722</td>\n",
       "      <td>5.731157</td>\n",
       "      <td>5.734526</td>\n",
       "      <td>3.538734</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.474011</td>\n",
       "      <td>1.432621</td>\n",
       "      <td>2.234491</td>\n",
       "      <td>3989.439738</td>\n",
       "      <td>1.121761</td>\n",
       "      <td>1.142135</td>\n",
       "      <td>0.705699</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.200000</td>\n",
       "      <td>43.000000</td>\n",
       "      <td>43.000000</td>\n",
       "      <td>326.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.400000</td>\n",
       "      <td>61.000000</td>\n",
       "      <td>56.000000</td>\n",
       "      <td>950.000000</td>\n",
       "      <td>4.710000</td>\n",
       "      <td>4.720000</td>\n",
       "      <td>2.910000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.700000</td>\n",
       "      <td>61.800000</td>\n",
       "      <td>57.000000</td>\n",
       "      <td>2401.000000</td>\n",
       "      <td>5.700000</td>\n",
       "      <td>5.710000</td>\n",
       "      <td>3.530000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>1.040000</td>\n",
       "      <td>62.500000</td>\n",
       "      <td>59.000000</td>\n",
       "      <td>5324.250000</td>\n",
       "      <td>6.540000</td>\n",
       "      <td>6.540000</td>\n",
       "      <td>4.040000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>5.010000</td>\n",
       "      <td>79.000000</td>\n",
       "      <td>95.000000</td>\n",
       "      <td>18823.000000</td>\n",
       "      <td>10.740000</td>\n",
       "      <td>58.900000</td>\n",
       "      <td>31.800000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              carat         depth         table         price             x  \\\n",
       "count  53940.000000  53940.000000  53940.000000  53940.000000  53940.000000   \n",
       "mean       0.797940     61.749405     57.457184   3932.799722      5.731157   \n",
       "std        0.474011      1.432621      2.234491   3989.439738      1.121761   \n",
       "min        0.200000     43.000000     43.000000    326.000000      0.000000   \n",
       "25%        0.400000     61.000000     56.000000    950.000000      4.710000   \n",
       "50%        0.700000     61.800000     57.000000   2401.000000      5.700000   \n",
       "75%        1.040000     62.500000     59.000000   5324.250000      6.540000   \n",
       "max        5.010000     79.000000     95.000000  18823.000000     10.740000   \n",
       "\n",
       "                  y             z  \n",
       "count  53940.000000  53940.000000  \n",
       "mean       5.734526      3.538734  \n",
       "std        1.142135      0.705699  \n",
       "min        0.000000      0.000000  \n",
       "25%        4.720000      2.910000  \n",
       "50%        5.710000      3.530000  \n",
       "75%        6.540000      4.040000  \n",
       "max       58.900000     31.800000  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diamond_data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "carat       float64\n",
       "cut        category\n",
       "color      category\n",
       "clarity    category\n",
       "depth       float64\n",
       "table       float64\n",
       "price         int64\n",
       "x           float64\n",
       "y           float64\n",
       "z           float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diamond_data.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "carat      0.0\n",
       "cut        0.0\n",
       "color      0.0\n",
       "clarity    0.0\n",
       "depth      0.0\n",
       "table      0.0\n",
       "price      0.0\n",
       "x          0.0\n",
       "y          0.0\n",
       "z          0.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diamond_data.isnull().mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((43152, 9), (10788, 9))"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "  diamond_data.drop('price', axis=1), \n",
    "  diamond_data['price'], \n",
    "  test_size=0.2, \n",
    "  random_state=42) \n",
    "\n",
    "X_train.shape, X_test.shape\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "diamond_data_pipe = Pipeline([\n",
    "\n",
    "\n",
    "    ('categorical_encoder',\n",
    "     cat_encode.OrdinalEncoder(encoding_method='ordered',\n",
    "                                  variables=[ 'cut', 'color', 'clarity'])),\n",
    "\n",
    "    ('rf', RandomForestRegressor(random_state=42))\n",
    "])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "diamond_data_pipe.fit(X_train, y_train)\n",
    "\n",
    "pred_X_train = diamond_data_pipe.predict(X_train)\n",
    "pred_X_test = diamond_data_pipe.predict(X_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean Absolute Error: 271.5354506226789\n",
      "Mean Squared Error: 308115.6650668038\n",
      "Root Mean Squared Error: 555.0816742307422\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_X_test))\n",
    "print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_X_test))\n",
    "print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_X_test)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}