{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Chapter 4 - Categorical Data Encoding" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 4.2. One hot Encoding" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealone
003male22.0107.2500SThirdmanTrueNaNSouthamptonnoFalse
111female38.01071.2833CFirstwomanFalseCCherbourgyesFalse
213female26.0007.9250SThirdwomanFalseNaNSouthamptonyesTrue
311female35.01053.1000SFirstwomanFalseCSouthamptonyesFalse
403male35.0008.0500SThirdmanTrueNaNSouthamptonnoTrue
\n", "
" ], "text/plain": [ " survived pclass sex age sibsp parch fare embarked class \\\n", "0 0 3 male 22.0 1 0 7.2500 S Third \n", "1 1 1 female 38.0 1 0 71.2833 C First \n", "2 1 3 female 26.0 0 0 7.9250 S Third \n", "3 1 1 female 35.0 1 0 53.1000 S First \n", "4 0 3 male 35.0 0 0 8.0500 S Third \n", "\n", " who adult_male deck embark_town alive alone \n", "0 man True NaN Southampton no False \n", "1 woman False C Cherbourg yes False \n", "2 woman False NaN Southampton yes True \n", "3 woman False C Southampton yes False \n", "4 man True NaN Southampton no True " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "plt.rcParams[\"figure.figsize\"] = [8,6]\n", "sns.set_style(\"darkgrid\")\n", "\n", "titanic_data = sns.load_dataset('titanic')\n", "\n", "titanic_data.head()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sexclassembark_town
0maleThirdSouthampton
1femaleFirstCherbourg
2femaleThirdSouthampton
3femaleFirstSouthampton
4maleThirdSouthampton
\n", "
" ], "text/plain": [ " sex class embark_town\n", "0 male Third Southampton\n", "1 female First Cherbourg\n", "2 female Third Southampton\n", "3 female First Southampton\n", "4 male Third Southampton" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_data = titanic_data[[\"sex\", \"class\", \"embark_town\"]]\n", "titanic_data.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['male' 'female']\n", "[Third, First, Second]\n", "Categories (3, object): [Third, First, Second]\n", "['Southampton' 'Cherbourg' 'Queenstown' nan]\n" ] } ], "source": [ "print(titanic_data['sex'].unique())\n", "print(titanic_data['class'].unique())\n", "print(titanic_data['embark_town'].unique())" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
femalemale
001
110
210
310
401
\n", "
" ], "text/plain": [ " female male\n", "0 0 1\n", "1 1 0\n", "2 1 0\n", "3 1 0\n", "4 0 1" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "temp = pd.get_dummies(titanic_data['sex'])\n", "\n", "temp.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sexfemalemale
0male01
1female10
2female10
3female10
4male01
\n", "
" ], "text/plain": [ " sex female male\n", "0 male 0 1\n", "1 female 1 0\n", "2 female 1 0\n", "3 female 1 0\n", "4 male 0 1" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([titanic_data['sex'],\n", " pd.get_dummies(titanic_data['sex'])], axis=1).head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CherbourgQueenstownSouthampton
0001
1100
2001
3001
4001
\n", "
" ], "text/plain": [ " Cherbourg Queenstown Southampton\n", "0 0 0 1\n", "1 1 0 0\n", "2 0 0 1\n", "3 0 0 1\n", "4 0 0 1" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "temp = pd.get_dummies(titanic_data['embark_town'])\n", "\n", "temp.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
QueenstownSouthampton
001
100
201
301
401
\n", "
" ], "text/plain": [ " Queenstown Southampton\n", "0 0 1\n", "1 0 0\n", "2 0 1\n", "3 0 1\n", "4 0 1" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "temp = pd.get_dummies(titanic_data['embark_town'], drop_first = True)\n", "\n", "temp.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
QueenstownSouthamptonNaN
0010
1000
2010
3010
4010
\n", "
" ], "text/plain": [ " Queenstown Southampton NaN\n", "0 0 1 0\n", "1 0 0 0\n", "2 0 1 0\n", "3 0 1 0\n", "4 0 1 0" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "temp = pd.get_dummies(titanic_data['embark_town'], dummy_na = True ,drop_first = True)\n", "\n", "temp.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 4.3. Label Encoding" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sexclassembark_townle_class
0maleThirdSouthampton2
1femaleFirstCherbourg0
2femaleThirdSouthampton2
3femaleFirstSouthampton0
4maleThirdSouthampton2
\n", "
" ], "text/plain": [ " sex class embark_town le_class\n", "0 male Third Southampton 2\n", "1 female First Cherbourg 0\n", "2 female Third Southampton 2\n", "3 female First Southampton 0\n", "4 male Third Southampton 2" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# for integer encoding using sklearn\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "le = LabelEncoder()\n", "\n", "le.fit(titanic_data['class'])\n", "\n", "titanic_data['le_class'] = le.transform(titanic_data['class'])\n", "\n", "titanic_data.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['First', 'Second', 'Third'], dtype=object)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "le.classes_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 4.4. Frequency Encoding" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "titanic_data.dropna(inplace = True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'Southampton': 644, 'Cherbourg': 168, 'Queenstown': 77}\n" ] } ], "source": [ "value_counts = titanic_data['embark_town'].value_counts().to_dict()\n", "print(value_counts)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sexclassembark_townle_class
0maleThird6442
1femaleFirst1680
2femaleThird6442
3femaleFirst6440
4maleThird6442
\n", "
" ], "text/plain": [ " sex class embark_town le_class\n", "0 male Third 644 2\n", "1 female First 168 0\n", "2 female Third 644 2\n", "3 female First 644 0\n", "4 male Third 644 2" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_data['embark_town'] = titanic_data['embark_town'].map(value_counts)\n", "titanic_data.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{644: 0.7244094488188977, 168: 0.1889763779527559, 77: 0.08661417322834646}\n" ] } ], "source": [ "frequency_count = (titanic_data['embark_town'].value_counts() / len(titanic_data) ).to_dict()\n", "print(frequency_count)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sexclassembark_townle_class
0maleThird0.7244092
1femaleFirst0.1889760
2femaleThird0.7244092
3femaleFirst0.7244090
4maleThird0.7244092
\n", "
" ], "text/plain": [ " sex class embark_town le_class\n", "0 male Third 0.724409 2\n", "1 female First 0.188976 0\n", "2 female Third 0.724409 2\n", "3 female First 0.724409 0\n", "4 male Third 0.724409 2" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_data['embark_town'] = titanic_data['embark_town'].map(frequency_count)\n", "titanic_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 4.5. Ordinal Encoding" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "class\n", "Third 0.242363\n", "Second 0.472826\n", "First 0.629630\n", "Name: survived, dtype: float64" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_data = sns.load_dataset('titanic')\n", "titanic_data = titanic_data[[\"sex\", \"class\", \"embark_town\", \"survived\"]]\n", "\n", "\n", "titanic_data.groupby(['class'])['survived'].mean().sort_values()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "ordered_cats = titanic_data.groupby(['class'])['survived'].mean().sort_values().index\n", "cat_map= {k: i for i, k in enumerate(ordered_cats, 0)}\n", "titanic_data['class_ordered'] = titanic_data['class'].map(cat_map)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sexclassembark_townsurvivedclass_ordered
0maleThirdSouthampton00
1femaleFirstCherbourg12
2femaleThirdSouthampton10
3femaleFirstSouthampton12
4maleThirdSouthampton00
\n", "
" ], "text/plain": [ " sex class embark_town survived class_ordered\n", "0 male Third Southampton 0 0\n", "1 female First Cherbourg 1 2\n", "2 female Third Southampton 1 0\n", "3 female First Southampton 1 2\n", "4 male Third Southampton 0 0" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 4.6 Mean Encoding" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "class\n", "First 0.629630\n", "Second 0.472826\n", "Third 0.242363\n", "Name: survived, dtype: float64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_data.groupby(['class'])['survived'].mean()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sexclassembark_townsurvivedclass_orderedclass_mean
0maleThirdSouthampton000.242363
1femaleFirstCherbourg120.629630
2femaleThirdSouthampton100.242363
3femaleFirstSouthampton120.629630
4maleThirdSouthampton000.242363
\n", "
" ], "text/plain": [ " sex class embark_town survived class_ordered class_mean\n", "0 male Third Southampton 0 0 0.242363\n", "1 female First Cherbourg 1 2 0.629630\n", "2 female Third Southampton 1 0 0.242363\n", "3 female First Southampton 1 2 0.629630\n", "4 male Third Southampton 0 0 0.242363" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mean_labels = titanic_data.groupby(['class'])['survived'].mean().to_dict()\n", "titanic_data['class_mean'] = titanic_data['class'].map(mean_labels)\n", "titanic_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Exercise 4.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Question 1\n", "\n", "Which encoding scheme generally leads to highest number of columns in the encoded dataset?\n", "\n", "A. Mean Encoding \\\n", "B. Ordinal Encoding \\\n", "C. One Hot Encoding \\\n", "D. All of the Above\n", "\n", "Answer: C" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Question 2\n", "\n", "Which attribute is set to True to remove the first column from the one-hot encoded columns generated via get_dummies() method?\n", "\n", "A. drop_first \\\n", "B. remove_first \\\n", "C. delete_first \\\n", "D. None of the above\n", "\n", "Answer: A" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Question 3\n", "\n", "What is the total number of integer labels in the frequency encoding?\n", "\n", "A. One less than total number of unique labels in the original column \\\n", "B. Equal to the total number of unique labels in the original column \\\n", "C. 3 \\\n", "D. None of the above\n", "\n", "Answer: B" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Exercise 4.2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Apply frequency encoding to the class column of the Titanic Dataset:" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'Third': 491, 'First': 216, 'Second': 184}\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sexclassembark_townclass_freq
0maleThirdSouthampton491
1femaleFirstCherbourg216
2femaleThirdSouthampton491
3femaleFirstSouthampton216
4maleThirdSouthampton491
\n", "
" ], "text/plain": [ " sex class embark_town class_freq\n", "0 male Third Southampton 491\n", "1 female First Cherbourg 216\n", "2 female Third Southampton 491\n", "3 female First Southampton 216\n", "4 male Third Southampton 491" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "plt.rcParams[\"figure.figsize\"] = [8,6]\n", "sns.set_style(\"darkgrid\")\n", "\n", "titanic_data = sns.load_dataset('titanic')\n", "\n", "titanic_data.head()\n", "\n", "titanic_data = titanic_data[[\"sex\", \"class\", \"embark_town\"]]\n", "titanic_data.head()\n", "\n", "value_counts = titanic_data['class'].value_counts().to_dict()\n", "print(value_counts)\n", "\n", "titanic_data['class_freq'] = titanic_data['class'].map(value_counts)\n", "titanic_data.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }