{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Chapter 4 - Categorical Data Encoding"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 4.2. One hot Encoding"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" survived | \n",
" pclass | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" embarked | \n",
" class | \n",
" who | \n",
" adult_male | \n",
" deck | \n",
" embark_town | \n",
" alive | \n",
" alone | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 3 | \n",
" male | \n",
" 22.0 | \n",
" 1 | \n",
" 0 | \n",
" 7.2500 | \n",
" S | \n",
" Third | \n",
" man | \n",
" True | \n",
" NaN | \n",
" Southampton | \n",
" no | \n",
" False | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 1 | \n",
" female | \n",
" 38.0 | \n",
" 1 | \n",
" 0 | \n",
" 71.2833 | \n",
" C | \n",
" First | \n",
" woman | \n",
" False | \n",
" C | \n",
" Cherbourg | \n",
" yes | \n",
" False | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 3 | \n",
" female | \n",
" 26.0 | \n",
" 0 | \n",
" 0 | \n",
" 7.9250 | \n",
" S | \n",
" Third | \n",
" woman | \n",
" False | \n",
" NaN | \n",
" Southampton | \n",
" yes | \n",
" True | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 1 | \n",
" female | \n",
" 35.0 | \n",
" 1 | \n",
" 0 | \n",
" 53.1000 | \n",
" S | \n",
" First | \n",
" woman | \n",
" False | \n",
" C | \n",
" Southampton | \n",
" yes | \n",
" False | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
" 3 | \n",
" male | \n",
" 35.0 | \n",
" 0 | \n",
" 0 | \n",
" 8.0500 | \n",
" S | \n",
" Third | \n",
" man | \n",
" True | \n",
" NaN | \n",
" Southampton | \n",
" no | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" survived pclass sex age sibsp parch fare embarked class \\\n",
"0 0 3 male 22.0 1 0 7.2500 S Third \n",
"1 1 1 female 38.0 1 0 71.2833 C First \n",
"2 1 3 female 26.0 0 0 7.9250 S Third \n",
"3 1 1 female 35.0 1 0 53.1000 S First \n",
"4 0 3 male 35.0 0 0 8.0500 S Third \n",
"\n",
" who adult_male deck embark_town alive alone \n",
"0 man True NaN Southampton no False \n",
"1 woman False C Cherbourg yes False \n",
"2 woman False NaN Southampton yes True \n",
"3 woman False C Southampton yes False \n",
"4 man True NaN Southampton no True "
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"plt.rcParams[\"figure.figsize\"] = [8,6]\n",
"sns.set_style(\"darkgrid\")\n",
"\n",
"titanic_data = sns.load_dataset('titanic')\n",
"\n",
"titanic_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sex | \n",
" class | \n",
" embark_town | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" male | \n",
" Third | \n",
" Southampton | \n",
"
\n",
" \n",
" | 1 | \n",
" female | \n",
" First | \n",
" Cherbourg | \n",
"
\n",
" \n",
" | 2 | \n",
" female | \n",
" Third | \n",
" Southampton | \n",
"
\n",
" \n",
" | 3 | \n",
" female | \n",
" First | \n",
" Southampton | \n",
"
\n",
" \n",
" | 4 | \n",
" male | \n",
" Third | \n",
" Southampton | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sex class embark_town\n",
"0 male Third Southampton\n",
"1 female First Cherbourg\n",
"2 female Third Southampton\n",
"3 female First Southampton\n",
"4 male Third Southampton"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titanic_data = titanic_data[[\"sex\", \"class\", \"embark_town\"]]\n",
"titanic_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['male' 'female']\n",
"[Third, First, Second]\n",
"Categories (3, object): [Third, First, Second]\n",
"['Southampton' 'Cherbourg' 'Queenstown' nan]\n"
]
}
],
"source": [
"print(titanic_data['sex'].unique())\n",
"print(titanic_data['class'].unique())\n",
"print(titanic_data['embark_town'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" female | \n",
" male | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" female male\n",
"0 0 1\n",
"1 1 0\n",
"2 1 0\n",
"3 1 0\n",
"4 0 1"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"temp = pd.get_dummies(titanic_data['sex'])\n",
"\n",
"temp.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sex | \n",
" female | \n",
" male | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" male | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" female | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" female | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" female | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" male | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sex female male\n",
"0 male 0 1\n",
"1 female 1 0\n",
"2 female 1 0\n",
"3 female 1 0\n",
"4 male 0 1"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([titanic_data['sex'],\n",
" pd.get_dummies(titanic_data['sex'])], axis=1).head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Cherbourg | \n",
" Queenstown | \n",
" Southampton | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Cherbourg Queenstown Southampton\n",
"0 0 0 1\n",
"1 1 0 0\n",
"2 0 0 1\n",
"3 0 0 1\n",
"4 0 0 1"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"temp = pd.get_dummies(titanic_data['embark_town'])\n",
"\n",
"temp.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Queenstown | \n",
" Southampton | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Queenstown Southampton\n",
"0 0 1\n",
"1 0 0\n",
"2 0 1\n",
"3 0 1\n",
"4 0 1"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"temp = pd.get_dummies(titanic_data['embark_town'], drop_first = True)\n",
"\n",
"temp.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Queenstown | \n",
" Southampton | \n",
" NaN | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Queenstown Southampton NaN\n",
"0 0 1 0\n",
"1 0 0 0\n",
"2 0 1 0\n",
"3 0 1 0\n",
"4 0 1 0"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"temp = pd.get_dummies(titanic_data['embark_town'], dummy_na = True ,drop_first = True)\n",
"\n",
"temp.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 4.3. Label Encoding"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sex | \n",
" class | \n",
" embark_town | \n",
" le_class | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" male | \n",
" Third | \n",
" Southampton | \n",
" 2 | \n",
"
\n",
" \n",
" | 1 | \n",
" female | \n",
" First | \n",
" Cherbourg | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" female | \n",
" Third | \n",
" Southampton | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" female | \n",
" First | \n",
" Southampton | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" male | \n",
" Third | \n",
" Southampton | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sex class embark_town le_class\n",
"0 male Third Southampton 2\n",
"1 female First Cherbourg 0\n",
"2 female Third Southampton 2\n",
"3 female First Southampton 0\n",
"4 male Third Southampton 2"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# for integer encoding using sklearn\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"le = LabelEncoder()\n",
"\n",
"le.fit(titanic_data['class'])\n",
"\n",
"titanic_data['le_class'] = le.transform(titanic_data['class'])\n",
"\n",
"titanic_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['First', 'Second', 'Third'], dtype=object)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"le.classes_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 4.4. Frequency Encoding"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"titanic_data.dropna(inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'Southampton': 644, 'Cherbourg': 168, 'Queenstown': 77}\n"
]
}
],
"source": [
"value_counts = titanic_data['embark_town'].value_counts().to_dict()\n",
"print(value_counts)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sex | \n",
" class | \n",
" embark_town | \n",
" le_class | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" male | \n",
" Third | \n",
" 644 | \n",
" 2 | \n",
"
\n",
" \n",
" | 1 | \n",
" female | \n",
" First | \n",
" 168 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" female | \n",
" Third | \n",
" 644 | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" female | \n",
" First | \n",
" 644 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" male | \n",
" Third | \n",
" 644 | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sex class embark_town le_class\n",
"0 male Third 644 2\n",
"1 female First 168 0\n",
"2 female Third 644 2\n",
"3 female First 644 0\n",
"4 male Third 644 2"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titanic_data['embark_town'] = titanic_data['embark_town'].map(value_counts)\n",
"titanic_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{644: 0.7244094488188977, 168: 0.1889763779527559, 77: 0.08661417322834646}\n"
]
}
],
"source": [
"frequency_count = (titanic_data['embark_town'].value_counts() / len(titanic_data) ).to_dict()\n",
"print(frequency_count)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sex | \n",
" class | \n",
" embark_town | \n",
" le_class | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" male | \n",
" Third | \n",
" 0.724409 | \n",
" 2 | \n",
"
\n",
" \n",
" | 1 | \n",
" female | \n",
" First | \n",
" 0.188976 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" female | \n",
" Third | \n",
" 0.724409 | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" female | \n",
" First | \n",
" 0.724409 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" male | \n",
" Third | \n",
" 0.724409 | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sex class embark_town le_class\n",
"0 male Third 0.724409 2\n",
"1 female First 0.188976 0\n",
"2 female Third 0.724409 2\n",
"3 female First 0.724409 0\n",
"4 male Third 0.724409 2"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titanic_data['embark_town'] = titanic_data['embark_town'].map(frequency_count)\n",
"titanic_data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 4.5. Ordinal Encoding"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"class\n",
"Third 0.242363\n",
"Second 0.472826\n",
"First 0.629630\n",
"Name: survived, dtype: float64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titanic_data = sns.load_dataset('titanic')\n",
"titanic_data = titanic_data[[\"sex\", \"class\", \"embark_town\", \"survived\"]]\n",
"\n",
"\n",
"titanic_data.groupby(['class'])['survived'].mean().sort_values()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"ordered_cats = titanic_data.groupby(['class'])['survived'].mean().sort_values().index\n",
"cat_map= {k: i for i, k in enumerate(ordered_cats, 0)}\n",
"titanic_data['class_ordered'] = titanic_data['class'].map(cat_map)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sex | \n",
" class | \n",
" embark_town | \n",
" survived | \n",
" class_ordered | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" male | \n",
" Third | \n",
" Southampton | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" female | \n",
" First | \n",
" Cherbourg | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" | 2 | \n",
" female | \n",
" Third | \n",
" Southampton | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" female | \n",
" First | \n",
" Southampton | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" | 4 | \n",
" male | \n",
" Third | \n",
" Southampton | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sex class embark_town survived class_ordered\n",
"0 male Third Southampton 0 0\n",
"1 female First Cherbourg 1 2\n",
"2 female Third Southampton 1 0\n",
"3 female First Southampton 1 2\n",
"4 male Third Southampton 0 0"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titanic_data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 4.6 Mean Encoding"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"class\n",
"First 0.629630\n",
"Second 0.472826\n",
"Third 0.242363\n",
"Name: survived, dtype: float64"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titanic_data.groupby(['class'])['survived'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sex | \n",
" class | \n",
" embark_town | \n",
" survived | \n",
" class_ordered | \n",
" class_mean | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" male | \n",
" Third | \n",
" Southampton | \n",
" 0 | \n",
" 0 | \n",
" 0.242363 | \n",
"
\n",
" \n",
" | 1 | \n",
" female | \n",
" First | \n",
" Cherbourg | \n",
" 1 | \n",
" 2 | \n",
" 0.629630 | \n",
"
\n",
" \n",
" | 2 | \n",
" female | \n",
" Third | \n",
" Southampton | \n",
" 1 | \n",
" 0 | \n",
" 0.242363 | \n",
"
\n",
" \n",
" | 3 | \n",
" female | \n",
" First | \n",
" Southampton | \n",
" 1 | \n",
" 2 | \n",
" 0.629630 | \n",
"
\n",
" \n",
" | 4 | \n",
" male | \n",
" Third | \n",
" Southampton | \n",
" 0 | \n",
" 0 | \n",
" 0.242363 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sex class embark_town survived class_ordered class_mean\n",
"0 male Third Southampton 0 0 0.242363\n",
"1 female First Cherbourg 1 2 0.629630\n",
"2 female Third Southampton 1 0 0.242363\n",
"3 female First Southampton 1 2 0.629630\n",
"4 male Third Southampton 0 0 0.242363"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_labels = titanic_data.groupby(['class'])['survived'].mean().to_dict()\n",
"titanic_data['class_mean'] = titanic_data['class'].map(mean_labels)\n",
"titanic_data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Exercise 4.1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Question 1\n",
"\n",
"Which encoding scheme generally leads to highest number of columns in the encoded dataset?\n",
"\n",
"A. Mean Encoding \\\n",
"B. Ordinal Encoding \\\n",
"C. One Hot Encoding \\\n",
"D. All of the Above\n",
"\n",
"Answer: C"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Question 2\n",
"\n",
"Which attribute is set to True to remove the first column from the one-hot encoded columns generated via get_dummies() method?\n",
"\n",
"A. drop_first \\\n",
"B. remove_first \\\n",
"C. delete_first \\\n",
"D. None of the above\n",
"\n",
"Answer: A"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Question 3\n",
"\n",
"What is the total number of integer labels in the frequency encoding?\n",
"\n",
"A. One less than total number of unique labels in the original column \\\n",
"B. Equal to the total number of unique labels in the original column \\\n",
"C. 3 \\\n",
"D. None of the above\n",
"\n",
"Answer: B"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Exercise 4.2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Apply frequency encoding to the class column of the Titanic Dataset:"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'Third': 491, 'First': 216, 'Second': 184}\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sex | \n",
" class | \n",
" embark_town | \n",
" class_freq | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" male | \n",
" Third | \n",
" Southampton | \n",
" 491 | \n",
"
\n",
" \n",
" | 1 | \n",
" female | \n",
" First | \n",
" Cherbourg | \n",
" 216 | \n",
"
\n",
" \n",
" | 2 | \n",
" female | \n",
" Third | \n",
" Southampton | \n",
" 491 | \n",
"
\n",
" \n",
" | 3 | \n",
" female | \n",
" First | \n",
" Southampton | \n",
" 216 | \n",
"
\n",
" \n",
" | 4 | \n",
" male | \n",
" Third | \n",
" Southampton | \n",
" 491 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sex class embark_town class_freq\n",
"0 male Third Southampton 491\n",
"1 female First Cherbourg 216\n",
"2 female Third Southampton 491\n",
"3 female First Southampton 216\n",
"4 male Third Southampton 491"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"plt.rcParams[\"figure.figsize\"] = [8,6]\n",
"sns.set_style(\"darkgrid\")\n",
"\n",
"titanic_data = sns.load_dataset('titanic')\n",
"\n",
"titanic_data.head()\n",
"\n",
"titanic_data = titanic_data[[\"sex\", \"class\", \"embark_town\"]]\n",
"titanic_data.head()\n",
"\n",
"value_counts = titanic_data['class'].value_counts().to_dict()\n",
"print(value_counts)\n",
"\n",
"titanic_data['class_freq'] = titanic_data['class'].map(value_counts)\n",
"titanic_data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}