diff --git a/lab-hyper-tuning.ipynb b/lab-hyper-tuning.ipynb
index 847d487..1819747 100644
--- a/lab-hyper-tuning.ipynb
+++ b/lab-hyper-tuning.ipynb
@@ -35,7 +35,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
@@ -47,7 +47,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 35,
"metadata": {},
"outputs": [
{
@@ -200,7 +200,7 @@
"4 True "
]
},
- "execution_count": 2,
+ "execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
@@ -219,13 +219,1695 @@
"- Feature Selection\n"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Check the shape of your data**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(8693, 14)"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "spaceship.shape #8693 rows and 14 columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Check for data types**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PassengerId object\n",
+ "HomePlanet object\n",
+ "CryoSleep object\n",
+ "Cabin object\n",
+ "Destination object\n",
+ "Age float64\n",
+ "VIP object\n",
+ "RoomService float64\n",
+ "FoodCourt float64\n",
+ "ShoppingMall float64\n",
+ "Spa float64\n",
+ "VRDeck float64\n",
+ "Name object\n",
+ "Transported bool\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "spaceship.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Check for missing values**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PassengerId 0\n",
+ "HomePlanet 201\n",
+ "CryoSleep 217\n",
+ "Cabin 199\n",
+ "Destination 182\n",
+ "Age 179\n",
+ "VIP 203\n",
+ "RoomService 181\n",
+ "FoodCourt 183\n",
+ "ShoppingMall 208\n",
+ "Spa 183\n",
+ "VRDeck 188\n",
+ "Name 200\n",
+ "Transported 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "spaceship.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "There are multiple strategies to handle missing data\n",
+ "\n",
+ "- Removing all rows or all columns containing missing data.\n",
+ "- Filling all missing values with a value (mean in continouos or mode in categorical for example).\n",
+ "- Filling all missing values with an algorithm.\n",
+ "\n",
+ "For this exercise, because we have such low amount of null values, we will drop rows containing any missing value. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " HomePlanet | \n",
+ " CryoSleep | \n",
+ " Cabin | \n",
+ " Destination | \n",
+ " Age | \n",
+ " VIP | \n",
+ " RoomService | \n",
+ " FoodCourt | \n",
+ " ShoppingMall | \n",
+ " Spa | \n",
+ " VRDeck | \n",
+ " Name | \n",
+ " Transported | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 7 | \n",
+ " 0006_02 | \n",
+ " Earth | \n",
+ " True | \n",
+ " G/0/S | \n",
+ " TRAPPIST-1e | \n",
+ " 28.0 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " Candra Jacostaffey | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 0008_02 | \n",
+ " Europa | \n",
+ " True | \n",
+ " B/1/P | \n",
+ " TRAPPIST-1e | \n",
+ " 34.0 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " Altardr Flatic | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " 0012_01 | \n",
+ " Earth | \n",
+ " False | \n",
+ " NaN | \n",
+ " TRAPPIST-1e | \n",
+ " 31.0 | \n",
+ " False | \n",
+ " 32.0 | \n",
+ " 0.0 | \n",
+ " 876.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " Justie Pooles | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 0014_01 | \n",
+ " Mars | \n",
+ " False | \n",
+ " F/3/P | \n",
+ " 55 Cancri e | \n",
+ " 27.0 | \n",
+ " False | \n",
+ " 1286.0 | \n",
+ " 122.0 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " Flats Eccle | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " 0020_03 | \n",
+ " Earth | \n",
+ " True | \n",
+ " E/0/S | \n",
+ " 55 Cancri e | \n",
+ " 29.0 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " Mollen Mcfaddennon | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 8667 | \n",
+ " 9250_01 | \n",
+ " Europa | \n",
+ " False | \n",
+ " E/597/P | \n",
+ " TRAPPIST-1e | \n",
+ " 29.0 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 2972.0 | \n",
+ " NaN | \n",
+ " 28.0 | \n",
+ " 188.0 | \n",
+ " Chain Reedectied | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 8674 | \n",
+ " 9257_01 | \n",
+ " NaN | \n",
+ " False | \n",
+ " F/1892/P | \n",
+ " TRAPPIST-1e | \n",
+ " 13.0 | \n",
+ " False | \n",
+ " 39.0 | \n",
+ " 0.0 | \n",
+ " 1085.0 | \n",
+ " 24.0 | \n",
+ " 0.0 | \n",
+ " Ties Apple | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 8675 | \n",
+ " 9259_01 | \n",
+ " Earth | \n",
+ " NaN | \n",
+ " F/1893/P | \n",
+ " TRAPPIST-1e | \n",
+ " 44.0 | \n",
+ " False | \n",
+ " 1030.0 | \n",
+ " 1015.0 | \n",
+ " 0.0 | \n",
+ " 11.0 | \n",
+ " NaN | \n",
+ " Annah Gilleyons | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 8684 | \n",
+ " 9274_01 | \n",
+ " NaN | \n",
+ " True | \n",
+ " G/1508/P | \n",
+ " TRAPPIST-1e | \n",
+ " 23.0 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " Chelsa Bullisey | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 8687 | \n",
+ " 9275_03 | \n",
+ " Europa | \n",
+ " NaN | \n",
+ " A/97/P | \n",
+ " TRAPPIST-1e | \n",
+ " 30.0 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 3208.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 330.0 | \n",
+ " Atlasym Conable | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2087 rows × 14 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId HomePlanet CryoSleep Cabin Destination Age VIP \\\n",
+ "7 0006_02 Earth True G/0/S TRAPPIST-1e 28.0 False \n",
+ "10 0008_02 Europa True B/1/P TRAPPIST-1e 34.0 False \n",
+ "15 0012_01 Earth False NaN TRAPPIST-1e 31.0 False \n",
+ "16 0014_01 Mars False F/3/P 55 Cancri e 27.0 False \n",
+ "23 0020_03 Earth True E/0/S 55 Cancri e 29.0 False \n",
+ "... ... ... ... ... ... ... ... \n",
+ "8667 9250_01 Europa False E/597/P TRAPPIST-1e 29.0 False \n",
+ "8674 9257_01 NaN False F/1892/P TRAPPIST-1e 13.0 False \n",
+ "8675 9259_01 Earth NaN F/1893/P TRAPPIST-1e 44.0 False \n",
+ "8684 9274_01 NaN True G/1508/P TRAPPIST-1e 23.0 False \n",
+ "8687 9275_03 Europa NaN A/97/P TRAPPIST-1e 30.0 False \n",
+ "\n",
+ " RoomService FoodCourt ShoppingMall Spa VRDeck Name \\\n",
+ "7 0.0 0.0 0.0 0.0 NaN Candra Jacostaffey \n",
+ "10 0.0 0.0 NaN 0.0 0.0 Altardr Flatic \n",
+ "15 32.0 0.0 876.0 0.0 0.0 Justie Pooles \n",
+ "16 1286.0 122.0 NaN 0.0 0.0 Flats Eccle \n",
+ "23 0.0 0.0 NaN 0.0 0.0 Mollen Mcfaddennon \n",
+ "... ... ... ... ... ... ... \n",
+ "8667 0.0 2972.0 NaN 28.0 188.0 Chain Reedectied \n",
+ "8674 39.0 0.0 1085.0 24.0 0.0 Ties Apple \n",
+ "8675 1030.0 1015.0 0.0 11.0 NaN Annah Gilleyons \n",
+ "8684 0.0 0.0 0.0 0.0 0.0 Chelsa Bullisey \n",
+ "8687 0.0 3208.0 0.0 2.0 330.0 Atlasym Conable \n",
+ "\n",
+ " Transported \n",
+ "7 True \n",
+ "10 True \n",
+ "15 False \n",
+ "16 False \n",
+ "23 False \n",
+ "... ... \n",
+ "8667 True \n",
+ "8674 False \n",
+ "8675 True \n",
+ "8684 True \n",
+ "8687 True \n",
+ "\n",
+ "[2087 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "spaceship[spaceship.isnull().any(axis=1)] # there are 2087 rows out of 8693 that has some missing values, so it is 25% of the data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spaceship_cleaned = spaceship.dropna()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(6606, 14)"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "spaceship_cleaned.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- **Cabin** is too granular - transform it in order to obtain {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\Anna\\AppData\\Local\\Temp\\ipykernel_19160\\2530471862.py:1: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " spaceship_cleaned['Cabin'] = spaceship_cleaned['Cabin'].str[0]\n"
+ ]
+ }
+ ],
+ "source": [
+ "spaceship_cleaned['Cabin'] = spaceship_cleaned['Cabin'].str[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " HomePlanet | \n",
+ " CryoSleep | \n",
+ " Cabin | \n",
+ " Destination | \n",
+ " Age | \n",
+ " VIP | \n",
+ " RoomService | \n",
+ " FoodCourt | \n",
+ " ShoppingMall | \n",
+ " Spa | \n",
+ " VRDeck | \n",
+ " Name | \n",
+ " Transported | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0001_01 | \n",
+ " Europa | \n",
+ " False | \n",
+ " B | \n",
+ " TRAPPIST-1e | \n",
+ " 39.0 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " Maham Ofracculy | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0002_01 | \n",
+ " Earth | \n",
+ " False | \n",
+ " F | \n",
+ " TRAPPIST-1e | \n",
+ " 24.0 | \n",
+ " False | \n",
+ " 109.0 | \n",
+ " 9.0 | \n",
+ " 25.0 | \n",
+ " 549.0 | \n",
+ " 44.0 | \n",
+ " Juanna Vines | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0003_01 | \n",
+ " Europa | \n",
+ " False | \n",
+ " A | \n",
+ " TRAPPIST-1e | \n",
+ " 58.0 | \n",
+ " True | \n",
+ " 43.0 | \n",
+ " 3576.0 | \n",
+ " 0.0 | \n",
+ " 6715.0 | \n",
+ " 49.0 | \n",
+ " Altark Susent | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0003_02 | \n",
+ " Europa | \n",
+ " False | \n",
+ " A | \n",
+ " TRAPPIST-1e | \n",
+ " 33.0 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 1283.0 | \n",
+ " 371.0 | \n",
+ " 3329.0 | \n",
+ " 193.0 | \n",
+ " Solam Susent | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0004_01 | \n",
+ " Earth | \n",
+ " False | \n",
+ " F | \n",
+ " TRAPPIST-1e | \n",
+ " 16.0 | \n",
+ " False | \n",
+ " 303.0 | \n",
+ " 70.0 | \n",
+ " 151.0 | \n",
+ " 565.0 | \n",
+ " 2.0 | \n",
+ " Willy Santantines | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId HomePlanet CryoSleep Cabin Destination Age VIP \\\n",
+ "0 0001_01 Europa False B TRAPPIST-1e 39.0 False \n",
+ "1 0002_01 Earth False F TRAPPIST-1e 24.0 False \n",
+ "2 0003_01 Europa False A TRAPPIST-1e 58.0 True \n",
+ "3 0003_02 Europa False A TRAPPIST-1e 33.0 False \n",
+ "4 0004_01 Earth False F TRAPPIST-1e 16.0 False \n",
+ "\n",
+ " RoomService FoodCourt ShoppingMall Spa VRDeck Name \\\n",
+ "0 0.0 0.0 0.0 0.0 0.0 Maham Ofracculy \n",
+ "1 109.0 9.0 25.0 549.0 44.0 Juanna Vines \n",
+ "2 43.0 3576.0 0.0 6715.0 49.0 Altark Susent \n",
+ "3 0.0 1283.0 371.0 3329.0 193.0 Solam Susent \n",
+ "4 303.0 70.0 151.0 565.0 2.0 Willy Santantines \n",
+ "\n",
+ " Transported \n",
+ "0 False \n",
+ "1 True \n",
+ "2 False \n",
+ "3 False \n",
+ "4 True "
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "spaceship_cleaned.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- Drop PassengerId and Name"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spaceship_final = spaceship_cleaned.drop(['PassengerId', 'Name'], axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " HomePlanet | \n",
+ " CryoSleep | \n",
+ " Cabin | \n",
+ " Destination | \n",
+ " Age | \n",
+ " VIP | \n",
+ " RoomService | \n",
+ " FoodCourt | \n",
+ " ShoppingMall | \n",
+ " Spa | \n",
+ " VRDeck | \n",
+ " Transported | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Europa | \n",
+ " False | \n",
+ " B | \n",
+ " TRAPPIST-1e | \n",
+ " 39.0 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Earth | \n",
+ " False | \n",
+ " F | \n",
+ " TRAPPIST-1e | \n",
+ " 24.0 | \n",
+ " False | \n",
+ " 109.0 | \n",
+ " 9.0 | \n",
+ " 25.0 | \n",
+ " 549.0 | \n",
+ " 44.0 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Europa | \n",
+ " False | \n",
+ " A | \n",
+ " TRAPPIST-1e | \n",
+ " 58.0 | \n",
+ " True | \n",
+ " 43.0 | \n",
+ " 3576.0 | \n",
+ " 0.0 | \n",
+ " 6715.0 | \n",
+ " 49.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Europa | \n",
+ " False | \n",
+ " A | \n",
+ " TRAPPIST-1e | \n",
+ " 33.0 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 1283.0 | \n",
+ " 371.0 | \n",
+ " 3329.0 | \n",
+ " 193.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Earth | \n",
+ " False | \n",
+ " F | \n",
+ " TRAPPIST-1e | \n",
+ " 16.0 | \n",
+ " False | \n",
+ " 303.0 | \n",
+ " 70.0 | \n",
+ " 151.0 | \n",
+ " 565.0 | \n",
+ " 2.0 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " HomePlanet CryoSleep Cabin Destination Age VIP RoomService \\\n",
+ "0 Europa False B TRAPPIST-1e 39.0 False 0.0 \n",
+ "1 Earth False F TRAPPIST-1e 24.0 False 109.0 \n",
+ "2 Europa False A TRAPPIST-1e 58.0 True 43.0 \n",
+ "3 Europa False A TRAPPIST-1e 33.0 False 0.0 \n",
+ "4 Earth False F TRAPPIST-1e 16.0 False 303.0 \n",
+ "\n",
+ " FoodCourt ShoppingMall Spa VRDeck Transported \n",
+ "0 0.0 0.0 0.0 0.0 False \n",
+ "1 9.0 25.0 549.0 44.0 True \n",
+ "2 3576.0 0.0 6715.0 49.0 False \n",
+ "3 1283.0 371.0 3329.0 193.0 False \n",
+ "4 70.0 151.0 565.0 2.0 True "
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "spaceship_final.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### X - y split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X = spaceship_final.drop(columns=['Transported'])\n",
+ "y = spaceship_final['Transported']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(6606, 11)"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Perform Train Test Split**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(5284, 11)"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- For non-numerical columns, do dummies."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#separating nominal columns from numerical\n",
+ "\n",
+ "nominal_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']\n",
+ "\n",
+ "X_train_nominal = X_train[nominal_cols]\n",
+ "X_train_numerical = X_train.drop(columns=nominal_cols)\n",
+ "\n",
+ "X_test_nominal = X_test[nominal_cols]\n",
+ "X_test_numerical = X_test.drop(columns=nominal_cols)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(5284, 5)"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train_nominal.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.preprocessing import OneHotEncoder"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# encoding nominal features\n",
+ "\n",
+ "ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')\n",
+ "\n",
+ "ohe.fit(X_train_nominal) #fitting only on train data to avoid data leakage\n",
+ "\n",
+ "X_train_nominal_encoded = ohe.transform(X_train_nominal)\n",
+ "X_test_nominal_encoded = ohe.transform(X_test_nominal)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(5284, 17)"
+ ]
+ },
+ "execution_count": 54,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train_nominal_encoded.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# let's get the column names for encoded features\n",
+ "\n",
+ "encoded_cols = ohe.get_feature_names_out()\n",
+ "\n",
+ "X_train_nominal_df = pd.DataFrame(X_train_nominal_encoded, columns=encoded_cols, index=X_train.index)\n",
+ "X_test_nominal_df = pd.DataFrame(X_test_nominal_encoded, columns=encoded_cols, index=X_test.index)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Let's concatenate to create one dataframe for modeling\n",
+ "\n",
+ "X_train_final = pd.concat([X_train_nominal_df, X_train_numerical], axis=1)\n",
+ "X_test_final = pd.concat([X_test_nominal_df, X_test_numerical], axis=1)\n",
+ "\n",
+ "X_train = X_train_final\n",
+ "X_test = X_test_final"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " HomePlanet_Earth | \n",
+ " HomePlanet_Europa | \n",
+ " HomePlanet_Mars | \n",
+ " CryoSleep_False | \n",
+ " CryoSleep_True | \n",
+ " Cabin_A | \n",
+ " Cabin_B | \n",
+ " Cabin_C | \n",
+ " Cabin_D | \n",
+ " Cabin_E | \n",
+ " ... | \n",
+ " Destination_PSO J318.5-22 | \n",
+ " Destination_TRAPPIST-1e | \n",
+ " VIP_False | \n",
+ " VIP_True | \n",
+ " Age | \n",
+ " RoomService | \n",
+ " FoodCourt | \n",
+ " ShoppingMall | \n",
+ " Spa | \n",
+ " VRDeck | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 8496 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 27.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 2347 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 48.0 | \n",
+ " 639.0 | \n",
+ " 412.0 | \n",
+ " 110.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 1654 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 26.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 944 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 21.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 791.0 | \n",
+ "
\n",
+ " \n",
+ " | 593 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 23 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " HomePlanet_Earth HomePlanet_Europa HomePlanet_Mars CryoSleep_False \\\n",
+ "8496 1.0 0.0 0.0 0.0 \n",
+ "2347 0.0 0.0 1.0 1.0 \n",
+ "1654 1.0 0.0 0.0 0.0 \n",
+ "944 1.0 0.0 0.0 1.0 \n",
+ "593 1.0 0.0 0.0 0.0 \n",
+ "\n",
+ " CryoSleep_True Cabin_A Cabin_B Cabin_C Cabin_D Cabin_E ... \\\n",
+ "8496 1.0 0.0 0.0 0.0 0.0 0.0 ... \n",
+ "2347 0.0 0.0 0.0 0.0 0.0 0.0 ... \n",
+ "1654 1.0 0.0 0.0 0.0 0.0 0.0 ... \n",
+ "944 0.0 0.0 0.0 0.0 0.0 1.0 ... \n",
+ "593 1.0 0.0 0.0 0.0 0.0 0.0 ... \n",
+ "\n",
+ " Destination_PSO J318.5-22 Destination_TRAPPIST-1e VIP_False VIP_True \\\n",
+ "8496 0.0 1.0 1.0 0.0 \n",
+ "2347 0.0 1.0 1.0 0.0 \n",
+ "1654 0.0 1.0 1.0 0.0 \n",
+ "944 0.0 1.0 1.0 0.0 \n",
+ "593 0.0 0.0 1.0 0.0 \n",
+ "\n",
+ " Age RoomService FoodCourt ShoppingMall Spa VRDeck \n",
+ "8496 27.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "2347 48.0 639.0 412.0 110.0 0.0 0.0 \n",
+ "1654 26.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "944 21.0 0.0 0.0 0.0 0.0 791.0 \n",
+ "593 3.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ "[5 rows x 23 columns]"
+ ]
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Model Selection** - now you will try to apply different ensemble methods in order to get a better model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- Bagging and Pasting"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier\n",
+ "from sklearn.metrics import accuracy_score, classification_report"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Normalize the data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.preprocessing import MinMaxScaler, StandardScaler"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "MinMaxScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "MinMaxScaler()"
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "normalizer = MinMaxScaler()\n",
+ "\n",
+ "normalizer.fit(X_train)"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
- "#your code here"
+ "X_train_norm = normalizer.transform(X_train)\n",
+ "\n",
+ "X_test_norm = normalizer.transform(X_test)"
]
},
{
@@ -237,11 +1919,454 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
- "#your code here"
+ "bagging_clf = BaggingClassifier(DecisionTreeClassifier(max_depth=20),\n",
+ " n_estimators=100,\n",
+ " max_samples=1000)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=20),\n",
+ " max_samples=1000, n_estimators=100)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=20),\n",
+ " max_samples=1000, n_estimators=100)"
+ ]
+ },
+ "execution_count": 63,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bagging_clf.fit(X_train_norm, y_train)"
]
},
{
@@ -253,11 +2378,30 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 64,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.7965204236006052\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " False 0.76 0.83 0.80 634\n",
+ " True 0.83 0.76 0.80 688\n",
+ "\n",
+ " accuracy 0.80 1322\n",
+ " macro avg 0.80 0.80 0.80 1322\n",
+ "weighted avg 0.80 0.80 0.80 1322\n",
+ "\n"
+ ]
+ }
+ ],
"source": [
- "#your code here"
+ "pred = bagging_clf.predict(X_test_norm)\n",
+ "print(\"Accuracy:\", accuracy_score(y_test, pred))\n",
+ "print(classification_report(y_test, pred))"
]
},
{
@@ -283,11 +2427,16 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
- "#your code here"
+ "param_grid = {\n",
+ " 'n_estimators': [50, 100, 200],\n",
+ " 'max_samples': [500, 1000, 2000],\n",
+ " 'estimator__max_depth': [5, 10, 20],\n",
+ " 'estimator__min_samples_split': [2, 5, 10]\n",
+ "}"
]
},
{
@@ -299,10 +2448,36 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 68,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Fitting 5 folds for each of 81 candidates, totalling 405 fits\n",
+ "Best parameters: {'estimator__max_depth': 5, 'estimator__min_samples_split': 5, 'max_samples': 2000, 'n_estimators': 50}\n",
+ "Best CV accuracy: 0.7959854576990338\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.model_selection import GridSearchCV\n",
+ "\n",
+ "grid_search = GridSearchCV(\n",
+ " estimator=BaggingClassifier(DecisionTreeClassifier()),\n",
+ " param_grid=param_grid,\n",
+ " cv=5,\n",
+ " scoring='accuracy',\n",
+ " n_jobs=-1,\n",
+ " verbose=2\n",
+ ")\n",
+ "\n",
+ "grid_search.fit(X_train_norm, y_train)\n",
+ "\n",
+ "print(\"Best parameters:\", grid_search.best_params_)\n",
+ "print(\"Best CV accuracy:\", grid_search.best_score_)"
+ ]
},
{
"cell_type": "markdown",
@@ -313,15 +2488,48 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 70,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "=== Tuned BaggingClassifier ===\n",
+ "Accuracy: 0.7965204236006052\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " False 0.78 0.80 0.79 634\n",
+ " True 0.81 0.80 0.80 688\n",
+ "\n",
+ " accuracy 0.80 1322\n",
+ " macro avg 0.80 0.80 0.80 1322\n",
+ "weighted avg 0.80 0.80 0.80 1322\n",
+ "\n",
+ "\n",
+ "=== Final Comparison ===\n",
+ "Baseline BaggingClassifier: 0.7965\n",
+ "Tuned BaggingClassifier: 0.7965204236006052\n"
+ ]
+ }
+ ],
+ "source": [
+ "best_model = grid_search.best_estimator_\n",
+ "pred_best = best_model.predict(X_test_norm)\n",
+ "\n",
+ "print(\"=== Tuned BaggingClassifier ===\")\n",
+ "print(\"Accuracy:\", accuracy_score(y_test, pred_best))\n",
+ "print(classification_report(y_test, pred_best))\n",
+ "\n",
+ "print(\"\\n=== Final Comparison ===\")\n",
+ "print(\"Baseline BaggingClassifier: 0.7965\")\n",
+ "print(\"Tuned BaggingClassifier: \", accuracy_score(y_test, pred_best))"
+ ]
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "base",
"language": "python",
"name": "python3"
},
@@ -335,7 +2543,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.9"
+ "version": "3.13.5"
}
},
"nbformat": 4,