diff --git a/smoker-models.ipynb b/smoker-models.ipynb index d3902e6..4c99698 100644 --- a/smoker-models.ipynb +++ b/smoker-models.ipynb @@ -20,9 +20,7 @@ "from sklearn.model_selection import train_test_split, GridSearchCV\n", "from sklearn.tree import DecisionTreeClassifier, plot_tree\n", "from sklearn.svm import SVC, LinearSVC\n", - "\n", - "df_train = pd.read_csv(\"data/smoker_train.csv\")\n", - "df_train = df_train.drop_duplicates()" + "from sklearn.model_selection import KFold, cross_val_score\n" ] }, { @@ -40,9 +38,16 @@ "metadata": {}, "outputs": [], "source": [ + "# Load the dataset\n", + "df = pd.read_csv(\"data/smoker_train.csv\")\n", + "df = df.drop_duplicates()\n", + "\n", "# store test error and train error for each model\n", "# [model, test_mse, train_mse, test_accuracy, train_accuracy, test_f1, train_f1]\n", - "model_errors = []" + "model_errors = []\n", + "\n", + "# Split the data into test and train sets\n", + "df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)" ] }, { @@ -61,20 +66,31 @@ "outputs": [], "source": [ "# Modell, das immer \"Non-Smoker\" (0) vorhersagt\n", - "y_true = df_train['smoking']\n", - "y_pred_naive = np.zeros_like(y_true)\n", + "y = df_train['smoking']\n", + "y_pred_naive = np.zeros_like(y)\n", "\n", "# Fehlerwerte berechnen\n", - "accuracy = accuracy_score(y_true, y_pred_naive)\n", - "f1 = f1_score(y_true, y_pred_naive)\n", - "mse = mean_squared_error(y_true, y_pred_naive)\n", + "accuracy = accuracy_score(y, y_pred_naive)\n", + "f1 = f1_score(y, y_pred_naive)\n", + "mse = mean_squared_error(y, y_pred_naive)\n", "\n", "print(f\"Accuracy (immer Non-Smoker): {accuracy:.4f}\")\n", "print(f\"F1-Score (immer Non-Smoker): {f1:.4f}\")\n", "print(f\"Mean Squared Error (immer Non-Smoker): {mse:.4f}\")\n", "\n", + "# Fehlerwerte für testset berechnen\n", + "y_test = df_test['smoking']\n", + "y_test_pred_naive = np.zeros_like(y_test)\n", + "accuracy_test = accuracy_score(y_test, y_test_pred_naive)\n", + "f1_test = f1_score(y_test, y_test_pred_naive)\n", + "mse_test = mean_squared_error(y_test, y_test_pred_naive)\n", + "\n", + "print(f\"Test Accuracy (immer Non-Smoker): {accuracy_test:.4f}\")\n", + "print(f\"Test F1-Score (immer Non-Smoker): {f1_test:.4f}\")\n", + "print(f\"Test Mean Squared Error (immer Non-Smoker): {mse_test:.4f}\")\n", + "\n", "\n", - "model_errors.append(['Naive Model', mse, mse, accuracy, accuracy, f1, f1])" + "model_errors.append(['Naive Model', mse_test, mse, accuracy_test, accuracy, f1_test, f1])" ] }, { @@ -82,7 +98,8 @@ "id": "68aff011", "metadata": {}, "source": [ - "## Linear Regression" + "## Linear Regression\n", + "Not relevant to smoker prediction." ] }, { @@ -113,6 +130,120 @@ "print(f\"Test Mean Squared Error: {mse}\")" ] }, + { + "cell_type": "markdown", + "id": "1d00ddb7", + "metadata": {}, + "source": [ + "## Decision Tree limited depth of 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78711fd5", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Input Variablen\n", + "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", + "\n", + "# Output Variable\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", + "\n", + "dtc = DecisionTreeClassifier(random_state=0, max_depth=1).fit(X, y)\n", + "\n", + "# Modellanwendung\n", + "y_pred_train = dtc.predict(X)\n", + "y_pred = dtc.predict(X_test)\n", + "\n", + "# Calc error values\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"Decision Tree Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"Decision Tree Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"Decision Tree Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"Decision Tree Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"Decision Tree Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"Decision Tree Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "model_errors.append(['Decision Tree max depth of 1', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a200ebd1", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(20, 10))\n", + "plot_tree(dtc, filled=True, feature_names=X.columns, class_names=[\"Non Smoker\", \"Smoker\"])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "6e92b579", + "metadata": {}, + "source": [ + "## Decision Tree" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48c88bb2", + "metadata": {}, + "outputs": [], + "source": [ + "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", + "\n", + "# Training der Modelle\n", + "dtc = DecisionTreeClassifier(random_state=0).fit(X, y)\n", + "\n", + "y_pred_train = dtc.predict(X)\n", + "y_pred = dtc.predict(X_test)\n", + "\n", + "# Calc error values\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"Decision Tree Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"Decision Tree Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"Decision Tree Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"Decision Tree Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"Decision Tree Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"Decision Tree Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "# How many leafs does the tree have?\n", + "print(f\"Number of leafs in the tree: {dtc.get_n_leaves()}\")\n", + "\n", + "# How deep is the tree?\n", + "print(f\"Depth of the tree: {dtc.get_depth()}\")\n", + "\n", + "model_errors.append(['Decision Tree', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, { "cell_type": "markdown", "id": "093b1869", @@ -121,6 +252,65 @@ "## Decision Tree limited leaves" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "41ce9948", + "metadata": {}, + "outputs": [], + "source": [ + "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", + "\n", + "# Training der Modelle\n", + "dtc = DecisionTreeClassifier(random_state=0, max_leaf_nodes=7).fit(X, y)\n", + "\n", + "y_pred_train = dtc.predict(X)\n", + "y_pred = dtc.predict(X_test)\n", + "\n", + "# Calc error values\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"Decision Tree Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"Decision Tree Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"Decision Tree Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"Decision Tree Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"Decision Tree Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"Decision Tree Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "model_errors.append(['Decision Tree', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0328d7b7", + "metadata": {}, + "outputs": [], + "source": [ + "# Entscheidungsbaum des besten Modells visualisieren\n", + "plt.figure(figsize=(20, 10))\n", + "plot_tree(dtc, filled=True, feature_names=X.columns, class_names=[\"Non Smoker\", \"Smoker\"])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "c21c2869", + "metadata": {}, + "source": [ + "## Decision Tree limited leaves with cross validation for hyperparameter tuning (max leaves)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -128,24 +318,51 @@ "metadata": {}, "outputs": [], "source": [ - "X = df_train[['height(cm)', 'waist(cm)', 'hemoglobin']]\n", + "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", "\n", - "# Split data into training and testing sets (using only training set for comparison)\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - "X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_train, y_train, test_size=0.2, random_state=42)\n", + "# Training der Modelle mit K-Fold Cross-Validation\n", + "dtc = DecisionTreeClassifier(random_state=0)\n", "\n", - "# Training der Modelle\n", - "dtc = DecisionTreeClassifier(random_state=0, max_leaf_nodes=7).fit(X_train_1, y_train_1)\n", + "grid_search = GridSearchCV(\n", + " dtc, \n", + " param_grid={\n", + " 'max_leaf_nodes': [2, 5, 8, 9, 10, 11, 12, 17, 20]\n", + " }, \n", + " cv=5,\n", + " scoring='accuracy'\n", + ")\n", + "\n", + "grid_search.fit(X, y)\n", + "\n", + "print(\"Best parameters found: \", grid_search.best_params_)\n", + "\n", + "# Training der Modelle mit den besten Parametern\n", + "dtc = grid_search.best_estimator_\n", "\n", - "y_pred_train = dtc.predict(X_train)\n", + "y_pred_train = dtc.predict(X)\n", "y_pred = dtc.predict(X_test)\n", "\n", + "# Calc error values\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", "\n", - "print('Accuracy of Decision Tree-Train: ', accuracy_score(y_pred_train, y_train))\n", - "print('Accuracy of Decision Tree-Test: ', accuracy_score(y_pred, y_test))\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", "\n", - "model_errors.append(['Decision Tree', 0, 0, accuracy_score(y_pred, y_test), accuracy_score(y_pred_train, y_train), f1_score(y_test, y_pred), f1_score(y_train, y_pred_train)])" + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"Decision Tree Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"Decision Tree Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"Decision Tree Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"Decision Tree Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"Decision Tree Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"Decision Tree Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "model_errors.append(['Decision Tree cv for max leaf', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" ] }, { @@ -155,66 +372,93 @@ "metadata": {}, "outputs": [], "source": [ - "plot_tree(dtc)" + "# Entscheidungsbaum des besten Modells visualisieren\n", + "plt.figure(figsize=(20, 10))\n", + "plot_tree(dtc, filled=True, feature_names=X.columns, class_names=[\"Non Smoker\", \"Smoker\"])\n", + "plt.show()" ] }, { "cell_type": "markdown", - "id": "1d00ddb7", + "id": "76808385", "metadata": {}, "source": [ - "## Decision Tree limited depth" + "## Decision Tree limited leaves with cross validation for hyperparameter tuning (max depth)" ] }, { "cell_type": "code", "execution_count": null, - "id": "78711fd5", + "id": "2b54c8cf", "metadata": {}, "outputs": [], "source": [ - "\n", - "# Input Variablen\n", "X = df_train.drop('smoking', axis=1)\n", - "# Output Variable\n", + "X_test = df_test.drop('smoking', axis=1)\n", "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", "\n", - "# Test und Trainingssplit\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - "# array100 = list(range(1, 20))\n", - "# for depth in array100:\n", - "# # Decision Tree Classifier\n", - "# dtc = DecisionTreeClassifier(random_state=0, max_depth=depth).fit(X_train, y_train)\n", - " \n", - "# # Modellanwendung\n", - "# y_pred_train = dtc.predict(X_train)\n", - "# y_pred = dtc.predict(X_test)\n", - " \n", - "# print(f'Depth: {depth}')\n", - "# print('Accuracy of Decision Tree-Train: ', accuracy_score(y_pred_train, y_train))\n", - "# print('Accuracy of Decision Tree-Test: ', accuracy_score(y_pred, y_test))\n", - " # Decision Tree Classifier\n", - "dtc = DecisionTreeClassifier(random_state=0, max_depth=4).fit(X_train, y_train)\n", + "# Training der Modelle mit K-Fold Cross-Validation\n", + "dtc = DecisionTreeClassifier(random_state=0)\n", "\n", - "# Modellanwendung\n", - "y_pred_train = dtc.predict(X_train)\n", + "grid_search = GridSearchCV(\n", + " dtc, \n", + " param_grid={\n", + " 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n", + " }, \n", + " cv=5,\n", + " scoring='accuracy'\n", + ")\n", + "\n", + "grid_search.fit(X, y)\n", + "\n", + "print(\"Best parameters found: \", grid_search.best_params_)\n", + "\n", + "# Training der Modelle mit den besten Parametern\n", + "dtc = grid_search.best_estimator_\n", + "\n", + "y_pred_train = dtc.predict(X)\n", "y_pred = dtc.predict(X_test)\n", "\n", - "print('Accuracy of Decision Tree-Train: ', accuracy_score(y_pred_train, y_train))\n", - "print('Accuracy of Decision Tree-Test: ', accuracy_score(y_pred, y_test))\n", + "# Calc error values\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", "\n", - "model_errors.append(['Decision Tree (max_depth=4)', 0, 0, accuracy_score(y_pred, y_test), accuracy_score(y_pred_train, y_train), f1_score(y_test, y_pred), f1_score(y_train, y_pred_train)])\n" + "print(f\"Decision Tree Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"Decision Tree Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"Decision Tree Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"Decision Tree Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"Decision Tree Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"Decision Tree Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "model_errors.append(['Decision Tree cv for max depth', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" ] }, { "cell_type": "code", "execution_count": null, - "id": "a200ebd1", + "id": "24ac9239", "metadata": {}, "outputs": [], "source": [ - "plot_tree(dtc)" + "# Entscheidungsbaum des besten Modells visualisieren\n", + "plt.figure(figsize=(20, 10))\n", + "plot_tree(dtc, filled=True, feature_names=X.columns, class_names=[\"Non Smoker\", \"Smoker\"])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "f5efebe5", + "metadata": {}, + "source": [ + "## Decision Tree limited leaves with cross validation for hyperparameter tuning (ccp_alpha)" ] }, { @@ -224,43 +468,64 @@ "metadata": {}, "outputs": [], "source": [ - "\n", - "# Input Variablen\n", "X = df_train.drop('smoking', axis=1)\n", - "# Output Variable\n", + "X_test = df_test.drop('smoking', axis=1)\n", "y = df_train['smoking']\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "y_test = df_test['smoking']\n", "\n", - "# Ermittlung aller ccp_alpha-Werte durch den Pruning-Pfad\n", - "path = dtc.cost_complexity_pruning_path(X_train, y_train)\n", + "# Training der Modelle mit K-Fold Cross-Validation\n", + "dtc = DecisionTreeClassifier(random_state=0, max_depth=7).fit(X, y)\n", + "\n", + "\n", + "path = dtc.cost_complexity_pruning_path(X, y)\n", "ccp_alphas = path.ccp_alphas\n", "\n", - "# Verwendung von GridSearchCV zur Bestimmung des besten ccp_alpha-Werts\n", - "param_grid = {'ccp_alpha': ccp_alphas}\n", - "grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)\n", - "grid_search.fit(X_train, y_train)\n", - "\n", - "# Das beste ccp_alpha ermitteln\n", - "best_ccp_alpha = grid_search.best_params_['ccp_alpha']\n", - "print(f\"Bestes ccp_alpha durch Cross-Validation: {best_ccp_alpha}\")\n", - "\n", - "# Modell mit dem besten ccp_alpha trainieren\n", - "best_dtc = DecisionTreeClassifier(random_state=42, ccp_alpha=best_ccp_alpha)\n", - "best_dtc.fit(X_train, y_train)\n", - "\n", - "# Vorhersagen und Genauigkeit auf den Testdaten\n", - "y_pred = best_dtc.predict(X_test)\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "f1 = f1_score(y_test, y_pred)\n", - "print(f\"Genauigkeit des besten Modells auf dem Testdatensatz: {accuracy:.4f}\")\n", - "print(f\"Genauigkeit des besten Modells auf dem Testdatensatz (f1): {f1:.4f}\")\n", - "print(f\"Mean Squared Error des besten Modells auf dem Testdatensatz: {mean_squared_error(y_test, y_pred):.4f}\")\n", - "# Entscheidungsbaum des besten Modells visualisieren\n", - "plt.figure(figsize=(20, 10))\n", - "plot_tree(best_dtc, filled=True, feature_names=X.columns, class_names=[\"Non Smoker\", \"Smoker\"])\n", - "plt.show()\n", + "grid_search = GridSearchCV(\n", + " dtc, \n", + " param_grid={\n", + " 'ccp_alpha': ccp_alphas\n", + " }, \n", + " cv=5,\n", + " scoring='accuracy'\n", + " \n", + ")\n", "\n", - "model_errors.append(['Decision Tree (best ccp_alpha)', 0, 0, accuracy, accuracy_score(y_train, best_dtc.predict(X_train)), f1, f1_score(y_train, best_dtc.predict(X_train))])" + "grid_search.fit(X, y)\n", + "\n", + "print(\"Best parameters found: \", grid_search.best_params_)\n", + "\n", + "# Training der Modelle mit den besten Parametern\n", + "dtc = grid_search.best_estimator_\n", + "\n", + "y_pred_train = dtc.predict(X)\n", + "y_pred = dtc.predict(X_test)\n", + "\n", + "# Calc error values\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"Decision Tree Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"Decision Tree Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"Decision Tree Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"Decision Tree Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"Decision Tree Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"Decision Tree Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "model_errors.append(['Decision Tree cv for ccp_alpha', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "463161d9", + "metadata": {}, + "source": [ + "# SVM" ] }, { @@ -268,7 +533,216 @@ "id": "2c9329e5", "metadata": {}, "source": [ - "## SVC Modelle" + "## SVM Modell with cross validation for hyperparameter tuning (C, kernel, gamma)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f80066d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Daten vorbereiten\n", + "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", + "\n", + "# SVM-Modell\n", + "svm = SVC(random_state=0)\n", + "\n", + "# Grid mit Parametern – einfache Auswahl für Demo-Zwecke\n", + "param_grid = {\n", + " 'C': [0.1, 1, 10],\n", + " 'kernel': ['linear', 'rbf'],\n", + " 'gamma': ['scale', 'auto']\n", + "}\n", + "\n", + "# Cross-Validation mit 5 Folds\n", + "grid_search = GridSearchCV(\n", + " svm,\n", + " param_grid=param_grid,\n", + " cv=5,\n", + " scoring='accuracy',\n", + " n_jobs=-1\n", + ")\n", + "\n", + "# Training mit CV\n", + "grid_search.fit(X, y)\n", + "\n", + "print(\"Best parameters found: \", grid_search.best_params_)\n", + "\n", + "# Modell mit besten Parametern\n", + "svm_best = grid_search.best_estimator_\n", + "\n", + "# Vorhersagen\n", + "y_pred_train = svm_best.predict(X)\n", + "y_pred = svm_best.predict(X_test)\n", + "\n", + "# Metriken\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "# Fehlerliste ergänzen\n", + "model_errors.append(['SVM cv for C/kernel', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "1c9e5492", + "metadata": {}, + "source": [ + "### Output Parameter tuning\n", + "Best parameters found: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}\n", + "SVM Test Accuracy: 0.7269\n", + "SVM Train Accuracy: 0.7493\n", + "SVM Test F1-Score: 0.6024\n", + "SVM Train F1-Score: 0.6291\n", + "SVM Test Mean Squared Error: 0.2731\n", + "SVM Train Mean Squared Error: 0.2507" + ] + }, + { + "cell_type": "markdown", + "id": "11decbc2", + "metadata": {}, + "source": [ + "## SVM Modell with cross validation for hyperparameter tuning (C)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b639d89", + "metadata": {}, + "outputs": [], + "source": [ + "# Daten vorbereiten\n", + "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", + "\n", + "# SVM-Modell\n", + "svm = SVC(random_state=0)\n", + "\n", + "# Grid mit Parametern – einfache Auswahl für Demo-Zwecke\n", + "param_grid = {\n", + " 'C': [5, 10, 15, 20],\n", + "}\n", + "\n", + "# Cross-Validation mit 5 Folds\n", + "grid_search = GridSearchCV(\n", + " svm,\n", + " param_grid=param_grid,\n", + " cv=5,\n", + " scoring='accuracy',\n", + " n_jobs=-1\n", + ")\n", + "\n", + "# Training mit CV\n", + "grid_search.fit(X, y)\n", + "\n", + "print(\"Best parameters found: \", grid_search.best_params_)\n", + "\n", + "# Modell mit besten Parametern\n", + "svm_best = grid_search.best_estimator_\n", + "\n", + "# Vorhersagen\n", + "y_pred_train = svm_best.predict(X)\n", + "y_pred = svm_best.predict(X_test)\n", + "\n", + "# Metriken\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "# Fehlerliste ergänzen\n", + "model_errors.append(['SVM cv for C/kernel', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "c9d03ad9", + "metadata": {}, + "source": [ + "## SVM Modell Hyperparamter tuned with limited Columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ead21ae2", + "metadata": {}, + "outputs": [], + "source": [ + "# Daten vorbereiten\n", + "X = df_train[['height(cm)', 'hemoglobin', 'Gtp', 'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol']]\n", + "X_test = df_test[['height(cm)', 'hemoglobin', 'Gtp', 'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol']]\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", + "\n", + "\n", + "# Train the SVC model\n", + "svc_model = SVC(random_state=42, kernel='rbf', C=10, gamma='scale')\n", + "svc_model.fit(X, y)\n", + "\n", + "# Make predictions\n", + "y_pred_train = svc_model.predict(X)\n", + "y_pred = svc_model.predict(X_test)\n", + "\n", + "# Metriken\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "model_errors.append(['SVM', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "aac5f7b2", + "metadata": {}, + "source": [ + "## SVM Modell" ] }, { @@ -278,28 +752,47 @@ "metadata": {}, "outputs": [], "source": [ - "# Define features and target variable\n", + "# Daten vorbereiten\n", "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", "\n", - "# Split the data into training and testing sets\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "# Train the SVC model\n", "svc_model = SVC(random_state=42)\n", - "svc_model.fit(X_train, y_train)\n", + "svc_model.fit(X, y)\n", "\n", "# Make predictions\n", + "y_pred_train = svc_model.predict(X)\n", "y_pred = svc_model.predict(X_test)\n", "\n", - "# Evaluate the model\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "f1 = f1_score(y_test, y_pred)\n", - "print(f\"Genauigkeit des besten Modells auf dem Testdatensatz: {accuracy:.4f}\")\n", - "print(f\"Genauigkeit des besten Modells auf dem Testdatensatz (f1): {f1:.4f}\")\n", + "# Metriken\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", "\n", - "model_errors.append(['SVC', 0, 0, accuracy, accuracy_score(y_train, svc_model.predict(X_train)), f1, f1_score(y_train, svc_model.predict(X_train))])" + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "model_errors.append(['SVM', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "d73eba52", + "metadata": {}, + "source": [ + "## SVM Linear Kernel Model" ] }, { @@ -309,28 +802,139 @@ "metadata": {}, "outputs": [], "source": [ - "# Define features and target variable\n", + "# Daten vorbereiten\n", "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", "\n", - "# Split the data into training and testing sets\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "# Train the SVC model\n", "svc_model = LinearSVC(random_state=42)\n", - "svc_model.fit(X_train, y_train)\n", + "svc_model.fit(X, y)\n", "\n", "# Make predictions\n", + "y_pred_train = svc_model.predict(X)\n", "y_pred = svc_model.predict(X_test)\n", "\n", - "# Evaluate the model\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "f1 = f1_score(y_test, y_pred)\n", - "print(f\"Genauigkeit des besten Modells auf dem Testdatensatz: {accuracy:.4f}\")\n", - "print(f\"Genauigkeit des besten Modells auf dem Testdatensatz (f1): {f1:.4f}\")\n", + "# Metriken\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", "\n", + "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", "\n", - "model_errors.append(['Linear SVC', 0, 0, accuracy, accuracy_score(y_train, svc_model.predict(X_train)), f1, f1_score(y_train, svc_model.predict(X_train))])" + "model_errors.append(['SVM', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "02e71868", + "metadata": {}, + "source": [ + "## SVM Modell limited columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52db280c", + "metadata": {}, + "outputs": [], + "source": [ + "# Daten vorbereiten\n", + "X = df_train[['height(cm)', 'hemoglobin', 'Gtp']]\n", + "X_test = df_test[['height(cm)', 'hemoglobin', 'Gtp']]\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", + "\n", + "\n", + "# Train the SVC model\n", + "svc_model = SVC(random_state=42)\n", + "svc_model.fit(X, y)\n", + "\n", + "# Make predictions\n", + "y_pred_train = svc_model.predict(X)\n", + "y_pred = svc_model.predict(X_test)\n", + "\n", + "# Metriken\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "model_errors.append(['SVM limited Columns', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "371b1933", + "metadata": {}, + "source": [ + "## SVM Linear Kernel Modell limited columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fe14bef", + "metadata": {}, + "outputs": [], + "source": [ + "# Daten vorbereiten\n", + "X = df_train[['height(cm)', 'hemoglobin', 'Gtp']]\n", + "X_test = df_test[['height(cm)', 'hemoglobin', 'Gtp']]\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", + "\n", + "\n", + "# Train the SVC model\n", + "svc_model = LinearSVC(random_state=42)\n", + "svc_model.fit(X, y)\n", + "\n", + "# Make predictions\n", + "y_pred_train = svc_model.predict(X)\n", + "y_pred = svc_model.predict(X_test)\n", + "\n", + "# Metriken\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "model_errors.append(['SVM Linear Kernel limited Columns', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" ] }, { @@ -392,6 +996,44 @@ "id": "7c225dfb", "metadata": {}, "outputs": [], + "source": [ + "# Visualize model errors\n", + "model_errors_df = pd.DataFrame(model_errors, columns=['Model', 'Test MSE', 'Train MSE', 'Test Accuracy', 'Train Accuracy', 'Test F1', 'Train F1'])\n", + "plt.figure(figsize=(12, 8))\n", + "sns.lineplot(x='Model', y='Test MSE', data=model_errors_df, color='blue', label='Test Accuracy')\n", + "sns.lineplot(x='Model', y='Train MSE', data=model_errors_df, color='orange', label='Train Accuracy', marker='o')\n", + "plt.xticks(rotation=45)\n", + "plt.title('Model Performance Comparison')\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a635d0b7", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize model errors\n", + "model_errors_df = pd.DataFrame(model_errors, columns=['Model', 'Test MSE', 'Train MSE', 'Test Accuracy', 'Train Accuracy', 'Test F1', 'Train F1'])\n", + "plt.figure(figsize=(12, 8))\n", + "sns.lineplot(x='Model', y='Test F1', data=model_errors_df, color='blue', label='Test Accuracy')\n", + "sns.lineplot(x='Model', y='Train F1', data=model_errors_df, color='orange', label='Train Accuracy', marker='o')\n", + "plt.xticks(rotation=45)\n", + "plt.title('Model Performance Comparison')\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0031537d", + "metadata": {}, + "outputs": [], "source": [ "# Visualize model errors\n", "model_errors_df = pd.DataFrame(model_errors, columns=['Model', 'Test MSE', 'Train MSE', 'Test Accuracy', 'Train Accuracy', 'Test F1', 'Train F1'])\n", @@ -422,7 +1064,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.3" + "version": "3.12.1" } }, "nbformat": 4,