diff --git a/smoker-models.ipynb b/smoker-models.ipynb index 4c99698..0af9d4a 100644 --- a/smoker-models.ipynb +++ b/smoker-models.ipynb @@ -12,15 +12,19 @@ "from matplotlib.pyplot import subplots\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", + "import graphviz\n", "\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import mean_squared_error, r2_score,accuracy_score, confusion_matrix, classification_report, accuracy_score, f1_score\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.model_selection import train_test_split, GridSearchCV\n", - "from sklearn.tree import DecisionTreeClassifier, plot_tree\n", + "from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz\n", "from sklearn.svm import SVC, LinearSVC\n", - "from sklearn.model_selection import KFold, cross_val_score\n" + "from sklearn.model_selection import KFold, cross_val_score\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from IPython.display import Image\n", + "\n" ] }, { @@ -45,6 +49,9 @@ "# store test error and train error for each model\n", "# [model, test_mse, train_mse, test_accuracy, train_accuracy, test_f1, train_f1]\n", "model_errors = []\n", + "DTC_model_errors = []\n", + "RFC_model_errors = []\n", + "SVM_model_errors = []\n", "\n", "# Split the data into test and train sets\n", "df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)" @@ -90,7 +97,10 @@ "print(f\"Test Mean Squared Error (immer Non-Smoker): {mse_test:.4f}\")\n", "\n", "\n", - "model_errors.append(['Naive Model', mse_test, mse, accuracy_test, accuracy, f1_test, f1])" + "model_errors.append(['Naive Model', mse_test, mse, accuracy_test, accuracy, f1_test, f1])\n", + "DTC_model_errors.append(['Naive Model', mse_test, mse, accuracy_test, accuracy, f1_test, f1])\n", + "RFC_model_errors.append(['Naive Model', mse_test, mse, accuracy_test, accuracy, f1_test, f1])\n", + "SVM_model_errors.append(['Naive Model', mse_test, mse, accuracy_test, accuracy, f1_test, f1])" ] }, { @@ -130,6 +140,14 @@ "print(f\"Test Mean Squared Error: {mse}\")" ] }, + { + "cell_type": "markdown", + "id": "7a3e89f2", + "metadata": {}, + "source": [ + "# Decision Trees" + ] + }, { "cell_type": "markdown", "id": "1d00ddb7", @@ -177,7 +195,8 @@ "print(f\"Decision Tree Test Mean Squared Error: {mse_test:.4f}\")\n", "print(f\"Decision Tree Train Mean Squared Error: {mse_train:.4f}\")\n", "\n", - "model_errors.append(['Decision Tree max depth of 1', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + "model_errors.append(['Decision Tree max depth of 1', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "DTC_model_errors.append(['Decision Tree max depth of 1', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" ] }, { @@ -187,8 +206,10 @@ "metadata": {}, "outputs": [], "source": [ - "plt.figure(figsize=(20, 10))\n", + "\n", + "plt.figure(figsize=(4, 3), dpi=200)\n", "plot_tree(dtc, filled=True, feature_names=X.columns, class_names=[\"Non Smoker\", \"Smoker\"])\n", + "plt.title(\"Decision Tree Depth=1\")\n", "plt.show()" ] }, @@ -241,7 +262,21 @@ "# How deep is the tree?\n", "print(f\"Depth of the tree: {dtc.get_depth()}\")\n", "\n", - "model_errors.append(['Decision Tree', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + "model_errors.append(['Decision Tree', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "DTC_model_errors.append(['Decision Tree', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "634d512b", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(20, 10))\n", + "plot_tree(dtc, filled=True, feature_names=X.columns, class_names=[\"Non Smoker\", \"Smoker\"])\n", + "plt.title(\"Decision Tree no Limitations\")\n", + "plt.show()" ] }, { @@ -287,7 +322,8 @@ "print(f\"Decision Tree Test Mean Squared Error: {mse_test:.4f}\")\n", "print(f\"Decision Tree Train Mean Squared Error: {mse_train:.4f}\")\n", "\n", - "model_errors.append(['Decision Tree', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + "model_errors.append(['Decision Tree limited leaves', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "DTC_model_errors.append(['Decision Tree limited leaves', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" ] }, { @@ -362,7 +398,8 @@ "print(f\"Decision Tree Test Mean Squared Error: {mse_test:.4f}\")\n", "print(f\"Decision Tree Train Mean Squared Error: {mse_train:.4f}\")\n", "\n", - "model_errors.append(['Decision Tree cv for max leaf', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + "model_errors.append(['Decision Tree cv for max leaf', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "DTC_model_errors.append(['Decision Tree cv for max leaf', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" ] }, { @@ -375,6 +412,7 @@ "# Entscheidungsbaum des besten Modells visualisieren\n", "plt.figure(figsize=(20, 10))\n", "plot_tree(dtc, filled=True, feature_names=X.columns, class_names=[\"Non Smoker\", \"Smoker\"])\n", + "plt.title(\"Decision Tree limited Leaves\")\n", "plt.show()" ] }, @@ -437,7 +475,8 @@ "print(f\"Decision Tree Test Mean Squared Error: {mse_test:.4f}\")\n", "print(f\"Decision Tree Train Mean Squared Error: {mse_train:.4f}\")\n", "\n", - "model_errors.append(['Decision Tree cv for max depth', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + "model_errors.append(['Decision Tree cv for max depth', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "DTC_model_errors.append(['Decision Tree cv for max depth', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" ] }, { @@ -450,6 +489,7 @@ "# Entscheidungsbaum des besten Modells visualisieren\n", "plt.figure(figsize=(20, 10))\n", "plot_tree(dtc, filled=True, feature_names=X.columns, class_names=[\"Non Smoker\", \"Smoker\"])\n", + "plt.title(\"Decision Tree limited Depth\")\n", "plt.show()" ] }, @@ -474,7 +514,7 @@ "y_test = df_test['smoking']\n", "\n", "# Training der Modelle mit K-Fold Cross-Validation\n", - "dtc = DecisionTreeClassifier(random_state=0, max_depth=7).fit(X, y)\n", + "dtc = DecisionTreeClassifier(random_state=0).fit(X, y)\n", "\n", "\n", "path = dtc.cost_complexity_pruning_path(X, y)\n", @@ -517,29 +557,130 @@ "print(f\"Decision Tree Test Mean Squared Error: {mse_test:.4f}\")\n", "print(f\"Decision Tree Train Mean Squared Error: {mse_train:.4f}\")\n", "\n", - "model_errors.append(['Decision Tree cv for ccp_alpha', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + "model_errors.append(['Decision Tree cv for ccp_alpha', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "DTC_model_errors.append(['Decision Tree cv for ccp_alpha', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31a76e71", + "metadata": {}, + "outputs": [], + "source": [ + "# Entscheidungsbaum des besten Modells visualisieren\n", + "plt.figure(figsize=(20, 10))\n", + "plot_tree(dtc, filled=True, feature_names=X.columns, class_names=[\"Non Smoker\", \"Smoker\"])\n", + "plt.title(\"Decision Tree Cost Complexity Pruning\")\n", + "plt.show()" ] }, { "cell_type": "markdown", - "id": "463161d9", + "id": "b548e9bb", "metadata": {}, "source": [ - "# SVM" + "## Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8557ad7", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize model errors\n", + "model_errors_df = pd.DataFrame(DTC_model_errors, columns=['Model', 'Test MSE', 'Train MSE', 'Test Accuracy', 'Train Accuracy', 'Test F1', 'Train F1'])\n", + "plt.figure(figsize=(12, 8))\n", + "sns.lineplot(x='Model', y='Test MSE', data=model_errors_df, color='blue', label='Test Accuracy')\n", + "sns.lineplot(x='Model', y='Train MSE', data=model_errors_df, color='orange', label='Train Accuracy', marker='o')\n", + "plt.xticks(rotation=45)\n", + "plt.title('Model Performance Comparison MSE')\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "152befb9", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize model errors\n", + "model_errors_df = pd.DataFrame(DTC_model_errors, columns=['Model', 'Test MSE', 'Train MSE', 'Test Accuracy', 'Train Accuracy', 'Test F1', 'Train F1'])\n", + "plt.figure(figsize=(12, 8))\n", + "sns.lineplot(x='Model', y='Test F1', data=model_errors_df, color='blue', label='Test Accuracy')\n", + "sns.lineplot(x='Model', y='Train F1', data=model_errors_df, color='orange', label='Train Accuracy', marker='o')\n", + "plt.xticks(rotation=45)\n", + "plt.title('Model Performance Comparison F1')\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7916fec1", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize model errors\n", + "model_errors_df = pd.DataFrame(DTC_model_errors, columns=['Model', 'Test MSE', 'Train MSE', 'Test Accuracy', 'Train Accuracy', 'Test F1', 'Train F1'])\n", + "plt.figure(figsize=(12, 8))\n", + "sns.lineplot(x='Model', y='Test Accuracy', data=model_errors_df, color='blue', label='Test Accuracy')\n", + "sns.lineplot(x='Model', y='Train Accuracy', data=model_errors_df, color='orange', label='Train Accuracy', marker='o')\n", + "plt.xticks(rotation=45)\n", + "plt.title('Model Performance Comparison Accuracy')\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8f5b0e1", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize model errors\n", + "model_errors_df = pd.DataFrame(DTC_model_errors, columns=['Model', 'Test MSE', 'Train MSE', 'Test Accuracy', 'Train Accuracy', 'Test F1', 'Train F1'])\n", + "plt.figure(figsize=(12, 8))\n", + "sns.lineplot(x='Model', y='Test Accuracy', data=model_errors_df, color='blue', label='Test Accuracy', marker='o')\n", + "sns.lineplot(x='Model', y='Train Accuracy', data=model_errors_df, color='orange', label='Train Accuracy', marker='o')\n", + "sns.lineplot(x='Model', y='Test F1', data=model_errors_df, color='purple', label='Test F1', marker='o')\n", + "sns.lineplot(x='Model', y='Train F1', data=model_errors_df, color='red', label='Train F1', marker='o')\n", + "plt.xticks(rotation=45)\n", + "plt.title('Model Performance Comparison Accuracy')\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.show()" ] }, { "cell_type": "markdown", - "id": "2c9329e5", + "id": "c7240019", "metadata": {}, "source": [ - "## SVM Modell with cross validation for hyperparameter tuning (C, kernel, gamma)" + "# Random Forest" + ] + }, + { + "cell_type": "markdown", + "id": "2a617eac", + "metadata": {}, + "source": [ + "## Random forest no tuning" ] }, { "cell_type": "code", "execution_count": null, - "id": "f80066d5", + "id": "5ec872f2", "metadata": {}, "outputs": [], "source": [ @@ -549,38 +690,15 @@ "y = df_train['smoking']\n", "y_test = df_test['smoking']\n", "\n", - "# SVM-Modell\n", - "svm = SVC(random_state=0)\n", - "\n", - "# Grid mit Parametern – einfache Auswahl für Demo-Zwecke\n", - "param_grid = {\n", - " 'C': [0.1, 1, 10],\n", - " 'kernel': ['linear', 'rbf'],\n", - " 'gamma': ['scale', 'auto']\n", - "}\n", - "\n", - "# Cross-Validation mit 5 Folds\n", - "grid_search = GridSearchCV(\n", - " svm,\n", - " param_grid=param_grid,\n", - " cv=5,\n", - " scoring='accuracy',\n", - " n_jobs=-1\n", - ")\n", - "\n", - "# Training mit CV\n", - "grid_search.fit(X, y)\n", - "\n", - "print(\"Best parameters found: \", grid_search.best_params_)\n", - "\n", - "# Modell mit besten Parametern\n", - "svm_best = grid_search.best_estimator_\n", + "# Random Forest Modell trainieren\n", + "rfc = RandomForestClassifier(random_state=0)\n", + "rfc.fit(X, y)\n", "\n", - "# Vorhersagen\n", - "y_pred_train = svm_best.predict(X)\n", - "y_pred = svm_best.predict(X_test)\n", + "# Vorhersagen berechnen\n", + "y_pred_train = rfc.predict(X)\n", + "y_pred = rfc.predict(X_test)\n", "\n", - "# Metriken\n", + "# Metriken berechnen\n", "accuracy_train = accuracy_score(y, y_pred_train)\n", "accuracy_test = accuracy_score(y_test, y_pred)\n", "\n", @@ -590,44 +708,55 @@ "mse_train = mean_squared_error(y, y_pred_train)\n", "mse_test = mean_squared_error(y_test, y_pred)\n", "\n", - "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", - "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", - "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", - "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", - "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", - "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", - "\n", - "# Fehlerliste ergänzen\n", - "model_errors.append(['SVM cv for C/kernel', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + "# Ergebnisse ausgeben\n", + "print(f\"Random Forest Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"Random Forest Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"Random Forest Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"Random Forest Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"Random Forest Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"Random Forest Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "# Anzahl der genutzten Bäume\n", + "print(f\"Number of trees in the forest: {len(rfc.estimators_)}\")\n", + "\n", + "# Durchschnittliche Tiefe der Bäume ausgeben\n", + "tree_depths = [estimator.tree_.max_depth for estimator in rfc.estimators_]\n", + "avg_depth = sum(tree_depths) / len(tree_depths)\n", + "print(f\"Average depth of the trees: {avg_depth:.2f}\")\n", + "\n", + "# Modellfehlerliste ergänzen\n", + "model_errors.append(['Random Forest', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "RFC_model_errors.append(['Random Forest', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" ] }, { - "cell_type": "markdown", - "id": "1c9e5492", + "cell_type": "code", + "execution_count": null, + "id": "b765c6ce", "metadata": {}, + "outputs": [], "source": [ - "### Output Parameter tuning\n", - "Best parameters found: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}\n", - "SVM Test Accuracy: 0.7269\n", - "SVM Train Accuracy: 0.7493\n", - "SVM Test F1-Score: 0.6024\n", - "SVM Train F1-Score: 0.6291\n", - "SVM Test Mean Squared Error: 0.2731\n", - "SVM Train Mean Squared Error: 0.2507" + "# Feature Importances\n", + "feat_importances = pd.Series(rfc.feature_importances_, index=X.columns)\n", + "feat_importances.sort_values(ascending=True).plot(kind='barh', figsize=(8,6))\n", + "plt.title(\"Feature Importances im Random Forest Modell\")\n", + "plt.xlabel(\"Wichtigkeit\")\n", + "plt.tight_layout()\n", + "plt.show()" ] }, { "cell_type": "markdown", - "id": "11decbc2", + "id": "80d9d25c", "metadata": {}, "source": [ - "## SVM Modell with cross validation for hyperparameter tuning (C)" + "## Random Forest cross validation hyperparametertuning n_estimators" ] }, { "cell_type": "code", "execution_count": null, - "id": "9b639d89", + "id": "8d30b2b7", "metadata": {}, "outputs": [], "source": [ @@ -637,17 +766,17 @@ "y = df_train['smoking']\n", "y_test = df_test['smoking']\n", "\n", - "# SVM-Modell\n", - "svm = SVC(random_state=0)\n", + "# Random Forest Modell definieren\n", + "rfc = RandomForestClassifier(random_state=0)\n", "\n", - "# Grid mit Parametern – einfache Auswahl für Demo-Zwecke\n", + "# Hyperparameter Grid definieren\n", "param_grid = {\n", - " 'C': [5, 10, 15, 20],\n", + " 'n_estimators': [50, 100, 150, 200]\n", "}\n", "\n", - "# Cross-Validation mit 5 Folds\n", + "# GridSearch mit Cross-Validation\n", "grid_search = GridSearchCV(\n", - " svm,\n", + " estimator=rfc,\n", " param_grid=param_grid,\n", " cv=5,\n", " scoring='accuracy',\n", @@ -657,16 +786,17 @@ "# Training mit CV\n", "grid_search.fit(X, y)\n", "\n", + "# Beste Parameter anzeigen\n", "print(\"Best parameters found: \", grid_search.best_params_)\n", "\n", - "# Modell mit besten Parametern\n", - "svm_best = grid_search.best_estimator_\n", + "# Bestes Modell verwenden\n", + "rfc_best = grid_search.best_estimator_\n", "\n", - "# Vorhersagen\n", - "y_pred_train = svm_best.predict(X)\n", - "y_pred = svm_best.predict(X_test)\n", + "# Vorhersagen berechnen\n", + "y_pred_train = rfc_best.predict(X)\n", + "y_pred = rfc_best.predict(X_test)\n", "\n", - "# Metriken\n", + "# Metriken berechnen\n", "accuracy_train = accuracy_score(y, y_pred_train)\n", "accuracy_test = accuracy_score(y_test, y_pred)\n", "\n", @@ -676,48 +806,79 @@ "mse_train = mean_squared_error(y, y_pred_train)\n", "mse_test = mean_squared_error(y_test, y_pred)\n", "\n", - "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", - "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", - "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", - "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", - "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", - "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", - "\n", - "# Fehlerliste ergänzen\n", - "model_errors.append(['SVM cv for C/kernel', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + "# Ergebnisse ausgeben\n", + "print(f\"Random Forest Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"Random Forest Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"Random Forest Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"Random Forest Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"Random Forest Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"Random Forest Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "# Anzahl Bäume im besten Modell\n", + "print(f\"Number of trees in the best model: {rfc_best.n_estimators}\")\n", + "\n", + "# Durchschnittliche Tiefe der Bäume ausgeben\n", + "tree_depths = [estimator.tree_.max_depth for estimator in rfc_best.estimators_]\n", + "avg_depth = sum(tree_depths) / len(tree_depths)\n", + "print(f\"Average depth of the trees: {avg_depth:.2f}\")\n", + "\n", + "# Modellfehlerliste ergänzen\n", + "model_errors.append(['Random Forest cv for n_estimators', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "RFC_model_errors.append(['Random Forest cv for n_estimators', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" ] }, { "cell_type": "markdown", - "id": "c9d03ad9", + "id": "ed5acbf8", "metadata": {}, "source": [ - "## SVM Modell Hyperparamter tuned with limited Columns" + "## Random Forest cross validation hyperparametertuning max depth" ] }, { "cell_type": "code", "execution_count": null, - "id": "ead21ae2", + "id": "483f05a9", "metadata": {}, "outputs": [], "source": [ "# Daten vorbereiten\n", - "X = df_train[['height(cm)', 'hemoglobin', 'Gtp', 'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol']]\n", - "X_test = df_test[['height(cm)', 'hemoglobin', 'Gtp', 'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol']]\n", + "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", "y = df_train['smoking']\n", "y_test = df_test['smoking']\n", "\n", + "# Random Forest Modell definieren\n", + "rfc = RandomForestClassifier(random_state=0, n_estimators=200)\n", "\n", - "# Train the SVC model\n", - "svc_model = SVC(random_state=42, kernel='rbf', C=10, gamma='scale')\n", - "svc_model.fit(X, y)\n", + "# Hyperparameter Grid definieren\n", + "param_grid = {\n", + " 'max_depth': [10, 20, 30, 40, 50]\n", + "}\n", "\n", - "# Make predictions\n", - "y_pred_train = svc_model.predict(X)\n", - "y_pred = svc_model.predict(X_test)\n", + "# GridSearch mit Cross-Validation\n", + "grid_search = GridSearchCV(\n", + " estimator=rfc,\n", + " param_grid=param_grid,\n", + " cv=5,\n", + " scoring='accuracy',\n", + " n_jobs=-1\n", + ")\n", "\n", - "# Metriken\n", + "# Training mit CV\n", + "grid_search.fit(X, y)\n", + "\n", + "# Beste Parameter anzeigen\n", + "print(\"Best parameters found: \", grid_search.best_params_)\n", + "\n", + "# Bestes Modell verwenden\n", + "rfc_best = grid_search.best_estimator_\n", + "\n", + "# Vorhersagen berechnen\n", + "y_pred_train = rfc_best.predict(X)\n", + "y_pred = rfc_best.predict(X_test)\n", + "\n", + "# Metriken berechnen\n", "accuracy_train = accuracy_score(y, y_pred_train)\n", "accuracy_test = accuracy_score(y_test, y_pred)\n", "\n", @@ -727,28 +888,39 @@ "mse_train = mean_squared_error(y, y_pred_train)\n", "mse_test = mean_squared_error(y_test, y_pred)\n", "\n", - "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", - "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", - "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", - "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", - "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", - "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", - "\n", - "model_errors.append(['SVM', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + "# Ergebnisse ausgeben\n", + "print(f\"Random Forest Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"Random Forest Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"Random Forest Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"Random Forest Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"Random Forest Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"Random Forest Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "# Anzahl Bäume im besten Modell\n", + "print(f\"Number of trees in the best model: {rfc_best.n_estimators}\")\n", + "\n", + "# Durchschnittliche Tiefe der Bäume ausgeben\n", + "tree_depths = [estimator.tree_.max_depth for estimator in rfc_best.estimators_]\n", + "avg_depth = sum(tree_depths) / len(tree_depths)\n", + "print(f\"Average depth of the trees: {avg_depth:.2f}\")\n", + "\n", + "# Modellfehlerliste ergänzen\n", + "model_errors.append(['Random Forest cv for max_depth', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "RFC_model_errors.append(['Random Forest cv for max_depth', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" ] }, { "cell_type": "markdown", - "id": "aac5f7b2", + "id": "b1209577", "metadata": {}, "source": [ - "## SVM Modell" + "## Random Forest cross validation hyperparametertuning max features" ] }, { "cell_type": "code", "execution_count": null, - "id": "6cfaacf6", + "id": "ffb5830c", "metadata": {}, "outputs": [], "source": [ @@ -758,16 +930,37 @@ "y = df_train['smoking']\n", "y_test = df_test['smoking']\n", "\n", + "# Random Forest Modell definieren\n", + "rfc = RandomForestClassifier(random_state=0, n_estimators=200, max_depth=20)\n", "\n", - "# Train the SVC model\n", - "svc_model = SVC(random_state=42)\n", - "svc_model.fit(X, y)\n", + "# Hyperparameter Grid definieren\n", + "param_grid = {\n", + " 'max_features': ['sqrt', 3, 5, 10]\n", + "}\n", "\n", - "# Make predictions\n", - "y_pred_train = svc_model.predict(X)\n", - "y_pred = svc_model.predict(X_test)\n", + "# GridSearch mit Cross-Validation\n", + "grid_search = GridSearchCV(\n", + " estimator=rfc,\n", + " param_grid=param_grid,\n", + " cv=5,\n", + " scoring='accuracy',\n", + " n_jobs=-1\n", + ")\n", "\n", - "# Metriken\n", + "# Training mit CV\n", + "grid_search.fit(X, y)\n", + "\n", + "# Beste Parameter anzeigen\n", + "print(\"Best parameters found: \", grid_search.best_params_)\n", + "\n", + "# Bestes Modell verwenden\n", + "rfc_best = grid_search.best_estimator_\n", + "\n", + "# Vorhersagen berechnen\n", + "y_pred_train = rfc_best.predict(X)\n", + "y_pred = rfc_best.predict(X_test)\n", + "\n", + "# Metriken berechnen\n", "accuracy_train = accuracy_score(y, y_pred_train)\n", "accuracy_test = accuracy_score(y_test, y_pred)\n", "\n", @@ -777,28 +970,39 @@ "mse_train = mean_squared_error(y, y_pred_train)\n", "mse_test = mean_squared_error(y_test, y_pred)\n", "\n", - "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", - "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", - "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", - "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", - "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", - "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", - "\n", - "model_errors.append(['SVM', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + "# Ergebnisse ausgeben\n", + "print(f\"Random Forest Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"Random Forest Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"Random Forest Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"Random Forest Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"Random Forest Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"Random Forest Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "# Anzahl Bäume im besten Modell\n", + "print(f\"Number of trees in the best model: {rfc_best.n_estimators}\")\n", + "\n", + "# Durchschnittliche Tiefe der Bäume ausgeben\n", + "tree_depths = [estimator.tree_.max_depth for estimator in rfc_best.estimators_]\n", + "avg_depth = sum(tree_depths) / len(tree_depths)\n", + "print(f\"Average depth of the trees: {avg_depth:.2f}\")\n", + "\n", + "# Modellfehlerliste ergänzen\n", + "model_errors.append(['Random Forest cv for max_features', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "RFC_model_errors.append(['Random Forest cv for max_features', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" ] }, { "cell_type": "markdown", - "id": "d73eba52", + "id": "b30b689d", "metadata": {}, "source": [ - "## SVM Linear Kernel Model" + "## Random Forest cross validation hyperparametertuning min samples leaf" ] }, { "cell_type": "code", "execution_count": null, - "id": "fa36707a", + "id": "ec84d6e1", "metadata": {}, "outputs": [], "source": [ @@ -808,10 +1012,379 @@ "y = df_train['smoking']\n", "y_test = df_test['smoking']\n", "\n", + "# Random Forest Modell definieren\n", + "rfc = RandomForestClassifier(random_state=0, n_estimators=200, max_depth=20, max_features='sqrt')\n", "\n", - "# Train the SVC model\n", - "svc_model = LinearSVC(random_state=42)\n", - "svc_model.fit(X, y)\n", + "\n", + "# Parameter Grid für min_samples_leaf\n", + "param_grid = {\n", + " 'min_samples_leaf': [1, 2, 5, 10, 20, 50]\n", + "}\n", + "\n", + "# GridSearchCV Setup\n", + "grid_search = GridSearchCV(\n", + " estimator=rfc,\n", + " param_grid=param_grid,\n", + " cv=5,\n", + " scoring='accuracy',\n", + " n_jobs=-1\n", + ")\n", + "\n", + "# Training mit Cross-Validation\n", + "grid_search.fit(X, y)\n", + "\n", + "# Beste Parameter anzeigen\n", + "print(\"Best parameters found: \", grid_search.best_params_)\n", + "\n", + "# Bestes Modell verwenden\n", + "rfc_best = grid_search.best_estimator_\n", + "\n", + "# Vorhersagen berechnen\n", + "y_pred_train = rfc_best.predict(X)\n", + "y_pred = rfc_best.predict(X_test)\n", + "\n", + "# Metriken berechnen\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "# Ergebnisse ausgeben\n", + "print(f\"Random Forest Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"Random Forest Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"Random Forest Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"Random Forest Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"Random Forest Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"Random Forest Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "# Anzahl Bäume im besten Modell\n", + "print(f\"Number of trees in the best model: {rfc_best.n_estimators}\")\n", + "\n", + "# Durchschnittliche Tiefe der Bäume ausgeben\n", + "tree_depths = [estimator.tree_.max_depth for estimator in rfc_best.estimators_]\n", + "avg_depth = sum(tree_depths) / len(tree_depths)\n", + "print(f\"Average depth of the trees: {avg_depth:.2f}\")\n", + "\n", + "# Modellfehlerliste ergänzen\n", + "model_errors.append(['Random Forest cv for min_samples_leaf', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "RFC_model_errors.append(['Random Forest cv for min_samples_leaf', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "a645c746", + "metadata": {}, + "source": [ + "## Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35ce9065", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize model errors\n", + "model_errors_df = pd.DataFrame(RFC_model_errors, columns=['Model', 'Test MSE', 'Train MSE', 'Test Accuracy', 'Train Accuracy', 'Test F1', 'Train F1'])\n", + "plt.figure(figsize=(12, 8))\n", + "sns.lineplot(x='Model', y='Test Accuracy', data=model_errors_df, color='blue', label='Test Accuracy', marker='o')\n", + "sns.lineplot(x='Model', y='Train Accuracy', data=model_errors_df, color='orange', label='Train Accuracy', marker='o')\n", + "sns.lineplot(x='Model', y='Test F1', data=model_errors_df, color='purple', label='Test F1', marker='o')\n", + "sns.lineplot(x='Model', y='Train F1', data=model_errors_df, color='red', label='Train F1', marker='o')\n", + "plt.xticks(rotation=45)\n", + "plt.title('Model Performance Comparison Accuracy')\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "463161d9", + "metadata": {}, + "source": [ + "# SVM" + ] + }, + { + "cell_type": "markdown", + "id": "2c9329e5", + "metadata": {}, + "source": [ + "## SVM Modell with cross validation for hyperparameter tuning (C, kernel, gamma)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f80066d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Daten vorbereiten\n", + "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", + "\n", + "# SVM-Modell\n", + "svm = SVC(random_state=0)\n", + "\n", + "# Grid mit Parametern – einfache Auswahl für Demo-Zwecke\n", + "param_grid = {\n", + " 'C': [0.1, 1, 10],\n", + " 'kernel': ['linear', 'rbf'],\n", + " 'gamma': ['scale', 'auto']\n", + "}\n", + "\n", + "# Cross-Validation mit 5 Folds\n", + "grid_search = GridSearchCV(\n", + " svm,\n", + " param_grid=param_grid,\n", + " cv=5,\n", + " scoring='accuracy',\n", + " n_jobs=-1\n", + ")\n", + "\n", + "# Training mit CV\n", + "grid_search.fit(X, y)\n", + "\n", + "print(\"Best parameters found: \", grid_search.best_params_)\n", + "\n", + "# Modell mit besten Parametern\n", + "svm_best = grid_search.best_estimator_\n", + "\n", + "# Vorhersagen\n", + "y_pred_train = svm_best.predict(X)\n", + "y_pred = svm_best.predict(X_test)\n", + "\n", + "# Metriken\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "# Fehlerliste ergänzen\n", + "model_errors.append(['SVM cv for C/kernel/gamma', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "SVM_model_errors.append(['SVM cv for C/kernel/gamma', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "1c9e5492", + "metadata": {}, + "source": [ + "### Output Parameter tuning\n", + "Best parameters found: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}\n", + "SVM Test Accuracy: 0.7269\n", + "SVM Train Accuracy: 0.7493\n", + "SVM Test F1-Score: 0.6024\n", + "SVM Train F1-Score: 0.6291\n", + "SVM Test Mean Squared Error: 0.2731\n", + "SVM Train Mean Squared Error: 0.2507" + ] + }, + { + "cell_type": "markdown", + "id": "11decbc2", + "metadata": {}, + "source": [ + "## SVM Modell with cross validation for hyperparameter tuning (C)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b639d89", + "metadata": {}, + "outputs": [], + "source": [ + "# Daten vorbereiten\n", + "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", + "\n", + "# SVM-Modell\n", + "svm = SVC(random_state=0, gamma='scale', kernel='rbf')\n", + "\n", + "# Grid mit Parametern – einfache Auswahl für Demo-Zwecke\n", + "param_grid = {\n", + " 'C': [5, 10, 15, 20],\n", + "}\n", + "\n", + "# Cross-Validation mit 5 Folds\n", + "grid_search = GridSearchCV(\n", + " svm,\n", + " param_grid=param_grid,\n", + " cv=5,\n", + " scoring='accuracy',\n", + " n_jobs=-1\n", + ")\n", + "\n", + "# Training mit CV\n", + "grid_search.fit(X, y)\n", + "\n", + "print(\"Best parameters found: \", grid_search.best_params_)\n", + "\n", + "# Modell mit besten Parametern\n", + "svm_best = grid_search.best_estimator_\n", + "\n", + "# Vorhersagen\n", + "y_pred_train = svm_best.predict(X)\n", + "y_pred = svm_best.predict(X_test)\n", + "\n", + "# Metriken\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "# Fehlerliste ergänzen\n", + "model_errors.append(['SVM cv for C<20', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "SVM_model_errors.append(['SVM cv for C<20', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n" + ] + }, + { + "cell_type": "markdown", + "id": "1054ebde", + "metadata": {}, + "source": [ + "### Output parameter tuning\n", + "\n", + "Best parameters found: {'C': 20}\n", + "SVM Test Accuracy: 0.7248\n", + "SVM Train Accuracy: 0.7519\n", + "SVM Test F1-Score: 0.6020\n", + "SVM Train F1-Score: 0.6364\n", + "SVM Test Mean Squared Error: 0.2752\n", + "SVM Train Mean Squared Error: 0.2481" + ] + }, + { + "cell_type": "markdown", + "id": "732aad07", + "metadata": {}, + "source": [ + "## SVM Modell with cross validation for hyperparameter tuning (C)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e77b75f", + "metadata": {}, + "outputs": [], + "source": [ + "# Daten vorbereiten\n", + "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", + "\n", + "# SVM-Modell\n", + "svm = SVC(random_state=0, gamma='scale', kernel='rbf')\n", + "\n", + "# Grid mit Parametern – einfache Auswahl für Demo-Zwecke\n", + "param_grid = {\n", + " 'C': [20, 25, 30],\n", + "}\n", + "\n", + "# Cross-Validation mit 5 Folds\n", + "grid_search = GridSearchCV(\n", + " svm,\n", + " param_grid=param_grid,\n", + " cv=5,\n", + " scoring='accuracy',\n", + " n_jobs=-1\n", + ")\n", + "\n", + "# Training mit CV\n", + "grid_search.fit(X, y)\n", + "\n", + "print(\"Best parameters found: \", grid_search.best_params_)\n", + "\n", + "# Modell mit besten Parametern\n", + "svm_best = grid_search.best_estimator_\n", + "\n", + "# Vorhersagen\n", + "y_pred_train = svm_best.predict(X)\n", + "y_pred = svm_best.predict(X_test)\n", + "\n", + "# Metriken\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "# Fehlerliste ergänzen\n", + "model_errors.append(['SVM cv for C>20', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "SVM_model_errors.append(['SVM cv for C>20', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "c9d03ad9", + "metadata": {}, + "source": [ + "## SVM Modell Hyperparamter tuned with limited Columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ead21ae2", + "metadata": {}, + "outputs": [], + "source": [ + "# Daten vorbereiten\n", + "X = df_train[['height(cm)', 'hemoglobin', 'Gtp', 'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol']]\n", + "X_test = df_test[['height(cm)', 'hemoglobin', 'Gtp', 'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol']]\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", + "\n", + "\n", + "# Train the SVC model\n", + "svc_model = SVC(random_state=42, kernel='rbf', C=10, gamma='scale')\n", + "svc_model.fit(X, y)\n", "\n", "# Make predictions\n", "y_pred_train = svc_model.predict(X)\n", @@ -834,27 +1407,28 @@ "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", "\n", - "model_errors.append(['SVM', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + "model_errors.append(['SVM cv for limited Columns', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "SVM_model_errors.append(['SVM cv for limited Columns', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n" ] }, { "cell_type": "markdown", - "id": "02e71868", + "id": "aac5f7b2", "metadata": {}, "source": [ - "## SVM Modell limited columns" + "## SVM Modell" ] }, { "cell_type": "code", "execution_count": null, - "id": "52db280c", + "id": "6cfaacf6", "metadata": {}, "outputs": [], "source": [ "# Daten vorbereiten\n", - "X = df_train[['height(cm)', 'hemoglobin', 'Gtp']]\n", - "X_test = df_test[['height(cm)', 'hemoglobin', 'Gtp']]\n", + "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", "y = df_train['smoking']\n", "y_test = df_test['smoking']\n", "\n", @@ -884,27 +1458,28 @@ "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", "\n", - "model_errors.append(['SVM limited Columns', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + "model_errors.append(['SVM', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "SVM_model_errors.append(['SVM', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" ] }, { "cell_type": "markdown", - "id": "371b1933", + "id": "d73eba52", "metadata": {}, "source": [ - "## SVM Linear Kernel Modell limited columns" + "## SVM Linear Kernel Model" ] }, { "cell_type": "code", "execution_count": null, - "id": "2fe14bef", + "id": "fa36707a", "metadata": {}, "outputs": [], "source": [ "# Daten vorbereiten\n", - "X = df_train[['height(cm)', 'hemoglobin', 'Gtp']]\n", - "X_test = df_test[['height(cm)', 'hemoglobin', 'Gtp']]\n", + "X = df_train.drop('smoking', axis=1)\n", + "X_test = df_test.drop('smoking', axis=1)\n", "y = df_train['smoking']\n", "y_test = df_test['smoking']\n", "\n", @@ -934,52 +1509,110 @@ "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", "\n", - "model_errors.append(['SVM Linear Kernel limited Columns', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + "model_errors.append(['SVM Linear', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "SVM_model_errors.append(['SVM Linear', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "02e71868", + "metadata": {}, + "source": [ + "## SVM Modell limited columns" ] }, { "cell_type": "code", "execution_count": null, - "id": "c963bb58", + "id": "52db280c", "metadata": {}, "outputs": [], "source": [ - "# # Define features and target variable\n", - "# X = df_train.drop('smoking', axis=1)\n", - "# y = df_train['smoking']\n", + "# Daten vorbereiten\n", + "X = df_train[['height(cm)', 'hemoglobin', 'Gtp']]\n", + "X_test = df_test[['height(cm)', 'hemoglobin', 'Gtp']]\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", "\n", - "# # Split the data into training and testing sets\n", - "# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", - "# svc=SVC() \n", + "# Train the SVC model\n", + "svc_model = SVC(random_state=42)\n", + "svc_model.fit(X, y)\n", "\n", + "# Make predictions\n", + "y_pred_train = svc_model.predict(X)\n", + "y_pred = svc_model.predict(X_test)\n", "\n", + "# Metriken\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", "\n", - "# # declare parameters for hyperparameter tuning\n", - "# parameters = [ {'C':[1, 10, 100, 1000], 'kernel':['linear']},\n", - "# {'C':[1, 10, 100, 1000], 'kernel':['rbf'], 'gamma':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]},\n", - "# {'C':[1, 10, 100, 1000], 'kernel':['poly'], 'degree': [2,3,4] ,'gamma':[0.01,0.02,0.03,0.04,0.05]} \n", - "# ]\n", + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", "\n", + "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", "\n", + "model_errors.append(['SVM limited Columns', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "SVM_model_errors.append(['SVM limited Columns', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "371b1933", + "metadata": {}, + "source": [ + "## SVM Linear Kernel Modell limited columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fe14bef", + "metadata": {}, + "outputs": [], + "source": [ + "# Daten vorbereiten\n", + "X = df_train[['height(cm)', 'hemoglobin', 'Gtp']]\n", + "X_test = df_test[['height(cm)', 'hemoglobin', 'Gtp']]\n", + "y = df_train['smoking']\n", + "y_test = df_test['smoking']\n", "\n", - "# grid_search = GridSearchCV(estimator = svc, \n", - "# param_grid = parameters,\n", - "# scoring = 'accuracy',\n", - "# cv = 5,\n", - "# verbose=0)\n", "\n", + "# Train the SVC model\n", + "svc_model = LinearSVC(random_state=42)\n", + "svc_model.fit(X, y)\n", "\n", - "# grid_search.fit(X_train, y_train)\n", + "# Make predictions\n", + "y_pred_train = svc_model.predict(X)\n", + "y_pred = svc_model.predict(X_test)\n", "\n", - "# y_pred = grid_search.predict(X_test)\n", - "# accuracy = accuracy_score(y_test, y_pred)\n", - "# f1 = f1_score(y_test, y_pred)\n", - "# print(f\"Genauigkeit des besten Modells auf dem Testdatensatz: {accuracy:.4f}\")\n", - "# print(f\"Genauigkeit des besten Modells auf dem Testdatensatz (f1): {f1:.4f}\")\n", + "# Metriken\n", + "accuracy_train = accuracy_score(y, y_pred_train)\n", + "accuracy_test = accuracy_score(y_test, y_pred)\n", "\n", - " " + "f1_train = f1_score(y, y_pred_train)\n", + "f1_test = f1_score(y_test, y_pred)\n", + "\n", + "mse_train = mean_squared_error(y, y_pred_train)\n", + "mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "model_errors.append(['SVM Linear Kernel limited Columns', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])\n", + "SVM_model_errors.append(['SVM Linear Kernel limited Columns', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" ] }, { @@ -987,7 +1620,7 @@ "id": "8f7eef7f", "metadata": {}, "source": [ - "## Model Evaluation" + "# Model Evaluation" ] }, { @@ -1046,6 +1679,167 @@ "plt.tight_layout()\n", "plt.show()" ] + }, + { + "cell_type": "markdown", + "id": "aec6d18a", + "metadata": {}, + "source": [ + "# Unnecessary" + ] + }, + { + "cell_type": "markdown", + "id": "0bffc63f", + "metadata": {}, + "source": [ + "## SVM Model with set C (C=15)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6219bd2", + "metadata": {}, + "outputs": [], + "source": [ + "# # Daten vorbereiten\n", + "# X = df_train.drop('smoking', axis=1)\n", + "# X_test = df_test.drop('smoking', axis=1)\n", + "# y = df_train['smoking']\n", + "# y_test = df_test['smoking']\n", + "\n", + "\n", + "# # Train the SVC model\n", + "# svc_model = SVC(random_state=0, kernel='rbf', C=15, gamma='scale')\n", + "# svc_model.fit(X, y)\n", + "\n", + "# # Make predictions\n", + "# y_pred_train = svc_model.predict(X)\n", + "# y_pred = svc_model.predict(X_test)\n", + "\n", + "# # Metriken\n", + "# accuracy_train = accuracy_score(y, y_pred_train)\n", + "# accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "# f1_train = f1_score(y, y_pred_train)\n", + "# f1_test = f1_score(y_test, y_pred)\n", + "\n", + "# mse_train = mean_squared_error(y, y_pred_train)\n", + "# mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "# print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "# print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "# print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "# print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "# print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "# print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "# model_errors.append(['SVM', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "626b6c33", + "metadata": {}, + "source": [ + "## SVM Model with set C (C=8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d4d48ae", + "metadata": {}, + "outputs": [], + "source": [ + "# # Daten vorbereiten\n", + "# X = df_train.drop('smoking', axis=1)\n", + "# X_test = df_test.drop('smoking', axis=1)\n", + "# y = df_train['smoking']\n", + "# y_test = df_test['smoking']\n", + "\n", + "\n", + "# # Train the SVC model\n", + "# svc_model = SVC(random_state=0, kernel='rbf', C=8, gamma='scale')\n", + "# svc_model.fit(X, y)\n", + "\n", + "# # Make predictions\n", + "# y_pred_train = svc_model.predict(X)\n", + "# y_pred = svc_model.predict(X_test)\n", + "\n", + "# # Metriken\n", + "# accuracy_train = accuracy_score(y, y_pred_train)\n", + "# accuracy_test = accuracy_score(y_test, y_pred)\n", + "\n", + "# f1_train = f1_score(y, y_pred_train)\n", + "# f1_test = f1_score(y_test, y_pred)\n", + "\n", + "# mse_train = mean_squared_error(y, y_pred_train)\n", + "# mse_test = mean_squared_error(y_test, y_pred)\n", + "\n", + "# print(f\"SVM Test Accuracy: {accuracy_test:.4f}\")\n", + "# print(f\"SVM Train Accuracy: {accuracy_train:.4f}\")\n", + "# print(f\"SVM Test F1-Score: {f1_test:.4f}\")\n", + "# print(f\"SVM Train F1-Score: {f1_train:.4f}\")\n", + "# print(f\"SVM Test Mean Squared Error: {mse_test:.4f}\")\n", + "# print(f\"SVM Train Mean Squared Error: {mse_train:.4f}\")\n", + "\n", + "# model_errors.append(['SVM', mse_test, mse_train, accuracy_test, accuracy_train, f1_test, f1_train])" + ] + }, + { + "cell_type": "markdown", + "id": "3a656dad", + "metadata": {}, + "source": [ + "## SVM Model with too many Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c963bb58", + "metadata": {}, + "outputs": [], + "source": [ + "# # Define features and target variable\n", + "# X = df_train.drop('smoking', axis=1)\n", + "# y = df_train['smoking']\n", + "\n", + "# # Split the data into training and testing sets\n", + "# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# svc=SVC() \n", + "\n", + "\n", + "\n", + "# # declare parameters for hyperparameter tuning\n", + "# parameters = [ {'C':[1, 10, 100, 1000], 'kernel':['linear']},\n", + "# {'C':[1, 10, 100, 1000], 'kernel':['rbf'], 'gamma':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]},\n", + "# {'C':[1, 10, 100, 1000], 'kernel':['poly'], 'degree': [2,3,4] ,'gamma':[0.01,0.02,0.03,0.04,0.05]} \n", + "# ]\n", + "\n", + "\n", + "\n", + "\n", + "# grid_search = GridSearchCV(estimator = svc, \n", + "# param_grid = parameters,\n", + "# scoring = 'accuracy',\n", + "# cv = 5,\n", + "# verbose=0)\n", + "\n", + "\n", + "# grid_search.fit(X_train, y_train)\n", + "\n", + "# y_pred = grid_search.predict(X_test)\n", + "# accuracy = accuracy_score(y_test, y_pred)\n", + "# f1 = f1_score(y_test, y_pred)\n", + "# print(f\"Genauigkeit des besten Modells auf dem Testdatensatz: {accuracy:.4f}\")\n", + "# print(f\"Genauigkeit des besten Modells auf dem Testdatensatz (f1): {f1:.4f}\")\n", + "\n", + " " + ] } ], "metadata": {