diff --git a/data-preparation.ipynb b/data-preparation.ipynb new file mode 100644 index 0000000..70c6ced --- /dev/null +++ b/data-preparation.ipynb @@ -0,0 +1,92 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "eda9759b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "original = pd.read_csv(\"data/smoker_train.csv\")\n", + "\n", + "# Number of Rows\n", + "# Original: 38984\n", + "# Ohne Duplikate: 33467\n", + "# Ohne Nullwerte: 33467\n", + "# Real. Blutdruck: 33467 \n", + "# Cholesterin: 33344\n", + "# BMI: 33340\n", + "# Anderes: 33302" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94a8e555", + "metadata": {}, + "outputs": [], + "source": [ + "def clean_up(df):\n", + " # Duplikate Löschen\n", + " x = df.drop_duplicates()\n", + " \n", + " # Delete rows with null values\n", + " x = x.dropna()\n", + " \n", + " # Delete rows with impossible blood pressure values\n", + " x = x[x[\"systolic\"] > x[\"relaxation\"]]\n", + " \n", + " # Cholesterin\n", + " x = x[abs(x[\"Cholesterol\"] - x[\"HDL\"] - x[\"LDL\"] - x[\"triglyceride\"] / 5) < 30]\n", + " \n", + " # BMI > 16\n", + " x = x[(x[\"weight(kg)\"] / ((x[\"height(cm)\"] / 100) ** 2)) >= 15]\n", + " \n", + " # Anderes -> 10-facher Normbereich\n", + " x = x[x[\"AST\"] < 500] # 8 Werte\n", + " x = x[x[\"ALT\"] < 500] # 4 Werte\n", + " x = x[x[\"Gtp\"] < 660] # 26 Werte\n", + " x = x[x[\"hemoglobin\"] < 180] # 0 Werte\n", + " x = x[x[\"serum creatinine\"] < 12] # 0 Werte\n", + " x = x[x[\"fasting blood sugar\"] < 1000] # 0 Werte\n", + " \n", + " return x \n", + "\n", + "print(len(clean_up(original)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19704c5b", + "metadata": {}, + "outputs": [], + "source": [ + "y = original[\"fasting blood sugar\"].sort_values()\n", + "y.tail(30)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}