From 561ba54f0bc721db3e96232bcce0f6b24afb62a3 Mon Sep 17 00:00:00 2001 From: I569996 Date: Thu, 26 Jun 2025 09:41:45 +0200 Subject: [PATCH 1/2] added clean_up function --- data-preparation.ipynb | 141 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 data-preparation.ipynb diff --git a/data-preparation.ipynb b/data-preparation.ipynb new file mode 100644 index 0000000..01f54a8 --- /dev/null +++ b/data-preparation.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "eda9759b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "original = pd.read_csv(\"data/smoker_train.csv\")\n", + "\n", + "# Number of Rows\n", + "# Original: 38984\n", + "# Ohne Duplikate: 33467\n", + "# Ohne Nullwerte: 33467\n", + "# Real. Blutdruck: 33467 \n", + "# Cholesterin: 33344\n", + "# BMI: 33340\n", + "# Anderes: 33302" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94a8e555", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "33302\n" + ] + } + ], + "source": [ + "def clean_up(df):\n", + " # Duplikate Löschen\n", + " x = df.drop_duplicates()\n", + " \n", + " # Delete rows with null values\n", + " x = x.dropna()\n", + " \n", + " # Delete rows with impossible blood pressure values\n", + " x = x[x[\"systolic\"] > x[\"relaxation\"]]\n", + " \n", + " # Cholesterin\n", + " x = x[abs(x[\"Cholesterol\"] - x[\"HDL\"] - x[\"LDL\"] - x[\"triglyceride\"] / 5) < 30]\n", + " \n", + " # BMI > 16\n", + " x = x[(x[\"weight(kg)\"] / ((x[\"height(cm)\"] / 100) ** 2)) >= 15]\n", + " \n", + " # Anderes -> 10-facher Normbereich\n", + " x = x[x[\"AST\"] < 500] # 8 Werte\n", + " x = x[x[\"ALT\"] < 500] # 4 Werte\n", + " x = x[x[\"Gtp\"] < 660] # 26 Werte\n", + " x = x[x[\"hemoglobin\"] < 180] # 0 Werte\n", + " x = x[x[\"serum creatinine\"] < 12] # 0 Werte\n", + " x = x[x[\"fasting blood sugar\"] < 1000] # 0 Werte\n", + " \n", + " return x \n", + "\n", + "print(len(clean_up(original)))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "19704c5b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "18248 302\n", + "22530 303\n", + "23521 303\n", + "25010 305\n", + "11078 307\n", + "11190 308\n", + "15180 309\n", + "34033 310\n", + "13660 313\n", + "19433 313\n", + "15530 314\n", + "31343 315\n", + "8358 315\n", + "37194 318\n", + "34982 329\n", + "7532 330\n", + "35606 341\n", + "37408 342\n", + "32310 342\n", + "29230 349\n", + "15263 365\n", + "5479 369\n", + "24060 369\n", + "27782 369\n", + "25007 375\n", + "28494 375\n", + "31056 386\n", + "4977 391\n", + "31123 398\n", + "38530 423\n", + "Name: fasting blood sugar, dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y = original[\"fasting blood sugar\"].sort_values()\n", + "y.tail(30)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From ddae472effcad1552ac64dca8f5e57a45e79bfab Mon Sep 17 00:00:00 2001 From: I569996 Date: Thu, 26 Jun 2025 09:43:42 +0200 Subject: [PATCH 2/2] jona rumberg --- data-preparation.ipynb | 55 +++--------------------------------------- 1 file changed, 3 insertions(+), 52 deletions(-) diff --git a/data-preparation.ipynb b/data-preparation.ipynb index 01f54a8..70c6ced 100644 --- a/data-preparation.ipynb +++ b/data-preparation.ipynb @@ -25,15 +25,7 @@ "execution_count": null, "id": "94a8e555", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "33302\n" - ] - } - ], + "outputs": [], "source": [ "def clean_up(df):\n", " # Duplikate Löschen\n", @@ -66,51 +58,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "19704c5b", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "18248 302\n", - "22530 303\n", - "23521 303\n", - "25010 305\n", - "11078 307\n", - "11190 308\n", - "15180 309\n", - "34033 310\n", - "13660 313\n", - "19433 313\n", - "15530 314\n", - "31343 315\n", - "8358 315\n", - "37194 318\n", - "34982 329\n", - "7532 330\n", - "35606 341\n", - "37408 342\n", - "32310 342\n", - "29230 349\n", - "15263 365\n", - "5479 369\n", - "24060 369\n", - "27782 369\n", - "25007 375\n", - "28494 375\n", - "31056 386\n", - "4977 391\n", - "31123 398\n", - "38530 423\n", - "Name: fasting blood sugar, dtype: int64" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "y = original[\"fasting blood sugar\"].sort_values()\n", "y.tail(30)"