diff --git a/notebooks/extsream.ipynb b/notebooks/extsream_1.ipynb similarity index 87% rename from notebooks/extsream.ipynb rename to notebooks/extsream_1.ipynb index 6e8f202..7d56b25 100644 --- a/notebooks/extsream.ipynb +++ b/notebooks/extsream_1.ipynb @@ -9,11 +9,12 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# Import librairies.\n", + "from typing import List, Tuple, Dict, Set, Union\n", "import glob\n", "import numpy as np\n", "import pandas as pd\n", @@ -30,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -45,7 +46,7 @@ " '../data/custom_no_streaming_8/folder_1\\\\labels.csv']" ] }, - "execution_count": 2, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -59,18 +60,21 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Concatenated DataFrame (except labels.csv):\n", - "(272935, 10)\n", + "Concatenated DataFrame (all but 'labels.csv'):\n", + " (272935, 10)\n", "\n", "Labels DataFrame:\n", - "(29, 7)\n" + " (29, 7)\n", + "\n", + "Total no. missing values:\n", + " 0\n" ] } ], @@ -85,7 +89,6 @@ " df.insert(1, column=\"trace_id\", value=filename)\n", " df.rename({\"Unnamed: 0\": \"time\"}, axis=1, inplace=True)\n", " dfs.append(df)\n", - "\n", "# Concatenate all dataframes except labels.csv\n", "anomaly_df = pd.concat(dfs)\n", "\n", @@ -93,19 +96,10 @@ "labels_df = pd.read_csv(files[-1], index_col=0)\n", "\n", "# Print the resulting dataframes\n", - "print(\"Concatenated DataFrame (except labels.csv):\")\n", - "print(anomaly_df.shape)\n", + "print(\"Concatenated DataFrame (all but 'labels.csv'):\\n\", anomaly_df.shape)\n", + "print(\"\\nLabels DataFrame:\\n\", labels_df.shape)\n", + "\n", "\n", - "print(\"\\nLabels DataFrame:\")\n", - "print(labels_df.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ "# Rename values in the \"anomaly_type\" column based on the specified mapping\n", "mapping = {\n", " \"1_1\": \"bursty_input\",\n", @@ -115,44 +109,15 @@ " \"3_1\": \"cpu_contention\",\n", " \"3_2\": \"cpu_contention\"\n", "}\n", - "anomaly_df.insert(2, column=\"anomaly_type\", value=anomaly_df['trace_id'].replace(mapping))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "time 0\n", - "trace_id 0\n", - "anomaly_type 0\n", - "driver_BlockManager_memory_memUsed_MB_value 0\n", - "driver_jvm_heap_used_value 0\n", - "avg_jvm_heap_used_value 0\n", - "avg_executor_filesystem_hdfs_write_ops_value_1_diff 0\n", - "avg_executor_cpuTime_count_1_diff 0\n", - "avg_executor_runTime_count_1_diff 0\n", - "avg_executor_shuffleRecordsRead_count_1_diff 0\n", - "avg_executor_shuffleRecordsWritten_count_1_diff 0\n", - "dtype: int64" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ + "anomaly_df.insert(2, column=\"anomaly_type\", value=anomaly_df['trace_id'].replace(mapping))\n", + "\n", "# Missing values.\n", - "anomaly_df.isna().sum()" + "print(\"\\nTotal no. missing values:\\n\", anomaly_df.isna().sum().sum())" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 56, "metadata": {}, "outputs": [ { @@ -178,15 +143,6 @@ " \n", " time\n", " trace_id\n", - " anomaly_type\n", - " driver_BlockManager_memory_memUsed_MB_value\n", - " driver_jvm_heap_used_value\n", - " avg_jvm_heap_used_value\n", - " avg_executor_filesystem_hdfs_write_ops_value_1_diff\n", - " avg_executor_cpuTime_count_1_diff\n", - " avg_executor_runTime_count_1_diff\n", - " avg_executor_shuffleRecordsRead_count_1_diff\n", - " avg_executor_shuffleRecordsWritten_count_1_diff\n", " \n", " \n", " \n", @@ -194,268 +150,91 @@ " 0\n", " 0\n", " 1_1\n", - " bursty_input\n", - " 2100.0\n", - " 1.095197e+09\n", - " 2.665584e+09\n", - " 0.000000\n", - " 0.000000e+00\n", - " 0.0000\n", - " 0.000\n", - " 0.000\n", " \n", " \n", " 1\n", " 1\n", " 1_1\n", - " bursty_input\n", - " 2134.0\n", - " 1.100890e+09\n", - " 2.690258e+09\n", - " 0.000000\n", - " 0.000000e+00\n", - " 0.0000\n", - " 0.000\n", - " 0.000\n", " \n", " \n", " 2\n", " 2\n", " 1_1\n", - " bursty_input\n", - " 2200.0\n", - " 1.106092e+09\n", - " 2.731373e+09\n", - " 0.000000\n", - " 0.000000e+00\n", - " 0.0000\n", - " 0.000\n", - " 0.000\n", " \n", " \n", " 3\n", " 3\n", " 1_1\n", - " bursty_input\n", - " 2265.0\n", - " 1.128138e+09\n", - " 2.116565e+09\n", - " 0.000000\n", - " 4.837354e+08\n", - " 486.0000\n", - " 0.000\n", - " 8302.000\n", " \n", " \n", " 4\n", " 4\n", " 1_1\n", - " bursty_input\n", - " 2299.0\n", - " 1.165589e+09\n", - " 2.580707e+09\n", - " 40.000000\n", - " 1.146078e+09\n", - " 1529.0000\n", - " 25885.000\n", - " 9968.000\n", " \n", " \n", " ...\n", " ...\n", " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", " \n", " \n", " 43094\n", " 43094\n", " 3_2\n", - " cpu_contention\n", - " 12893.0\n", - " 3.736669e+08\n", - " 1.166263e+10\n", - " 0.000000\n", - " 2.516878e+09\n", - " 2639.6667\n", - " 0.000\n", - " 37973.668\n", " \n", " \n", " 43095\n", " 43095\n", " 3_2\n", - " cpu_contention\n", - " 13095.0\n", - " 3.857830e+08\n", - " 9.822457e+09\n", - " 0.000000\n", - " 1.470157e+09\n", - " 1509.0000\n", - " 0.000\n", - " 21917.334\n", " \n", " \n", " 43096\n", " 43096\n", " 3_2\n", - " cpu_contention\n", - " 13319.0\n", - " 3.999292e+08\n", - " 9.272556e+09\n", - " 0.000000\n", - " 1.807193e+09\n", - " 2253.0000\n", - " 0.000\n", - " 25897.666\n", " \n", " \n", " 43097\n", " 43097\n", " 3_2\n", - " cpu_contention\n", - " 13598.0\n", - " 4.153047e+08\n", - " 8.529498e+09\n", - " 6.000000\n", - " 1.821398e+09\n", - " 1899.6666\n", - " 28229.666\n", - " 28906.666\n", " \n", " \n", " 43098\n", " 43098\n", " 3_2\n", - " cpu_contention\n", - " 13802.0\n", - " 4.516347e+08\n", - " 8.042026e+09\n", - " 20.666666\n", - " 5.880769e+08\n", - " 833.6667\n", - " 114175.000\n", - " 0.000\n", " \n", " \n", "\n", - "

272935 rows × 11 columns

\n", + "

272935 rows × 2 columns

\n", "" ], "text/plain": [ - " time trace_id anomaly_type \\\n", - "0 0 1_1 bursty_input \n", - "1 1 1_1 bursty_input \n", - "2 2 1_1 bursty_input \n", - "3 3 1_1 bursty_input \n", - "4 4 1_1 bursty_input \n", - "... ... ... ... \n", - "43094 43094 3_2 cpu_contention \n", - "43095 43095 3_2 cpu_contention \n", - "43096 43096 3_2 cpu_contention \n", - "43097 43097 3_2 cpu_contention \n", - "43098 43098 3_2 cpu_contention \n", - "\n", - " driver_BlockManager_memory_memUsed_MB_value \\\n", - "0 2100.0 \n", - "1 2134.0 \n", - "2 2200.0 \n", - "3 2265.0 \n", - "4 2299.0 \n", - "... ... \n", - "43094 12893.0 \n", - "43095 13095.0 \n", - "43096 13319.0 \n", - "43097 13598.0 \n", - "43098 13802.0 \n", + " time trace_id\n", + "0 0 1_1\n", + "1 1 1_1\n", + "2 2 1_1\n", + "3 3 1_1\n", + "4 4 1_1\n", + "... ... ...\n", + "43094 43094 3_2\n", + "43095 43095 3_2\n", + "43096 43096 3_2\n", + "43097 43097 3_2\n", + "43098 43098 3_2\n", "\n", - " driver_jvm_heap_used_value avg_jvm_heap_used_value \\\n", - "0 1.095197e+09 2.665584e+09 \n", - "1 1.100890e+09 2.690258e+09 \n", - "2 1.106092e+09 2.731373e+09 \n", - "3 1.128138e+09 2.116565e+09 \n", - "4 1.165589e+09 2.580707e+09 \n", - "... ... ... \n", - "43094 3.736669e+08 1.166263e+10 \n", - "43095 3.857830e+08 9.822457e+09 \n", - "43096 3.999292e+08 9.272556e+09 \n", - "43097 4.153047e+08 8.529498e+09 \n", - "43098 4.516347e+08 8.042026e+09 \n", - "\n", - " avg_executor_filesystem_hdfs_write_ops_value_1_diff \\\n", - "0 0.000000 \n", - "1 0.000000 \n", - "2 0.000000 \n", - "3 0.000000 \n", - "4 40.000000 \n", - "... ... \n", - "43094 0.000000 \n", - "43095 0.000000 \n", - "43096 0.000000 \n", - "43097 6.000000 \n", - "43098 20.666666 \n", - "\n", - " avg_executor_cpuTime_count_1_diff avg_executor_runTime_count_1_diff \\\n", - "0 0.000000e+00 0.0000 \n", - "1 0.000000e+00 0.0000 \n", - "2 0.000000e+00 0.0000 \n", - "3 4.837354e+08 486.0000 \n", - "4 1.146078e+09 1529.0000 \n", - "... ... ... \n", - "43094 2.516878e+09 2639.6667 \n", - "43095 1.470157e+09 1509.0000 \n", - "43096 1.807193e+09 2253.0000 \n", - "43097 1.821398e+09 1899.6666 \n", - "43098 5.880769e+08 833.6667 \n", - "\n", - " avg_executor_shuffleRecordsRead_count_1_diff \\\n", - "0 0.000 \n", - "1 0.000 \n", - "2 0.000 \n", - "3 0.000 \n", - "4 25885.000 \n", - "... ... \n", - "43094 0.000 \n", - "43095 0.000 \n", - "43096 0.000 \n", - "43097 28229.666 \n", - "43098 114175.000 \n", - "\n", - " avg_executor_shuffleRecordsWritten_count_1_diff \n", - "0 0.000 \n", - "1 0.000 \n", - "2 0.000 \n", - "3 8302.000 \n", - "4 9968.000 \n", - "... ... \n", - "43094 37973.668 \n", - "43095 21917.334 \n", - "43096 25897.666 \n", - "43097 28906.666 \n", - "43098 0.000 \n", - "\n", - "[272935 rows x 11 columns]" + "[272935 rows x 2 columns]" ] }, - "execution_count": 10, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "anomaly_df" + "anomaly_df[[\"time\", \"trace_id\"]]." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -479,364 +258,390 @@ " \n", " \n", " \n", + " time\n", " trace_id\n", + " anomaly_type\n", + " driver_BlockManager_memory_memUsed_MB_value\n", + " driver_jvm_heap_used_value\n", + " avg_jvm_heap_used_value\n", + " avg_executor_filesystem_hdfs_write_ops_value_1_diff\n", + " avg_executor_cpuTime_count_1_diff\n", + " avg_executor_runTime_count_1_diff\n", + " avg_executor_shuffleRecordsRead_count_1_diff\n", + " avg_executor_shuffleRecordsWritten_count_1_diff\n", " ano_id\n", " ref_start\n", " ref_end\n", " ano_start\n", " ano_end\n", " ano_type\n", + " _merge\n", " \n", " \n", " \n", " \n", - " 0\n", + " 25482\n", + " 4247\n", " 1_1\n", + " bursty_input\n", + " 2189.0\n", + " 367144260.0\n", + " 3.028880e+09\n", + " 0.000000\n", + " 0.000000e+00\n", + " 0.0000\n", + " 0.000\n", + " 0.0000\n", " 0\n", " 4247\n", " 6193\n", " 6193\n", " 7166\n", " bursty_input\n", + " both\n", " \n", " \n", - " 1\n", + " 25488\n", + " 4248\n", " 1_1\n", - " 1\n", - " 11425\n", - " 13393\n", - " 13393\n", - " 14377\n", " bursty_input\n", - " \n", - " \n", - " 2\n", - " 1_1\n", - " 2\n", - " 18659\n", - " 20593\n", - " 20593\n", - " 21560\n", + " 2255.0\n", + " 635267260.0\n", + " 4.005025e+09\n", + " 0.000000\n", + " 4.942382e+08\n", + " 495.0000\n", + " 0.000\n", + " 14532.5000\n", + " 0\n", + " 4247\n", + " 6193\n", + " 6193\n", + " 7166\n", " bursty_input\n", + " both\n", " \n", " \n", - " 3\n", + " 25494\n", + " 4249\n", " 1_1\n", - " 3\n", - " 25849\n", - " 27793\n", - " 27793\n", - " 28765\n", " bursty_input\n", - " \n", - " \n", - " 4\n", - " 1_1\n", - " 4\n", - " 33049\n", - " 34993\n", - " 34993\n", - " 35965\n", + " 2288.0\n", + " 735523600.0\n", + " 4.224833e+09\n", + " 34.000000\n", + " 8.029332e+08\n", + " 1147.0000\n", + " 34207.500\n", + " 10806.5000\n", + " 0\n", + " 4247\n", + " 6193\n", + " 6193\n", + " 7166\n", " bursty_input\n", + " both\n", " \n", " \n", - " 5\n", + " 25500\n", + " 4250\n", " 1_1\n", - " 5\n", - " 40257\n", - " 42193\n", - " 42193\n", - " 43161\n", " bursty_input\n", - " \n", - " \n", - " 6\n", - " 1_2\n", + " 2142.0\n", + " 760737500.0\n", + " 2.231513e+09\n", + " 6.000000\n", + " 6.237813e+07\n", + " 490.0000\n", + " 7250.500\n", + " 0.0000\n", " 0\n", - " 5055\n", - " 6989\n", - " 6989\n", - " 7956\n", - " bursty_input\n", - " \n", - " \n", - " 7\n", - " 1_2\n", - " 1\n", - " 13025\n", - " 14989\n", - " 14989\n", - " 15971\n", - " bursty_input\n", - " \n", - " \n", - " 8\n", - " 1_2\n", - " 2\n", - " 21043\n", - " 22989\n", - " 22989\n", - " 23962\n", - " bursty_input\n", - " \n", - " \n", - " 9\n", - " 1_2\n", - " 3\n", - " 29033\n", - " 30989\n", - " 30989\n", - " 31967\n", + " 4247\n", + " 6193\n", + " 6193\n", + " 7166\n", " bursty_input\n", + " both\n", " \n", " \n", - " 10\n", - " 1_2\n", - " 4\n", - " 37045\n", - " 38989\n", - " 38989\n", - " 39961\n", + " 25506\n", + " 4251\n", + " 1_1\n", " bursty_input\n", - " \n", - " \n", - " 11\n", - " 2_1\n", - " 0\n", - " 6954\n", - " 8908\n", - " 8908\n", - " 9885\n", - " stalled_input\n", - " \n", - " \n", - " 12\n", - " 2_1\n", - " 1\n", - " 16856\n", - " 18808\n", - " 18808\n", - " 19784\n", - " stalled_input\n", - " \n", - " \n", - " 13\n", - " 2_1\n", - " 2\n", - " 26756\n", - " 28708\n", - " 28708\n", - " 29684\n", - " stalled_input\n", - " \n", - " \n", - " 14\n", - " 2_1\n", - " 3\n", - " 36659\n", - " 38609\n", - " 38609\n", - " 39584\n", - " stalled_input\n", - " \n", - " \n", - " 15\n", - " 2_2\n", + " 2125.0\n", + " 770470140.0\n", + " 2.237279e+09\n", + " 0.000000\n", + " 0.000000e+00\n", + " 0.0000\n", + " 0.000\n", + " 0.0000\n", " 0\n", - " 8771\n", - " 10699\n", - " 10699\n", - " 11663\n", - " stalled_input\n", - " \n", - " \n", - " 16\n", - " 2_2\n", - " 1\n", - " 20451\n", - " 22399\n", - " 22399\n", - " 23373\n", - " stalled_input\n", - " \n", - " \n", - " 17\n", - " 2_2\n", - " 2\n", - " 32154\n", - " 34100\n", - " 34100\n", - " 35073\n", - " stalled_input\n", - " \n", - " \n", - " 18\n", - " 2_2\n", - " 3\n", - " 44024\n", - " 45800\n", - " 45800\n", - " 46688\n", - " stalled_input\n", + " 4247\n", + " 6193\n", + " 6193\n", + " 7166\n", + " bursty_input\n", + " both\n", " \n", " \n", - " 19\n", - " 3_1\n", - " 0\n", - " 3035\n", - " 3965\n", - " 3965\n", - " 4430\n", - " cpu_contention\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 20\n", - " 3_1\n", - " 1\n", - " 5205\n", - " 7125\n", - " 7125\n", - " 8085\n", + " 1286118\n", + " 36724\n", + " 3_2\n", " cpu_contention\n", - " \n", - " \n", - " 21\n", - " 3_1\n", - " 2\n", - " 11442\n", - " 12548\n", - " 12548\n", - " 13101\n", + " 12790.0\n", + " 522976540.0\n", + " 9.267732e+09\n", + " 0.000000\n", + " 1.557150e+09\n", + " 1589.0000\n", + " 0.000\n", + " 29848.6660\n", + " 4\n", + " 31802\n", + " 35086\n", + " 35086\n", + " 36728\n", " cpu_contention\n", + " both\n", " \n", " \n", - " 22\n", - " 3_1\n", - " 3\n", - " 17332\n", - " 19980\n", - " 19980\n", - " 21304\n", + " 1286123\n", + " 36725\n", + " 3_2\n", " cpu_contention\n", - " \n", - " \n", - " 23\n", - " 3_1\n", + " 13045.0\n", + " 534622100.0\n", + " 8.986438e+09\n", + " 0.000000\n", + " 1.908520e+09\n", + " 2291.0000\n", + " 0.000\n", + " 30756.3340\n", " 4\n", - " 31947\n", - " 35035\n", - " 35035\n", - " 36579\n", + " 31802\n", + " 35086\n", + " 35086\n", + " 36728\n", " cpu_contention\n", + " both\n", " \n", " \n", - " 24\n", + " 1286128\n", + " 36726\n", " 3_2\n", - " 0\n", - " 3050\n", - " 4016\n", - " 4016\n", - " 4499\n", " cpu_contention\n", - " \n", - " \n", - " 25\n", - " 3_2\n", - " 1\n", - " 5288\n", - " 7176\n", - " 7176\n", - " 8120\n", + " 13209.0\n", + " 549762800.0\n", + " 8.434711e+09\n", + " 0.000000\n", + " 2.480250e+09\n", + " 2544.3333\n", + " 0.000\n", + " 38853.0000\n", + " 4\n", + " 31802\n", + " 35086\n", + " 35086\n", + " 36728\n", " cpu_contention\n", + " both\n", " \n", " \n", - " 26\n", + " 1286133\n", + " 36727\n", " 3_2\n", - " 2\n", - " 17455\n", - " 20031\n", - " 20031\n", - " 21319\n", " cpu_contention\n", - " \n", - " \n", - " 27\n", - " 3_2\n", - " 3\n", - " 26522\n", - " 28060\n", - " 28060\n", - " 28829\n", + " 13486.0\n", + " 574724700.0\n", + " 9.416742e+09\n", + " 22.666666\n", + " 6.127938e+08\n", + " 721.0000\n", + " 117025.000\n", + " 1511.3334\n", + " 4\n", + " 31802\n", + " 35086\n", + " 35086\n", + " 36728\n", " cpu_contention\n", + " both\n", " \n", " \n", - " 28\n", + " 1286138\n", + " 36728\n", " 3_2\n", + " cpu_contention\n", + " 13784.0\n", + " 606572860.0\n", + " 8.501003e+09\n", + " 4.000000\n", + " 1.040760e+08\n", + " 134.0000\n", + " 20870.666\n", + " 0.0000\n", " 4\n", " 31802\n", " 35086\n", " 35086\n", " 36728\n", " cpu_contention\n", + " both\n", " \n", " \n", "\n", + "

85177 rows × 18 columns

\n", "" ], "text/plain": [ - " trace_id ano_id ref_start ref_end ano_start ano_end ano_type\n", - "0 1_1 0 4247 6193 6193 7166 bursty_input\n", - "1 1_1 1 11425 13393 13393 14377 bursty_input\n", - "2 1_1 2 18659 20593 20593 21560 bursty_input\n", - "3 1_1 3 25849 27793 27793 28765 bursty_input\n", - "4 1_1 4 33049 34993 34993 35965 bursty_input\n", - "5 1_1 5 40257 42193 42193 43161 bursty_input\n", - "6 1_2 0 5055 6989 6989 7956 bursty_input\n", - "7 1_2 1 13025 14989 14989 15971 bursty_input\n", - "8 1_2 2 21043 22989 22989 23962 bursty_input\n", - "9 1_2 3 29033 30989 30989 31967 bursty_input\n", - "10 1_2 4 37045 38989 38989 39961 bursty_input\n", - "11 2_1 0 6954 8908 8908 9885 stalled_input\n", - "12 2_1 1 16856 18808 18808 19784 stalled_input\n", - "13 2_1 2 26756 28708 28708 29684 stalled_input\n", - "14 2_1 3 36659 38609 38609 39584 stalled_input\n", - "15 2_2 0 8771 10699 10699 11663 stalled_input\n", - "16 2_2 1 20451 22399 22399 23373 stalled_input\n", - "17 2_2 2 32154 34100 34100 35073 stalled_input\n", - "18 2_2 3 44024 45800 45800 46688 stalled_input\n", - "19 3_1 0 3035 3965 3965 4430 cpu_contention\n", - "20 3_1 1 5205 7125 7125 8085 cpu_contention\n", - "21 3_1 2 11442 12548 12548 13101 cpu_contention\n", - "22 3_1 3 17332 19980 19980 21304 cpu_contention\n", - "23 3_1 4 31947 35035 35035 36579 cpu_contention\n", - "24 3_2 0 3050 4016 4016 4499 cpu_contention\n", - "25 3_2 1 5288 7176 7176 8120 cpu_contention\n", - "26 3_2 2 17455 20031 20031 21319 cpu_contention\n", - "27 3_2 3 26522 28060 28060 28829 cpu_contention\n", - "28 3_2 4 31802 35086 35086 36728 cpu_contention" + " time trace_id anomaly_type \\\n", + "25482 4247 1_1 bursty_input \n", + "25488 4248 1_1 bursty_input \n", + "25494 4249 1_1 bursty_input \n", + "25500 4250 1_1 bursty_input \n", + "25506 4251 1_1 bursty_input \n", + "... ... ... ... \n", + "1286118 36724 3_2 cpu_contention \n", + "1286123 36725 3_2 cpu_contention \n", + "1286128 36726 3_2 cpu_contention \n", + "1286133 36727 3_2 cpu_contention \n", + "1286138 36728 3_2 cpu_contention \n", + "\n", + " driver_BlockManager_memory_memUsed_MB_value \\\n", + "25482 2189.0 \n", + "25488 2255.0 \n", + "25494 2288.0 \n", + "25500 2142.0 \n", + "25506 2125.0 \n", + "... ... \n", + "1286118 12790.0 \n", + "1286123 13045.0 \n", + "1286128 13209.0 \n", + "1286133 13486.0 \n", + "1286138 13784.0 \n", + "\n", + " driver_jvm_heap_used_value avg_jvm_heap_used_value \\\n", + "25482 367144260.0 3.028880e+09 \n", + "25488 635267260.0 4.005025e+09 \n", + "25494 735523600.0 4.224833e+09 \n", + "25500 760737500.0 2.231513e+09 \n", + "25506 770470140.0 2.237279e+09 \n", + "... ... ... \n", + "1286118 522976540.0 9.267732e+09 \n", + "1286123 534622100.0 8.986438e+09 \n", + "1286128 549762800.0 8.434711e+09 \n", + "1286133 574724700.0 9.416742e+09 \n", + "1286138 606572860.0 8.501003e+09 \n", + "\n", + " avg_executor_filesystem_hdfs_write_ops_value_1_diff \\\n", + "25482 0.000000 \n", + "25488 0.000000 \n", + "25494 34.000000 \n", + "25500 6.000000 \n", + "25506 0.000000 \n", + "... ... \n", + "1286118 0.000000 \n", + "1286123 0.000000 \n", + "1286128 0.000000 \n", + "1286133 22.666666 \n", + "1286138 4.000000 \n", + "\n", + " avg_executor_cpuTime_count_1_diff avg_executor_runTime_count_1_diff \\\n", + "25482 0.000000e+00 0.0000 \n", + "25488 4.942382e+08 495.0000 \n", + "25494 8.029332e+08 1147.0000 \n", + "25500 6.237813e+07 490.0000 \n", + "25506 0.000000e+00 0.0000 \n", + "... ... ... \n", + "1286118 1.557150e+09 1589.0000 \n", + "1286123 1.908520e+09 2291.0000 \n", + "1286128 2.480250e+09 2544.3333 \n", + "1286133 6.127938e+08 721.0000 \n", + "1286138 1.040760e+08 134.0000 \n", + "\n", + " avg_executor_shuffleRecordsRead_count_1_diff \\\n", + "25482 0.000 \n", + "25488 0.000 \n", + "25494 34207.500 \n", + "25500 7250.500 \n", + "25506 0.000 \n", + "... ... \n", + "1286118 0.000 \n", + "1286123 0.000 \n", + "1286128 0.000 \n", + "1286133 117025.000 \n", + "1286138 20870.666 \n", + "\n", + " avg_executor_shuffleRecordsWritten_count_1_diff ano_id ref_start \\\n", + "25482 0.0000 0 4247 \n", + "25488 14532.5000 0 4247 \n", + "25494 10806.5000 0 4247 \n", + "25500 0.0000 0 4247 \n", + "25506 0.0000 0 4247 \n", + "... ... ... ... \n", + "1286118 29848.6660 4 31802 \n", + "1286123 30756.3340 4 31802 \n", + "1286128 38853.0000 4 31802 \n", + "1286133 1511.3334 4 31802 \n", + "1286138 0.0000 4 31802 \n", + "\n", + " ref_end ano_start ano_end ano_type _merge \n", + "25482 6193 6193 7166 bursty_input both \n", + "25488 6193 6193 7166 bursty_input both \n", + "25494 6193 6193 7166 bursty_input both \n", + "25500 6193 6193 7166 bursty_input both \n", + "25506 6193 6193 7166 bursty_input both \n", + "... ... ... ... ... ... \n", + "1286118 35086 35086 36728 cpu_contention both \n", + "1286123 35086 35086 36728 cpu_contention both \n", + "1286128 35086 35086 36728 cpu_contention both \n", + "1286133 35086 35086 36728 cpu_contention both \n", + "1286138 35086 35086 36728 cpu_contention both \n", + "\n", + "[85177 rows x 18 columns]" ] }, - "execution_count": 11, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "labels_df" + "pd.merge(\n", + " anomaly_df,\n", + " labels_df,\n", + " how='inner',\n", + " on='trace_id',\n", + " suffixes=('_anomaly', '_labels'),\n", + " indicator=True\n", + ").loc[lambda x: x['time'].between(x['ref_start'], x['ref_end']) | x['time'].between(x['ano_start'], x['ano_end'])]\n" ] }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "bursty_input1_df = anomaly_df[anomaly_df[\"trace_id\"] == \"1_1\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 21, + "execution_count": 59, "metadata": {}, "outputs": [ { @@ -860,286 +665,104 @@ " \n", " \n", " \n", - " time\n", " trace_id\n", - " anomaly_type\n", - " driver_BlockManager_memory_memUsed_MB_value\n", - " driver_jvm_heap_used_value\n", - " avg_jvm_heap_used_value\n", - " avg_executor_filesystem_hdfs_write_ops_value_1_diff\n", - " avg_executor_cpuTime_count_1_diff\n", - " avg_executor_runTime_count_1_diff\n", - " avg_executor_shuffleRecordsRead_count_1_diff\n", - " avg_executor_shuffleRecordsWritten_count_1_diff\n", + " ano_id\n", + " ref_start\n", + " ref_end\n", + " ano_start\n", + " ano_end\n", " \n", " \n", " \n", " \n", " 0\n", - " 0\n", " 1_1\n", - " bursty_input\n", - " 2100.0\n", - " 1.095197e+09\n", - " 2.665584e+09\n", - " 0.0\n", - " 0.000000e+00\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " 4247\n", + " 6193\n", + " 6193\n", + " 7166\n", " \n", " \n", " 1\n", - " 1\n", " 1_1\n", - " bursty_input\n", - " 2134.0\n", - " 1.100890e+09\n", - " 2.690258e+09\n", - " 0.0\n", - " 0.000000e+00\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", + " 1\n", + " 11425\n", + " 13393\n", + " 13393\n", + " 14377\n", " \n", " \n", " 2\n", - " 2\n", " 1_1\n", - " bursty_input\n", - " 2200.0\n", - " 1.106092e+09\n", - " 2.731373e+09\n", - " 0.0\n", - " 0.000000e+00\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", + " 2\n", + " 18659\n", + " 20593\n", + " 20593\n", + " 21560\n", " \n", " \n", " 3\n", - " 3\n", " 1_1\n", - " bursty_input\n", - " 2265.0\n", - " 1.128138e+09\n", - " 2.116565e+09\n", - " 0.0\n", - " 4.837354e+08\n", - " 486.0\n", - " 0.0\n", - " 8302.0\n", + " 3\n", + " 25849\n", + " 27793\n", + " 27793\n", + " 28765\n", " \n", " \n", " 4\n", - " 4\n", " 1_1\n", - " bursty_input\n", - " 2299.0\n", - " 1.165589e+09\n", - " 2.580707e+09\n", - " 40.0\n", - " 1.146078e+09\n", - " 1529.0\n", - " 25885.0\n", - " 9968.0\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 46686\n", - " 46686\n", - " 1_1\n", - " bursty_input\n", - " 2095.0\n", - " 4.809436e+08\n", - " 9.870312e+09\n", - " 0.0\n", - " 0.000000e+00\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 46687\n", - " 46687\n", - " 1_1\n", - " bursty_input\n", - " 2143.0\n", - " 4.904456e+08\n", - " 9.910990e+09\n", - " 0.0\n", - " 0.000000e+00\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 46688\n", - " 46688\n", - " 1_1\n", - " bursty_input\n", - " 2208.0\n", - " 5.183927e+08\n", - " 8.766686e+09\n", - " 0.0\n", - " 4.356684e+08\n", - " 451.5\n", - " 0.0\n", - " 7001.5\n", - " \n", - " \n", - " 46689\n", - " 46689\n", - " 1_1\n", - " bursty_input\n", - " 2258.0\n", - " 5.530916e+08\n", - " 8.766686e+09\n", - " 0.0\n", - " 0.000000e+00\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 46690\n", - " 46690\n", - " 1_1\n", - " bursty_input\n", - " 2258.0\n", - " 5.830545e+08\n", - " 8.766686e+09\n", - " 0.0\n", - " 0.000000e+00\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", + " 4\n", + " 33049\n", + " 34993\n", + " 34993\n", + " 35965\n", " \n", " \n", "\n", - "

46691 rows × 11 columns

\n", "" ], "text/plain": [ - " time trace_id anomaly_type \\\n", - "0 0 1_1 bursty_input \n", - "1 1 1_1 bursty_input \n", - "2 2 1_1 bursty_input \n", - "3 3 1_1 bursty_input \n", - "4 4 1_1 bursty_input \n", - "... ... ... ... \n", - "46686 46686 1_1 bursty_input \n", - "46687 46687 1_1 bursty_input \n", - "46688 46688 1_1 bursty_input \n", - "46689 46689 1_1 bursty_input \n", - "46690 46690 1_1 bursty_input \n", - "\n", - " driver_BlockManager_memory_memUsed_MB_value \\\n", - "0 2100.0 \n", - "1 2134.0 \n", - "2 2200.0 \n", - "3 2265.0 \n", - "4 2299.0 \n", - "... ... \n", - "46686 2095.0 \n", - "46687 2143.0 \n", - "46688 2208.0 \n", - "46689 2258.0 \n", - "46690 2258.0 \n", - "\n", - " driver_jvm_heap_used_value avg_jvm_heap_used_value \\\n", - "0 1.095197e+09 2.665584e+09 \n", - "1 1.100890e+09 2.690258e+09 \n", - "2 1.106092e+09 2.731373e+09 \n", - "3 1.128138e+09 2.116565e+09 \n", - "4 1.165589e+09 2.580707e+09 \n", - "... ... ... \n", - "46686 4.809436e+08 9.870312e+09 \n", - "46687 4.904456e+08 9.910990e+09 \n", - "46688 5.183927e+08 8.766686e+09 \n", - "46689 5.530916e+08 8.766686e+09 \n", - "46690 5.830545e+08 8.766686e+09 \n", - "\n", - " avg_executor_filesystem_hdfs_write_ops_value_1_diff \\\n", - "0 0.0 \n", - "1 0.0 \n", - "2 0.0 \n", - "3 0.0 \n", - "4 40.0 \n", - "... ... \n", - "46686 0.0 \n", - "46687 0.0 \n", - "46688 0.0 \n", - "46689 0.0 \n", - "46690 0.0 \n", - "\n", - " avg_executor_cpuTime_count_1_diff avg_executor_runTime_count_1_diff \\\n", - "0 0.000000e+00 0.0 \n", - "1 0.000000e+00 0.0 \n", - "2 0.000000e+00 0.0 \n", - "3 4.837354e+08 486.0 \n", - "4 1.146078e+09 1529.0 \n", - "... ... ... \n", - "46686 0.000000e+00 0.0 \n", - "46687 0.000000e+00 0.0 \n", - "46688 4.356684e+08 451.5 \n", - "46689 0.000000e+00 0.0 \n", - "46690 0.000000e+00 0.0 \n", - "\n", - " avg_executor_shuffleRecordsRead_count_1_diff \\\n", - "0 0.0 \n", - "1 0.0 \n", - "2 0.0 \n", - "3 0.0 \n", - "4 25885.0 \n", - "... ... \n", - "46686 0.0 \n", - "46687 0.0 \n", - "46688 0.0 \n", - "46689 0.0 \n", - "46690 0.0 \n", - "\n", - " avg_executor_shuffleRecordsWritten_count_1_diff \n", - "0 0.0 \n", - "1 0.0 \n", - "2 0.0 \n", - "3 8302.0 \n", - "4 9968.0 \n", - "... ... \n", - "46686 0.0 \n", - "46687 0.0 \n", - "46688 7001.5 \n", - "46689 0.0 \n", - "46690 0.0 \n", - "\n", - "[46691 rows x 11 columns]" + " trace_id ano_id ref_start ref_end ano_start ano_end\n", + "0 1_1 0 4247 6193 6193 7166\n", + "1 1_1 1 11425 13393 13393 14377\n", + "2 1_1 2 18659 20593 20593 21560\n", + "3 1_1 3 25849 27793 27793 28765\n", + "4 1_1 4 33049 34993 34993 35965" ] }, - "execution_count": 21, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "bursty_input1_df" + "labels_df.iloc[:, :-1].head()" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['1_1']\n" + ] + } + ], + "source": [ + "# Take one example\n", + "bursty_input1_df = anomaly_df[anomaly_df[\"trace_id\"] == \"1_1\"]\n", + "print(bursty_input1_df[\"trace_id\"].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1201,11 +824,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ - "#---CODE HERE----" + "# #---CODE HERE----\n", + "# def sufficient_features_space(\n", + "# data: pd.DataFrame | np.ndarray,\n", + "# time_feature: str | int,\n", + "# ref_start: int, \n", + "# ref_end: int, \n", + "# ano_start: int, \n", + "# ano_end: int) -> List[str] | List[int]:\n", + " \n", + "# if isinstance(data, pd.DataFrame):\n", + "# mask_ref = data[time_feature].between(ref_start, ref_end)\n", + "# mask_ano = data[time_feature].between(ano_start, ano_end)\n", + "# mask_intervals = mask_ref and mask_ano\n", + "# relevant_data = data[mask_intervals]\n", + "# return [col for col in relevant_data.columns if relevant_data[col].var() > 10e-16]\n", + "# else:\n", + "# raise AttributeError(\"`data` is not a dataframe\")\n", + "# # pass" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# sufficient_features_space(bursty_input1_df, time_feature=)" ] }, { @@ -1250,27 +899,117 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ - "#---CODE HERE----" + "#---CODE HERE----\n", + "def filter_feature(data: pd.DataFrame, time: str, feature: str, ref_start: int, ref_end: int, ano_start: int, ano_end: int):\n", + " mask_ref = data[time].between(ref_start, ref_end)\n", + " mask_ano = data[time].between(ano_start, ano_end)\n", + " TSA = data.loc[mask_ano, [time, feature]].set_index(time)\n", + " TSR = data.loc[mask_ref, [time, feature]].set_index(time)\n", + " return TSA, TSR" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4247 6193 6193 7166\n", + "(1947, 1) time\n", + "4248 494238180.0\n", + "6193 423371900.0\n", + "Name: avg_executor_cpuTime_count_1_diff, dtype: float64\n", + "(974, 1) time\n", + "6194 1.444361e+09\n", + "7166 0.000000e+00\n", + "Name: avg_executor_cpuTime_count_1_diff, dtype: float64\n" + ] + } + ], + "source": [ + "ref_start, ref_end, ano_start, ano_end = labels_df.loc[\n", + " 0, [\"ref_start\", \"ref_end\", \"ano_start\", \"ano_end\"]\n", + "] #TODO: Replace \"0\" by a selected \"ano_id\" given a \"trace_id\"\n", + "\n", + "TSA, TSR = filter_feature(data=bursty_input1_df,\n", + " time=\"time\",\n", + " feature=\"avg_executor_cpuTime_count_1_diff\",\n", + " ref_start=ref_start,\n", + " ref_end=ref_end,\n", + " ano_start=ano_start,\n", + " ano_end=ano_end)\n", + "\n", + "print(ref_start, ref_end, ano_start, ano_end)\n", + "print(TSR.shape, TSR.iloc[[1,-1], 0])\n", + "print(TSA.shape, TSA.iloc[[1,-1], 0])" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "def class_entropy(TSA, TSR):\n", + " \"\"\"\n", + " Calculate the class entropy between two time series.\n", + "\n", + " Parameters:\n", + " - TSA: Time series belonging to the abnormal class.\n", + " - TSR: Time series belonging to the reference class.\n", + "\n", + " Returns:\n", + " - HClass: Class entropy.\n", + " \"\"\"\n", + " # Calculate probabilities\n", + " pA = len(TSA) / (len(TSA) + len(TSR))\n", + " pR = len(TSR) / (len(TSA) + len(TSR))\n", + "\n", + " return - pA * np.log(pA) - pR * np.log(pR)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6365932382975314\n" + ] + } + ], + "source": [ + "class_ent = class_entropy(TSA, TSR)\n", + "print(class_ent)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Segmentation entropy (???)\n", + "### Segmentation entropy\n", "\n", "**Input**\n", - "* Time series $TS$ of a feature $f$: $(X_t)_{t=1,\\dots,T}$. \n", + "* Time series $TS$ of a feature $f$: $(X_t)_{t=1,\\dots,T}$.\n", "* Anomaly time annotations: $A_S = (a_s)_{s=1,\\dots, S} \\subset \\{1, \\dots, T\\}$ \n", "\n", "Example: In the sequence, AANNNNANNN. The anomaly time anotations would be (1,2,7). We are given only the different time intervals from which we have to derive the different anomaly time anotation and then the segment (this is just an heuristic, not necessarily the way it would be implemented).\n", "\n", + "\n", "**Compute**\n", - "* The segments. A segment is a (contiguous) sequence of point with the same label (anomaly or not anomaly)\n", + "* Sort the feature time series values in increasing order.\n", + "* Tag each feature value depending of if it appears during the anormal interval only (YELLOW), the reference interval only (RED), or in both interval (BLUE).\n", + "* The segments. A segment is a (contiguous) sequence of point with the same label (anomaly or not anomaly).\n", "* The segmentation entropy of the feature $f$:\n", "$$\n", " H_{Segmentation}(f) = \\sum_{i=1}^{n} p_i \\cdot \\log\\left(\\frac{1}{p_i}\\right)\n", @@ -1283,7 +1022,9 @@ "metadata": {}, "outputs": [], "source": [ - "#---CODE HERE----" + "#---CODE HERE----\n", + "def segmentation_entropy():\n", + " pass" ] }, { @@ -1307,7 +1048,16 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "def reorder_mixed_segment():\n", + " pass\n", + "\n", + "def segment_penalty(): # ???\n", + " pass\n", + "\n", + "def penalized_segmentation_entropy():\n", + " pass" + ] }, { "cell_type": "markdown", @@ -1316,12 +1066,31 @@ "### Normalization by feature size" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$\n", + " D(f) = \\dfrac{H_{Class}(f)}{H^+_{Segmentation}(f)}\n", + "$$" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "def single_reward_function(feature):\n", + " return class_entropy(feature) / penalized_segmentation_entropy(feature)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Testing `single_reward_function`" + ] }, { "cell_type": "code",