feifeibear · huangzl19 · Oct 10, 2024 · Oct 26, 2024
diff --git a/data/oasst_prompts.json b/data/oasst_prompts.json
diff --git a/data/understand_json.ipynb b/data/understand_json.ipynb
@@ -0,0 +1,185 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/huangzl/anaconda3/envs/specexec/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "from transformers import AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "540\n"
+     ]
+    }
+   ],
+   "source": [
+    "file_path = f\"./oasst_prompts.json\"\n",
+    "with open(file_path, \"r\") as f:\n",
+    "    dataset = json.load(f)\n",
+    "dataset = [x[1] for x in dataset]\n",
+    "print(len(dataset))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[INST] <<SYS>>\n",
+      "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n",
+      "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n",
+      "<</SYS>\n",
+      "\n",
+      "\n",
+      "What is a Dyson Sphere? [\\INST]\n",
+      "\n",
+      "564\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(dataset[0])\n",
+    "print(len(dataset[0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(\"/home/huangzl/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    }
+   ],
+   "source": [
+    "def get_inputs(tokenizer, dataset_name, prefill_len, n_test):\n",
+    "    dataset_list = []\n",
+    "\n",
+    "    if dataset_name == \"oasst\":\n",
+    "        file_path = f\"oasst_prompts.json\"\n",
+    "        with open(file_path, \"r\") as f:\n",
+    "            dataset = json.load(f)\n",
+    "        dataset = [x[1] for x in dataset]\n",
+    "        count = 0\n",
+    "        idx = 0\n",
+    "        \n",
+    "        while True:\n",
+    "            tokens = tokenizer(dataset[idx], return_tensors='pt')\n",
+    "            if tokens.input_ids.shape[1] >= prefill_len:\n",
+    "                dataset_list.append(tokens.input_ids[:, :prefill_len].to('cuda:0'))\n",
+    "                count += 1\n",
+    "            idx += 1\n",
+    "            if count >= n_test:\n",
+    "                break\n",
+    "    else:\n",
+    "        raise ValueError(f\"dataset {dataset_name} not supported\")\n",
+    "    \n",
+    "    return dataset_list\n",
+    "\n",
+    "dataset_list = get_inputs(tokenizer, \"oasst\", 128, 10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[    1,   518, 25580, 29962,  3532, 14816, 29903,  6778,    13,  3492,\n",
+       "           526,   263,  8444, 29892,  3390,  1319,   322, 15993, 20255, 29889,\n",
+       "         29849,  1234,   408,  1371,  3730,   408,  1950, 29892,  1550,  1641,\n",
+       "          9109, 29889,  3575,  6089,   881,   451,  3160,   738, 10311,  1319,\n",
+       "         29892,   443,   621,   936, 29892, 11021,   391, 29892,  7916,   391,\n",
+       "         29892,   304, 27375, 29892, 18215, 29892,   470, 27302,  2793, 29889,\n",
+       "          3529,  9801,   393,   596, 20890,   526,  5374,   635,   443,  5365,\n",
+       "          1463,   322,  6374,   297,  5469, 29889,    13,  3644,   263,  1139,\n",
+       "           947,   451,  1207,   738,  4060, 29892,   470,   338,   451,  2114,\n",
+       "          1474, 16165,   261,   296, 29892,  5649,  2020,  2012,   310, 22862,\n",
+       "          1554,   451,  1959, 29889,   960,   366,  1016, 29915, 29873,  1073,\n",
+       "           278,  1234,   304,   263,  1139, 29892,  3113,  1016, 29915, 29873,\n",
+       "          6232,  2089,  2472, 29889,    13, 29966,   829, 14816]],\n",
+       "       device='cuda:0')"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset_list[0]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "specexec",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}