Skip to content
Open

Topk #37

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,162 changes: 2,162 additions & 0 deletions data/oasst_prompts.json

Large diffs are not rendered by default.

185 changes: 185 additions & 0 deletions data/understand_json.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/huangzl/anaconda3/envs/specexec/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import json\n",
"from transformers import AutoTokenizer"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"540\n"
]
}
],
"source": [
"file_path = f\"./oasst_prompts.json\"\n",
"with open(file_path, \"r\") as f:\n",
" dataset = json.load(f)\n",
"dataset = [x[1] for x in dataset]\n",
"print(len(dataset))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[INST] <<SYS>>\n",
"You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n",
"If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n",
"<</SYS>\n",
"\n",
"\n",
"What is a Dyson Sphere? [\\INST]\n",
"\n",
"564\n"
]
}
],
"source": [
"print(dataset[0])\n",
"print(len(dataset[0]))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained(\"/home/huangzl/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
}
],
"source": [
"def get_inputs(tokenizer, dataset_name, prefill_len, n_test):\n",
" dataset_list = []\n",
"\n",
" if dataset_name == \"oasst\":\n",
" file_path = f\"oasst_prompts.json\"\n",
" with open(file_path, \"r\") as f:\n",
" dataset = json.load(f)\n",
" dataset = [x[1] for x in dataset]\n",
" count = 0\n",
" idx = 0\n",
" \n",
" while True:\n",
" tokens = tokenizer(dataset[idx], return_tensors='pt')\n",
" if tokens.input_ids.shape[1] >= prefill_len:\n",
" dataset_list.append(tokens.input_ids[:, :prefill_len].to('cuda:0'))\n",
" count += 1\n",
" idx += 1\n",
" if count >= n_test:\n",
" break\n",
" else:\n",
" raise ValueError(f\"dataset {dataset_name} not supported\")\n",
" \n",
" return dataset_list\n",
"\n",
"dataset_list = get_inputs(tokenizer, \"oasst\", 128, 10)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[ 1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492,\n",
" 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889,\n",
" 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641,\n",
" 9109, 29889, 3575, 6089, 881, 451, 3160, 738, 10311, 1319,\n",
" 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, 391,\n",
" 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, 29889,\n",
" 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, 5365,\n",
" 1463, 322, 6374, 297, 5469, 29889, 13, 3644, 263, 1139,\n",
" 947, 451, 1207, 738, 4060, 29892, 470, 338, 451, 2114,\n",
" 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, 310, 22862,\n",
" 1554, 451, 1959, 29889, 960, 366, 1016, 29915, 29873, 1073,\n",
" 278, 1234, 304, 263, 1139, 29892, 3113, 1016, 29915, 29873,\n",
" 6232, 2089, 2472, 29889, 13, 29966, 829, 14816]],\n",
" device='cuda:0')"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_list[0]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "specexec",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading