diff --git "a/novel-translation/07_tune-lf-py3.11.ipynb" "b/novel-translation/07_tune-lf-py3.11.ipynb"
deleted file mode 100644--- "a/novel-translation/07_tune-lf-py3.11.ipynb"
+++ /dev/null
@@ -1 +0,0 @@
-{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"0ea8b46b-839b-445b-8043-ccdf4e920ace","showTitle":false,"title":""},"id":"HNcg8cZt4-Vy"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"id":"yUsdJRoa-Ucr"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"6d394937-6c99-4a7c-9d32-7600a280032f","showTitle":false,"title":""},"id":"VNEzdbzv4-Vz","outputId":"f0f8498e-f8d6-47a9-c2da-d0c6a59cba09"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /home/inflaton/code/projects/courses/llm-finetuning\n"]}],"source":["import os\n","import sys\n","from pathlib import Path\n","\n","workding_dir = str(Path.cwd().parent)\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"id":"xnfep6kU4-V0","outputId":"6ac1cf39-8815-4105-bd81-650366d892f5"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /home/inflaton/code/projects/courses/llm-finetuning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"f1597656-8042-4878-9d3b-9ebfb8dd86dc","showTitle":false,"title":""},"id":"7OFhWcvF4-V0","outputId":"fafa314f-3368-4d52-d7bd-8c0cd5ba75a1"},"outputs":[{"data":{"text/plain":["('unsloth/Qwen2-0.5B-Instruct-bnb-4bit',\n"," True,\n"," None,\n"," None,\n"," 2048,\n"," 10,\n"," None,\n"," 'datasets/mac/mac.tsv',\n"," 'results/mac-results_lf.csv')"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["import os\n","\n","model_name = os.getenv(\"MODEL_NAME\")\n","token = os.getenv(\"HF_TOKEN\") or None\n","load_in_4bit = os.getenv(\"LOAD_IN_4BIT\") == \"true\"\n","local_model = os.getenv(\"LOCAL_MODEL\")\n","hub_model = os.getenv(\"HUB_MODEL\")\n","num_train_epochs = int(os.getenv(\"NUM_TRAIN_EPOCHS\") or 0)\n","data_path = os.getenv(\"DATA_PATH\")\n","results_path = os.getenv(\"RESULTS_PATH\")\n","\n","max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n","dtype = (\n"," None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n",")\n","\n","model_name, load_in_4bit, local_model, hub_model, max_seq_length, num_train_epochs, dtype, data_path, results_path"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"l-onwasK4-V1","outputId":"10ecb411-ae5a-49d2-d564-e4d76c9e5eb4"},"outputs":[{"name":"stdout","output_type":"stream","text":["Sat Jun 29 17:26:00 2024 \n","+---------------------------------------------------------------------------------------+\n","| NVIDIA-SMI 545.23.07 Driver Version: 546.12 CUDA Version: 12.3 |\n","|-----------------------------------------+----------------------+----------------------+\n","| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n","| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n","| | | MIG M. |\n","|=========================================+======================+======================|\n","| 0 NVIDIA GeForce RTX 4080 ... On | 00000000:01:00.0 Off | N/A |\n","| N/A 50C P8 4W / 150W | 129MiB / 12282MiB | 0% Default |\n","| | | N/A |\n","+-----------------------------------------+----------------------+----------------------+\n"," \n","+---------------------------------------------------------------------------------------+\n","| Processes: |\n","| GPU GI CI PID Type Process name GPU Memory |\n","| ID ID Usage |\n","|=======================================================================================|\n","| No running processes found |\n","+---------------------------------------------------------------------------------------+\n"]}],"source":["!nvidia-smi"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"NwuXzx414-V1","outputId":"489df2da-c43e-4c5f-bdce-82177b21a0ce","colab":{"referenced_widgets":["fabc731ff8e5499a9c842ef6833f3e98","2e186baa65dc4dd1956fa2db0d83b4a1"]}},"outputs":[{"name":"stdout","output_type":"stream","text":["๐ฆฅ Unsloth: Will patch your computer to enable 2x faster free finetuning.\n"]},{"name":"stderr","output_type":"stream","text":["[nltk_data] Downloading package wordnet to /home/inflaton/nltk_data...\n","[nltk_data] Package wordnet is already up-to-date!\n","[nltk_data] Downloading package punkt to /home/inflaton/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n","[nltk_data] Downloading package omw-1.4 to /home/inflaton/nltk_data...\n","[nltk_data] Package omw-1.4 is already up-to-date!\n"]},{"name":"stdout","output_type":"stream","text":["loading /home/inflaton/code/projects/courses/llm-finetuning/llm_toolkit/translation_engine.py\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"fabc731ff8e5499a9c842ef6833f3e98","version_major":2,"version_minor":0},"text/plain":["Generating train split: 0 examples [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"2e186baa65dc4dd1956fa2db0d83b4a1","version_major":2,"version_minor":0},"text/plain":["Generating test split: 0 examples [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['chinese', 'english'],\n"," num_rows: 4528\n"," })\n"," test: Dataset({\n"," features: ['chinese', 'english'],\n"," num_rows: 1133\n"," })\n","})\n"]}],"source":["from llm_toolkit.translation_engine import load_translation_dataset\n","\n","dataset = load_translation_dataset(data_path)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"otrAE5qq4-V2"},"outputs":[],"source":["df = dataset[\"train\"].to_pandas()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"aEbWlaG94-V2"},"outputs":[],"source":["import pandas as pd\n","\n","df_alpaca = pd.DataFrame({\"instruction\": [\"Please translate the following Chinese text into English and provide only the translated content, nothing else.\"]*len(df)})"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"I4BMMR_X4-V3","outputId":"601dc146-e058-4512-d45f-88e7945ec873"},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n"," \n"," | \n"," instruction | \n"," input | \n"," output | \n","
\n"," \n"," \n"," \n"," 0 | \n"," Please translate the following Chinese text in... | \n"," ๅ
จไป็็ไปๆญๆใ | \n"," Because I was protected by a fox fairy. | \n","
\n"," \n"," 1 | \n"," Please translate the following Chinese text in... | \n"," ่ฟๅ๏ผ่กจๅฅๅ่ฏๅฅนไฟฉ๏ผ่ฟไบบๆฏๅฏผๆผ๏ผๅจๅคๅฝ็่ฟๅญฆ็๏ผ่ฟไผ็ผๅง๏ผไปๅคฉๆ็่ฟๆ๏ผๅฐฑๆฏไป่ช็ผ่ชๅฏผ็ใ | \n"," He was the director, the cousin later told the... | \n","
\n"," \n"," 2 | \n"," Please translate the following Chinese text in... | \n"," ่ฟๅคๅงๅฟฝ็ถๆณ่ตทไธไปถไบๆฅ๏ผไพฟๅ็ชๅคๅซ๏ผโ่ๅฟๅๆฅ๏ผโ | \n"," Xi-feng suddenly seemed to remember something,... | \n","
\n"," \n"," 3 | \n"," Please translate the following Chinese text in... | \n"," ไธไธช่็บขๅซๅ
ต่ตฐๅฐๅถๆๆด้ขๅ๏ผ้ขๅฏน็ๅฅน็ซๆไบไธๆโโๅฝๅนด๏ผๅฅนไปฌไนๆฏ่ฟๆ ท้ขๅฏนๅถๅฒๆณฐ็โโ่ฏๅพๅ็ฐ... | \n"," The three old Red Guards stood in front of Ye ... | \n","
\n"," \n"," 4 | \n"," Please translate the following Chinese text in... | \n"," ็จๅ
็็
งๅๅ
จๆถ๏ผ้ฝๆฏไธไธชโ่ฐขโๅญ๏ผ็ถๅ้ฎ็็ฆ็ถๆไปไน่ฏ่ฏดใ | \n"," Mr. Cheng accepted their toast with equanimity... | \n","
\n"," \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n","
\n"," \n"," 4523 | \n"," Please translate the following Chinese text in... | \n"," ๅค่พนๆไธคๅผ ่
ฟๆญช้ข่ฃ็ๅ
ซไปๆกๅญ๏ผๆกๆ่กไนฑๆก็ๅ ๆก็ญ็ช็ๆจๅณใ | \n"," Two rickety tables with scarred tops and a few... | \n","
\n"," \n"," 4524 | \n"," Please translate the following Chinese text in... | \n"," ่ดพ็ๅฌไบ๏ผๅ็ๆ่ณๆ ่
ฎใ | \n"," At this last remark Jia Rui positively scratch... | \n","
\n"," \n"," 4525 | \n"," Please translate the following Chinese text in... | \n"," ๅฌไบ่ฟๆ ท็่ฏไปท๏ผๆไปฌๅฟๆ
ๆฟๅจ๏ผๅๅคงๅฎถไธ่ตทๆฏ่้ซๅผ๏ผๆๅ็ไบ๏ผ | \n"," Hearing comments like this, our emotions were ... | \n","
\n"," \n"," 4526 | \n"," Please translate the following Chinese text in... | \n"," ๆตท่ๅ
ฌ้๏ผโ่ฎฐไฝไบๅ๏ผโ | \n"," 'Can you remember that?' | \n","
\n"," \n"," 4527 | \n"," Please translate the following Chinese text in... | \n"," ไธ้ข่ฏด๏ผ่ฟๆ ทๅ็ผบๅฐ็ป่ใ | \n"," This time the opinions from above said it need... | \n","
\n"," \n","
\n","
4528 rows ร 3 columns
\n","
"],"text/plain":[" instruction \\\n","0 Please translate the following Chinese text in... \n","1 Please translate the following Chinese text in... \n","2 Please translate the following Chinese text in... \n","3 Please translate the following Chinese text in... \n","4 Please translate the following Chinese text in... \n","... ... \n","4523 Please translate the following Chinese text in... \n","4524 Please translate the following Chinese text in... \n","4525 Please translate the following Chinese text in... \n","4526 Please translate the following Chinese text in... \n","4527 Please translate the following Chinese text in... \n","\n"," input \\\n","0 ๅ
จไป็็ไปๆญๆใ \n","1 ่ฟๅ๏ผ่กจๅฅๅ่ฏๅฅนไฟฉ๏ผ่ฟไบบๆฏๅฏผๆผ๏ผๅจๅคๅฝ็่ฟๅญฆ็๏ผ่ฟไผ็ผๅง๏ผไปๅคฉๆ็่ฟๆ๏ผๅฐฑๆฏไป่ช็ผ่ชๅฏผ็ใ \n","2 ่ฟๅคๅงๅฟฝ็ถๆณ่ตทไธไปถไบๆฅ๏ผไพฟๅ็ชๅคๅซ๏ผโ่ๅฟๅๆฅ๏ผโ \n","3 ไธไธช่็บขๅซๅ
ต่ตฐๅฐๅถๆๆด้ขๅ๏ผ้ขๅฏน็ๅฅน็ซๆไบไธๆโโๅฝๅนด๏ผๅฅนไปฌไนๆฏ่ฟๆ ท้ขๅฏนๅถๅฒๆณฐ็โโ่ฏๅพๅ็ฐ... \n","4 ็จๅ
็็
งๅๅ
จๆถ๏ผ้ฝๆฏไธไธชโ่ฐขโๅญ๏ผ็ถๅ้ฎ็็ฆ็ถๆไปไน่ฏ่ฏดใ \n","... ... \n","4523 ๅค่พนๆไธคๅผ ่
ฟๆญช้ข่ฃ็ๅ
ซไปๆกๅญ๏ผๆกๆ่กไนฑๆก็ๅ ๆก็ญ็ช็ๆจๅณใ \n","4524 ่ดพ็ๅฌไบ๏ผๅ็ๆ่ณๆ ่
ฎใ \n","4525 ๅฌไบ่ฟๆ ท็่ฏไปท๏ผๆไปฌๅฟๆ
ๆฟๅจ๏ผๅๅคงๅฎถไธ่ตทๆฏ่้ซๅผ๏ผๆๅ็ไบ๏ผ \n","4526 ๆตท่ๅ
ฌ้๏ผโ่ฎฐไฝไบๅ๏ผโ \n","4527 ไธ้ข่ฏด๏ผ่ฟๆ ทๅ็ผบๅฐ็ป่ใ \n","\n"," output \n","0 Because I was protected by a fox fairy. \n","1 He was the director, the cousin later told the... \n","2 Xi-feng suddenly seemed to remember something,... \n","3 The three old Red Guards stood in front of Ye ... \n","4 Mr. Cheng accepted their toast with equanimity... \n","... ... \n","4523 Two rickety tables with scarred tops and a few... \n","4524 At this last remark Jia Rui positively scratch... \n","4525 Hearing comments like this, our emotions were ... \n","4526 'Can you remember that?' \n","4527 This time the opinions from above said it need... \n","\n","[4528 rows x 3 columns]"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["df_alpaca[\"input\"] = df[\"chinese\"]\n","df_alpaca[\"output\"] = df[\"english\"]\n","df_alpaca"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"-PVlRPV_4-V3"},"outputs":[],"source":["df_alpaca.to_json(\n"," \"llama-factory/data/alpaca_mac.json\", orient=\"records\", lines=False, indent=2\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"SBqxzhjL4-V3"},"outputs":[],"source":["df = pd.read_json(\"llama-factory/data/alpaca_mac.json\", orient=\"records\", lines=False)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"AqXaqcI24-V3","outputId":"b1dd9059-f346-4b9e-81ae-fa12b75bf3f7"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," instruction | \n"," input | \n"," output | \n","
\n"," \n"," \n"," \n"," 0 | \n"," Please translate the following Chinese text in... | \n"," ๅ
จไป็็ไปๆญๆใ | \n"," Because I was protected by a fox fairy. | \n","
\n"," \n"," 1 | \n"," Please translate the following Chinese text in... | \n"," ่ฟๅ๏ผ่กจๅฅๅ่ฏๅฅนไฟฉ๏ผ่ฟไบบๆฏๅฏผๆผ๏ผๅจๅคๅฝ็่ฟๅญฆ็๏ผ่ฟไผ็ผๅง๏ผไปๅคฉๆ็่ฟๆ๏ผๅฐฑๆฏไป่ช็ผ่ชๅฏผ็ใ | \n"," He was the director, the cousin later told the... | \n","
\n"," \n"," 2 | \n"," Please translate the following Chinese text in... | \n"," ่ฟๅคๅงๅฟฝ็ถๆณ่ตทไธไปถไบๆฅ๏ผไพฟๅ็ชๅคๅซ๏ผโ่ๅฟๅๆฅ๏ผโ | \n"," Xi-feng suddenly seemed to remember something,... | \n","
\n"," \n"," 3 | \n"," Please translate the following Chinese text in... | \n"," ไธไธช่็บขๅซๅ
ต่ตฐๅฐๅถๆๆด้ขๅ๏ผ้ขๅฏน็ๅฅน็ซๆไบไธๆโโๅฝๅนด๏ผๅฅนไปฌไนๆฏ่ฟๆ ท้ขๅฏนๅถๅฒๆณฐ็โโ่ฏๅพๅ็ฐ... | \n"," The three old Red Guards stood in front of Ye ... | \n","
\n"," \n"," 4 | \n"," Please translate the following Chinese text in... | \n"," ็จๅ
็็
งๅๅ
จๆถ๏ผ้ฝๆฏไธไธชโ่ฐขโๅญ๏ผ็ถๅ้ฎ็็ฆ็ถๆไปไน่ฏ่ฏดใ | \n"," Mr. Cheng accepted their toast with equanimity... | \n","
\n"," \n","
\n","
"],"text/plain":[" instruction \\\n","0 Please translate the following Chinese text in... \n","1 Please translate the following Chinese text in... \n","2 Please translate the following Chinese text in... \n","3 Please translate the following Chinese text in... \n","4 Please translate the following Chinese text in... \n","\n"," input \\\n","0 ๅ
จไป็็ไปๆญๆใ \n","1 ่ฟๅ๏ผ่กจๅฅๅ่ฏๅฅนไฟฉ๏ผ่ฟไบบๆฏๅฏผๆผ๏ผๅจๅคๅฝ็่ฟๅญฆ็๏ผ่ฟไผ็ผๅง๏ผไปๅคฉๆ็่ฟๆ๏ผๅฐฑๆฏไป่ช็ผ่ชๅฏผ็ใ \n","2 ่ฟๅคๅงๅฟฝ็ถๆณ่ตทไธไปถไบๆฅ๏ผไพฟๅ็ชๅคๅซ๏ผโ่ๅฟๅๆฅ๏ผโ \n","3 ไธไธช่็บขๅซๅ
ต่ตฐๅฐๅถๆๆด้ขๅ๏ผ้ขๅฏน็ๅฅน็ซๆไบไธๆโโๅฝๅนด๏ผๅฅนไปฌไนๆฏ่ฟๆ ท้ขๅฏนๅถๅฒๆณฐ็โโ่ฏๅพๅ็ฐ... \n","4 ็จๅ
็็
งๅๅ
จๆถ๏ผ้ฝๆฏไธไธชโ่ฐขโๅญ๏ผ็ถๅ้ฎ็็ฆ็ถๆไปไน่ฏ่ฏดใ \n","\n"," output \n","0 Because I was protected by a fox fairy. \n","1 He was the director, the cousin later told the... \n","2 Xi-feng suddenly seemed to remember something,... \n","3 The three old Red Guards stood in front of Ye ... \n","4 Mr. Cheng accepted their toast with equanimity... "]},"execution_count":34,"metadata":{},"output_type":"execute_result"}],"source":["df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"yRkyZ7sZ4-V4","outputId":"856990af-bbfd-4235-b745-fc51e24ecbe0"},"outputs":[{"name":"stdout","output_type":"stream","text":["Python 3.11.9\n","\u001b[33mWARNING: Package(s) not found: flash-attn\u001b[0m\u001b[33m\n","\u001b[0mCPU times: user 23.2 ms, sys: 3.38 ms, total: 26.6 ms\n","Wall time: 518 ms\n"]}],"source":["%%time\n","!python --version\n","!pip show flash-attn"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ywlJE3CS4-V4","outputId":"0f4d751b-e772-4314-9f12-82bf92a61bda"},"outputs":[{"name":"stdout","output_type":"stream","text":["Current Directory:\n","/home/inflaton/code/projects/courses/llm-finetuning/llama-factory\n","06/29/2024 21:58:18 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16\n","[INFO|tokenization_utils_base.py:2161] 2024-06-29 21:58:18,444 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/vocab.json\n","[INFO|tokenization_utils_base.py:2161] 2024-06-29 21:58:18,444 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/merges.txt\n","[INFO|tokenization_utils_base.py:2161] 2024-06-29 21:58:18,444 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/tokenizer.json\n","[INFO|tokenization_utils_base.py:2161] 2024-06-29 21:58:18,444 >> loading file added_tokens.json from cache at None\n","[INFO|tokenization_utils_base.py:2161] 2024-06-29 21:58:18,444 >> loading file special_tokens_map.json from cache at None\n","[INFO|tokenization_utils_base.py:2161] 2024-06-29 21:58:18,444 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/tokenizer_config.json\n","[WARNING|logging.py:313] 2024-06-29 21:58:18,572 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n","06/29/2024 21:58:18 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>\n","06/29/2024 21:58:18 - INFO - llamafactory.data.template - Add <|im_start|> to stop words.\n","06/29/2024 21:58:18 - INFO - llamafactory.data.loader - Loading dataset alpaca_mac.json...\n","Converting format of dataset (num_proc=16): 100%|โ| 4528/4528 [00:00<00:00, 1613\n","Running tokenizer on dataset (num_proc=16): 100%|โ| 4528/4528 [00:01<00:00, 3159\n","input_ids:\n","[151644, 872, 198, 5501, 14683, 279, 2701, 8453, 1467, 1119, 6364, 323, 3410, 1172, 279, 24531, 2213, 11, 4302, 770, 624, 35987, 102895, 99164, 100324, 100717, 100095, 99509, 1773, 151645, 198, 151644, 77091, 198, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n","inputs:\n","<|im_start|>user\n","Please translate the following Chinese text into English and provide only the translated content, nothing else.\n","ๅ
จไป็็ไปๆญๆใ<|im_end|>\n","<|im_start|>assistant\n","Because I was protected by a fox fairy.<|im_end|>\n","label_ids:\n","[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n","labels:\n","Because I was protected by a fox fairy.<|im_end|>\n","[INFO|configuration_utils.py:733] 2024-06-29 21:58:21,872 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n","[INFO|configuration_utils.py:800] 2024-06-29 21:58:21,873 >> Model config Qwen2Config {\n"," \"_name_or_path\": \"Qwen/Qwen2-0.5B-Instruct\",\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 896,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 4864,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 24,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 14,\n"," \"num_hidden_layers\": 24,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|modeling_utils.py:3556] 2024-06-29 21:58:21,942 >> loading weights file model.safetensors from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/model.safetensors\n","[INFO|modeling_utils.py:1531] 2024-06-29 21:58:24,477 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.\n","[INFO|configuration_utils.py:1000] 2024-06-29 21:58:24,480 >> Generate config GenerationConfig {\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645\n","}\n","\n","[INFO|modeling_utils.py:4364] 2024-06-29 21:58:59,030 >> All model checkpoint weights were used when initializing Qwen2ForCausalLM.\n","\n","[INFO|modeling_utils.py:4372] 2024-06-29 21:58:59,030 >> All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at Qwen/Qwen2-0.5B-Instruct.\n","If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.\n","[INFO|configuration_utils.py:955] 2024-06-29 21:58:59,317 >> loading configuration file generation_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/generation_config.json\n","[INFO|configuration_utils.py:1000] 2024-06-29 21:58:59,317 >> Generate config GenerationConfig {\n"," \"bos_token_id\": 151643,\n"," \"do_sample\": true,\n"," \"eos_token_id\": [\n"," 151645,\n"," 151643\n"," ],\n"," \"pad_token_id\": 151643,\n"," \"repetition_penalty\": 1.1,\n"," \"temperature\": 0.7,\n"," \"top_k\": 20,\n"," \"top_p\": 0.8\n","}\n","\n","06/29/2024 21:58:59 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.\n","06/29/2024 21:58:59 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.\n","06/29/2024 21:58:59 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.\n","06/29/2024 21:58:59 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA\n","06/29/2024 21:58:59 - INFO - llamafactory.model.model_utils.misc - Found linear modules: q_proj,up_proj,k_proj,v_proj,gate_proj,down_proj,o_proj\n","06/29/2024 21:58:59 - INFO - llamafactory.model.loader - trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826\n","[INFO|trainer.py:642] 2024-06-29 21:58:59,830 >> Using auto half precision backend\n","06/29/2024 21:58:59 - WARNING - llamafactory.train.callbacks - Previous trainer log in this folder will be deleted.\n","[INFO|trainer.py:2128] 2024-06-29 21:58:59,963 >> ***** Running training *****\n","[INFO|trainer.py:2129] 2024-06-29 21:58:59,963 >> Num examples = 4,482\n","[INFO|trainer.py:2130] 2024-06-29 21:58:59,963 >> Num Epochs = 10\n","[INFO|trainer.py:2131] 2024-06-29 21:58:59,963 >> Instantaneous batch size per device = 1\n","[INFO|trainer.py:2134] 2024-06-29 21:58:59,963 >> Total train batch size (w. parallel, distributed & accumulation) = 8\n","[INFO|trainer.py:2135] 2024-06-29 21:58:59,963 >> Gradient Accumulation steps = 8\n","[INFO|trainer.py:2136] 2024-06-29 21:58:59,963 >> Total optimization steps = 5,600\n","[INFO|trainer.py:2137] 2024-06-29 21:58:59,964 >> Number of trainable parameters = 4,399,104\n","{'loss': 2.5824, 'grad_norm': 3.00181245803833, 'learning_rate': 1.7857142857142857e-06, 'epoch': 0.02}\n","{'loss': 2.7043, 'grad_norm': 3.7918665409088135, 'learning_rate': 3.5714285714285714e-06, 'epoch': 0.04}\n","{'loss': 2.5845, 'grad_norm': 2.4548499584198, 'learning_rate': 5.357142857142857e-06, 'epoch': 0.05}\n","{'loss': 2.5238, 'grad_norm': 5.136275291442871, 'learning_rate': 7.142857142857143e-06, 'epoch': 0.07}\n","{'loss': 2.7407, 'grad_norm': 2.911478281021118, 'learning_rate': 8.92857142857143e-06, 'epoch': 0.09}\n","{'loss': 2.4438, 'grad_norm': 2.7009449005126953, 'learning_rate': 1.0714285714285714e-05, 'epoch': 0.11}\n","{'loss': 2.619, 'grad_norm': 2.6438188552856445, 'learning_rate': 1.25e-05, 'epoch': 0.12}\n","{'loss': 2.3602, 'grad_norm': 2.3748607635498047, 'learning_rate': 1.4285714285714285e-05, 'epoch': 0.14}\n","{'loss': 2.5023, 'grad_norm': 2.8664743900299072, 'learning_rate': 1.6071428571428572e-05, 'epoch': 0.16}\n","{'loss': 2.3225, 'grad_norm': 2.3505067825317383, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.18}\n","{'loss': 2.3869, 'grad_norm': 3.261944532394409, 'learning_rate': 1.9642857142857145e-05, 'epoch': 0.2}\n","{'loss': 2.3922, 'grad_norm': 2.6836485862731934, 'learning_rate': 2.1428571428571428e-05, 'epoch': 0.21}\n","{'loss': 2.3024, 'grad_norm': 2.848069667816162, 'learning_rate': 2.3214285714285715e-05, 'epoch': 0.23}\n","{'loss': 2.3501, 'grad_norm': 3.22798752784729, 'learning_rate': 2.5e-05, 'epoch': 0.25}\n","{'loss': 2.2154, 'grad_norm': 2.441416025161743, 'learning_rate': 2.6785714285714288e-05, 'epoch': 0.27}\n","{'loss': 2.2651, 'grad_norm': 2.3891408443450928, 'learning_rate': 2.857142857142857e-05, 'epoch': 0.29}\n","{'loss': 2.333, 'grad_norm': 2.3359410762786865, 'learning_rate': 3.0357142857142857e-05, 'epoch': 0.3}\n","{'loss': 2.1135, 'grad_norm': 2.6461141109466553, 'learning_rate': 3.2142857142857144e-05, 'epoch': 0.32}\n","{'loss': 2.2379, 'grad_norm': 3.4454798698425293, 'learning_rate': 3.392857142857143e-05, 'epoch': 0.34}\n","{'loss': 2.4006, 'grad_norm': 2.9662983417510986, 'learning_rate': 3.571428571428572e-05, 'epoch': 0.36}\n","{'loss': 2.3065, 'grad_norm': 2.796970844268799, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.37}\n","{'loss': 2.2302, 'grad_norm': 3.6208152770996094, 'learning_rate': 3.928571428571429e-05, 'epoch': 0.39}\n","{'loss': 2.1966, 'grad_norm': 3.335953950881958, 'learning_rate': 4.107142857142857e-05, 'epoch': 0.41}\n","{'loss': 2.3829, 'grad_norm': 4.235249042510986, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.43}\n","{'loss': 2.2592, 'grad_norm': 3.228585720062256, 'learning_rate': 4.464285714285715e-05, 'epoch': 0.45}\n","{'loss': 2.2236, 'grad_norm': 3.2165491580963135, 'learning_rate': 4.642857142857143e-05, 'epoch': 0.46}\n","{'loss': 2.2113, 'grad_norm': 4.193121433258057, 'learning_rate': 4.8214285714285716e-05, 'epoch': 0.48}\n","{'loss': 2.3292, 'grad_norm': 4.554675579071045, 'learning_rate': 5e-05, 'epoch': 0.5}\n","{'loss': 2.1239, 'grad_norm': 2.7911994457244873, 'learning_rate': 5.1785714285714296e-05, 'epoch': 0.52}\n","{'loss': 2.2483, 'grad_norm': 3.6781301498413086, 'learning_rate': 5.3571428571428575e-05, 'epoch': 0.54}\n","{'loss': 2.2574, 'grad_norm': 4.210690021514893, 'learning_rate': 5.535714285714286e-05, 'epoch': 0.55}\n","{'loss': 2.0374, 'grad_norm': 6.651491165161133, 'learning_rate': 5.714285714285714e-05, 'epoch': 0.57}\n","{'loss': 2.1021, 'grad_norm': 5.034158706665039, 'learning_rate': 5.8928571428571435e-05, 'epoch': 0.59}\n","{'loss': 2.1575, 'grad_norm': 4.4245381355285645, 'learning_rate': 6.0714285714285715e-05, 'epoch': 0.61}\n","{'loss': 2.1584, 'grad_norm': 4.884017467498779, 'learning_rate': 6.25e-05, 'epoch': 0.62}\n","{'loss': 2.0592, 'grad_norm': 3.4757015705108643, 'learning_rate': 6.428571428571429e-05, 'epoch': 0.64}\n","{'loss': 2.2959, 'grad_norm': 4.756143093109131, 'learning_rate': 6.607142857142857e-05, 'epoch': 0.66}\n","{'loss': 2.2236, 'grad_norm': 3.61995005607605, 'learning_rate': 6.785714285714286e-05, 'epoch': 0.68}\n","{'loss': 1.9521, 'grad_norm': 3.775660991668701, 'learning_rate': 6.964285714285715e-05, 'epoch': 0.7}\n","{'loss': 2.1048, 'grad_norm': 3.84194016456604, 'learning_rate': 7.142857142857143e-05, 'epoch': 0.71}\n","{'loss': 2.2049, 'grad_norm': 3.697145462036133, 'learning_rate': 7.321428571428571e-05, 'epoch': 0.73}\n","{'loss': 2.2091, 'grad_norm': 3.071280002593994, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.75}\n","{'loss': 2.1879, 'grad_norm': 3.8867111206054688, 'learning_rate': 7.67857142857143e-05, 'epoch': 0.77}\n","{'loss': 2.0959, 'grad_norm': 4.871102333068848, 'learning_rate': 7.857142857142858e-05, 'epoch': 0.79}\n","{'loss': 2.0237, 'grad_norm': 2.9602854251861572, 'learning_rate': 8.035714285714287e-05, 'epoch': 0.8}\n","{'loss': 2.12, 'grad_norm': 3.3257362842559814, 'learning_rate': 8.214285714285714e-05, 'epoch': 0.82}\n","{'loss': 2.1227, 'grad_norm': 5.4583024978637695, 'learning_rate': 8.392857142857144e-05, 'epoch': 0.84}\n","{'loss': 2.1448, 'grad_norm': 3.455509901046753, 'learning_rate': 8.571428571428571e-05, 'epoch': 0.86}\n","{'loss': 2.138, 'grad_norm': 2.953312397003174, 'learning_rate': 8.75e-05, 'epoch': 0.87}\n","{'loss': 2.3248, 'grad_norm': 3.1288394927978516, 'learning_rate': 8.92857142857143e-05, 'epoch': 0.89}\n","{'loss': 2.2541, 'grad_norm': 3.630788803100586, 'learning_rate': 9.107142857142857e-05, 'epoch': 0.91}\n","{'loss': 2.1579, 'grad_norm': 4.1369805335998535, 'learning_rate': 9.285714285714286e-05, 'epoch': 0.93}\n","{'loss': 2.1881, 'grad_norm': 3.945438861846924, 'learning_rate': 9.464285714285715e-05, 'epoch': 0.95}\n","{'loss': 2.1433, 'grad_norm': 3.308486223220825, 'learning_rate': 9.642857142857143e-05, 'epoch': 0.96}\n","{'loss': 2.1414, 'grad_norm': 3.59633207321167, 'learning_rate': 9.821428571428572e-05, 'epoch': 0.98}\n","{'loss': 2.1674, 'grad_norm': 3.1946074962615967, 'learning_rate': 0.0001, 'epoch': 1.0}\n"," 10%|โโโโ | 560/5600 [11:54<1:46:21, 1.27s/it][INFO|trainer.py:3788] 2024-06-29 22:10:54,528 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-29 22:10:54,528 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-29 22:10:54,528 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 9%|โโโโ | 4/46 [00:00<00:01, 36.47it/s]\u001b[A\n"," 17%|โโโโโโโโ | 8/46 [00:00<00:01, 29.70it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 27.30it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 26.35it/s]\u001b[A\n"," 39%|โโโโโโโโโโโโโโโโโ | 18/46 [00:00<00:01, 25.73it/s]\u001b[A\n"," 46%|โโโโโโโโโโโโโโโโโโโโ | 21/46 [00:00<00:00, 25.56it/s]\u001b[A\n"," 52%|โโโโโโโโโโโโโโโโโโโโโโโ | 24/46 [00:00<00:00, 23.84it/s]\u001b[A\n"," 59%|โโโโโโโโโโโโโโโโโโโโโโโโโโ | 27/46 [00:01<00:00, 23.72it/s]\u001b[A\n"," 65%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 30/46 [00:01<00:00, 23.16it/s]\u001b[A\n"," 72%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 33/46 [00:01<00:00, 22.87it/s]\u001b[A\n"," 78%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 36/46 [00:01<00:00, 23.02it/s]\u001b[A\n"," 85%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 39/46 [00:01<00:00, 23.66it/s]\u001b[A\n"," 91%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 42/46 [00:01<00:00, 24.46it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 2.084639310836792, 'eval_runtime': 1.9003, 'eval_samples_per_second': 24.206, 'eval_steps_per_second': 24.206, 'epoch': 1.0}\n"," 10%|โโโโ | 560/5600 [11:56<1:46:21, 1.27s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:01<00:00, 24.20it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-29 22:10:56,429 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-560\n","[INFO|configuration_utils.py:733] 2024-06-29 22:10:57,646 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n","[INFO|configuration_utils.py:800] 2024-06-29 22:10:57,646 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 896,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 4864,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 24,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 14,\n"," \"num_hidden_layers\": 24,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-29 22:10:57,680 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-560/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-29 22:10:57,680 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-560/special_tokens_map.json\n","{'loss': 2.0604, 'grad_norm': 3.1620354652404785, 'learning_rate': 9.999902864657691e-05, 'epoch': 1.02}\n","{'loss': 1.8643, 'grad_norm': 3.8117380142211914, 'learning_rate': 9.999611462404875e-05, 'epoch': 1.04}\n","{'loss': 2.0455, 'grad_norm': 3.2619926929473877, 'learning_rate': 9.999125804563732e-05, 'epoch': 1.05}\n","{'loss': 1.9864, 'grad_norm': 4.930575370788574, 'learning_rate': 9.998445910004082e-05, 'epoch': 1.07}\n","{'loss': 1.9508, 'grad_norm': 3.7913410663604736, 'learning_rate': 9.997571805142639e-05, 'epoch': 1.09}\n","{'loss': 1.9803, 'grad_norm': 4.443136215209961, 'learning_rate': 9.996503523941994e-05, 'epoch': 1.11}\n","{'loss': 1.9275, 'grad_norm': 3.6109349727630615, 'learning_rate': 9.99524110790929e-05, 'epoch': 1.12}\n","{'loss': 1.9954, 'grad_norm': 5.655592918395996, 'learning_rate': 9.993784606094612e-05, 'epoch': 1.14}\n","{'loss': 1.9877, 'grad_norm': 3.884321928024292, 'learning_rate': 9.992134075089084e-05, 'epoch': 1.16}\n","{'loss': 1.8812, 'grad_norm': 3.8242244720458984, 'learning_rate': 9.99028957902266e-05, 'epoch': 1.18}\n","{'loss': 1.8684, 'grad_norm': 2.90846586227417, 'learning_rate': 9.988251189561645e-05, 'epoch': 1.2}\n","{'loss': 1.993, 'grad_norm': 3.7888333797454834, 'learning_rate': 9.986018985905901e-05, 'epoch': 1.21}\n","{'loss': 1.8779, 'grad_norm': 4.632900714874268, 'learning_rate': 9.983593054785776e-05, 'epoch': 1.23}\n","{'loss': 1.999, 'grad_norm': 4.890506267547607, 'learning_rate': 9.980973490458728e-05, 'epoch': 1.25}\n","{'loss': 1.9202, 'grad_norm': 4.923672676086426, 'learning_rate': 9.978160394705668e-05, 'epoch': 1.27}\n","{'loss': 2.1275, 'grad_norm': 4.535311222076416, 'learning_rate': 9.975153876827008e-05, 'epoch': 1.29}\n","{'loss': 2.0239, 'grad_norm': 3.3138980865478516, 'learning_rate': 9.971954053638399e-05, 'epoch': 1.3}\n","{'loss': 2.0188, 'grad_norm': 3.4345853328704834, 'learning_rate': 9.968561049466214e-05, 'epoch': 1.32}\n","{'loss': 2.0142, 'grad_norm': 8.660140991210938, 'learning_rate': 9.964974996142698e-05, 'epoch': 1.34}\n","{'loss': 1.7658, 'grad_norm': 4.349238872528076, 'learning_rate': 9.961196033000861e-05, 'epoch': 1.36}\n","{'loss': 1.9781, 'grad_norm': 4.784688949584961, 'learning_rate': 9.957224306869053e-05, 'epoch': 1.37}\n","{'loss': 1.8478, 'grad_norm': 5.488619327545166, 'learning_rate': 9.953059972065265e-05, 'epoch': 1.39}\n","{'loss': 1.719, 'grad_norm': 3.712329387664795, 'learning_rate': 9.948703190391131e-05, 'epoch': 1.41}\n","{'loss': 2.0146, 'grad_norm': 4.957381248474121, 'learning_rate': 9.944154131125642e-05, 'epoch': 1.43}\n","{'loss': 1.8996, 'grad_norm': 4.802273273468018, 'learning_rate': 9.939412971018574e-05, 'epoch': 1.45}\n","{'loss': 1.9122, 'grad_norm': 3.6675291061401367, 'learning_rate': 9.934479894283606e-05, 'epoch': 1.46}\n","{'loss': 2.0716, 'grad_norm': 3.885627031326294, 'learning_rate': 9.92935509259118e-05, 'epoch': 1.48}\n","{'loss': 1.8866, 'grad_norm': 5.027438640594482, 'learning_rate': 9.924038765061042e-05, 'epoch': 1.5}\n","{'loss': 1.8188, 'grad_norm': 3.39078426361084, 'learning_rate': 9.918531118254507e-05, 'epoch': 1.52}\n","{'loss': 1.948, 'grad_norm': 4.390409469604492, 'learning_rate': 9.912832366166442e-05, 'epoch': 1.54}\n","{'loss': 1.9499, 'grad_norm': 5.019458770751953, 'learning_rate': 9.906942730216939e-05, 'epoch': 1.55}\n","{'loss': 1.8564, 'grad_norm': 3.9593818187713623, 'learning_rate': 9.900862439242719e-05, 'epoch': 1.57}\n","{'loss': 1.9739, 'grad_norm': 4.117242336273193, 'learning_rate': 9.894591729488242e-05, 'epoch': 1.59}\n","{'loss': 2.0614, 'grad_norm': 3.597482204437256, 'learning_rate': 9.888130844596524e-05, 'epoch': 1.61}\n","{'loss': 1.8292, 'grad_norm': 3.4714455604553223, 'learning_rate': 9.881480035599667e-05, 'epoch': 1.62}\n","{'loss': 1.8707, 'grad_norm': 3.4483628273010254, 'learning_rate': 9.874639560909117e-05, 'epoch': 1.64}\n","{'loss': 1.8787, 'grad_norm': 3.199208974838257, 'learning_rate': 9.867609686305617e-05, 'epoch': 1.66}\n","{'loss': 1.8856, 'grad_norm': 3.4779880046844482, 'learning_rate': 9.860390684928873e-05, 'epoch': 1.68}\n","{'loss': 1.8618, 'grad_norm': 5.559018135070801, 'learning_rate': 9.852982837266955e-05, 'epoch': 1.7}\n","{'loss': 1.864, 'grad_norm': 4.512182235717773, 'learning_rate': 9.84538643114539e-05, 'epoch': 1.71}\n","{'loss': 1.9054, 'grad_norm': 3.1477646827697754, 'learning_rate': 9.837601761715983e-05, 'epoch': 1.73}\n","{'loss': 2.0045, 'grad_norm': 3.805159091949463, 'learning_rate': 9.829629131445342e-05, 'epoch': 1.75}\n","{'loss': 1.9549, 'grad_norm': 3.356356143951416, 'learning_rate': 9.82146885010314e-05, 'epoch': 1.77}\n","{'loss': 1.8738, 'grad_norm': 4.890620231628418, 'learning_rate': 9.81312123475006e-05, 'epoch': 1.78}\n","{'loss': 1.906, 'grad_norm': 3.6688284873962402, 'learning_rate': 9.804586609725499e-05, 'epoch': 1.8}\n","{'loss': 1.8104, 'grad_norm': 3.987600564956665, 'learning_rate': 9.79586530663494e-05, 'epoch': 1.82}\n","{'loss': 1.7931, 'grad_norm': 3.517052173614502, 'learning_rate': 9.78695766433709e-05, 'epoch': 1.84}\n","{'loss': 1.984, 'grad_norm': 3.507730722427368, 'learning_rate': 9.777864028930705e-05, 'epoch': 1.86}\n","{'loss': 1.8427, 'grad_norm': 4.782810211181641, 'learning_rate': 9.768584753741134e-05, 'epoch': 1.87}\n","{'loss': 1.8765, 'grad_norm': 4.302423000335693, 'learning_rate': 9.759120199306613e-05, 'epoch': 1.89}\n","{'loss': 2.0702, 'grad_norm': 4.296674728393555, 'learning_rate': 9.74947073336423e-05, 'epoch': 1.91}\n","{'loss': 2.0158, 'grad_norm': 4.246646881103516, 'learning_rate': 9.73963673083566e-05, 'epoch': 1.93}\n","{'loss': 1.9104, 'grad_norm': 3.5928955078125, 'learning_rate': 9.72961857381258e-05, 'epoch': 1.95}\n","{'loss': 1.7656, 'grad_norm': 3.674893379211426, 'learning_rate': 9.719416651541839e-05, 'epoch': 1.96}\n","{'loss': 1.8906, 'grad_norm': 3.089376211166382, 'learning_rate': 9.709031360410318e-05, 'epoch': 1.98}\n","{'loss': 1.9109, 'grad_norm': 4.134565830230713, 'learning_rate': 9.698463103929542e-05, 'epoch': 2.0}\n"," 20%|โโโโโโโโ | 1120/5600 [23:38<1:33:12, 1.25s/it][INFO|trainer.py:3788] 2024-06-29 22:22:38,708 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-29 22:22:38,709 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-29 22:22:38,709 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 9%|โโโโ | 4/46 [00:00<00:01, 36.96it/s]\u001b[A\n"," 17%|โโโโโโโโ | 8/46 [00:00<00:01, 29.81it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 27.33it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 27.15it/s]\u001b[A\n"," 39%|โโโโโโโโโโโโโโโโโ | 18/46 [00:00<00:01, 27.40it/s]\u001b[A\n"," 46%|โโโโโโโโโโโโโโโโโโโโ | 21/46 [00:00<00:00, 27.60it/s]\u001b[A\n"," 52%|โโโโโโโโโโโโโโโโโโโโโโโ | 24/46 [00:00<00:00, 26.59it/s]\u001b[A\n"," 59%|โโโโโโโโโโโโโโโโโโโโโโโโโโ | 27/46 [00:00<00:00, 25.78it/s]\u001b[A\n"," 65%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 30/46 [00:01<00:00, 25.93it/s]\u001b[A\n"," 72%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 33/46 [00:01<00:00, 26.01it/s]\u001b[A\n"," 78%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 36/46 [00:01<00:00, 26.28it/s]\u001b[A\n"," 85%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 39/46 [00:01<00:00, 25.90it/s]\u001b[A\n"," 91%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 42/46 [00:01<00:00, 25.86it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 2.014204502105713, 'eval_runtime': 1.7748, 'eval_samples_per_second': 25.919, 'eval_steps_per_second': 25.919, 'epoch': 2.0}\n"," 20%|โโโโโโโโ | 1120/5600 [23:40<1:33:12, 1.25s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:01<00:00, 25.25it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-29 22:22:40,485 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-1120\n","[INFO|configuration_utils.py:733] 2024-06-29 22:22:41,066 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n","[INFO|configuration_utils.py:800] 2024-06-29 22:22:41,066 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 896,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 4864,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 24,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 14,\n"," \"num_hidden_layers\": 24,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-29 22:22:41,093 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-1120/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-29 22:22:41,093 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-1120/special_tokens_map.json\n","{'loss': 1.6353, 'grad_norm': 4.750072479248047, 'learning_rate': 9.687712292719997e-05, 'epoch': 2.02}\n","{'loss': 1.4907, 'grad_norm': 3.403005838394165, 'learning_rate': 9.67677934449517e-05, 'epoch': 2.03}\n","{'loss': 1.5814, 'grad_norm': 4.471570014953613, 'learning_rate': 9.665664684045333e-05, 'epoch': 2.05}\n","{'loss': 1.5284, 'grad_norm': 5.069768905639648, 'learning_rate': 9.654368743221022e-05, 'epoch': 2.07}\n","{'loss': 1.424, 'grad_norm': 3.740079641342163, 'learning_rate': 9.642891960916268e-05, 'epoch': 2.09}\n","{'loss': 1.5123, 'grad_norm': 3.6874232292175293, 'learning_rate': 9.631234783051544e-05, 'epoch': 2.11}\n","{'loss': 1.7386, 'grad_norm': 4.291646480560303, 'learning_rate': 9.619397662556435e-05, 'epoch': 2.12}\n","{'loss': 1.5025, 'grad_norm': 4.665364742279053, 'learning_rate': 9.607381059352038e-05, 'epoch': 2.14}\n","{'loss': 1.5494, 'grad_norm': 4.6409173011779785, 'learning_rate': 9.595185440333103e-05, 'epoch': 2.16}\n","{'loss': 1.596, 'grad_norm': 5.967792987823486, 'learning_rate': 9.582811279349882e-05, 'epoch': 2.18}\n","{'loss': 1.5333, 'grad_norm': 3.9247050285339355, 'learning_rate': 9.570259057189717e-05, 'epoch': 2.2}\n","{'loss': 1.5773, 'grad_norm': 5.318151950836182, 'learning_rate': 9.557529261558367e-05, 'epoch': 2.21}\n","{'loss': 1.6159, 'grad_norm': 5.290398120880127, 'learning_rate': 9.544622387061055e-05, 'epoch': 2.23}\n","{'loss': 1.6438, 'grad_norm': 4.89390230178833, 'learning_rate': 9.53153893518325e-05, 'epoch': 2.25}\n","{'loss': 1.486, 'grad_norm': 4.651273250579834, 'learning_rate': 9.518279414271183e-05, 'epoch': 2.27}\n","{'loss': 1.531, 'grad_norm': 5.672192573547363, 'learning_rate': 9.504844339512095e-05, 'epoch': 2.28}\n","{'loss': 1.5734, 'grad_norm': 3.6605958938598633, 'learning_rate': 9.491234232914221e-05, 'epoch': 2.3}\n","{'loss': 1.6449, 'grad_norm': 4.812197685241699, 'learning_rate': 9.477449623286505e-05, 'epoch': 2.32}\n","{'loss': 1.659, 'grad_norm': 4.542179584503174, 'learning_rate': 9.463491046218058e-05, 'epoch': 2.34}\n","{'loss': 1.6723, 'grad_norm': 4.588232517242432, 'learning_rate': 9.449359044057345e-05, 'epoch': 2.36}\n","{'loss': 1.619, 'grad_norm': 6.938955783843994, 'learning_rate': 9.435054165891109e-05, 'epoch': 2.37}\n","{'loss': 1.7966, 'grad_norm': 4.723308563232422, 'learning_rate': 9.420576967523049e-05, 'epoch': 2.39}\n","{'loss': 1.6444, 'grad_norm': 4.64656925201416, 'learning_rate': 9.405928011452211e-05, 'epoch': 2.41}\n","{'loss': 1.5322, 'grad_norm': 5.396662712097168, 'learning_rate': 9.391107866851143e-05, 'epoch': 2.43}\n","{'loss': 1.6074, 'grad_norm': 4.109992027282715, 'learning_rate': 9.376117109543769e-05, 'epoch': 2.45}\n","{'loss': 1.5129, 'grad_norm': 4.073942184448242, 'learning_rate': 9.360956321983028e-05, 'epoch': 2.46}\n","{'loss': 1.5839, 'grad_norm': 5.0658698081970215, 'learning_rate': 9.345626093228233e-05, 'epoch': 2.48}\n","{'loss': 1.726, 'grad_norm': 4.494250297546387, 'learning_rate': 9.330127018922194e-05, 'epoch': 2.5}\n","{'loss': 1.6129, 'grad_norm': 5.197183609008789, 'learning_rate': 9.314459701268065e-05, 'epoch': 2.52}\n","{'loss': 1.5691, 'grad_norm': 4.414649486541748, 'learning_rate': 9.298624749005951e-05, 'epoch': 2.53}\n","{'loss': 1.6516, 'grad_norm': 6.023291110992432, 'learning_rate': 9.282622777389258e-05, 'epoch': 2.55}\n","{'loss': 1.4793, 'grad_norm': 5.750635147094727, 'learning_rate': 9.266454408160779e-05, 'epoch': 2.57}\n","{'loss': 1.761, 'grad_norm': 6.335220813751221, 'learning_rate': 9.250120269528546e-05, 'epoch': 2.59}\n","{'loss': 1.5627, 'grad_norm': 6.77303409576416, 'learning_rate': 9.233620996141421e-05, 'epoch': 2.61}\n","{'loss': 1.656, 'grad_norm': 3.9022696018218994, 'learning_rate': 9.21695722906443e-05, 'epoch': 2.62}\n","{'loss': 1.5537, 'grad_norm': 3.297802209854126, 'learning_rate': 9.200129615753859e-05, 'epoch': 2.64}\n","{'loss': 1.5451, 'grad_norm': 4.561464309692383, 'learning_rate': 9.183138810032099e-05, 'epoch': 2.66}\n","{'loss': 1.7119, 'grad_norm': 5.242650508880615, 'learning_rate': 9.165985472062246e-05, 'epoch': 2.68}\n","{'loss': 1.499, 'grad_norm': 5.535559177398682, 'learning_rate': 9.148670268322438e-05, 'epoch': 2.7}\n","{'loss': 1.4735, 'grad_norm': 5.1633100509643555, 'learning_rate': 9.131193871579975e-05, 'epoch': 2.71}\n","{'loss': 1.7502, 'grad_norm': 5.2197184562683105, 'learning_rate': 9.113556960865167e-05, 'epoch': 2.73}\n","{'loss': 1.6312, 'grad_norm': 4.655239105224609, 'learning_rate': 9.09576022144496e-05, 'epoch': 2.75}\n","{'loss': 1.6455, 'grad_norm': 4.8979644775390625, 'learning_rate': 9.077804344796302e-05, 'epoch': 2.77}\n","{'loss': 1.6535, 'grad_norm': 4.097564220428467, 'learning_rate': 9.059690028579284e-05, 'epoch': 2.78}\n","{'loss': 1.5067, 'grad_norm': 3.5800154209136963, 'learning_rate': 9.041417976610027e-05, 'epoch': 2.8}\n","{'loss': 1.5709, 'grad_norm': 4.4109787940979, 'learning_rate': 9.022988898833342e-05, 'epoch': 2.82}\n","{'loss': 1.5517, 'grad_norm': 4.352450370788574, 'learning_rate': 9.004403511295141e-05, 'epoch': 2.84}\n","{'loss': 1.5259, 'grad_norm': 6.1658525466918945, 'learning_rate': 8.985662536114613e-05, 'epoch': 2.86}\n","{'loss': 1.7768, 'grad_norm': 4.468559265136719, 'learning_rate': 8.966766701456177e-05, 'epoch': 2.87}\n","{'loss': 1.5683, 'grad_norm': 4.119050979614258, 'learning_rate': 8.947716741501177e-05, 'epoch': 2.89}\n","{'loss': 1.5665, 'grad_norm': 5.216476917266846, 'learning_rate': 8.928513396419368e-05, 'epoch': 2.91}\n","{'loss': 1.524, 'grad_norm': 4.42177152633667, 'learning_rate': 8.90915741234015e-05, 'epoch': 2.93}\n","{'loss': 1.7169, 'grad_norm': 4.006609916687012, 'learning_rate': 8.889649541323574e-05, 'epoch': 2.95}\n","{'loss': 1.7248, 'grad_norm': 4.3928914070129395, 'learning_rate': 8.869990541331138e-05, 'epoch': 2.96}\n","{'loss': 1.5582, 'grad_norm': 5.347744464874268, 'learning_rate': 8.850181176196315e-05, 'epoch': 2.98}\n","{'loss': 1.6926, 'grad_norm': 4.4702019691467285, 'learning_rate': 8.83022221559489e-05, 'epoch': 3.0}\n"," 30%|โโโโโโโโโโโ | 1680/5600 [35:12<1:20:28, 1.23s/it][INFO|trainer.py:3788] 2024-06-29 22:34:12,572 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-29 22:34:12,573 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-29 22:34:12,573 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 9%|โโโโ | 4/46 [00:00<00:01, 31.72it/s]\u001b[A\n"," 17%|โโโโโโโโ | 8/46 [00:00<00:01, 28.02it/s]\u001b[A\n"," 24%|โโโโโโโโโโโ | 11/46 [00:00<00:01, 27.68it/s]\u001b[A\n"," 30%|โโโโโโโโโโโโโ | 14/46 [00:00<00:01, 24.83it/s]\u001b[A\n"," 37%|โโโโโโโโโโโโโโโโ | 17/46 [00:00<00:01, 25.62it/s]\u001b[A\n"," 43%|โโโโโโโโโโโโโโโโโโโ | 20/46 [00:00<00:00, 26.00it/s]\u001b[A\n"," 50%|โโโโโโโโโโโโโโโโโโโโโโ | 23/46 [00:00<00:00, 24.73it/s]\u001b[A\n"," 57%|โโโโโโโโโโโโโโโโโโโโโโโโโ | 26/46 [00:01<00:00, 25.05it/s]\u001b[A\n"," 63%|โโโโโโโโโโโโโโโโโโโโโโโโโโโ | 29/46 [00:01<00:00, 24.42it/s]\u001b[A\n"," 70%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 32/46 [00:01<00:00, 25.46it/s]\u001b[A\n"," 76%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 35/46 [00:01<00:00, 25.59it/s]\u001b[A\n"," 83%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 38/46 [00:01<00:00, 25.79it/s]\u001b[A\n"," 89%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 41/46 [00:01<00:00, 25.49it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 2.074129343032837, 'eval_runtime': 1.8246, 'eval_samples_per_second': 25.211, 'eval_steps_per_second': 25.211, 'epoch': 3.0}\n"," 30%|โโโโโโโโโโโ | 1680/5600 [35:14<1:20:28, 1.23s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:01<00:00, 25.23it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-29 22:34:14,397 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-1680\n","[INFO|configuration_utils.py:733] 2024-06-29 22:34:15,015 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n","[INFO|configuration_utils.py:800] 2024-06-29 22:34:15,015 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 896,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 4864,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 24,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 14,\n"," \"num_hidden_layers\": 24,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-29 22:34:15,046 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-1680/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-29 22:34:15,047 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-1680/special_tokens_map.json\n","{'loss': 1.4704, 'grad_norm': 4.0504021644592285, 'learning_rate': 8.810114435015054e-05, 'epoch': 3.02}\n","{'loss': 1.1325, 'grad_norm': 4.55043363571167, 'learning_rate': 8.789858615727265e-05, 'epoch': 3.03}\n","{'loss': 1.3183, 'grad_norm': 5.89686393737793, 'learning_rate': 8.7694555447539e-05, 'epoch': 3.05}\n","{'loss': 1.2266, 'grad_norm': 6.354063510894775, 'learning_rate': 8.748906014838672e-05, 'epoch': 3.07}\n","{'loss': 1.2011, 'grad_norm': 5.328189849853516, 'learning_rate': 8.728210824415827e-05, 'epoch': 3.09}\n","{'loss': 1.3191, 'grad_norm': 5.733210563659668, 'learning_rate': 8.707370777579133e-05, 'epoch': 3.11}\n","{'loss': 1.242, 'grad_norm': 4.455051422119141, 'learning_rate': 8.68638668405062e-05, 'epoch': 3.12}\n","{'loss': 1.3565, 'grad_norm': 5.194347381591797, 'learning_rate': 8.665259359149132e-05, 'epoch': 3.14}\n","{'loss': 1.2447, 'grad_norm': 5.317370414733887, 'learning_rate': 8.643989623758643e-05, 'epoch': 3.16}\n","{'loss': 1.3055, 'grad_norm': 6.149751663208008, 'learning_rate': 8.622578304296364e-05, 'epoch': 3.18}\n","{'loss': 1.2915, 'grad_norm': 5.196756839752197, 'learning_rate': 8.601026232680634e-05, 'epoch': 3.2}\n","{'loss': 1.2714, 'grad_norm': 6.269302845001221, 'learning_rate': 8.579334246298593e-05, 'epoch': 3.21}\n","{'loss': 1.4645, 'grad_norm': 4.260131359100342, 'learning_rate': 8.557503187973651e-05, 'epoch': 3.23}\n","{'loss': 1.266, 'grad_norm': 4.698756217956543, 'learning_rate': 8.535533905932738e-05, 'epoch': 3.25}\n","{'loss': 1.242, 'grad_norm': 5.149835109710693, 'learning_rate': 8.513427253773346e-05, 'epoch': 3.27}\n","{'loss': 1.2025, 'grad_norm': 5.154805660247803, 'learning_rate': 8.491184090430364e-05, 'epoch': 3.28}\n","{'loss': 1.3668, 'grad_norm': 6.301427364349365, 'learning_rate': 8.468805280142709e-05, 'epoch': 3.3}\n","{'loss': 1.3626, 'grad_norm': 4.72573709487915, 'learning_rate': 8.446291692419736e-05, 'epoch': 3.32}\n","{'loss': 1.3326, 'grad_norm': 4.458547592163086, 'learning_rate': 8.423644202007467e-05, 'epoch': 3.34}\n","{'loss': 1.324, 'grad_norm': 4.596677303314209, 'learning_rate': 8.400863688854597e-05, 'epoch': 3.36}\n","{'loss': 1.3217, 'grad_norm': 5.230796813964844, 'learning_rate': 8.377951038078302e-05, 'epoch': 3.37}\n","{'loss': 1.2399, 'grad_norm': 4.330605983734131, 'learning_rate': 8.354907139929851e-05, 'epoch': 3.39}\n","{'loss': 1.2958, 'grad_norm': 5.9636945724487305, 'learning_rate': 8.33173288976002e-05, 'epoch': 3.41}\n","{'loss': 1.3935, 'grad_norm': 4.622984409332275, 'learning_rate': 8.308429187984297e-05, 'epoch': 3.43}\n","{'loss': 1.3421, 'grad_norm': 4.806463718414307, 'learning_rate': 8.284996940047903e-05, 'epoch': 3.44}\n","{'loss': 1.3621, 'grad_norm': 4.162802219390869, 'learning_rate': 8.261437056390606e-05, 'epoch': 3.46}\n","{'loss': 1.2849, 'grad_norm': 5.431687355041504, 'learning_rate': 8.237750452411353e-05, 'epoch': 3.48}\n","{'loss': 1.2827, 'grad_norm': 6.106764316558838, 'learning_rate': 8.213938048432697e-05, 'epoch': 3.5}\n","{'loss': 1.2381, 'grad_norm': 5.98523473739624, 'learning_rate': 8.190000769665044e-05, 'epoch': 3.52}\n","{'loss': 1.3037, 'grad_norm': 4.923933029174805, 'learning_rate': 8.1659395461707e-05, 'epoch': 3.53}\n","{'loss': 1.2708, 'grad_norm': 6.869691371917725, 'learning_rate': 8.141755312827736e-05, 'epoch': 3.55}\n","{'loss': 1.414, 'grad_norm': 4.601339340209961, 'learning_rate': 8.117449009293668e-05, 'epoch': 3.57}\n","{'loss': 1.1949, 'grad_norm': 4.767725944519043, 'learning_rate': 8.093021579968941e-05, 'epoch': 3.59}\n","{'loss': 1.2801, 'grad_norm': 4.9436211585998535, 'learning_rate': 8.068473973960238e-05, 'epoch': 3.61}\n","{'loss': 1.3493, 'grad_norm': 5.783080577850342, 'learning_rate': 8.043807145043604e-05, 'epoch': 3.62}\n","{'loss': 1.3132, 'grad_norm': 4.968575477600098, 'learning_rate': 8.019022051627388e-05, 'epoch': 3.64}\n","{'loss': 1.2486, 'grad_norm': 5.723098278045654, 'learning_rate': 7.994119656715002e-05, 'epoch': 3.66}\n","{'loss': 1.4033, 'grad_norm': 7.168787956237793, 'learning_rate': 7.969100927867507e-05, 'epoch': 3.68}\n","{'loss': 1.3969, 'grad_norm': 5.891693592071533, 'learning_rate': 7.943966837166023e-05, 'epoch': 3.69}\n","{'loss': 1.4086, 'grad_norm': 4.852097511291504, 'learning_rate': 7.91871836117395e-05, 'epoch': 3.71}\n","{'loss': 1.208, 'grad_norm': 5.643867015838623, 'learning_rate': 7.89335648089903e-05, 'epoch': 3.73}\n","{'loss': 1.3448, 'grad_norm': 5.375209808349609, 'learning_rate': 7.86788218175523e-05, 'epoch': 3.75}\n","{'loss': 1.316, 'grad_norm': 5.470929145812988, 'learning_rate': 7.842296453524463e-05, 'epoch': 3.77}\n","{'loss': 1.2369, 'grad_norm': 4.993719577789307, 'learning_rate': 7.81660029031811e-05, 'epoch': 3.78}\n","{'loss': 1.3265, 'grad_norm': 5.081270217895508, 'learning_rate': 7.79079469053842e-05, 'epoch': 3.8}\n","{'loss': 1.3366, 'grad_norm': 5.608216285705566, 'learning_rate': 7.764880656839696e-05, 'epoch': 3.82}\n","{'loss': 1.2876, 'grad_norm': 5.217581272125244, 'learning_rate': 7.738859196089358e-05, 'epoch': 3.84}\n","{'loss': 1.1134, 'grad_norm': 5.468497276306152, 'learning_rate': 7.712731319328798e-05, 'epoch': 3.86}\n","{'loss': 1.2106, 'grad_norm': 5.239170074462891, 'learning_rate': 7.68649804173412e-05, 'epoch': 3.87}\n","{'loss': 1.3169, 'grad_norm': 4.908669471740723, 'learning_rate': 7.660160382576683e-05, 'epoch': 3.89}\n","{'loss': 1.3074, 'grad_norm': 6.217924118041992, 'learning_rate': 7.633719365183504e-05, 'epoch': 3.91}\n","{'loss': 1.2273, 'grad_norm': 5.6632513999938965, 'learning_rate': 7.60717601689749e-05, 'epoch': 3.93}\n","{'loss': 1.0764, 'grad_norm': 4.552252769470215, 'learning_rate': 7.580531369037533e-05, 'epoch': 3.94}\n","{'loss': 1.5051, 'grad_norm': 5.1463823318481445, 'learning_rate': 7.553786456858429e-05, 'epoch': 3.96}\n","{'loss': 1.3593, 'grad_norm': 4.828197956085205, 'learning_rate': 7.526942319510655e-05, 'epoch': 3.98}\n","{'loss': 1.1506, 'grad_norm': 3.7453665733337402, 'learning_rate': 7.500000000000001e-05, 'epoch': 4.0}\n"," 40%|โโโโโโโโโโโโโโโ | 2240/5600 [46:40<1:08:55, 1.23s/it][INFO|trainer.py:3788] 2024-06-29 22:45:40,494 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-29 22:45:40,495 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-29 22:45:40,495 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 9%|โโโโ | 4/46 [00:00<00:01, 32.23it/s]\u001b[A\n"," 17%|โโโโโโโโ | 8/46 [00:00<00:01, 26.02it/s]\u001b[A\n"," 24%|โโโโโโโโโโโ | 11/46 [00:00<00:01, 25.56it/s]\u001b[A\n"," 30%|โโโโโโโโโโโโโ | 14/46 [00:00<00:01, 26.22it/s]\u001b[A\n"," 37%|โโโโโโโโโโโโโโโโ | 17/46 [00:00<00:01, 24.55it/s]\u001b[A\n"," 43%|โโโโโโโโโโโโโโโโโโโ | 20/46 [00:00<00:01, 24.60it/s]\u001b[A\n"," 50%|โโโโโโโโโโโโโโโโโโโโโโ | 23/46 [00:00<00:00, 25.74it/s]\u001b[A\n"," 57%|โโโโโโโโโโโโโโโโโโโโโโโโโ | 26/46 [00:01<00:00, 25.16it/s]\u001b[A\n"," 63%|โโโโโโโโโโโโโโโโโโโโโโโโโโโ | 29/46 [00:01<00:00, 23.90it/s]\u001b[A\n"," 70%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 32/46 [00:01<00:00, 24.77it/s]\u001b[A\n"," 76%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 35/46 [00:01<00:00, 24.88it/s]\u001b[A\n"," 83%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 38/46 [00:01<00:00, 25.18it/s]\u001b[A\n"," 89%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 41/46 [00:01<00:00, 24.56it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 2.1811766624450684, 'eval_runtime': 1.8622, 'eval_samples_per_second': 24.702, 'eval_steps_per_second': 24.702, 'epoch': 4.0}\n"," 40%|โโโโโโโโโโโ๏ฟฝ๏ฟฝโโโ | 2240/5600 [46:42<1:08:55, 1.23s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:01<00:00, 25.50it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-29 22:45:42,357 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-2240\n","[INFO|configuration_utils.py:733] 2024-06-29 22:45:42,919 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n","[INFO|configuration_utils.py:800] 2024-06-29 22:45:42,919 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 896,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 4864,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 24,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 14,\n"," \"num_hidden_layers\": 24,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-29 22:45:42,944 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-2240/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-29 22:45:42,944 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-2240/special_tokens_map.json\n","{'loss': 1.0956, 'grad_norm': 3.8211286067962646, 'learning_rate': 7.472960545147038e-05, 'epoch': 4.02}\n","{'loss': 0.9704, 'grad_norm': 4.844677448272705, 'learning_rate': 7.445825005546448e-05, 'epoch': 4.03}\n","{'loss': 1.0473, 'grad_norm': 5.256546497344971, 'learning_rate': 7.4185944355262e-05, 'epoch': 4.05}\n","{'loss': 0.9475, 'grad_norm': 5.321354389190674, 'learning_rate': 7.391269893106592e-05, 'epoch': 4.07}\n","{'loss': 0.963, 'grad_norm': 5.462662220001221, 'learning_rate': 7.363852439959135e-05, 'epoch': 4.09}\n","{'loss': 1.1535, 'grad_norm': 4.1576056480407715, 'learning_rate': 7.33634314136531e-05, 'epoch': 4.11}\n","{'loss': 0.868, 'grad_norm': 5.811347484588623, 'learning_rate': 7.308743066175172e-05, 'epoch': 4.12}\n","{'loss': 0.9822, 'grad_norm': 4.920297145843506, 'learning_rate': 7.281053286765815e-05, 'epoch': 4.14}\n","{'loss': 1.0763, 'grad_norm': 4.794600963592529, 'learning_rate': 7.253274878999727e-05, 'epoch': 4.16}\n","{'loss': 0.9458, 'grad_norm': 5.9534010887146, 'learning_rate': 7.225408922182961e-05, 'epoch': 4.18}\n","{'loss': 0.9607, 'grad_norm': 5.4286298751831055, 'learning_rate': 7.197456499023225e-05, 'epoch': 4.19}\n","{'loss': 1.0148, 'grad_norm': 5.160229206085205, 'learning_rate': 7.169418695587791e-05, 'epoch': 4.21}\n","{'loss': 0.9381, 'grad_norm': 5.876075267791748, 'learning_rate': 7.141296601261314e-05, 'epoch': 4.23}\n","{'loss': 1.0924, 'grad_norm': 4.944575786590576, 'learning_rate': 7.113091308703498e-05, 'epoch': 4.25}\n","{'loss': 0.9265, 'grad_norm': 5.3727545738220215, 'learning_rate': 7.084803913806641e-05, 'epoch': 4.27}\n","{'loss': 0.9944, 'grad_norm': 7.851329803466797, 'learning_rate': 7.056435515653059e-05, 'epoch': 4.28}\n","{'loss': 1.0168, 'grad_norm': 4.633089542388916, 'learning_rate': 7.027987216472377e-05, 'epoch': 4.3}\n","{'loss': 0.8804, 'grad_norm': 5.784977912902832, 'learning_rate': 6.999460121598704e-05, 'epoch': 4.32}\n","{'loss': 1.0768, 'grad_norm': 8.059889793395996, 'learning_rate': 6.970855339427698e-05, 'epoch': 4.34}\n","{'loss': 0.9443, 'grad_norm': 6.03590726852417, 'learning_rate': 6.942173981373474e-05, 'epoch': 4.36}\n","{'loss': 0.9489, 'grad_norm': 12.74445915222168, 'learning_rate': 6.91341716182545e-05, 'epoch': 4.37}\n","{'loss': 1.049, 'grad_norm': 5.12004280090332, 'learning_rate': 6.884585998105026e-05, 'epoch': 4.39}\n","{'loss': 1.0488, 'grad_norm': 10.477398872375488, 'learning_rate': 6.855681610422189e-05, 'epoch': 4.41}\n","{'loss': 1.1279, 'grad_norm': 5.085097789764404, 'learning_rate': 6.826705121831976e-05, 'epoch': 4.43}\n","{'loss': 1.088, 'grad_norm': 5.412323474884033, 'learning_rate': 6.797657658190839e-05, 'epoch': 4.44}\n","{'loss': 0.9124, 'grad_norm': 11.975234985351562, 'learning_rate': 6.768540348112907e-05, 'epoch': 4.46}\n","{'loss': 1.1052, 'grad_norm': 4.589102745056152, 'learning_rate': 6.739354322926136e-05, 'epoch': 4.48}\n","{'loss': 1.0427, 'grad_norm': 5.322690963745117, 'learning_rate': 6.710100716628344e-05, 'epoch': 4.5}\n","{'loss': 1.0325, 'grad_norm': 5.0710577964782715, 'learning_rate': 6.680780665843155e-05, 'epoch': 4.52}\n","{'loss': 1.059, 'grad_norm': 5.193735122680664, 'learning_rate': 6.651395309775837e-05, 'epoch': 4.53}\n","{'loss': 1.0252, 'grad_norm': 6.584225177764893, 'learning_rate': 6.621945790169036e-05, 'epoch': 4.55}\n","{'loss': 0.9911, 'grad_norm': 6.5015435218811035, 'learning_rate': 6.592433251258423e-05, 'epoch': 4.57}\n","{'loss': 1.0211, 'grad_norm': 5.898025035858154, 'learning_rate': 6.562858839728223e-05, 'epoch': 4.59}\n","{'loss': 0.9131, 'grad_norm': 5.380829811096191, 'learning_rate': 6.533223704666672e-05, 'epoch': 4.61}\n","{'loss': 1.0094, 'grad_norm': 5.253726959228516, 'learning_rate': 6.503528997521366e-05, 'epoch': 4.62}\n","{'loss': 1.0349, 'grad_norm': 4.567104339599609, 'learning_rate': 6.473775872054521e-05, 'epoch': 4.64}\n","{'loss': 1.0492, 'grad_norm': 5.842156410217285, 'learning_rate': 6.44396548429815e-05, 'epoch': 4.66}\n","{'loss': 1.0739, 'grad_norm': 5.842441082000732, 'learning_rate': 6.414098992509138e-05, 'epoch': 4.68}\n","{'loss': 0.9745, 'grad_norm': 5.929434299468994, 'learning_rate': 6.384177557124247e-05, 'epoch': 4.69}\n","{'loss': 0.9974, 'grad_norm': 6.804376125335693, 'learning_rate': 6.354202340715026e-05, 'epoch': 4.71}\n","{'loss': 0.9628, 'grad_norm': 4.4478020668029785, 'learning_rate': 6.324174507942637e-05, 'epoch': 4.73}\n","{'loss': 0.9861, 'grad_norm': 4.888654708862305, 'learning_rate': 6.294095225512603e-05, 'epoch': 4.75}\n","{'loss': 1.0063, 'grad_norm': 5.920362949371338, 'learning_rate': 6.263965662129487e-05, 'epoch': 4.77}\n","{'loss': 1.0548, 'grad_norm': 5.524910926818848, 'learning_rate': 6.233786988451468e-05, 'epoch': 4.78}\n","{'loss': 1.0929, 'grad_norm': 4.255885601043701, 'learning_rate': 6.203560377044866e-05, 'epoch': 4.8}\n","{'loss': 0.9848, 'grad_norm': 6.053053379058838, 'learning_rate': 6.173287002338577e-05, 'epoch': 4.82}\n","{'loss': 0.9432, 'grad_norm': 5.0641655921936035, 'learning_rate': 6.142968040578449e-05, 'epoch': 4.84}\n","{'loss': 1.0374, 'grad_norm': 7.123205661773682, 'learning_rate': 6.112604669781572e-05, 'epoch': 4.85}\n","{'loss': 1.0961, 'grad_norm': 5.436131954193115, 'learning_rate': 6.0821980696905146e-05, 'epoch': 4.87}\n","{'loss': 1.0741, 'grad_norm': 5.909348964691162, 'learning_rate': 6.0517494217274794e-05, 'epoch': 4.89}\n","{'loss': 1.1491, 'grad_norm': 5.223842144012451, 'learning_rate': 6.021259908948402e-05, 'epoch': 4.91}\n","{'loss': 1.0061, 'grad_norm': 5.396011829376221, 'learning_rate': 5.9907307159969884e-05, 'epoch': 4.93}\n","{'loss': 1.1121, 'grad_norm': 5.92130184173584, 'learning_rate': 5.960163029058682e-05, 'epoch': 4.94}\n","{'loss': 1.1207, 'grad_norm': 8.12635326385498, 'learning_rate': 5.9295580358145744e-05, 'epoch': 4.96}\n","{'loss': 1.141, 'grad_norm': 6.187139511108398, 'learning_rate': 5.898916925395264e-05, 'epoch': 4.98}\n","{'loss': 1.0886, 'grad_norm': 5.036999702453613, 'learning_rate': 5.868240888334653e-05, 'epoch': 5.0}\n"," 50%|โโโโโโโโโโโโโโโโโโโ | 2800/5600 [59:01<1:00:22, 1.29s/it][INFO|trainer.py:3788] 2024-06-29 22:58:01,067 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-29 22:58:01,067 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-29 22:58:01,067 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 9%|โโโโ | 4/46 [00:00<00:01, 34.43it/s]\u001b[A\n"," 17%|โโโโโโโโ | 8/46 [00:00<00:01, 29.26it/s]\u001b[A\n"," 24%|โโโโโโโโโโโ | 11/46 [00:00<00:01, 26.19it/s]\u001b[A\n"," 30%|โโโโโโโโโโโโโ | 14/46 [00:00<00:01, 24.64it/s]\u001b[A\n"," 37%|โโโโโโโโโโโโโโโโ | 17/46 [00:00<00:01, 24.02it/s]\u001b[A\n"," 43%|โโโโโโโโโโโโโโโโโโโ | 20/46 [00:00<00:01, 24.72it/s]\u001b[A\n"," 50%|โโโโโโโโโโโโโโโโโโโโโโ | 23/46 [00:00<00:00, 24.49it/s]\u001b[A\n"," 57%|โโโโโโโโโโโโโโโโโโโโโโโโโ | 26/46 [00:01<00:00, 24.79it/s]\u001b[A\n"," 63%|โโโโโโโโโโโโโโโโโโโโโโโโโโโ | 29/46 [00:01<00:00, 25.03it/s]\u001b[A\n"," 70%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 32/46 [00:01<00:00, 23.74it/s]\u001b[A\n"," 76%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 35/46 [00:01<00:00, 24.89it/s]\u001b[A\n"," 83%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 38/46 [00:01<00:00, 25.06it/s]\u001b[A\n"," 89%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 41/46 [00:01<00:00, 25.82it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 2.340395927429199, 'eval_runtime': 1.8444, 'eval_samples_per_second': 24.94, 'eval_steps_per_second': 24.94, 'epoch': 5.0}\n"," 50%|โโโโโโโโโโโโโโโโโโโ | 2800/5600 [59:02<1:00:22, 1.29s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:01<00:00, 26.16it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-29 22:58:02,913 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-2800\n","[INFO|configuration_utils.py:733] 2024-06-29 22:58:03,475 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n","[INFO|configuration_utils.py:800] 2024-06-29 22:58:03,475 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 896,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 4864,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 24,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 14,\n"," \"num_hidden_layers\": 24,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-29 22:58:03,501 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-2800/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-29 22:58:03,501 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-2800/special_tokens_map.json\n","{'loss': 0.7949, 'grad_norm': 5.331892490386963, 'learning_rate': 5.837531116523682e-05, 'epoch': 5.02}\n","{'loss': 0.7411, 'grad_norm': 6.530398368835449, 'learning_rate': 5.806788803164034e-05, 'epoch': 5.03}\n","{'loss': 0.7875, 'grad_norm': 5.302547454833984, 'learning_rate': 5.7760151427217576e-05, 'epoch': 5.05}\n","{'loss': 0.7917, 'grad_norm': 5.69765567779541, 'learning_rate': 5.745211330880872e-05, 'epoch': 5.07}\n","{'loss': 0.8076, 'grad_norm': 4.74371862411499, 'learning_rate': 5.714378564496901e-05, 'epoch': 5.09}\n","{'loss': 0.9093, 'grad_norm': 5.205414772033691, 'learning_rate': 5.683518041550368e-05, 'epoch': 5.1}\n","{'loss': 0.6875, 'grad_norm': 5.478267192840576, 'learning_rate': 5.6526309611002594e-05, 'epoch': 5.12}\n","{'loss': 0.8528, 'grad_norm': 6.004168510437012, 'learning_rate': 5.621718523237427e-05, 'epoch': 5.14}\n","{'loss': 0.7398, 'grad_norm': 6.278602600097656, 'learning_rate': 5.590781929037965e-05, 'epoch': 5.16}\n","{'loss': 0.6957, 'grad_norm': 4.800085544586182, 'learning_rate': 5.559822380516539e-05, 'epoch': 5.18}\n","{'loss': 0.8193, 'grad_norm': 5.28222131729126, 'learning_rate': 5.5288410805796895e-05, 'epoch': 5.19}\n","{'loss': 0.825, 'grad_norm': 5.969717502593994, 'learning_rate': 5.497839232979084e-05, 'epoch': 5.21}\n","{'loss': 0.8286, 'grad_norm': 7.2066216468811035, 'learning_rate': 5.466818042264753e-05, 'epoch': 5.23}\n","{'loss': 0.8905, 'grad_norm': 5.272522449493408, 'learning_rate': 5.435778713738292e-05, 'epoch': 5.25}\n","{'loss': 0.7673, 'grad_norm': 4.872743606567383, 'learning_rate': 5.404722453406017e-05, 'epoch': 5.27}\n","{'loss': 0.7346, 'grad_norm': 7.293342113494873, 'learning_rate': 5.373650467932122e-05, 'epoch': 5.28}\n","{'loss': 0.737, 'grad_norm': 5.229869365692139, 'learning_rate': 5.3425639645917834e-05, 'epoch': 5.3}\n","{'loss': 0.7656, 'grad_norm': 4.550146579742432, 'learning_rate': 5.311464151224261e-05, 'epoch': 5.32}\n","{'loss': 0.7256, 'grad_norm': 4.5456223487854, 'learning_rate': 5.2803522361859594e-05, 'epoch': 5.34}\n","{'loss': 0.7886, 'grad_norm': 5.301971912384033, 'learning_rate': 5.249229428303486e-05, 'epoch': 5.35}\n","{'loss': 0.7801, 'grad_norm': 7.148138523101807, 'learning_rate': 5.218096936826681e-05, 'epoch': 5.37}\n","{'loss': 0.8076, 'grad_norm': 6.13567590713501, 'learning_rate': 5.18695597138163e-05, 'epoch': 5.39}\n","{'loss': 0.6323, 'grad_norm': 5.02504825592041, 'learning_rate': 5.155807741923666e-05, 'epoch': 5.41}\n","{'loss': 0.707, 'grad_norm': 6.629558563232422, 'learning_rate': 5.124653458690365e-05, 'epoch': 5.43}\n","{'loss': 0.7033, 'grad_norm': 6.333116054534912, 'learning_rate': 5.0934943321545115e-05, 'epoch': 5.44}\n","{'loss': 0.797, 'grad_norm': 10.160740852355957, 'learning_rate': 5.062331572977076e-05, 'epoch': 5.46}\n","{'loss': 0.9649, 'grad_norm': 5.644074440002441, 'learning_rate': 5.031166391960168e-05, 'epoch': 5.48}\n","{'loss': 0.7452, 'grad_norm': 5.57203483581543, 'learning_rate': 5e-05, 'epoch': 5.5}\n","{'loss': 0.7909, 'grad_norm': 6.231649875640869, 'learning_rate': 4.968833608039832e-05, 'epoch': 5.52}\n","{'loss': 0.8125, 'grad_norm': 6.369471073150635, 'learning_rate': 4.9376684270229254e-05, 'epoch': 5.53}\n","{'loss': 0.7572, 'grad_norm': 14.980217933654785, 'learning_rate': 4.9065056678454904e-05, 'epoch': 5.55}\n","{'loss': 0.9001, 'grad_norm': 6.943779468536377, 'learning_rate': 4.875346541309637e-05, 'epoch': 5.57}\n","{'loss': 0.8092, 'grad_norm': 6.565555572509766, 'learning_rate': 4.844192258076336e-05, 'epoch': 5.59}\n","{'loss': 0.9073, 'grad_norm': 8.527596473693848, 'learning_rate': 4.813044028618373e-05, 'epoch': 5.6}\n","{'loss': 0.8484, 'grad_norm': 5.995067119598389, 'learning_rate': 4.781903063173321e-05, 'epoch': 5.62}\n","{'loss': 0.7387, 'grad_norm': 16.719541549682617, 'learning_rate': 4.750770571696514e-05, 'epoch': 5.64}\n","{'loss': 0.8061, 'grad_norm': 5.842343807220459, 'learning_rate': 4.7196477638140404e-05, 'epoch': 5.66}\n","{'loss': 0.7958, 'grad_norm': 7.201180458068848, 'learning_rate': 4.68853584877574e-05, 'epoch': 5.68}\n","{'loss': 0.8338, 'grad_norm': 6.153838634490967, 'learning_rate': 4.657436035408217e-05, 'epoch': 5.69}\n","{'loss': 0.7411, 'grad_norm': 5.899301528930664, 'learning_rate': 4.626349532067879e-05, 'epoch': 5.71}\n","{'loss': 0.8199, 'grad_norm': 5.865950107574463, 'learning_rate': 4.595277546593984e-05, 'epoch': 5.73}\n","{'loss': 0.7367, 'grad_norm': 4.905264377593994, 'learning_rate': 4.564221286261709e-05, 'epoch': 5.75}\n","{'loss': 0.9049, 'grad_norm': 6.099426746368408, 'learning_rate': 4.5331819577352474e-05, 'epoch': 5.77}\n","{'loss': 0.7343, 'grad_norm': 7.31098747253418, 'learning_rate': 4.502160767020918e-05, 'epoch': 5.78}\n","{'loss': 0.7381, 'grad_norm': 5.501935958862305, 'learning_rate': 4.471158919420312e-05, 'epoch': 5.8}\n","{'loss': 0.7133, 'grad_norm': 7.434685707092285, 'learning_rate': 4.4401776194834613e-05, 'epoch': 5.82}\n","{'loss': 0.7352, 'grad_norm': 9.345376968383789, 'learning_rate': 4.409218070962036e-05, 'epoch': 5.84}\n","{'loss': 0.8973, 'grad_norm': 6.876387119293213, 'learning_rate': 4.378281476762576e-05, 'epoch': 5.85}\n","{'loss': 0.6906, 'grad_norm': 14.176045417785645, 'learning_rate': 4.347369038899744e-05, 'epoch': 5.87}\n","{'loss': 0.9098, 'grad_norm': 4.8011040687561035, 'learning_rate': 4.316481958449634e-05, 'epoch': 5.89}\n","{'loss': 0.7331, 'grad_norm': 5.1314697265625, 'learning_rate': 4.285621435503101e-05, 'epoch': 5.91}\n","{'loss': 0.8579, 'grad_norm': 7.106369495391846, 'learning_rate': 4.254788669119127e-05, 'epoch': 5.93}\n","{'loss': 0.748, 'grad_norm': 4.865246295928955, 'learning_rate': 4.223984857278242e-05, 'epoch': 5.94}\n","{'loss': 0.8392, 'grad_norm': 5.906892776489258, 'learning_rate': 4.1932111968359664e-05, 'epoch': 5.96}\n","{'loss': 0.8015, 'grad_norm': 5.705036163330078, 'learning_rate': 4.162468883476319e-05, 'epoch': 5.98}\n","{'loss': 0.8302, 'grad_norm': 5.603642463684082, 'learning_rate': 4.131759111665349e-05, 'epoch': 6.0}\n"," 60%|โโโโโโโโโโโโโโโโโโโโโโโ | 3360/5600 [1:11:19<47:06, 1.26s/it][INFO|trainer.py:3788] 2024-06-29 23:10:19,379 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-29 23:10:19,379 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-29 23:10:19,379 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 9%|โโโโ | 4/46 [00:00<00:01, 33.50it/s]\u001b[A\n"," 17%|โโโโโโโโ | 8/46 [00:00<00:01, 29.86it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 24.66it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 24.52it/s]\u001b[A\n"," 39%|โโโโโโโโโโโโโโโโโ | 18/46 [00:00<00:01, 24.31it/s]\u001b[A\n"," 46%|โโโโโโโโโโโโโโโโโโโโ | 21/46 [00:00<00:01, 24.94it/s]\u001b[A\n"," 52%|โโโโโโโโโโโโโโโโโโโโโโโ | 24/46 [00:00<00:00, 24.74it/s]\u001b[A\n"," 59%|โโโโโโโโโโโโโโโโโโโโโโโโโโ | 27/46 [00:01<00:00, 25.63it/s]\u001b[A\n"," 65%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 30/46 [00:01<00:00, 25.39it/s]\u001b[A\n"," 72%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 33/46 [00:01<00:00, 25.47it/s]\u001b[A\n"," 78%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 36/46 [00:01<00:00, 25.85it/s]\u001b[A\n"," 85%|โโโโโโโโโโโโโโโโโ๏ฟฝ๏ฟฝ๏ฟฝโโโโโโโโโโโโโโโโโโโ | 39/46 [00:01<00:00, 25.45it/s]\u001b[A\n"," 91%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 42/46 [00:01<00:00, 26.26it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 2.5864341259002686, 'eval_runtime': 1.8502, 'eval_samples_per_second': 24.863, 'eval_steps_per_second': 24.863, 'epoch': 6.0}\n"," 60%|โโโโโโโโโโโโโโโโโโโโโโโ | 3360/5600 [1:11:21<47:06, 1.26s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:01<00:00, 24.74it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-29 23:10:21,229 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-3360\n","[INFO|configuration_utils.py:733] 2024-06-29 23:10:21,875 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n","[INFO|configuration_utils.py:800] 2024-06-29 23:10:21,876 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 896,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 4864,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 24,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 14,\n"," \"num_hidden_layers\": 24,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-29 23:10:21,913 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-3360/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-29 23:10:21,914 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-3360/special_tokens_map.json\n","{'loss': 0.6066, 'grad_norm': 5.037189960479736, 'learning_rate': 4.101083074604737e-05, 'epoch': 6.02}\n","{'loss': 0.5565, 'grad_norm': 15.377517700195312, 'learning_rate': 4.0704419641854274e-05, 'epoch': 6.03}\n","{'loss': 0.5423, 'grad_norm': 5.809545516967773, 'learning_rate': 4.03983697094132e-05, 'epoch': 6.05}\n","{'loss': 0.6718, 'grad_norm': 6.553403377532959, 'learning_rate': 4.0092692840030134e-05, 'epoch': 6.07}\n","{'loss': 0.6391, 'grad_norm': 5.6180901527404785, 'learning_rate': 3.978740091051599e-05, 'epoch': 6.09}\n","{'loss': 0.6823, 'grad_norm': 5.028639793395996, 'learning_rate': 3.9482505782725224e-05, 'epoch': 6.1}\n","{'loss': 0.5271, 'grad_norm': 5.877006530761719, 'learning_rate': 3.917801930309486e-05, 'epoch': 6.12}\n","{'loss': 0.6207, 'grad_norm': 5.783533096313477, 'learning_rate': 3.887395330218429e-05, 'epoch': 6.14}\n","{'loss': 0.6231, 'grad_norm': 7.259427547454834, 'learning_rate': 3.857031959421553e-05, 'epoch': 6.16}\n","{'loss': 0.6113, 'grad_norm': 5.142063140869141, 'learning_rate': 3.8267129976614254e-05, 'epoch': 6.18}\n","{'loss': 0.6245, 'grad_norm': 6.205945014953613, 'learning_rate': 3.7964396229551364e-05, 'epoch': 6.19}\n","{'loss': 0.6885, 'grad_norm': 5.446501731872559, 'learning_rate': 3.7662130115485314e-05, 'epoch': 6.21}\n","{'loss': 0.6127, 'grad_norm': 5.06493616104126, 'learning_rate': 3.7360343378705124e-05, 'epoch': 6.23}\n","{'loss': 0.6773, 'grad_norm': 6.783751964569092, 'learning_rate': 3.705904774487396e-05, 'epoch': 6.25}\n","{'loss': 0.5281, 'grad_norm': 9.579876899719238, 'learning_rate': 3.675825492057364e-05, 'epoch': 6.27}\n","{'loss': 0.642, 'grad_norm': 5.472534656524658, 'learning_rate': 3.6457976592849754e-05, 'epoch': 6.28}\n","{'loss': 0.5956, 'grad_norm': 4.942157745361328, 'learning_rate': 3.6158224428757535e-05, 'epoch': 6.3}\n","{'loss': 0.6226, 'grad_norm': 4.671095848083496, 'learning_rate': 3.585901007490863e-05, 'epoch': 6.32}\n","{'loss': 0.6398, 'grad_norm': 7.523804664611816, 'learning_rate': 3.556034515701852e-05, 'epoch': 6.34}\n","{'loss': 0.5765, 'grad_norm': 5.049912929534912, 'learning_rate': 3.5262241279454785e-05, 'epoch': 6.35}\n","{'loss': 0.5686, 'grad_norm': 4.921656608581543, 'learning_rate': 3.4964710024786354e-05, 'epoch': 6.37}\n","{'loss': 0.5776, 'grad_norm': 4.165037155151367, 'learning_rate': 3.4667762953333295e-05, 'epoch': 6.39}\n","{'loss': 0.5287, 'grad_norm': 14.588775634765625, 'learning_rate': 3.4371411602717784e-05, 'epoch': 6.41}\n","{'loss': 0.6236, 'grad_norm': 4.320646286010742, 'learning_rate': 3.4075667487415785e-05, 'epoch': 6.43}\n","{'loss': 0.6174, 'grad_norm': 4.169257164001465, 'learning_rate': 3.3780542098309654e-05, 'epoch': 6.44}\n","{'loss': 0.6235, 'grad_norm': 3.5882270336151123, 'learning_rate': 3.3486046902241664e-05, 'epoch': 6.46}\n","{'loss': 0.6929, 'grad_norm': 5.139246940612793, 'learning_rate': 3.319219334156847e-05, 'epoch': 6.48}\n","{'loss': 0.6181, 'grad_norm': 6.403084754943848, 'learning_rate': 3.289899283371657e-05, 'epoch': 6.5}\n","{'loss': 0.702, 'grad_norm': 5.330471038818359, 'learning_rate': 3.2606456770738636e-05, 'epoch': 6.51}\n","{'loss': 0.558, 'grad_norm': 6.444238662719727, 'learning_rate': 3.231459651887093e-05, 'epoch': 6.53}\n","{'loss': 0.6338, 'grad_norm': 5.4946417808532715, 'learning_rate': 3.2023423418091626e-05, 'epoch': 6.55}\n","{'loss': 0.5606, 'grad_norm': 5.147060871124268, 'learning_rate': 3.173294878168025e-05, 'epoch': 6.57}\n","{'loss': 0.7059, 'grad_norm': 5.5029754638671875, 'learning_rate': 3.1443183895778105e-05, 'epoch': 6.59}\n","{'loss': 0.647, 'grad_norm': 5.451030731201172, 'learning_rate': 3.115414001894974e-05, 'epoch': 6.6}\n","{'loss': 0.6321, 'grad_norm': 5.880076885223389, 'learning_rate': 3.086582838174551e-05, 'epoch': 6.62}\n","{'loss': 0.6221, 'grad_norm': 12.090547561645508, 'learning_rate': 3.0578260186265265e-05, 'epoch': 6.64}\n","{'loss': 0.5515, 'grad_norm': 4.961390495300293, 'learning_rate': 3.029144660572304e-05, 'epoch': 6.66}\n","{'loss': 0.6303, 'grad_norm': 8.35487174987793, 'learning_rate': 3.000539878401296e-05, 'epoch': 6.68}\n","{'loss': 0.6175, 'grad_norm': 5.784793376922607, 'learning_rate': 2.9720127835276256e-05, 'epoch': 6.69}\n","{'loss': 0.6365, 'grad_norm': 5.296642780303955, 'learning_rate': 2.9435644843469436e-05, 'epoch': 6.71}\n","{'loss': 0.6053, 'grad_norm': 5.430149078369141, 'learning_rate': 2.9151960861933614e-05, 'epoch': 6.73}\n","{'loss': 0.6135, 'grad_norm': 5.0150980949401855, 'learning_rate': 2.886908691296504e-05, 'epoch': 6.75}\n","{'loss': 0.6041, 'grad_norm': 5.136585235595703, 'learning_rate': 2.858703398738686e-05, 'epoch': 6.76}\n","{'loss': 0.5429, 'grad_norm': 4.231466293334961, 'learning_rate': 2.8305813044122097e-05, 'epoch': 6.78}\n","{'loss': 0.5956, 'grad_norm': 5.151216983795166, 'learning_rate': 2.8025435009767747e-05, 'epoch': 6.8}\n","{'loss': 0.5732, 'grad_norm': 3.7542734146118164, 'learning_rate': 2.774591077817038e-05, 'epoch': 6.82}\n","{'loss': 0.6358, 'grad_norm': 6.12777042388916, 'learning_rate': 2.746725121000273e-05, 'epoch': 6.84}\n","{'loss': 0.5031, 'grad_norm': 11.638378143310547, 'learning_rate': 2.718946713234185e-05, 'epoch': 6.85}\n","{'loss': 0.6171, 'grad_norm': 9.199576377868652, 'learning_rate': 2.6912569338248315e-05, 'epoch': 6.87}\n","{'loss': 0.6104, 'grad_norm': 10.14255428314209, 'learning_rate': 2.66365685863469e-05, 'epoch': 6.89}\n","{'loss': 0.7077, 'grad_norm': 9.090829849243164, 'learning_rate': 2.636147560040866e-05, 'epoch': 6.91}\n","{'loss': 0.5531, 'grad_norm': 9.668030738830566, 'learning_rate': 2.6087301068934106e-05, 'epoch': 6.93}\n","{'loss': 0.6159, 'grad_norm': 6.352726936340332, 'learning_rate': 2.581405564473801e-05, 'epoch': 6.94}\n","{'loss': 0.6046, 'grad_norm': 5.168361663818359, 'learning_rate': 2.5541749944535554e-05, 'epoch': 6.96}\n","{'loss': 0.7733, 'grad_norm': 7.233384132385254, 'learning_rate': 2.527039454852963e-05, 'epoch': 6.98}\n","{'loss': 0.6154, 'grad_norm': 9.114374160766602, 'learning_rate': 2.500000000000001e-05, 'epoch': 7.0}\n"," 70%|โโโโโโโโโโโโโโโโโโโโโโโโโโ | 3920/5600 [1:23:31<34:46, 1.24s/it][INFO|trainer.py:3788] 2024-06-29 23:22:31,824 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-29 23:22:31,824 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-29 23:22:31,824 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 9%|โโโโ | 4/46 [00:00<00:01, 35.06it/s]\u001b[A\n"," 17%|โโโโโโโโ | 8/46 [00:00<00:01, 30.15it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 27.73it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 27.79it/s]\u001b[A\n"," 39%|โโโโโโโโโโโโโโโโโ | 18/46 [00:00<00:01, 27.85it/s]\u001b[A\n"," 46%|โโโโโโโโโโโโโโโโโโโโ | 21/46 [00:00<00:00, 27.80it/s]\u001b[A\n"," 52%|โโโโโโโโโโโโโโโโโโโโโโโ | 24/46 [00:00<00:00, 27.45it/s]\u001b[A\n"," 59%|โโโโโโโโโโโโโโโโโโโโโโโโโโ | 27/46 [00:00<00:00, 27.28it/s]\u001b[A\n"," 65%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 30/46 [00:01<00:00, 27.03it/s]\u001b[A\n"," 72%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 33/46 [00:01<00:00, 27.21it/s]\u001b[A\n"," 78%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 36/46 [00:01<00:00, 27.28it/s]\u001b[A\n"," 85%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 39/46 [00:01<00:00, 27.24it/s]\u001b[A\n"," 91%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 42/46 [00:01<00:00, 27.01it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 2.8612773418426514, 'eval_runtime': 1.7012, 'eval_samples_per_second': 27.04, 'eval_steps_per_second': 27.04, 'epoch': 7.0}\n"," 70%|โโโโโโโโโโโโโโโโโโโโโโโโโโ | 3920/5600 [1:23:33<34:46, 1.24s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:01<00:00, 27.52it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-29 23:22:33,526 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-3920\n","[INFO|configuration_utils.py:733] 2024-06-29 23:22:34,201 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n","[INFO|configuration_utils.py:800] 2024-06-29 23:22:34,202 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 896,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 4864,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 24,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 14,\n"," \"num_hidden_layers\": 24,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-29 23:22:34,235 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-3920/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-29 23:22:34,235 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-3920/special_tokens_map.json\n","{'loss': 0.4505, 'grad_norm': 4.652220726013184, 'learning_rate': 2.473057680489348e-05, 'epoch': 7.01}\n","{'loss': 0.385, 'grad_norm': 2.926722526550293, 'learning_rate': 2.4462135431415733e-05, 'epoch': 7.03}\n","{'loss': 0.4096, 'grad_norm': 6.222466468811035, 'learning_rate': 2.4194686309624663e-05, 'epoch': 7.05}\n","{'loss': 0.553, 'grad_norm': 3.829651117324829, 'learning_rate': 2.39282398310251e-05, 'epoch': 7.07}\n","{'loss': 0.403, 'grad_norm': 5.209712982177734, 'learning_rate': 2.366280634816496e-05, 'epoch': 7.09}\n","{'loss': 0.5494, 'grad_norm': 4.316225051879883, 'learning_rate': 2.3398396174233178e-05, 'epoch': 7.1}\n","{'loss': 0.4251, 'grad_norm': 5.665122985839844, 'learning_rate': 2.3135019582658802e-05, 'epoch': 7.12}\n","{'loss': 0.4833, 'grad_norm': 5.162817478179932, 'learning_rate': 2.2872686806712035e-05, 'epoch': 7.14}\n","{'loss': 0.4832, 'grad_norm': 4.767073631286621, 'learning_rate': 2.261140803910644e-05, 'epoch': 7.16}\n","{'loss': 0.4627, 'grad_norm': 6.984405994415283, 'learning_rate': 2.235119343160303e-05, 'epoch': 7.18}\n","{'loss': 0.48, 'grad_norm': 5.248043060302734, 'learning_rate': 2.2092053094615813e-05, 'epoch': 7.19}\n","{'loss': 0.4851, 'grad_norm': 5.531778812408447, 'learning_rate': 2.1833997096818898e-05, 'epoch': 7.21}\n","{'loss': 0.4751, 'grad_norm': 5.573154926300049, 'learning_rate': 2.157703546475539e-05, 'epoch': 7.23}\n","{'loss': 0.5816, 'grad_norm': 4.959446430206299, 'learning_rate': 2.132117818244771e-05, 'epoch': 7.25}\n","{'loss': 0.4175, 'grad_norm': 4.046441078186035, 'learning_rate': 2.1066435191009715e-05, 'epoch': 7.26}\n","{'loss': 0.5647, 'grad_norm': 7.062335968017578, 'learning_rate': 2.0812816388260518e-05, 'epoch': 7.28}\n","{'loss': 0.479, 'grad_norm': 4.6393914222717285, 'learning_rate': 2.056033162833977e-05, 'epoch': 7.3}\n","{'loss': 0.471, 'grad_norm': 5.455317497253418, 'learning_rate': 2.0308990721324927e-05, 'epoch': 7.32}\n","{'loss': 0.5885, 'grad_norm': 4.32041597366333, 'learning_rate': 2.0058803432849987e-05, 'epoch': 7.34}\n","{'loss': 0.4522, 'grad_norm': 4.541329383850098, 'learning_rate': 1.980977948372612e-05, 'epoch': 7.35}\n","{'loss': 0.4865, 'grad_norm': 5.104362964630127, 'learning_rate': 1.9561928549563968e-05, 'epoch': 7.37}\n","{'loss': 0.554, 'grad_norm': 5.151457786560059, 'learning_rate': 1.931526026039764e-05, 'epoch': 7.39}\n","{'loss': 0.4381, 'grad_norm': 5.234814643859863, 'learning_rate': 1.906978420031059e-05, 'epoch': 7.41}\n","{'loss': 0.5294, 'grad_norm': 6.009786128997803, 'learning_rate': 1.8825509907063327e-05, 'epoch': 7.43}\n","{'loss': 0.4886, 'grad_norm': 6.153667449951172, 'learning_rate': 1.8582446871722636e-05, 'epoch': 7.44}\n","{'loss': 0.5583, 'grad_norm': 5.528926849365234, 'learning_rate': 1.8340604538293015e-05, 'epoch': 7.46}\n","{'loss': 0.5186, 'grad_norm': 6.47043514251709, 'learning_rate': 1.8099992303349577e-05, 'epoch': 7.48}\n","{'loss': 0.4369, 'grad_norm': 4.640471458435059, 'learning_rate': 1.7860619515673033e-05, 'epoch': 7.5}\n","{'loss': 0.485, 'grad_norm': 4.996728420257568, 'learning_rate': 1.7622495475886487e-05, 'epoch': 7.51}\n","{'loss': 0.5824, 'grad_norm': 7.510169982910156, 'learning_rate': 1.738562943609396e-05, 'epoch': 7.53}\n","{'loss': 0.5401, 'grad_norm': 5.8573503494262695, 'learning_rate': 1.7150030599520984e-05, 'epoch': 7.55}\n","{'loss': 0.4099, 'grad_norm': 4.604180335998535, 'learning_rate': 1.691570812015704e-05, 'epoch': 7.57}\n","{'loss': 0.4631, 'grad_norm': 9.454184532165527, 'learning_rate': 1.6682671102399805e-05, 'epoch': 7.59}\n","{'loss': 0.4046, 'grad_norm': 3.9995360374450684, 'learning_rate': 1.6450928600701504e-05, 'epoch': 7.6}\n","{'loss': 0.3599, 'grad_norm': 5.843255043029785, 'learning_rate': 1.622048961921699e-05, 'epoch': 7.62}\n","{'loss': 0.5126, 'grad_norm': 5.647862434387207, 'learning_rate': 1.599136311145402e-05, 'epoch': 7.64}\n","{'loss': 0.6103, 'grad_norm': 6.46891450881958, 'learning_rate': 1.5763557979925324e-05, 'epoch': 7.66}\n","{'loss': 0.5807, 'grad_norm': 6.223480224609375, 'learning_rate': 1.553708307580265e-05, 'epoch': 7.68}\n","{'loss': 0.5201, 'grad_norm': 4.753687381744385, 'learning_rate': 1.531194719857292e-05, 'epoch': 7.69}\n","{'loss': 0.3547, 'grad_norm': 5.846710681915283, 'learning_rate': 1.5088159095696363e-05, 'epoch': 7.71}\n","{'loss': 0.5295, 'grad_norm': 5.13261079788208, 'learning_rate': 1.4865727462266543e-05, 'epoch': 7.73}\n","{'loss': 0.5368, 'grad_norm': 4.849207401275635, 'learning_rate': 1.4644660940672627e-05, 'epoch': 7.75}\n","{'loss': 0.5151, 'grad_norm': 4.458810806274414, 'learning_rate': 1.4424968120263504e-05, 'epoch': 7.76}\n","{'loss': 0.4958, 'grad_norm': 7.0515360832214355, 'learning_rate': 1.4206657537014079e-05, 'epoch': 7.78}\n","{'loss': 0.5166, 'grad_norm': 6.9797258377075195, 'learning_rate': 1.398973767319368e-05, 'epoch': 7.8}\n","{'loss': 0.5007, 'grad_norm': 8.272122383117676, 'learning_rate': 1.3774216957036367e-05, 'epoch': 7.82}\n","{'loss': 0.4178, 'grad_norm': 5.713352203369141, 'learning_rate': 1.3560103762413584e-05, 'epoch': 7.84}\n","{'loss': 0.4001, 'grad_norm': 7.498878479003906, 'learning_rate': 1.3347406408508695e-05, 'epoch': 7.85}\n","{'loss': 0.5782, 'grad_norm': 6.81415319442749, 'learning_rate': 1.3136133159493802e-05, 'epoch': 7.87}\n","{'loss': 0.493, 'grad_norm': 5.0307936668396, 'learning_rate': 1.2926292224208664e-05, 'epoch': 7.89}\n","{'loss': 0.4523, 'grad_norm': 4.477788925170898, 'learning_rate': 1.2717891755841722e-05, 'epoch': 7.91}\n","{'loss': 0.496, 'grad_norm': 5.846407413482666, 'learning_rate': 1.2510939851613285e-05, 'epoch': 7.93}\n","{'loss': 0.5292, 'grad_norm': 7.384892463684082, 'learning_rate': 1.230544455246101e-05, 'epoch': 7.94}\n","{'loss': 0.425, 'grad_norm': 6.020524978637695, 'learning_rate': 1.2101413842727345e-05, 'epoch': 7.96}\n","{'loss': 0.5331, 'grad_norm': 5.7436699867248535, 'learning_rate': 1.1898855649849461e-05, 'epoch': 7.98}\n","{'loss': 0.3988, 'grad_norm': 4.166412353515625, 'learning_rate': 1.1697777844051105e-05, 'epoch': 8.0}\n"," 80%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 4480/5600 [1:35:21<23:03, 1.24s/it][INFO|trainer.py:3788] 2024-06-29 23:34:21,043 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-29 23:34:21,043 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-29 23:34:21,043 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 9%|โโโโ | 4/46 [00:00<00:01, 36.84it/s]\u001b[A\n"," 17%|โโโโโโโโ | 8/46 [00:00<00:01, 31.27it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 26.39it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 26.67it/s]\u001b[A\n"," 39%|โโโโโโโโโโโโโโโโโ | 18/46 [00:00<00:01, 26.22it/s]\u001b[A\n"," 46%|โโโโโโโโโโโโโโโโโโโโ | 21/46 [00:00<00:00, 26.57it/s]\u001b[A\n"," 52%|โโโโโโโโโโโโโโโโโโโโโโโ | 24/46 [00:00<00:00, 27.11it/s]\u001b[A\n"," 59%|โโโโโโโโโโโโโโโโโโโโโโโโโโ | 27/46 [00:00<00:00, 26.68it/s]\u001b[A\n"," 65%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 30/46 [00:01<00:00, 26.95it/s]\u001b[A\n"," 72%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 33/46 [00:01<00:00, 27.15it/s]\u001b[A\n"," 78%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 36/46 [00:01<00:00, 26.95it/s]\u001b[A\n"," 85%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 39/46 [00:01<00:00, 26.54it/s]\u001b[A\n"," 91%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 42/46 [00:01<00:00, 26.66it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 3.1611218452453613, 'eval_runtime': 1.7732, 'eval_samples_per_second': 25.942, 'eval_steps_per_second': 25.942, 'epoch': 8.0}\n"," 80%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 4480/5600 [1:35:22<23:03, 1.24s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:01<00:00, 25.30it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-29 23:34:22,817 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-4480\n","[INFO|configuration_utils.py:733] 2024-06-29 23:34:23,861 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n","[INFO|configuration_utils.py:800] 2024-06-29 23:34:23,861 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 896,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 4864,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 24,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 14,\n"," \"num_hidden_layers\": 24,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-29 23:34:23,897 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-4480/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-29 23:34:23,898 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-4480/special_tokens_map.json\n","{'loss': 0.3943, 'grad_norm': 3.8637747764587402, 'learning_rate': 1.1498188238036861e-05, 'epoch': 8.01}\n","{'loss': 0.3449, 'grad_norm': 4.5357465744018555, 'learning_rate': 1.130009458668863e-05, 'epoch': 8.03}\n","{'loss': 0.4843, 'grad_norm': 4.340099334716797, 'learning_rate': 1.1103504586764263e-05, 'epoch': 8.05}\n","{'loss': 0.3624, 'grad_norm': 5.39348030090332, 'learning_rate': 1.090842587659851e-05, 'epoch': 8.07}\n","{'loss': 0.3457, 'grad_norm': 5.173300743103027, 'learning_rate': 1.0714866035806326e-05, 'epoch': 8.09}\n","{'loss': 0.4419, 'grad_norm': 3.9911515712738037, 'learning_rate': 1.0522832584988234e-05, 'epoch': 8.1}\n","{'loss': 0.344, 'grad_norm': 4.810797214508057, 'learning_rate': 1.0332332985438248e-05, 'epoch': 8.12}\n","{'loss': 0.4852, 'grad_norm': 4.6731462478637695, 'learning_rate': 1.0143374638853891e-05, 'epoch': 8.14}\n","{'loss': 0.3417, 'grad_norm': 4.519662857055664, 'learning_rate': 9.955964887048607e-06, 'epoch': 8.16}\n","{'loss': 0.3707, 'grad_norm': 6.008825302124023, 'learning_rate': 9.770111011666583e-06, 'epoch': 8.17}\n","{'loss': 0.4661, 'grad_norm': 4.404787540435791, 'learning_rate': 9.58582023389974e-06, 'epoch': 8.19}\n","{'loss': 0.4659, 'grad_norm': 3.959002733230591, 'learning_rate': 9.403099714207175e-06, 'epoch': 8.21}\n","{'loss': 0.4714, 'grad_norm': 5.200716972351074, 'learning_rate': 9.221956552036992e-06, 'epoch': 8.23}\n","{'loss': 0.4455, 'grad_norm': 4.942255973815918, 'learning_rate': 9.042397785550405e-06, 'epoch': 8.25}\n","{'loss': 0.4403, 'grad_norm': 5.782726764678955, 'learning_rate': 8.864430391348332e-06, 'epoch': 8.26}\n","{'loss': 0.3306, 'grad_norm': 3.7129645347595215, 'learning_rate': 8.688061284200266e-06, 'epoch': 8.28}\n","{'loss': 0.407, 'grad_norm': 8.037576675415039, 'learning_rate': 8.513297316775625e-06, 'epoch': 8.3}\n","{'loss': 0.3798, 'grad_norm': 5.382339000701904, 'learning_rate': 8.34014527937756e-06, 'epoch': 8.32}\n","{'loss': 0.4849, 'grad_norm': 4.840189456939697, 'learning_rate': 8.168611899679013e-06, 'epoch': 8.34}\n","{'loss': 0.417, 'grad_norm': 5.7303619384765625, 'learning_rate': 7.998703842461431e-06, 'epoch': 8.35}\n","{'loss': 0.4589, 'grad_norm': 6.085827350616455, 'learning_rate': 7.830427709355725e-06, 'epoch': 8.37}\n","{'loss': 0.4225, 'grad_norm': 4.722183704376221, 'learning_rate': 7.663790038585793e-06, 'epoch': 8.39}\n","{'loss': 0.3773, 'grad_norm': 5.256956100463867, 'learning_rate': 7.498797304714544e-06, 'epoch': 8.41}\n","{'loss': 0.4246, 'grad_norm': 5.674898624420166, 'learning_rate': 7.33545591839222e-06, 'epoch': 8.42}\n","{'loss': 0.4219, 'grad_norm': 4.489896774291992, 'learning_rate': 7.173772226107434e-06, 'epoch': 8.44}\n","{'loss': 0.4486, 'grad_norm': 5.115447521209717, 'learning_rate': 7.013752509940485e-06, 'epoch': 8.46}\n","{'loss': 0.4512, 'grad_norm': 4.548392295837402, 'learning_rate': 6.855402987319348e-06, 'epoch': 8.48}\n","{'loss': 0.3836, 'grad_norm': 6.2048258781433105, 'learning_rate': 6.698729810778065e-06, 'epoch': 8.5}\n","{'loss': 0.3778, 'grad_norm': 4.5989766120910645, 'learning_rate': 6.54373906771768e-06, 'epoch': 8.51}\n","{'loss': 0.4007, 'grad_norm': 5.147210121154785, 'learning_rate': 6.390436780169734e-06, 'epoch': 8.53}\n","{'loss': 0.4504, 'grad_norm': 4.499249458312988, 'learning_rate': 6.238828904562316e-06, 'epoch': 8.55}\n","{'loss': 0.4176, 'grad_norm': 4.788080215454102, 'learning_rate': 6.088921331488568e-06, 'epoch': 8.57}\n","{'loss': 0.2845, 'grad_norm': 3.5535483360290527, 'learning_rate': 5.94071988547788e-06, 'epoch': 8.59}\n","{'loss': 0.3807, 'grad_norm': 4.653518199920654, 'learning_rate': 5.794230324769517e-06, 'epoch': 8.6}\n","{'loss': 0.3348, 'grad_norm': 4.7170915603637695, 'learning_rate': 5.649458341088915e-06, 'epoch': 8.62}\n","{'loss': 0.3807, 'grad_norm': 17.9665584564209, 'learning_rate': 5.506409559426573e-06, 'epoch': 8.64}\n","{'loss': 0.4922, 'grad_norm': 4.38849401473999, 'learning_rate': 5.365089537819434e-06, 'epoch': 8.66}\n","{'loss': 0.4016, 'grad_norm': 4.990530967712402, 'learning_rate': 5.2255037671349535e-06, 'epoch': 8.67}\n","{'loss': 0.4209, 'grad_norm': 4.245598793029785, 'learning_rate': 5.087657670857798e-06, 'epoch': 8.69}\n","{'loss': 0.3529, 'grad_norm': 3.6876637935638428, 'learning_rate': 4.951556604879048e-06, 'epoch': 8.71}\n","{'loss': 0.4205, 'grad_norm': 6.267766952514648, 'learning_rate': 4.8172058572881765e-06, 'epoch': 8.73}\n","{'loss': 0.391, 'grad_norm': 4.628519535064697, 'learning_rate': 4.684610648167503e-06, 'epoch': 8.75}\n","{'loss': 0.4038, 'grad_norm': 5.335127353668213, 'learning_rate': 4.5537761293894535e-06, 'epoch': 8.76}\n","{'loss': 0.4519, 'grad_norm': 5.06191349029541, 'learning_rate': 4.424707384416344e-06, 'epoch': 8.78}\n","{'loss': 0.4043, 'grad_norm': 3.3718318939208984, 'learning_rate': 4.29740942810285e-06, 'epoch': 8.8}\n","{'loss': 0.4329, 'grad_norm': 5.270512104034424, 'learning_rate': 4.1718872065011904e-06, 'epoch': 8.82}\n","{'loss': 0.4345, 'grad_norm': 4.938543796539307, 'learning_rate': 4.048145596668967e-06, 'epoch': 8.84}\n","{'loss': 0.4661, 'grad_norm': 4.726830005645752, 'learning_rate': 3.9261894064796135e-06, 'epoch': 8.85}\n","{'loss': 0.4037, 'grad_norm': 4.747579574584961, 'learning_rate': 3.8060233744356633e-06, 'epoch': 8.87}\n","{'loss': 0.3594, 'grad_norm': 3.65122652053833, 'learning_rate': 3.687652169484568e-06, 'epoch': 8.89}\n","{'loss': 0.3756, 'grad_norm': 3.7553329467773438, 'learning_rate': 3.5710803908373224e-06, 'epoch': 8.91}\n","{'loss': 0.4363, 'grad_norm': 6.1218132972717285, 'learning_rate': 3.4563125677897932e-06, 'epoch': 8.92}\n","{'loss': 0.5039, 'grad_norm': 6.221901893615723, 'learning_rate': 3.343353159546675e-06, 'epoch': 8.94}\n","{'loss': 0.4145, 'grad_norm': 4.449114799499512, 'learning_rate': 3.2322065550483007e-06, 'epoch': 8.96}\n","{'loss': 0.3358, 'grad_norm': 3.244713306427002, 'learning_rate': 3.1228770728000455e-06, 'epoch': 8.98}\n","{'loss': 0.3726, 'grad_norm': 5.383361339569092, 'learning_rate': 3.0153689607045845e-06, 'epoch': 9.0}\n"," 90%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 5040/5600 [1:47:11<11:48, 1.26s/it][INFO|trainer.py:3788] 2024-06-29 23:46:11,764 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-29 23:46:11,764 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-29 23:46:11,764 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 9%|โโโโ | 4/46 [00:00<00:01, 28.58it/s]\u001b[A\n"," 15%|โโโโโโโ | 7/46 [00:00<00:01, 27.42it/s]\u001b[A\n"," 22%|โโโโโโโโโโ | 10/46 [00:00<00:01, 27.02it/s]\u001b[A\n"," 28%|โโโโโโโโโโโโโ | 13/46 [00:00<00:01, 27.51it/s]\u001b[A\n"," 35%|โโโโโโโโโโโโโโโ | 16/46 [00:00<00:01, 27.44it/s]\u001b[A\n"," 41%|โโโโโโโโโโโโโโโโโโ | 19/46 [00:00<00:00, 27.84it/s]\u001b[A\n"," 48%|โโโโโโโโโโโโโโโโโโโโโ | 22/46 [00:00<00:00, 26.64it/s]\u001b[A\n"," 54%|โโโโโโโโโโโโโโโโโโโโโโโโ | 25/46 [00:00<00:00, 25.39it/s]\u001b[A\n"," 61%|โโโโโโโโโโโโโโโโโโโโโโโโโโโ | 28/46 [00:01<00:00, 25.72it/s]\u001b[A\n"," 67%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 31/46 [00:01<00:00, 25.99it/s]\u001b[A\n"," 74%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 34/46 [00:01<00:00, 26.47it/s]\u001b[A\n"," 80%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 37/46 [00:01<00:00, 26.99it/s]\u001b[A\n"," 87%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 40/46 [00:01<00:00, 25.12it/s]\u001b[A\n"," 93%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 43/46 [00:01<00:00, 26.14it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 3.362522602081299, 'eval_runtime': 1.7721, 'eval_samples_per_second': 25.958, 'eval_steps_per_second': 25.958, 'epoch': 9.0}\n"," 90%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 5040/5600 [1:47:13<11:48, 1.26s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:01<00:00, 26.66it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-29 23:46:13,536 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-5040\n","[INFO|configuration_utils.py:733] 2024-06-29 23:46:14,139 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n","[INFO|configuration_utils.py:800] 2024-06-29 23:46:14,139 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 896,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 4864,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 24,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 14,\n"," \"num_hidden_layers\": 24,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-29 23:46:14,173 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-5040/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-29 23:46:14,173 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-5040/special_tokens_map.json\n","{'loss': 0.3451, 'grad_norm': 5.190840244293213, 'learning_rate': 2.9096863958968268e-06, 'epoch': 9.01}\n","{'loss': 0.33, 'grad_norm': 3.5857107639312744, 'learning_rate': 2.8058334845816213e-06, 'epoch': 9.03}\n","{'loss': 0.3675, 'grad_norm': 4.7077860832214355, 'learning_rate': 2.7038142618741992e-06, 'epoch': 9.05}\n","{'loss': 0.4356, 'grad_norm': 4.774041175842285, 'learning_rate': 2.603632691643415e-06, 'epoch': 9.07}\n","{'loss': 0.3459, 'grad_norm': 3.2734451293945312, 'learning_rate': 2.5052926663577e-06, 'epoch': 9.09}\n","{'loss': 0.3926, 'grad_norm': 5.444535732269287, 'learning_rate': 2.408798006933882e-06, 'epoch': 9.1}\n","{'loss': 0.295, 'grad_norm': 4.564394474029541, 'learning_rate': 2.314152462588659e-06, 'epoch': 9.12}\n","{'loss': 0.3274, 'grad_norm': 3.5276427268981934, 'learning_rate': 2.221359710692961e-06, 'epoch': 9.14}\n","{'loss': 0.3454, 'grad_norm': 4.8225603103637695, 'learning_rate': 2.1304233566290964e-06, 'epoch': 9.16}\n","{'loss': 0.2982, 'grad_norm': 3.1064751148223877, 'learning_rate': 2.041346933650612e-06, 'epoch': 9.17}\n","{'loss': 0.3529, 'grad_norm': 3.431065082550049, 'learning_rate': 1.9541339027450256e-06, 'epoch': 9.19}\n","{'loss': 0.4354, 'grad_norm': 4.004822254180908, 'learning_rate': 1.8687876524993987e-06, 'epoch': 9.21}\n","{'loss': 0.3608, 'grad_norm': 5.244897842407227, 'learning_rate': 1.785311498968617e-06, 'epoch': 9.23}\n","{'loss': 0.3693, 'grad_norm': 4.393815517425537, 'learning_rate': 1.70370868554659e-06, 'epoch': 9.25}\n","{'loss': 0.3802, 'grad_norm': 4.819892883300781, 'learning_rate': 1.6239823828401945e-06, 'epoch': 9.26}\n","{'loss': 0.3838, 'grad_norm': 3.781949996948242, 'learning_rate': 1.5461356885461075e-06, 'epoch': 9.28}\n","{'loss': 0.4715, 'grad_norm': 4.076176166534424, 'learning_rate': 1.4701716273304521e-06, 'epoch': 9.3}\n","{'loss': 0.3256, 'grad_norm': 4.226771354675293, 'learning_rate': 1.3960931507112752e-06, 'epoch': 9.32}\n","{'loss': 0.3638, 'grad_norm': 3.562203884124756, 'learning_rate': 1.3239031369438326e-06, 'epoch': 9.34}\n","{'loss': 0.3687, 'grad_norm': 4.55058479309082, 'learning_rate': 1.2536043909088191e-06, 'epoch': 9.35}\n","{'loss': 0.3869, 'grad_norm': 4.373401165008545, 'learning_rate': 1.1851996440033319e-06, 'epoch': 9.37}\n","{'loss': 0.3151, 'grad_norm': 4.085133075714111, 'learning_rate': 1.118691554034773e-06, 'epoch': 9.39}\n","{'loss': 0.3557, 'grad_norm': 4.491430282592773, 'learning_rate': 1.0540827051175818e-06, 'epoch': 9.41}\n","{'loss': 0.405, 'grad_norm': 4.82833194732666, 'learning_rate': 9.913756075728087e-07, 'epoch': 9.42}\n","{'loss': 0.2972, 'grad_norm': 2.666112184524536, 'learning_rate': 9.305726978306173e-07, 'epoch': 9.44}\n","{'loss': 0.3194, 'grad_norm': 4.029996871948242, 'learning_rate': 8.716763383355864e-07, 'epoch': 9.46}\n","{'loss': 0.3984, 'grad_norm': 3.864152193069458, 'learning_rate': 8.146888174549339e-07, 'epoch': 9.48}\n","{'loss': 0.3483, 'grad_norm': 4.201892375946045, 'learning_rate': 7.596123493895991e-07, 'epoch': 9.5}\n","{'loss': 0.4642, 'grad_norm': 4.560868740081787, 'learning_rate': 7.064490740882057e-07, 'epoch': 9.51}\n","{'loss': 0.379, 'grad_norm': 4.305575370788574, 'learning_rate': 6.552010571639456e-07, 'epoch': 9.53}\n","{'loss': 0.445, 'grad_norm': 5.4909772872924805, 'learning_rate': 6.058702898142643e-07, 'epoch': 9.55}\n","{'loss': 0.3116, 'grad_norm': 4.831486225128174, 'learning_rate': 5.584586887435739e-07, 'epoch': 9.57}\n","{'loss': 0.3896, 'grad_norm': 4.905820846557617, 'learning_rate': 5.129680960887007e-07, 'epoch': 9.59}\n","{'loss': 0.3798, 'grad_norm': 3.7179861068725586, 'learning_rate': 4.6940027934735954e-07, 'epoch': 9.6}\n","{'loss': 0.3401, 'grad_norm': 4.62000036239624, 'learning_rate': 4.277569313094809e-07, 'epoch': 9.62}\n","{'loss': 0.4521, 'grad_norm': 4.725619792938232, 'learning_rate': 3.8803966999139684e-07, 'epoch': 9.64}\n","{'loss': 0.4075, 'grad_norm': 3.523742914199829, 'learning_rate': 3.50250038573019e-07, 'epoch': 9.66}\n","{'loss': 0.3438, 'grad_norm': 3.7823429107666016, 'learning_rate': 3.143895053378698e-07, 'epoch': 9.67}\n","{'loss': 0.2996, 'grad_norm': 3.2718749046325684, 'learning_rate': 2.8045946361601183e-07, 'epoch': 9.69}\n","{'loss': 0.4503, 'grad_norm': 5.158358097076416, 'learning_rate': 2.4846123172992954e-07, 'epoch': 9.71}\n","{'loss': 0.3938, 'grad_norm': 3.8553905487060547, 'learning_rate': 2.1839605294330933e-07, 'epoch': 9.73}\n","{'loss': 0.4459, 'grad_norm': 4.788202285766602, 'learning_rate': 1.9026509541272275e-07, 'epoch': 9.75}\n","{'loss': 0.3762, 'grad_norm': 4.024471759796143, 'learning_rate': 1.640694521422459e-07, 'epoch': 9.76}\n","{'loss': 0.4065, 'grad_norm': 5.944757461547852, 'learning_rate': 1.3981014094099353e-07, 'epoch': 9.78}\n","{'loss': 0.3105, 'grad_norm': 3.0800580978393555, 'learning_rate': 1.1748810438355628e-07, 'epoch': 9.8}\n","{'loss': 0.4782, 'grad_norm': 3.273432731628418, 'learning_rate': 9.710420977340762e-08, 'epoch': 9.82}\n","{'loss': 0.3914, 'grad_norm': 4.411673069000244, 'learning_rate': 7.865924910916977e-08, 'epoch': 9.83}\n","{'loss': 0.3274, 'grad_norm': 4.555184364318848, 'learning_rate': 6.215393905388278e-08, 'epoch': 9.85}\n","{'loss': 0.289, 'grad_norm': 5.107693672180176, 'learning_rate': 4.7588920907110094e-08, 'epoch': 9.87}\n","{'loss': 0.3202, 'grad_norm': 4.9626617431640625, 'learning_rate': 3.496476058006959e-08, 'epoch': 9.89}\n","{'loss': 0.433, 'grad_norm': 5.598171234130859, 'learning_rate': 2.4281948573617874e-08, 'epoch': 9.91}\n","{'loss': 0.4018, 'grad_norm': 4.289453029632568, 'learning_rate': 1.5540899959187727e-08, 'epoch': 9.92}\n","{'loss': 0.3691, 'grad_norm': 4.765395641326904, 'learning_rate': 8.741954362678772e-09, 'epoch': 9.94}\n","{'loss': 0.3645, 'grad_norm': 5.474503993988037, 'learning_rate': 3.885375951256931e-09, 'epoch': 9.96}\n","{'loss': 0.4003, 'grad_norm': 3.922280788421631, 'learning_rate': 9.713534230904041e-10, 'epoch': 9.98}\n","{'loss': 0.382, 'grad_norm': 4.276446342468262, 'learning_rate': 0.0, 'epoch': 10.0}\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 5600/5600 [1:59:00<00:00, 1.26s/it][INFO|trainer.py:3788] 2024-06-29 23:58:00,034 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-29 23:58:00,034 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-29 23:58:00,034 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 9%|โโโโ | 4/46 [00:00<00:01, 36.19it/s]\u001b[A\n"," 17%|โโโโโโโโ | 8/46 [00:00<00:01, 28.86it/s]\u001b[A\n"," 24%|โโโโโโโโโโโ | 11/46 [00:00<00:01, 23.27it/s]\u001b[A\n"," 30%|โโโโโโโโโโโโโ | 14/46 [00:00<00:01, 25.05it/s]\u001b[A\n"," 37%|โโโโโโโโโโโโโโโโ | 17/46 [00:00<00:01, 25.40it/s]\u001b[A\n"," 43%|โโโโโโโโโโโโโโโโโโโ | 20/46 [00:00<00:01, 25.19it/s]\u001b[A\n"," 50%|โโโโโโโโโโโโโโโโโโโโโโ | 23/46 [00:00<00:00, 25.41it/s]\u001b[A\n"," 57%|โโโโโโโโโโโโโโโโโโโโโโโโโ | 26/46 [00:01<00:00, 25.58it/s]\u001b[A\n"," 63%|โโโโโโโโโโโโโโโโโโโโโโโโโโโ | 29/46 [00:01<00:00, 26.01it/s]\u001b[A\n"," 70%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 32/46 [00:01<00:00, 26.06it/s]\u001b[A\n"," 76%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 35/46 [00:01<00:00, 25.63it/s]\u001b[A\n"," 83%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 38/46 [00:01<00:00, 25.11it/s]\u001b[A\n"," 89%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 41/46 [00:01<00:00, 25.13it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 3.466093063354492, 'eval_runtime': 1.8384, 'eval_samples_per_second': 25.021, 'eval_steps_per_second': 25.021, 'epoch': 10.0}\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 5600/5600 [1:59:01<00:00, 1.26s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:01<00:00, 25.62it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-29 23:58:01,873 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-5600\n","[INFO|configuration_utils.py:733] 2024-06-29 23:58:02,446 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n","[INFO|configuration_utils.py:800] 2024-06-29 23:58:02,446 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 896,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 4864,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 24,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 14,\n"," \"num_hidden_layers\": 24,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-29 23:58:02,476 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-5600/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-29 23:58:02,476 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-5600/special_tokens_map.json\n","[INFO|trainer.py:2383] 2024-06-29 23:58:02,637 >> \n","\n","Training completed. Do not forget to share your model on huggingface.co/models =)\n","\n","\n","{'train_runtime': 7142.6727, 'train_samples_per_second': 6.275, 'train_steps_per_second': 0.784, 'train_loss': 1.0784291120512144, 'epoch': 10.0}\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 5600/5600 [1:59:02<00:00, 1.28s/it]\n","[INFO|trainer.py:3478] 2024-06-29 23:58:02,640 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft\n","[INFO|configuration_utils.py:733] 2024-06-29 23:58:03,159 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n","[INFO|configuration_utils.py:800] 2024-06-29 23:58:03,160 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 896,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 4864,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 24,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 14,\n"," \"num_hidden_layers\": 24,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-29 23:58:03,220 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-29 23:58:03,220 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/special_tokens_map.json\n","***** train metrics *****\n"," epoch = 9.9955\n"," total_flos = 7657006GF\n"," train_loss = 1.0784\n"," train_runtime = 1:59:02.67\n"," train_samples_per_second = 6.275\n"," train_steps_per_second = 0.784\n","Figure saved at: saves/qwen2-0.5b/lora/sft/training_loss.png\n","Figure saved at: saves/qwen2-0.5b/lora/sft/training_eval_loss.png\n","[INFO|trainer.py:3788] 2024-06-29 23:58:03,541 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-29 23:58:03,541 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-29 23:58:03,541 >> Batch size = 1\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:01<00:00, 25.51it/s]\n","***** eval metrics *****\n"," epoch = 9.9955\n"," eval_loss = 3.4661\n"," eval_runtime = 0:00:01.85\n"," eval_samples_per_second = 24.833\n"," eval_steps_per_second = 24.833\n","[INFO|modelcard.py:449] 2024-06-29 23:58:05,395 >> Dropping the following result as it does not have all the necessary fields:\n","{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}\n","CPU times: user 1min 32s, sys: 30.2 s, total: 2min 2s\n","Wall time: 1h 59min 52s\n"]}],"source":["%%time\n","\n","!./scripts/tune-lf.sh config/qwen2_0.5b_lora_sft.yaml"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"3gnpFF-H4-V4","outputId":"4d6cb943-d9e9-4e33-881b-fdbe7a71f627"},"outputs":[{"name":"stdout","output_type":"stream","text":["Current Directory:\n","/home/inflaton/code/projects/courses/llm-finetuning/llama-factory\n","06/30/2024 06:14:31 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16\n","[INFO|tokenization_utils_base.py:2161] 2024-06-30 06:14:31,888 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/vocab.json\n","[INFO|tokenization_utils_base.py:2161] 2024-06-30 06:14:31,888 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/merges.txt\n","[INFO|tokenization_utils_base.py:2161] 2024-06-30 06:14:31,888 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/tokenizer.json\n","[INFO|tokenization_utils_base.py:2161] 2024-06-30 06:14:31,888 >> loading file added_tokens.json from cache at None\n","[INFO|tokenization_utils_base.py:2161] 2024-06-30 06:14:31,888 >> loading file special_tokens_map.json from cache at None\n","[INFO|tokenization_utils_base.py:2161] 2024-06-30 06:14:31,888 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/tokenizer_config.json\n","[WARNING|logging.py:313] 2024-06-30 06:14:32,031 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n","06/30/2024 06:14:32 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>\n","06/30/2024 06:14:32 - INFO - llamafactory.data.template - Add <|im_start|> to stop words.\n","06/30/2024 06:14:32 - INFO - llamafactory.data.loader - Loading dataset alpaca_mac.json...\n","Converting format of dataset (num_proc=16): 100%|โ| 4528/4528 [00:00<00:00, 1488\n","Running tokenizer on dataset (num_proc=16): 100%|โ| 4528/4528 [00:01<00:00, 3433\n","input_ids:\n","[151644, 872, 198, 5501, 14683, 279, 2701, 8453, 1467, 1119, 6364, 323, 3410, 1172, 279, 24531, 2213, 11, 4302, 770, 624, 35987, 102895, 99164, 100324, 100717, 100095, 99509, 1773, 151645, 198, 151644, 77091, 198, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n","inputs:\n","<|im_start|>user\n","Please translate the following Chinese text into English and provide only the translated content, nothing else.\n","ๅ
จไป็็ไปๆญๆใ<|im_end|>\n","<|im_start|>assistant\n","Because I was protected by a fox fairy.<|im_end|>\n","label_ids:\n","[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n","labels:\n","Because I was protected by a fox fairy.<|im_end|>\n","[INFO|configuration_utils.py:733] 2024-06-30 06:14:35,044 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n","[INFO|configuration_utils.py:800] 2024-06-30 06:14:35,045 >> Model config Qwen2Config {\n"," \"_name_or_path\": \"Qwen/Qwen2-1.5B-Instruct\",\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 1536,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 8960,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 28,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 12,\n"," \"num_hidden_layers\": 28,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|modeling_utils.py:3556] 2024-06-30 06:14:35,702 >> loading weights file model.safetensors from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/model.safetensors\n","[INFO|modeling_utils.py:1531] 2024-06-30 06:14:37,609 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.\n","[INFO|configuration_utils.py:1000] 2024-06-30 06:14:37,613 >> Generate config GenerationConfig {\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645\n","}\n","\n","[INFO|modeling_utils.py:4364] 2024-06-30 06:16:33,749 >> All model checkpoint weights were used when initializing Qwen2ForCausalLM.\n","\n","[INFO|modeling_utils.py:4372] 2024-06-30 06:16:33,749 >> All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at Qwen/Qwen2-1.5B-Instruct.\n","If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.\n","[INFO|configuration_utils.py:955] 2024-06-30 06:16:34,027 >> loading configuration file generation_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/generation_config.json\n","[INFO|configuration_utils.py:1000] 2024-06-30 06:16:34,027 >> Generate config GenerationConfig {\n"," \"bos_token_id\": 151643,\n"," \"do_sample\": true,\n"," \"eos_token_id\": [\n"," 151645,\n"," 151643\n"," ],\n"," \"pad_token_id\": 151643,\n"," \"repetition_penalty\": 1.1,\n"," \"temperature\": 0.7,\n"," \"top_k\": 20,\n"," \"top_p\": 0.8\n","}\n","\n","06/30/2024 06:16:34 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.\n","06/30/2024 06:16:34 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.\n","06/30/2024 06:16:34 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.\n","06/30/2024 06:16:34 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA\n","06/30/2024 06:16:34 - INFO - llamafactory.model.model_utils.misc - Found linear modules: k_proj,q_proj,v_proj,gate_proj,up_proj,o_proj,down_proj\n","06/30/2024 06:16:34 - INFO - llamafactory.model.loader - trainable params: 9,232,384 || all params: 1,552,946,688 || trainable%: 0.5945\n","[INFO|trainer.py:642] 2024-06-30 06:16:34,928 >> Using auto half precision backend\n","[INFO|trainer.py:2128] 2024-06-30 06:16:35,081 >> ***** Running training *****\n","[INFO|trainer.py:2129] 2024-06-30 06:16:35,081 >> Num examples = 4,482\n","[INFO|trainer.py:2130] 2024-06-30 06:16:35,081 >> Num Epochs = 10\n","[INFO|trainer.py:2131] 2024-06-30 06:16:35,081 >> Instantaneous batch size per device = 1\n","[INFO|trainer.py:2134] 2024-06-30 06:16:35,081 >> Total train batch size (w. parallel, distributed & accumulation) = 8\n","[INFO|trainer.py:2135] 2024-06-30 06:16:35,081 >> Gradient Accumulation steps = 8\n","[INFO|trainer.py:2136] 2024-06-30 06:16:35,081 >> Total optimization steps = 5,600\n","[INFO|trainer.py:2137] 2024-06-30 06:16:35,083 >> Number of trainable parameters = 9,232,384\n","{'loss': 2.1598, 'grad_norm': 1.7301031351089478, 'learning_rate': 1.7857142857142857e-06, 'epoch': 0.02}\n","{'loss': 2.2894, 'grad_norm': 1.9221487045288086, 'learning_rate': 3.5714285714285714e-06, 'epoch': 0.04}\n","{'loss': 2.152, 'grad_norm': 1.5344856977462769, 'learning_rate': 5.357142857142857e-06, 'epoch': 0.05}\n","{'loss': 2.1602, 'grad_norm': 3.0139236450195312, 'learning_rate': 7.142857142857143e-06, 'epoch': 0.07}\n","{'loss': 2.3519, 'grad_norm': 1.501677393913269, 'learning_rate': 8.92857142857143e-06, 'epoch': 0.09}\n","{'loss': 2.05, 'grad_norm': 1.680209994316101, 'learning_rate': 1.0714285714285714e-05, 'epoch': 0.11}\n","{'loss': 2.1875, 'grad_norm': 1.694694995880127, 'learning_rate': 1.25e-05, 'epoch': 0.12}\n","{'loss': 1.9546, 'grad_norm': 1.5895333290100098, 'learning_rate': 1.4285714285714285e-05, 'epoch': 0.14}\n","{'loss': 2.075, 'grad_norm': 2.115245819091797, 'learning_rate': 1.6071428571428572e-05, 'epoch': 0.16}\n","{'loss': 1.9713, 'grad_norm': 1.3625324964523315, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.18}\n","{'loss': 2.0099, 'grad_norm': 1.7853630781173706, 'learning_rate': 1.9642857142857145e-05, 'epoch': 0.2}\n","{'loss': 1.9603, 'grad_norm': 1.3131749629974365, 'learning_rate': 2.1428571428571428e-05, 'epoch': 0.21}\n","{'loss': 1.9619, 'grad_norm': 1.6807270050048828, 'learning_rate': 2.3214285714285715e-05, 'epoch': 0.23}\n","{'loss': 1.9889, 'grad_norm': 2.136683464050293, 'learning_rate': 2.5e-05, 'epoch': 0.25}\n","{'loss': 1.8445, 'grad_norm': 1.5379092693328857, 'learning_rate': 2.6785714285714288e-05, 'epoch': 0.27}\n","{'loss': 1.8162, 'grad_norm': 1.4818131923675537, 'learning_rate': 2.857142857142857e-05, 'epoch': 0.29}\n","{'loss': 1.9681, 'grad_norm': 1.3765653371810913, 'learning_rate': 3.0357142857142857e-05, 'epoch': 0.3}\n","{'loss': 1.7704, 'grad_norm': 1.7519148588180542, 'learning_rate': 3.2142857142857144e-05, 'epoch': 0.32}\n","{'loss': 1.8997, 'grad_norm': 2.2547669410705566, 'learning_rate': 3.392857142857143e-05, 'epoch': 0.34}\n","{'loss': 2.0083, 'grad_norm': 1.9038093090057373, 'learning_rate': 3.571428571428572e-05, 'epoch': 0.36}\n","{'loss': 1.9641, 'grad_norm': 1.864136815071106, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.37}\n","{'loss': 1.8745, 'grad_norm': 2.456977605819702, 'learning_rate': 3.928571428571429e-05, 'epoch': 0.39}\n","{'loss': 1.8564, 'grad_norm': 2.0037779808044434, 'learning_rate': 4.107142857142857e-05, 'epoch': 0.41}\n","{'loss': 2.0248, 'grad_norm': 2.459550619125366, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.43}\n","{'loss': 1.9225, 'grad_norm': 2.4255712032318115, 'learning_rate': 4.464285714285715e-05, 'epoch': 0.45}\n","{'loss': 1.8559, 'grad_norm': 2.2272531986236572, 'learning_rate': 4.642857142857143e-05, 'epoch': 0.46}\n","{'loss': 1.916, 'grad_norm': 3.067957878112793, 'learning_rate': 4.8214285714285716e-05, 'epoch': 0.48}\n","{'loss': 1.9695, 'grad_norm': 2.689528226852417, 'learning_rate': 5e-05, 'epoch': 0.5}\n","{'loss': 1.7267, 'grad_norm': 1.640542984008789, 'learning_rate': 5.1785714285714296e-05, 'epoch': 0.52}\n","{'loss': 1.8751, 'grad_norm': 2.6767070293426514, 'learning_rate': 5.3571428571428575e-05, 'epoch': 0.54}\n","{'loss': 1.8821, 'grad_norm': 2.2540671825408936, 'learning_rate': 5.535714285714286e-05, 'epoch': 0.55}\n","{'loss': 1.7133, 'grad_norm': 3.7877705097198486, 'learning_rate': 5.714285714285714e-05, 'epoch': 0.57}\n","{'loss': 1.7552, 'grad_norm': 2.7244925498962402, 'learning_rate': 5.8928571428571435e-05, 'epoch': 0.59}\n","{'loss': 1.8089, 'grad_norm': 2.4050076007843018, 'learning_rate': 6.0714285714285715e-05, 'epoch': 0.61}\n","{'loss': 1.8102, 'grad_norm': 3.4505980014801025, 'learning_rate': 6.25e-05, 'epoch': 0.62}\n","{'loss': 1.7452, 'grad_norm': 1.8404840230941772, 'learning_rate': 6.428571428571429e-05, 'epoch': 0.64}\n","{'loss': 1.9171, 'grad_norm': 2.923614025115967, 'learning_rate': 6.607142857142857e-05, 'epoch': 0.66}\n","{'loss': 1.8893, 'grad_norm': 2.2417802810668945, 'learning_rate': 6.785714285714286e-05, 'epoch': 0.68}\n","{'loss': 1.6041, 'grad_norm': 1.8358319997787476, 'learning_rate': 6.964285714285715e-05, 'epoch': 0.7}\n","{'loss': 1.7782, 'grad_norm': 2.7531838417053223, 'learning_rate': 7.142857142857143e-05, 'epoch': 0.71}\n","{'loss': 1.8365, 'grad_norm': 2.2503859996795654, 'learning_rate': 7.321428571428571e-05, 'epoch': 0.73}\n","{'loss': 1.863, 'grad_norm': 1.8987295627593994, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.75}\n","{'loss': 1.8407, 'grad_norm': 2.950441598892212, 'learning_rate': 7.67857142857143e-05, 'epoch': 0.77}\n","{'loss': 1.7695, 'grad_norm': 3.3668158054351807, 'learning_rate': 7.857142857142858e-05, 'epoch': 0.79}\n","{'loss': 1.6759, 'grad_norm': 1.843374252319336, 'learning_rate': 8.035714285714287e-05, 'epoch': 0.8}\n","{'loss': 1.7465, 'grad_norm': 2.3402576446533203, 'learning_rate': 8.214285714285714e-05, 'epoch': 0.82}\n","{'loss': 1.7852, 'grad_norm': 3.2396647930145264, 'learning_rate': 8.392857142857144e-05, 'epoch': 0.84}\n","{'loss': 1.7626, 'grad_norm': 2.432474136352539, 'learning_rate': 8.571428571428571e-05, 'epoch': 0.86}\n","{'loss': 1.8173, 'grad_norm': 1.9021589756011963, 'learning_rate': 8.75e-05, 'epoch': 0.87}\n","{'loss': 1.9716, 'grad_norm': 1.968782901763916, 'learning_rate': 8.92857142857143e-05, 'epoch': 0.89}\n","{'loss': 1.8814, 'grad_norm': 2.0488665103912354, 'learning_rate': 9.107142857142857e-05, 'epoch': 0.91}\n","{'loss': 1.7689, 'grad_norm': 2.5687661170959473, 'learning_rate': 9.285714285714286e-05, 'epoch': 0.93}\n","{'loss': 1.8, 'grad_norm': 3.141063690185547, 'learning_rate': 9.464285714285715e-05, 'epoch': 0.95}\n","{'loss': 1.8067, 'grad_norm': 2.3366873264312744, 'learning_rate': 9.642857142857143e-05, 'epoch': 0.96}\n","{'loss': 1.7689, 'grad_norm': 2.356125831604004, 'learning_rate': 9.821428571428572e-05, 'epoch': 0.98}\n","{'loss': 1.7444, 'grad_norm': 1.962470293045044, 'learning_rate': 0.0001, 'epoch': 1.0}\n"," 10%|โโโโ | 560/5600 [15:11<2:19:14, 1.66s/it][INFO|trainer.py:3788] 2024-06-30 06:31:46,942 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-30 06:31:46,942 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-30 06:31:46,942 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 7%|โโโ | 3/46 [00:00<00:01, 27.37it/s]\u001b[A\n"," 13%|โโโโโโ | 6/46 [00:00<00:01, 22.33it/s]\u001b[A\n"," 20%|โโโโโโโโโ | 9/46 [00:00<00:01, 20.97it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 20.64it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 20.61it/s]\u001b[A\n"," 39%|โโโโโโโโโโโโโโโโโ | 18/46 [00:00<00:01, 19.71it/s]\u001b[A\n"," 43%|โโโโโโโโโโโโโโโโโโโ | 20/46 [00:00<00:01, 19.38it/s]\u001b[A\n"," 48%|โโโโโโโโโโโโโโโโโโโโโ | 22/46 [00:01<00:01, 18.71it/s]\u001b[A\n"," 52%|โโโโโโโโโโโโโโโโโโโโโโโ | 24/46 [00:01<00:01, 18.19it/s]\u001b[A\n"," 57%|โโโโโโโโโโโโโโโโโโโโโโโโโ | 26/46 [00:01<00:01, 18.21it/s]\u001b[A\n"," 61%|โโโโโโโโโโโโโโโโโโโโโโโโโโโ | 28/46 [00:01<00:00, 18.29it/s]\u001b[A\n"," 65%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 30/46 [00:01<00:00, 17.90it/s]\u001b[A\n"," 70%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 32/46 [00:01<00:00, 16.92it/s]\u001b[A\n"," 74%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 34/46 [00:01<00:00, 17.03it/s]\u001b[A\n"," 78%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 36/46 [00:01<00:00, 17.24it/s]\u001b[A\n"," 83%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 38/46 [00:02<00:00, 17.33it/s]\u001b[A\n"," 87%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 40/46 [00:02<00:00, 17.09it/s]\u001b[A\n"," 91%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 42/46 [00:02<00:00, 16.80it/s]\u001b[A\n"," 96%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 44/46 [00:02<00:00, 17.10it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 1.7407625913619995, 'eval_runtime': 2.569, 'eval_samples_per_second': 17.906, 'eval_steps_per_second': 17.906, 'epoch': 1.0}\n"," 10%|โโโโ | 560/5600 [15:14<2:19:14, 1.66s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:02<00:00, 16.67it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-30 06:31:49,511 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-560\n","[INFO|configuration_utils.py:733] 2024-06-30 06:31:50,591 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n","[INFO|configuration_utils.py:800] 2024-06-30 06:31:50,592 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 1536,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 8960,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 28,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 12,\n"," \"num_hidden_layers\": 28,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-30 06:31:50,659 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-560/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-30 06:31:50,660 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-560/special_tokens_map.json\n","{'loss': 1.6815, 'grad_norm': 2.0423543453216553, 'learning_rate': 9.999902864657691e-05, 'epoch': 1.02}\n","{'loss': 1.5142, 'grad_norm': 3.1950676441192627, 'learning_rate': 9.999611462404875e-05, 'epoch': 1.04}\n","{'loss': 1.6976, 'grad_norm': 2.1450624465942383, 'learning_rate': 9.999125804563732e-05, 'epoch': 1.05}\n","{'loss': 1.636, 'grad_norm': 2.6176905632019043, 'learning_rate': 9.998445910004082e-05, 'epoch': 1.07}\n","{'loss': 1.6114, 'grad_norm': 2.9323713779449463, 'learning_rate': 9.997571805142639e-05, 'epoch': 1.09}\n","{'loss': 1.6291, 'grad_norm': 2.9673070907592773, 'learning_rate': 9.996503523941994e-05, 'epoch': 1.11}\n","{'loss': 1.6302, 'grad_norm': 2.463287591934204, 'learning_rate': 9.99524110790929e-05, 'epoch': 1.12}\n","{'loss': 1.6341, 'grad_norm': 4.124421119689941, 'learning_rate': 9.993784606094612e-05, 'epoch': 1.14}\n","{'loss': 1.6547, 'grad_norm': 2.851663589477539, 'learning_rate': 9.992134075089084e-05, 'epoch': 1.16}\n","{'loss': 1.5286, 'grad_norm': 2.5066659450531006, 'learning_rate': 9.99028957902266e-05, 'epoch': 1.18}\n","{'loss': 1.4801, 'grad_norm': 2.078930139541626, 'learning_rate': 9.988251189561645e-05, 'epoch': 1.2}\n","{'loss': 1.6318, 'grad_norm': 3.086003065109253, 'learning_rate': 9.986018985905901e-05, 'epoch': 1.21}\n","{'loss': 1.5591, 'grad_norm': 3.057227849960327, 'learning_rate': 9.983593054785776e-05, 'epoch': 1.23}\n","{'loss': 1.6401, 'grad_norm': 3.679922342300415, 'learning_rate': 9.980973490458728e-05, 'epoch': 1.25}\n","{'loss': 1.6262, 'grad_norm': 3.8075058460235596, 'learning_rate': 9.978160394705668e-05, 'epoch': 1.27}\n","{'loss': 1.7599, 'grad_norm': 3.5445713996887207, 'learning_rate': 9.975153876827008e-05, 'epoch': 1.29}\n","{'loss': 1.6814, 'grad_norm': 2.6588189601898193, 'learning_rate': 9.971954053638399e-05, 'epoch': 1.3}\n","{'loss': 1.6972, 'grad_norm': 2.6084141731262207, 'learning_rate': 9.968561049466214e-05, 'epoch': 1.32}\n","{'loss': 1.6675, 'grad_norm': 3.312152147293091, 'learning_rate': 9.964974996142698e-05, 'epoch': 1.34}\n","{'loss': 1.4381, 'grad_norm': 3.4132375717163086, 'learning_rate': 9.961196033000861e-05, 'epoch': 1.36}\n","{'loss': 1.6732, 'grad_norm': 3.6682002544403076, 'learning_rate': 9.957224306869053e-05, 'epoch': 1.37}\n","{'loss': 1.5185, 'grad_norm': 4.421182155609131, 'learning_rate': 9.953059972065265e-05, 'epoch': 1.39}\n","{'loss': 1.3911, 'grad_norm': 2.5544440746307373, 'learning_rate': 9.948703190391131e-05, 'epoch': 1.41}\n","{'loss': 1.6939, 'grad_norm': 3.4235222339630127, 'learning_rate': 9.944154131125642e-05, 'epoch': 1.43}\n","{'loss': 1.5821, 'grad_norm': 3.2818450927734375, 'learning_rate': 9.939412971018574e-05, 'epoch': 1.45}\n","{'loss': 1.551, 'grad_norm': 3.252692461013794, 'learning_rate': 9.934479894283606e-05, 'epoch': 1.46}\n","{'loss': 1.7187, 'grad_norm': 2.9500677585601807, 'learning_rate': 9.92935509259118e-05, 'epoch': 1.48}\n","{'loss': 1.5578, 'grad_norm': 3.451415538787842, 'learning_rate': 9.924038765061042e-05, 'epoch': 1.5}\n","{'loss': 1.4891, 'grad_norm': 2.3982598781585693, 'learning_rate': 9.918531118254507e-05, 'epoch': 1.52}\n","{'loss': 1.6728, 'grad_norm': 3.524627685546875, 'learning_rate': 9.912832366166442e-05, 'epoch': 1.54}\n","{'loss': 1.6112, 'grad_norm': 3.316537857055664, 'learning_rate': 9.906942730216939e-05, 'epoch': 1.55}\n","{'loss': 1.547, 'grad_norm': 2.789212465286255, 'learning_rate': 9.900862439242719e-05, 'epoch': 1.57}\n","{'loss': 1.6212, 'grad_norm': 3.1522133350372314, 'learning_rate': 9.894591729488242e-05, 'epoch': 1.59}\n","{'loss': 1.7589, 'grad_norm': 2.6350767612457275, 'learning_rate': 9.888130844596524e-05, 'epoch': 1.61}\n","{'loss': 1.5101, 'grad_norm': 2.931504487991333, 'learning_rate': 9.881480035599667e-05, 'epoch': 1.62}\n","{'loss': 1.6024, 'grad_norm': 2.5779600143432617, 'learning_rate': 9.874639560909117e-05, 'epoch': 1.64}\n","{'loss': 1.5994, 'grad_norm': 3.0192410945892334, 'learning_rate': 9.867609686305617e-05, 'epoch': 1.66}\n","{'loss': 1.5899, 'grad_norm': 2.50893497467041, 'learning_rate': 9.860390684928873e-05, 'epoch': 1.68}\n","{'loss': 1.5526, 'grad_norm': 3.570330858230591, 'learning_rate': 9.852982837266955e-05, 'epoch': 1.7}\n","{'loss': 1.5617, 'grad_norm': 4.337871074676514, 'learning_rate': 9.84538643114539e-05, 'epoch': 1.71}\n","{'loss': 1.5299, 'grad_norm': 2.3411428928375244, 'learning_rate': 9.837601761715983e-05, 'epoch': 1.73}\n","{'loss': 1.6652, 'grad_norm': 2.955780029296875, 'learning_rate': 9.829629131445342e-05, 'epoch': 1.75}\n","{'loss': 1.651, 'grad_norm': 2.441587209701538, 'learning_rate': 9.82146885010314e-05, 'epoch': 1.77}\n","{'loss': 1.5477, 'grad_norm': 2.947199821472168, 'learning_rate': 9.81312123475006e-05, 'epoch': 1.78}\n","{'loss': 1.5604, 'grad_norm': 2.740534543991089, 'learning_rate': 9.804586609725499e-05, 'epoch': 1.8}\n","{'loss': 1.5216, 'grad_norm': 2.7406256198883057, 'learning_rate': 9.79586530663494e-05, 'epoch': 1.82}\n","{'loss': 1.4901, 'grad_norm': 2.576497793197632, 'learning_rate': 9.78695766433709e-05, 'epoch': 1.84}\n","{'loss': 1.6326, 'grad_norm': 2.4222359657287598, 'learning_rate': 9.777864028930705e-05, 'epoch': 1.86}\n","{'loss': 1.4982, 'grad_norm': 3.2682604789733887, 'learning_rate': 9.768584753741134e-05, 'epoch': 1.87}\n","{'loss': 1.5688, 'grad_norm': 2.756934642791748, 'learning_rate': 9.759120199306613e-05, 'epoch': 1.89}\n","{'loss': 1.6835, 'grad_norm': 3.1586759090423584, 'learning_rate': 9.74947073336423e-05, 'epoch': 1.91}\n","{'loss': 1.7065, 'grad_norm': 3.218165874481201, 'learning_rate': 9.73963673083566e-05, 'epoch': 1.93}\n","{'loss': 1.6155, 'grad_norm': 2.732252836227417, 'learning_rate': 9.72961857381258e-05, 'epoch': 1.95}\n","{'loss': 1.5021, 'grad_norm': 2.702173948287964, 'learning_rate': 9.719416651541839e-05, 'epoch': 1.96}\n","{'loss': 1.6002, 'grad_norm': 2.3407227993011475, 'learning_rate': 9.709031360410318e-05, 'epoch': 1.98}\n","{'loss': 1.5955, 'grad_norm': 3.0833232402801514, 'learning_rate': 9.698463103929542e-05, 'epoch': 2.0}\n"," 20%|โโโโโโโโ | 1120/5600 [30:57<2:03:57, 1.66s/it][INFO|trainer.py:3788] 2024-06-30 06:47:32,631 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-30 06:47:32,631 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-30 06:47:32,631 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 7%|โโโ | 3/46 [00:00<00:01, 28.46it/s]\u001b[A\n"," 13%|โโโโโโ | 6/46 [00:00<00:01, 21.98it/s]\u001b[A\n"," 20%|โโโโโโโโโ | 9/46 [00:00<00:01, 21.50it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 20.72it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 20.55it/s]\u001b[A\n"," 39%|โโโโโโโโโโโโโโโโโ | 18/46 [00:00<00:01, 20.31it/s]\u001b[A\n"," 46%|โโโโโโโโโโโโโโโโโโโโ | 21/46 [00:01<00:01, 20.05it/s]\u001b[A\n"," 52%|โโโโโโโโโโโโโโโโโโโโโโโ | 24/46 [00:01<00:01, 19.50it/s]\u001b[A\n"," 59%|โโโโโโโโโโโโโโโโโโโโโโโโโโ | 27/46 [00:01<00:00, 19.98it/s]\u001b[A\n"," 65%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 30/46 [00:01<00:00, 19.97it/s]\u001b[A\n"," 72%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 33/46 [00:01<00:00, 19.91it/s]\u001b[A\n"," 76%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 35/46 [00:01<00:00, 19.41it/s]\u001b[A\n"," 80%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 37/46 [00:01<00:00, 18.75it/s]\u001b[A\n"," 85%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 39/46 [00:01<00:00, 18.79it/s]\u001b[A\n"," 89%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 41/46 [00:02<00:00, 18.91it/s]\u001b[A\n"," 93%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 43/46 [00:02<00:00, 18.56it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 1.6952180862426758, 'eval_runtime': 2.3895, 'eval_samples_per_second': 19.251, 'eval_steps_per_second': 19.251, 'epoch': 2.0}\n"," 20%|โโโโโโโโ | 1120/5600 [30:59<2:03:57, 1.66s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:02<00:00, 18.53it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-30 06:47:35,021 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-1120\n","[INFO|configuration_utils.py:733] 2024-06-30 06:47:35,643 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n","[INFO|configuration_utils.py:800] 2024-06-30 06:47:35,644 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 1536,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 8960,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 28,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 12,\n"," \"num_hidden_layers\": 28,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-30 06:47:35,688 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-1120/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-30 06:47:35,688 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-1120/special_tokens_map.json\n","{'loss': 1.2986, 'grad_norm': 6.459508895874023, 'learning_rate': 9.687712292719997e-05, 'epoch': 2.02}\n","{'loss': 1.1686, 'grad_norm': 2.6047580242156982, 'learning_rate': 9.67677934449517e-05, 'epoch': 2.03}\n","{'loss': 1.2613, 'grad_norm': 4.400974273681641, 'learning_rate': 9.665664684045333e-05, 'epoch': 2.05}\n","{'loss': 1.1817, 'grad_norm': 3.368881940841675, 'learning_rate': 9.654368743221022e-05, 'epoch': 2.07}\n","{'loss': 1.1255, 'grad_norm': 2.8357393741607666, 'learning_rate': 9.642891960916268e-05, 'epoch': 2.09}\n","{'loss': 1.2003, 'grad_norm': 2.8627848625183105, 'learning_rate': 9.631234783051544e-05, 'epoch': 2.11}\n","{'loss': 1.3588, 'grad_norm': 4.006772041320801, 'learning_rate': 9.619397662556435e-05, 'epoch': 2.12}\n","{'loss': 1.1791, 'grad_norm': 3.8697452545166016, 'learning_rate': 9.607381059352038e-05, 'epoch': 2.14}\n","{'loss': 1.1847, 'grad_norm': 4.039665222167969, 'learning_rate': 9.595185440333103e-05, 'epoch': 2.16}\n","{'loss': 1.1875, 'grad_norm': 4.559266567230225, 'learning_rate': 9.582811279349882e-05, 'epoch': 2.18}\n","{'loss': 1.2245, 'grad_norm': 3.3498127460479736, 'learning_rate': 9.570259057189717e-05, 'epoch': 2.2}\n","{'loss': 1.2361, 'grad_norm': 4.742955684661865, 'learning_rate': 9.557529261558367e-05, 'epoch': 2.21}\n","{'loss': 1.2929, 'grad_norm': 5.568743705749512, 'learning_rate': 9.544622387061055e-05, 'epoch': 2.23}\n","{'loss': 1.2945, 'grad_norm': 5.399260997772217, 'learning_rate': 9.53153893518325e-05, 'epoch': 2.25}\n","{'loss': 1.1725, 'grad_norm': 3.5391037464141846, 'learning_rate': 9.518279414271183e-05, 'epoch': 2.27}\n","{'loss': 1.1956, 'grad_norm': 4.30355978012085, 'learning_rate': 9.504844339512095e-05, 'epoch': 2.28}\n","{'loss': 1.2378, 'grad_norm': 3.2837555408477783, 'learning_rate': 9.491234232914221e-05, 'epoch': 2.3}\n","{'loss': 1.2634, 'grad_norm': 4.105693340301514, 'learning_rate': 9.477449623286505e-05, 'epoch': 2.32}\n","{'loss': 1.3589, 'grad_norm': 3.694589614868164, 'learning_rate': 9.463491046218058e-05, 'epoch': 2.34}\n","{'loss': 1.3134, 'grad_norm': 3.689924716949463, 'learning_rate': 9.449359044057345e-05, 'epoch': 2.36}\n","{'loss': 1.2572, 'grad_norm': 4.2927374839782715, 'learning_rate': 9.435054165891109e-05, 'epoch': 2.37}\n","{'loss': 1.4522, 'grad_norm': 4.005749225616455, 'learning_rate': 9.420576967523049e-05, 'epoch': 2.39}\n","{'loss': 1.343, 'grad_norm': 4.006478309631348, 'learning_rate': 9.405928011452211e-05, 'epoch': 2.41}\n","{'loss': 1.181, 'grad_norm': 4.455829620361328, 'learning_rate': 9.391107866851143e-05, 'epoch': 2.43}\n","{'loss': 1.2442, 'grad_norm': 3.436230421066284, 'learning_rate': 9.376117109543769e-05, 'epoch': 2.45}\n","{'loss': 1.2157, 'grad_norm': 3.515488386154175, 'learning_rate': 9.360956321983028e-05, 'epoch': 2.46}\n","{'loss': 1.2723, 'grad_norm': 3.4698567390441895, 'learning_rate': 9.345626093228233e-05, 'epoch': 2.48}\n","{'loss': 1.3747, 'grad_norm': 4.542730808258057, 'learning_rate': 9.330127018922194e-05, 'epoch': 2.5}\n","{'loss': 1.2685, 'grad_norm': 3.6365323066711426, 'learning_rate': 9.314459701268065e-05, 'epoch': 2.52}\n","{'loss': 1.2574, 'grad_norm': 3.8041131496429443, 'learning_rate': 9.298624749005951e-05, 'epoch': 2.53}\n","{'loss': 1.3031, 'grad_norm': 3.81734037399292, 'learning_rate': 9.282622777389258e-05, 'epoch': 2.55}\n","{'loss': 1.168, 'grad_norm': 4.677352428436279, 'learning_rate': 9.266454408160779e-05, 'epoch': 2.57}\n","{'loss': 1.3771, 'grad_norm': 5.038273811340332, 'learning_rate': 9.250120269528546e-05, 'epoch': 2.59}\n","{'loss': 1.2421, 'grad_norm': 5.5514702796936035, 'learning_rate': 9.233620996141421e-05, 'epoch': 2.61}\n","{'loss': 1.2833, 'grad_norm': 3.1367263793945312, 'learning_rate': 9.21695722906443e-05, 'epoch': 2.62}\n","{'loss': 1.2831, 'grad_norm': 2.603522539138794, 'learning_rate': 9.200129615753859e-05, 'epoch': 2.64}\n","{'loss': 1.2421, 'grad_norm': 3.707820177078247, 'learning_rate': 9.183138810032099e-05, 'epoch': 2.66}\n","{'loss': 1.3674, 'grad_norm': 3.9344961643218994, 'learning_rate': 9.165985472062246e-05, 'epoch': 2.68}\n","{'loss': 1.1452, 'grad_norm': 4.652283668518066, 'learning_rate': 9.148670268322438e-05, 'epoch': 2.7}\n","{'loss': 1.1737, 'grad_norm': 4.732541084289551, 'learning_rate': 9.131193871579975e-05, 'epoch': 2.71}\n","{'loss': 1.4043, 'grad_norm': 3.7013778686523438, 'learning_rate': 9.113556960865167e-05, 'epoch': 2.73}\n","{'loss': 1.334, 'grad_norm': 3.8859188556671143, 'learning_rate': 9.09576022144496e-05, 'epoch': 2.75}\n","{'loss': 1.2964, 'grad_norm': 3.6818110942840576, 'learning_rate': 9.077804344796302e-05, 'epoch': 2.77}\n","{'loss': 1.3015, 'grad_norm': 3.5502216815948486, 'learning_rate': 9.059690028579284e-05, 'epoch': 2.78}\n","{'loss': 1.1433, 'grad_norm': 3.0337369441986084, 'learning_rate': 9.041417976610027e-05, 'epoch': 2.8}\n","{'loss': 1.2503, 'grad_norm': 3.4227890968322754, 'learning_rate': 9.022988898833342e-05, 'epoch': 2.82}\n","{'loss': 1.2781, 'grad_norm': 3.566080093383789, 'learning_rate': 9.004403511295141e-05, 'epoch': 2.84}\n","{'loss': 1.2557, 'grad_norm': 4.064306735992432, 'learning_rate': 8.985662536114613e-05, 'epoch': 2.86}\n","{'loss': 1.4121, 'grad_norm': 3.106153726577759, 'learning_rate': 8.966766701456177e-05, 'epoch': 2.87}\n","{'loss': 1.2789, 'grad_norm': 3.873041868209839, 'learning_rate': 8.947716741501177e-05, 'epoch': 2.89}\n","{'loss': 1.2759, 'grad_norm': 3.9415042400360107, 'learning_rate': 8.928513396419368e-05, 'epoch': 2.91}\n","{'loss': 1.2078, 'grad_norm': 3.456357002258301, 'learning_rate': 8.90915741234015e-05, 'epoch': 2.93}\n","{'loss': 1.3886, 'grad_norm': 3.5346779823303223, 'learning_rate': 8.889649541323574e-05, 'epoch': 2.95}\n","{'loss': 1.33, 'grad_norm': 3.6706087589263916, 'learning_rate': 8.869990541331138e-05, 'epoch': 2.96}\n","{'loss': 1.2564, 'grad_norm': 4.235021591186523, 'learning_rate': 8.850181176196315e-05, 'epoch': 2.98}\n","{'loss': 1.3518, 'grad_norm': 3.6379354000091553, 'learning_rate': 8.83022221559489e-05, 'epoch': 3.0}\n"," 30%|โโโโโโโโโโโ | 1680/5600 [46:41<1:48:23, 1.66s/it][INFO|trainer.py:3788] 2024-06-30 07:03:16,574 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-30 07:03:16,574 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-30 07:03:16,574 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 7%|โโโ | 3/46 [00:00<00:01, 28.95it/s]\u001b[A\n"," 13%|โโโโโโ | 6/46 [00:00<00:01, 21.09it/s]\u001b[A\n"," 20%|โโโโโโโโโ | 9/46 [00:00<00:01, 19.49it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 19.05it/s]\u001b[A\n"," 30%|โโโโโโโโโโโโโ | 14/46 [00:00<00:01, 18.97it/s]\u001b[A\n"," 35%|โโโโโโโโโโโโโโโ | 16/46 [00:00<00:01, 18.52it/s]\u001b[A\n"," 41%|โโโโโโโโโโโโโโโโโโ | 19/46 [00:00<00:01, 19.03it/s]\u001b[A\n"," 46%|โโโโโโโโโโโโโโโโโโโโ | 21/46 [00:01<00:01, 19.12it/s]\u001b[A\n"," 52%|โโโโโโโโโโโโโโโโโโโโโโโ | 24/46 [00:01<00:01, 19.40it/s]\u001b[A\n"," 57%|โโโโโโโโโโโโโโโโโโโโโโโโโ | 26/46 [00:01<00:01, 19.15it/s]\u001b[A\n"," 61%|โโโโโโโโโโโโโโโโโโโโโโโโโโโ | 28/46 [00:01<00:00, 19.26it/s]\u001b[A\n"," 65%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 30/46 [00:01<00:00, 19.36it/s]\u001b[A\n"," 70%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 32/46 [00:01<00:00, 19.43it/s]\u001b[A\n"," 74%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 34/46 [00:01<00:00, 18.16it/s]\u001b[A\n"," 78%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 36/46 [00:01<00:00, 18.30it/s]\u001b[A\n"," 83%|โโโ๏ฟฝ๏ฟฝ๏ฟฝโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 38/46 [00:01<00:00, 18.69it/s]\u001b[A\n"," 87%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 40/46 [00:02<00:00, 18.74it/s]\u001b[A\n"," 91%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 42/46 [00:02<00:00, 19.09it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 1.7901949882507324, 'eval_runtime': 2.4532, 'eval_samples_per_second': 18.751, 'eval_steps_per_second': 18.751, 'epoch': 3.0}\n"," 30%|โโโโโโโโโโโ | 1680/5600 [46:43<1:48:23, 1.66s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:02<00:00, 18.83it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-30 07:03:19,028 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-1680\n","[INFO|configuration_utils.py:733] 2024-06-30 07:03:19,590 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n","[INFO|configuration_utils.py:800] 2024-06-30 07:03:19,590 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 1536,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 8960,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 28,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 12,\n"," \"num_hidden_layers\": 28,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-30 07:03:19,633 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-1680/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-30 07:03:19,633 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-1680/special_tokens_map.json\n","{'loss': 1.15, 'grad_norm': 3.3373215198516846, 'learning_rate': 8.810114435015054e-05, 'epoch': 3.02}\n","{'loss': 0.8154, 'grad_norm': 4.1678571701049805, 'learning_rate': 8.789858615727265e-05, 'epoch': 3.03}\n","{'loss': 1.0096, 'grad_norm': 7.9504194259643555, 'learning_rate': 8.7694555447539e-05, 'epoch': 3.05}\n","{'loss': 0.9017, 'grad_norm': 3.666703462600708, 'learning_rate': 8.748906014838672e-05, 'epoch': 3.07}\n","{'loss': 0.8374, 'grad_norm': 3.569261312484741, 'learning_rate': 8.728210824415827e-05, 'epoch': 3.09}\n","{'loss': 0.9234, 'grad_norm': 5.156108856201172, 'learning_rate': 8.707370777579133e-05, 'epoch': 3.11}\n","{'loss': 0.8894, 'grad_norm': 3.83931827545166, 'learning_rate': 8.68638668405062e-05, 'epoch': 3.12}\n","{'loss': 0.9901, 'grad_norm': 4.340090274810791, 'learning_rate': 8.665259359149132e-05, 'epoch': 3.14}\n","{'loss': 0.8987, 'grad_norm': 5.530636310577393, 'learning_rate': 8.643989623758643e-05, 'epoch': 3.16}\n","{'loss': 0.94, 'grad_norm': 4.701400279998779, 'learning_rate': 8.622578304296364e-05, 'epoch': 3.18}\n","{'loss': 0.9471, 'grad_norm': 5.912676811218262, 'learning_rate': 8.601026232680634e-05, 'epoch': 3.2}\n","{'loss': 0.8883, 'grad_norm': 5.244345188140869, 'learning_rate': 8.579334246298593e-05, 'epoch': 3.21}\n","{'loss': 1.1187, 'grad_norm': 3.5720531940460205, 'learning_rate': 8.557503187973651e-05, 'epoch': 3.23}\n","{'loss': 0.8993, 'grad_norm': 4.100275993347168, 'learning_rate': 8.535533905932738e-05, 'epoch': 3.25}\n","{'loss': 0.9287, 'grad_norm': 3.9435741901397705, 'learning_rate': 8.513427253773346e-05, 'epoch': 3.27}\n","{'loss': 0.8239, 'grad_norm': 4.083703994750977, 'learning_rate': 8.491184090430364e-05, 'epoch': 3.28}\n","{'loss': 1.0248, 'grad_norm': 4.739283084869385, 'learning_rate': 8.468805280142709e-05, 'epoch': 3.3}\n","{'loss': 1.0099, 'grad_norm': 4.6722493171691895, 'learning_rate': 8.446291692419736e-05, 'epoch': 3.32}\n","{'loss': 0.9861, 'grad_norm': 3.677231550216675, 'learning_rate': 8.423644202007467e-05, 'epoch': 3.34}\n","{'loss': 0.9352, 'grad_norm': 3.738945245742798, 'learning_rate': 8.400863688854597e-05, 'epoch': 3.36}\n","{'loss': 1.0084, 'grad_norm': 4.615973949432373, 'learning_rate': 8.377951038078302e-05, 'epoch': 3.37}\n","{'loss': 0.8946, 'grad_norm': 4.280567169189453, 'learning_rate': 8.354907139929851e-05, 'epoch': 3.39}\n","{'loss': 0.9536, 'grad_norm': 5.548139572143555, 'learning_rate': 8.33173288976002e-05, 'epoch': 3.41}\n","{'loss': 1.032, 'grad_norm': 4.183009147644043, 'learning_rate': 8.308429187984297e-05, 'epoch': 3.43}\n","{'loss': 0.9905, 'grad_norm': 4.598621368408203, 'learning_rate': 8.284996940047903e-05, 'epoch': 3.44}\n","{'loss': 1.0168, 'grad_norm': 3.7102458477020264, 'learning_rate': 8.261437056390606e-05, 'epoch': 3.46}\n","{'loss': 0.9419, 'grad_norm': 3.9970738887786865, 'learning_rate': 8.237750452411353e-05, 'epoch': 3.48}\n","{'loss': 0.945, 'grad_norm': 5.531300067901611, 'learning_rate': 8.213938048432697e-05, 'epoch': 3.5}\n","{'loss': 0.8867, 'grad_norm': 5.528501510620117, 'learning_rate': 8.190000769665044e-05, 'epoch': 3.52}\n","{'loss': 0.9773, 'grad_norm': 5.0458807945251465, 'learning_rate': 8.1659395461707e-05, 'epoch': 3.53}\n","{'loss': 0.9484, 'grad_norm': 7.089639663696289, 'learning_rate': 8.141755312827736e-05, 'epoch': 3.55}\n","{'loss': 1.0592, 'grad_norm': 5.28053617477417, 'learning_rate': 8.117449009293668e-05, 'epoch': 3.57}\n","{'loss': 0.8727, 'grad_norm': 3.750885009765625, 'learning_rate': 8.093021579968941e-05, 'epoch': 3.59}\n","{'loss': 0.9438, 'grad_norm': 3.9763479232788086, 'learning_rate': 8.068473973960238e-05, 'epoch': 3.61}\n","{'loss': 1.0057, 'grad_norm': 8.926958084106445, 'learning_rate': 8.043807145043604e-05, 'epoch': 3.62}\n","{'loss': 0.9272, 'grad_norm': 4.707141399383545, 'learning_rate': 8.019022051627388e-05, 'epoch': 3.64}\n","{'loss': 0.9354, 'grad_norm': 4.845958232879639, 'learning_rate': 7.994119656715002e-05, 'epoch': 3.66}\n","{'loss': 1.0041, 'grad_norm': 6.272175312042236, 'learning_rate': 7.969100927867507e-05, 'epoch': 3.68}\n","{'loss': 1.0257, 'grad_norm': 5.634955883026123, 'learning_rate': 7.943966837166023e-05, 'epoch': 3.69}\n","{'loss': 1.0411, 'grad_norm': 4.726901054382324, 'learning_rate': 7.91871836117395e-05, 'epoch': 3.71}\n","{'loss': 0.8919, 'grad_norm': 5.341351509094238, 'learning_rate': 7.89335648089903e-05, 'epoch': 3.73}\n","{'loss': 0.9918, 'grad_norm': 4.697306156158447, 'learning_rate': 7.86788218175523e-05, 'epoch': 3.75}\n","{'loss': 1.0214, 'grad_norm': 7.20255708694458, 'learning_rate': 7.842296453524463e-05, 'epoch': 3.77}\n","{'loss': 0.8907, 'grad_norm': 4.981348037719727, 'learning_rate': 7.81660029031811e-05, 'epoch': 3.78}\n","{'loss': 0.9927, 'grad_norm': 4.630974292755127, 'learning_rate': 7.79079469053842e-05, 'epoch': 3.8}\n","{'loss': 0.9723, 'grad_norm': 4.9225921630859375, 'learning_rate': 7.764880656839696e-05, 'epoch': 3.82}\n","{'loss': 0.9968, 'grad_norm': 5.320995807647705, 'learning_rate': 7.738859196089358e-05, 'epoch': 3.84}\n","{'loss': 0.8093, 'grad_norm': 4.394636154174805, 'learning_rate': 7.712731319328798e-05, 'epoch': 3.86}\n","{'loss': 0.9058, 'grad_norm': 4.045576572418213, 'learning_rate': 7.68649804173412e-05, 'epoch': 3.87}\n","{'loss': 1.0048, 'grad_norm': 3.463576316833496, 'learning_rate': 7.660160382576683e-05, 'epoch': 3.89}\n","{'loss': 0.9774, 'grad_norm': 6.120863914489746, 'learning_rate': 7.633719365183504e-05, 'epoch': 3.91}\n","{'loss': 0.8715, 'grad_norm': 4.576050758361816, 'learning_rate': 7.60717601689749e-05, 'epoch': 3.93}\n","{'loss': 0.7799, 'grad_norm': 3.344226360321045, 'learning_rate': 7.580531369037533e-05, 'epoch': 3.94}\n","{'loss': 1.1199, 'grad_norm': 4.684515476226807, 'learning_rate': 7.553786456858429e-05, 'epoch': 3.96}\n","{'loss': 1.0056, 'grad_norm': 3.8074159622192383, 'learning_rate': 7.526942319510655e-05, 'epoch': 3.98}\n","{'loss': 0.8473, 'grad_norm': 3.2416229248046875, 'learning_rate': 7.500000000000001e-05, 'epoch': 4.0}\n"," 40%|โโโโโโโโโโโโโโ | 2240/5600 [1:02:28<1:32:15, 1.65s/it][INFO|trainer.py:3788] 2024-06-30 07:19:03,748 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-30 07:19:03,748 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-30 07:19:03,748 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 7%|โโโ | 3/46 [00:00<00:01, 29.51it/s]\u001b[A\n"," 13%|โโโโโโ | 6/46 [00:00<00:01, 22.92it/s]\u001b[A\n"," 20%|โโโโโโโโโ | 9/46 [00:00<00:01, 21.24it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 20.58it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 20.33it/s]\u001b[A\n"," 39%|โโโโโโโโโโโโโโโโโ | 18/46 [00:00<00:01, 19.87it/s]\u001b[A\n"," 46%|โโโโโโโโโโโโโโโโโโโโ | 21/46 [00:01<00:01, 19.88it/s]\u001b[A\n"," 52%|โโโโโโโโโโโโโโโโโโโโโโโ | 24/46 [00:01<00:01, 19.79it/s]\u001b[A\n"," 57%|โโโโโโโโโโโโโโโโโโโโโโโโโ | 26/46 [00:01<00:01, 19.64it/s]\u001b[A\n"," 61%|โโโโโโโโโโโโโโโโโโโโโโโโโโโ | 28/46 [00:01<00:00, 19.52it/s]\u001b[A\n"," 65%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 30/46 [00:01<00:00, 19.07it/s]\u001b[A\n"," 70%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 32/46 [00:01<00:00, 18.47it/s]\u001b[A\n"," 74%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 34/46 [00:01<00:00, 18.40it/s]\u001b[A\n"," 78%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 36/46 [00:01<00:00, 18.48it/s]\u001b[A\n"," 83%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 38/46 [00:01<00:00, 18.44it/s]\u001b[A\n"," 87%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 40/46 [00:02<00:00, 18.37it/s]\u001b[A\n"," 91%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 42/46 [00:02<00:00, 18.42it/s]\u001b[A\n"," 96%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 44/46 [00:02<00:00, 18.56it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 1.9558732509613037, 'eval_runtime': 2.4173, 'eval_samples_per_second': 19.03, 'eval_steps_per_second': 19.03, 'epoch': 4.0}\n"," 40%|โโโโโโโโโโโโโโ | 2240/5600 [1:02:31<1:32:15, 1.65s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:02<00:00, 18.79it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-30 07:19:06,166 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-2240\n","[INFO|configuration_utils.py:733] 2024-06-30 07:19:06,736 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n","[INFO|configuration_utils.py:800] 2024-06-30 07:19:06,736 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 1536,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 8960,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 28,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 12,\n"," \"num_hidden_layers\": 28,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-30 07:19:06,775 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-2240/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-30 07:19:06,776 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-2240/special_tokens_map.json\n","{'loss': 0.7957, 'grad_norm': 4.7848052978515625, 'learning_rate': 7.472960545147038e-05, 'epoch': 4.02}\n","{'loss': 0.631, 'grad_norm': 4.0633649826049805, 'learning_rate': 7.445825005546448e-05, 'epoch': 4.03}\n","{'loss': 0.7215, 'grad_norm': 4.579054832458496, 'learning_rate': 7.4185944355262e-05, 'epoch': 4.05}\n","{'loss': 0.6143, 'grad_norm': 4.206972122192383, 'learning_rate': 7.391269893106592e-05, 'epoch': 4.07}\n","{'loss': 0.6206, 'grad_norm': 4.568854808807373, 'learning_rate': 7.363852439959135e-05, 'epoch': 4.09}\n","{'loss': 0.7532, 'grad_norm': 4.556376934051514, 'learning_rate': 7.33634314136531e-05, 'epoch': 4.11}\n","{'loss': 0.5763, 'grad_norm': 5.614088535308838, 'learning_rate': 7.308743066175172e-05, 'epoch': 4.12}\n","{'loss': 0.6721, 'grad_norm': 9.883162498474121, 'learning_rate': 7.281053286765815e-05, 'epoch': 4.14}\n","{'loss': 0.7292, 'grad_norm': 4.348223686218262, 'learning_rate': 7.253274878999727e-05, 'epoch': 4.16}\n","{'loss': 0.6231, 'grad_norm': 3.8863847255706787, 'learning_rate': 7.225408922182961e-05, 'epoch': 4.18}\n","{'loss': 0.6226, 'grad_norm': 8.888066291809082, 'learning_rate': 7.197456499023225e-05, 'epoch': 4.19}\n","{'loss': 0.6884, 'grad_norm': 4.336313247680664, 'learning_rate': 7.169418695587791e-05, 'epoch': 4.21}\n","{'loss': 0.6207, 'grad_norm': 4.8345112800598145, 'learning_rate': 7.141296601261314e-05, 'epoch': 4.23}\n","{'loss': 0.7374, 'grad_norm': 5.926130771636963, 'learning_rate': 7.113091308703498e-05, 'epoch': 4.25}\n","{'loss': 0.6514, 'grad_norm': 5.482864856719971, 'learning_rate': 7.084803913806641e-05, 'epoch': 4.27}\n","{'loss': 0.6823, 'grad_norm': 6.955780029296875, 'learning_rate': 7.056435515653059e-05, 'epoch': 4.28}\n","{'loss': 0.6645, 'grad_norm': 10.101131439208984, 'learning_rate': 7.027987216472377e-05, 'epoch': 4.3}\n","{'loss': 0.5592, 'grad_norm': 5.433172225952148, 'learning_rate': 6.999460121598704e-05, 'epoch': 4.32}\n","{'loss': 0.7167, 'grad_norm': 6.796948432922363, 'learning_rate': 6.970855339427698e-05, 'epoch': 4.34}\n","{'loss': 0.6169, 'grad_norm': 3.863734006881714, 'learning_rate': 6.942173981373474e-05, 'epoch': 4.36}\n","{'loss': 0.6398, 'grad_norm': 3.5763421058654785, 'learning_rate': 6.91341716182545e-05, 'epoch': 4.37}\n","{'loss': 0.7066, 'grad_norm': 4.580504894256592, 'learning_rate': 6.884585998105026e-05, 'epoch': 4.39}\n","{'loss': 0.647, 'grad_norm': 5.605465412139893, 'learning_rate': 6.855681610422189e-05, 'epoch': 4.41}\n","{'loss': 0.7305, 'grad_norm': 3.584751605987549, 'learning_rate': 6.826705121831976e-05, 'epoch': 4.43}\n","{'loss': 0.7089, 'grad_norm': 6.16217041015625, 'learning_rate': 6.797657658190839e-05, 'epoch': 4.44}\n","{'loss': 0.5937, 'grad_norm': 3.5875444412231445, 'learning_rate': 6.768540348112907e-05, 'epoch': 4.46}\n","{'loss': 0.7547, 'grad_norm': 4.757628917694092, 'learning_rate': 6.739354322926136e-05, 'epoch': 4.48}\n","{'loss': 0.6766, 'grad_norm': 5.012269020080566, 'learning_rate': 6.710100716628344e-05, 'epoch': 4.5}\n","{'loss': 0.6572, 'grad_norm': 5.2274861335754395, 'learning_rate': 6.680780665843155e-05, 'epoch': 4.52}\n","{'loss': 0.7324, 'grad_norm': 5.329851150512695, 'learning_rate': 6.651395309775837e-05, 'epoch': 4.53}\n","{'loss': 0.7048, 'grad_norm': 6.628935813903809, 'learning_rate': 6.621945790169036e-05, 'epoch': 4.55}\n","{'loss': 0.683, 'grad_norm': 5.611133575439453, 'learning_rate': 6.592433251258423e-05, 'epoch': 4.57}\n","{'loss': 0.678, 'grad_norm': 5.939394950866699, 'learning_rate': 6.562858839728223e-05, 'epoch': 4.59}\n","{'loss': 0.5917, 'grad_norm': 10.7606201171875, 'learning_rate': 6.533223704666672e-05, 'epoch': 4.61}\n","{'loss': 0.6841, 'grad_norm': 6.9346089363098145, 'learning_rate': 6.503528997521366e-05, 'epoch': 4.62}\n","{'loss': 0.7123, 'grad_norm': 5.5321364402771, 'learning_rate': 6.473775872054521e-05, 'epoch': 4.64}\n","{'loss': 0.6863, 'grad_norm': 4.588550567626953, 'learning_rate': 6.44396548429815e-05, 'epoch': 4.66}\n","{'loss': 0.6828, 'grad_norm': 4.912098407745361, 'learning_rate': 6.414098992509138e-05, 'epoch': 4.68}\n","{'loss': 0.6467, 'grad_norm': 7.303658485412598, 'learning_rate': 6.384177557124247e-05, 'epoch': 4.69}\n","{'loss': 0.6986, 'grad_norm': 4.651421546936035, 'learning_rate': 6.354202340715026e-05, 'epoch': 4.71}\n","{'loss': 0.6532, 'grad_norm': 4.812668800354004, 'learning_rate': 6.324174507942637e-05, 'epoch': 4.73}\n","{'loss': 0.6688, 'grad_norm': 4.208662509918213, 'learning_rate': 6.294095225512603e-05, 'epoch': 4.75}\n","{'loss': 0.674, 'grad_norm': 5.573670387268066, 'learning_rate': 6.263965662129487e-05, 'epoch': 4.77}\n","{'loss': 0.7383, 'grad_norm': 4.292681694030762, 'learning_rate': 6.233786988451468e-05, 'epoch': 4.78}\n","{'loss': 0.7485, 'grad_norm': 4.01066255569458, 'learning_rate': 6.203560377044866e-05, 'epoch': 4.8}\n","{'loss': 0.6504, 'grad_norm': 4.865781307220459, 'learning_rate': 6.173287002338577e-05, 'epoch': 4.82}\n","{'loss': 0.6004, 'grad_norm': 3.7839431762695312, 'learning_rate': 6.142968040578449e-05, 'epoch': 4.84}\n","{'loss': 0.6536, 'grad_norm': 7.742762565612793, 'learning_rate': 6.112604669781572e-05, 'epoch': 4.85}\n","{'loss': 0.7322, 'grad_norm': 5.1467719078063965, 'learning_rate': 6.0821980696905146e-05, 'epoch': 4.87}\n","{'loss': 0.7393, 'grad_norm': 6.317329406738281, 'learning_rate': 6.0517494217274794e-05, 'epoch': 4.89}\n","{'loss': 0.7548, 'grad_norm': 5.456260681152344, 'learning_rate': 6.021259908948402e-05, 'epoch': 4.91}\n","{'loss': 0.6722, 'grad_norm': 5.673567771911621, 'learning_rate': 5.9907307159969884e-05, 'epoch': 4.93}\n","{'loss': 0.7384, 'grad_norm': 4.8718366622924805, 'learning_rate': 5.960163029058682e-05, 'epoch': 4.94}\n","{'loss': 0.7634, 'grad_norm': 5.771657943725586, 'learning_rate': 5.9295580358145744e-05, 'epoch': 4.96}\n","{'loss': 0.7871, 'grad_norm': 4.919590473175049, 'learning_rate': 5.898916925395264e-05, 'epoch': 4.98}\n","{'loss': 0.7701, 'grad_norm': 4.445159912109375, 'learning_rate': 5.868240888334653e-05, 'epoch': 5.0}\n"," 50%|โโโโโโโโโโโโโโโโโโ | 2800/5600 [1:18:09<1:20:41, 1.73s/it][INFO|trainer.py:3788] 2024-06-30 07:34:44,634 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-30 07:34:44,634 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-30 07:34:44,634 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 7%|โโโ | 3/46 [00:00<00:01, 24.62it/s]\u001b[A\n"," 13%|โโโโโโ | 6/46 [00:00<00:02, 19.70it/s]\u001b[A\n"," 20%|โโโโโโโโโ | 9/46 [00:00<00:01, 19.03it/s]\u001b[A\n"," 24%|โโโโโโโโโโโ | 11/46 [00:00<00:01, 18.15it/s]\u001b[A\n"," 28%|โโโโโโโโโโโโโ | 13/46 [00:00<00:01, 17.79it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 17.92it/s]\u001b[A\n"," 37%|โโโโโโโโโโโโโโโโ | 17/46 [00:00<00:01, 17.64it/s]\u001b[A\n"," 41%|โโโโโโโโโโโโโโโโโโ | 19/46 [00:01<00:01, 18.03it/s]\u001b[A\n"," 46%|โโโโโโโโโโโโโโโโโโโโ | 21/46 [00:01<00:01, 17.86it/s]\u001b[A\n"," 50%|โโโโโโโโโโโโโโโโโโโโโโ | 23/46 [00:01<00:01, 18.04it/s]\u001b[A\n"," 54%|โโโโโโโโโโโโโโโโโโโโโโโโ | 25/46 [00:01<00:01, 17.74it/s]\u001b[A\n"," 59%|โโโโโโโโโโโโโโโโโโโโโโโโโโ | 27/46 [00:01<00:01, 17.80it/s]\u001b[A\n"," 63%|โโโโโโโโโโโโโโโโโโโโโโโโโโโ | 29/46 [00:01<00:00, 17.87it/s]\u001b[A\n"," 67%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 31/46 [00:01<00:00, 17.65it/s]\u001b[A\n"," 72%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 33/46 [00:01<00:00, 17.19it/s]\u001b[A\n"," 76%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 35/46 [00:01<00:00, 17.14it/s]\u001b[A\n"," 80%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 37/46 [00:02<00:00, 17.11it/s]\u001b[A\n"," 85%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 39/46 [00:02<00:00, 17.21it/s]\u001b[A\n"," 89%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 41/46 [00:02<00:00, 17.35it/s]\u001b[A\n"," 93%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 43/46 [00:02<00:00, 17.38it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 2.203040599822998, 'eval_runtime': 2.649, 'eval_samples_per_second': 17.365, 'eval_steps_per_second': 17.365, 'epoch': 5.0}\n"," 50%|โโโโโโโโโโโโโโโโโโ | 2800/5600 [1:18:12<1:20:41, 1.73s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:02<00:00, 17.26it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-30 07:34:47,283 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-2800\n","[INFO|configuration_utils.py:733] 2024-06-30 07:34:47,846 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n","[INFO|configuration_utils.py:800] 2024-06-30 07:34:47,846 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 1536,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 8960,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 28,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 12,\n"," \"num_hidden_layers\": 28,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-30 07:34:47,887 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-2800/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-30 07:34:47,887 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-2800/special_tokens_map.json\n","{'loss': 0.5161, 'grad_norm': 3.8078582286834717, 'learning_rate': 5.837531116523682e-05, 'epoch': 5.02}\n","{'loss': 0.4045, 'grad_norm': 4.426279544830322, 'learning_rate': 5.806788803164034e-05, 'epoch': 5.03}\n","{'loss': 0.4832, 'grad_norm': 6.388131618499756, 'learning_rate': 5.7760151427217576e-05, 'epoch': 5.05}\n","{'loss': 0.4554, 'grad_norm': 4.689113616943359, 'learning_rate': 5.745211330880872e-05, 'epoch': 5.07}\n","{'loss': 0.4532, 'grad_norm': 4.104332447052002, 'learning_rate': 5.714378564496901e-05, 'epoch': 5.09}\n","{'loss': 0.5509, 'grad_norm': 4.345515727996826, 'learning_rate': 5.683518041550368e-05, 'epoch': 5.1}\n","{'loss': 0.4002, 'grad_norm': 6.301547527313232, 'learning_rate': 5.6526309611002594e-05, 'epoch': 5.12}\n","{'loss': 0.4822, 'grad_norm': 5.300792217254639, 'learning_rate': 5.621718523237427e-05, 'epoch': 5.14}\n","{'loss': 0.4324, 'grad_norm': 4.0373311042785645, 'learning_rate': 5.590781929037965e-05, 'epoch': 5.16}\n","{'loss': 0.4274, 'grad_norm': 6.742273330688477, 'learning_rate': 5.559822380516539e-05, 'epoch': 5.18}\n","{'loss': 0.494, 'grad_norm': 6.803271293640137, 'learning_rate': 5.5288410805796895e-05, 'epoch': 5.19}\n","{'loss': 0.4682, 'grad_norm': 3.1775426864624023, 'learning_rate': 5.497839232979084e-05, 'epoch': 5.21}\n","{'loss': 0.4614, 'grad_norm': 3.7366745471954346, 'learning_rate': 5.466818042264753e-05, 'epoch': 5.23}\n","{'loss': 0.5448, 'grad_norm': 5.096468448638916, 'learning_rate': 5.435778713738292e-05, 'epoch': 5.25}\n","{'loss': 0.4847, 'grad_norm': 4.3523712158203125, 'learning_rate': 5.404722453406017e-05, 'epoch': 5.27}\n","{'loss': 0.4473, 'grad_norm': 4.652655601501465, 'learning_rate': 5.373650467932122e-05, 'epoch': 5.28}\n","{'loss': 0.4453, 'grad_norm': 4.760082244873047, 'learning_rate': 5.3425639645917834e-05, 'epoch': 5.3}\n","{'loss': 0.4814, 'grad_norm': 5.638540267944336, 'learning_rate': 5.311464151224261e-05, 'epoch': 5.32}\n","{'loss': 0.4013, 'grad_norm': 3.9371888637542725, 'learning_rate': 5.2803522361859594e-05, 'epoch': 5.34}\n","{'loss': 0.4984, 'grad_norm': 4.2124152183532715, 'learning_rate': 5.249229428303486e-05, 'epoch': 5.35}\n","{'loss': 0.4914, 'grad_norm': 6.735795974731445, 'learning_rate': 5.218096936826681e-05, 'epoch': 5.37}\n","{'loss': 0.464, 'grad_norm': 4.825798988342285, 'learning_rate': 5.18695597138163e-05, 'epoch': 5.39}\n","{'loss': 0.3474, 'grad_norm': 4.686152458190918, 'learning_rate': 5.155807741923666e-05, 'epoch': 5.41}\n","{'loss': 0.3999, 'grad_norm': 4.344501972198486, 'learning_rate': 5.124653458690365e-05, 'epoch': 5.43}\n","{'loss': 0.3818, 'grad_norm': 3.8981587886810303, 'learning_rate': 5.0934943321545115e-05, 'epoch': 5.44}\n","{'loss': 0.4732, 'grad_norm': 8.811891555786133, 'learning_rate': 5.062331572977076e-05, 'epoch': 5.46}\n","{'loss': 0.609, 'grad_norm': 4.967749118804932, 'learning_rate': 5.031166391960168e-05, 'epoch': 5.48}\n","{'loss': 0.4442, 'grad_norm': 4.958866596221924, 'learning_rate': 5e-05, 'epoch': 5.5}\n","{'loss': 0.4474, 'grad_norm': 4.941844940185547, 'learning_rate': 4.968833608039832e-05, 'epoch': 5.52}\n","{'loss': 0.5222, 'grad_norm': 4.754947662353516, 'learning_rate': 4.9376684270229254e-05, 'epoch': 5.53}\n","{'loss': 0.4465, 'grad_norm': 4.058730125427246, 'learning_rate': 4.9065056678454904e-05, 'epoch': 5.55}\n","{'loss': 0.5767, 'grad_norm': 5.571474552154541, 'learning_rate': 4.875346541309637e-05, 'epoch': 5.57}\n","{'loss': 0.4737, 'grad_norm': 4.0056939125061035, 'learning_rate': 4.844192258076336e-05, 'epoch': 5.59}\n","{'loss': 0.5223, 'grad_norm': 5.950839042663574, 'learning_rate': 4.813044028618373e-05, 'epoch': 5.6}\n","{'loss': 0.5301, 'grad_norm': 4.6719255447387695, 'learning_rate': 4.781903063173321e-05, 'epoch': 5.62}\n","{'loss': 0.4188, 'grad_norm': 4.333907127380371, 'learning_rate': 4.750770571696514e-05, 'epoch': 5.64}\n","{'loss': 0.4934, 'grad_norm': 6.121321678161621, 'learning_rate': 4.7196477638140404e-05, 'epoch': 5.66}\n","{'loss': 0.4343, 'grad_norm': 5.436617374420166, 'learning_rate': 4.68853584877574e-05, 'epoch': 5.68}\n","{'loss': 0.4969, 'grad_norm': 5.086023330688477, 'learning_rate': 4.657436035408217e-05, 'epoch': 5.69}\n","{'loss': 0.4571, 'grad_norm': 5.212259769439697, 'learning_rate': 4.626349532067879e-05, 'epoch': 5.71}\n","{'loss': 0.5086, 'grad_norm': 4.355545997619629, 'learning_rate': 4.595277546593984e-05, 'epoch': 5.73}\n","{'loss': 0.4502, 'grad_norm': 3.553330421447754, 'learning_rate': 4.564221286261709e-05, 'epoch': 5.75}\n","{'loss': 0.5377, 'grad_norm': 4.984807014465332, 'learning_rate': 4.5331819577352474e-05, 'epoch': 5.77}\n","{'loss': 0.4203, 'grad_norm': 10.004477500915527, 'learning_rate': 4.502160767020918e-05, 'epoch': 5.78}\n","{'loss': 0.4515, 'grad_norm': 4.771313190460205, 'learning_rate': 4.471158919420312e-05, 'epoch': 5.8}\n","{'loss': 0.4102, 'grad_norm': 3.963116407394409, 'learning_rate': 4.4401776194834613e-05, 'epoch': 5.82}\n","{'loss': 0.4378, 'grad_norm': 5.920322895050049, 'learning_rate': 4.409218070962036e-05, 'epoch': 5.84}\n","{'loss': 0.5433, 'grad_norm': 5.597177505493164, 'learning_rate': 4.378281476762576e-05, 'epoch': 5.85}\n","{'loss': 0.3711, 'grad_norm': 10.070011138916016, 'learning_rate': 4.347369038899744e-05, 'epoch': 5.87}\n","{'loss': 0.5813, 'grad_norm': 3.5711491107940674, 'learning_rate': 4.316481958449634e-05, 'epoch': 5.89}\n","{'loss': 0.4328, 'grad_norm': 4.168658256530762, 'learning_rate': 4.285621435503101e-05, 'epoch': 5.91}\n","{'loss': 0.5556, 'grad_norm': 10.734298706054688, 'learning_rate': 4.254788669119127e-05, 'epoch': 5.93}\n","{'loss': 0.4497, 'grad_norm': 4.482186794281006, 'learning_rate': 4.223984857278242e-05, 'epoch': 5.94}\n","{'loss': 0.5173, 'grad_norm': 5.7400054931640625, 'learning_rate': 4.1932111968359664e-05, 'epoch': 5.96}\n","{'loss': 0.5062, 'grad_norm': 4.264299392700195, 'learning_rate': 4.162468883476319e-05, 'epoch': 5.98}\n","{'loss': 0.4793, 'grad_norm': 9.265963554382324, 'learning_rate': 4.131759111665349e-05, 'epoch': 6.0}\n"," 60%|โโโโโโโโโโโโโโโโโโโโโ | 3360/5600 [1:33:57<1:02:06, 1.66s/it][INFO|trainer.py:3788] 2024-06-30 07:50:32,520 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-30 07:50:32,520 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-30 07:50:32,520 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 7%|โโโ | 3/46 [00:00<00:01, 29.07it/s]\u001b[A\n"," 13%|โโโโโโ | 6/46 [00:00<00:01, 23.12it/s]\u001b[A\n"," 20%|โโโโโโโโโ | 9/46 [00:00<00:01, 20.27it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 20.10it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 20.29it/s]\u001b[A\n"," 39%|โโโโโโโโโโโโโโโโโ | 18/46 [00:00<00:01, 19.71it/s]\u001b[A\n"," 43%|โโโโโโโโโโโโโโโโโโโ | 20/46 [00:00<00:01, 19.50it/s]\u001b[A\n"," 48%|โโโโโโโโโโโโโโโโโโโโโ | 22/46 [00:01<00:01, 18.02it/s]\u001b[A\n"," 52%|โโโโโโโโโโโโโโโโโโโโโโโ | 24/46 [00:01<00:01, 17.97it/s]\u001b[A\n"," 57%|โโโโโโโโโโโโโโโโโโโโโโโโโ | 26/46 [00:01<00:01, 18.23it/s]\u001b[A\n"," 61%|โโโโโโโโโโโโโโโโโโโโโโโโโโโ | 28/46 [00:01<00:00, 18.18it/s]\u001b[A\n"," 65%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 30/46 [00:01<00:00, 18.06it/s]\u001b[A\n"," 70%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 32/46 [00:01<00:00, 18.33it/s]\u001b[A\n"," 74%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 34/46 [00:01<00:00, 18.31it/s]\u001b[A\n"," 78%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 36/46 [00:01<00:00, 18.47it/s]\u001b[A\n"," 83%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 38/46 [00:01<00:00, 18.61it/s]\u001b[A\n"," 87%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 40/46 [00:02<00:00, 18.26it/s]\u001b[A\n"," 91%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 42/46 [00:02<00:00, 18.35it/s]\u001b[A\n"," 96%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 44/46 [00:02<00:00, 18.73it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 2.468099594116211, 'eval_runtime': 2.4745, 'eval_samples_per_second': 18.59, 'eval_steps_per_second': 18.59, 'epoch': 6.0}\n"," 60%|โโโโโโโโโโโโโโโโโโโโโ | 3360/5600 [1:33:59<1:02:06, 1.66s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:02<00:00, 18.73it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-30 07:50:34,996 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-3360\n","[INFO|configuration_utils.py:733] 2024-06-30 07:50:35,897 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n","[INFO|configuration_utils.py:800] 2024-06-30 07:50:35,897 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 1536,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 8960,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 28,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 12,\n"," \"num_hidden_layers\": 28,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-30 07:50:35,949 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-3360/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-30 07:50:35,949 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-3360/special_tokens_map.json\n","{'loss': 0.3304, 'grad_norm': 3.7002038955688477, 'learning_rate': 4.101083074604737e-05, 'epoch': 6.02}\n","{'loss': 0.2771, 'grad_norm': 4.872511863708496, 'learning_rate': 4.0704419641854274e-05, 'epoch': 6.03}\n","{'loss': 0.297, 'grad_norm': 3.524137020111084, 'learning_rate': 4.03983697094132e-05, 'epoch': 6.05}\n","{'loss': 0.3305, 'grad_norm': 1.7784379720687866, 'learning_rate': 4.0092692840030134e-05, 'epoch': 6.07}\n","{'loss': 0.3112, 'grad_norm': 4.346820831298828, 'learning_rate': 3.978740091051599e-05, 'epoch': 6.09}\n","{'loss': 0.349, 'grad_norm': 3.382376194000244, 'learning_rate': 3.9482505782725224e-05, 'epoch': 6.1}\n","{'loss': 0.256, 'grad_norm': 5.693515300750732, 'learning_rate': 3.917801930309486e-05, 'epoch': 6.12}\n","{'loss': 0.3141, 'grad_norm': 4.295924186706543, 'learning_rate': 3.887395330218429e-05, 'epoch': 6.14}\n","{'loss': 0.3338, 'grad_norm': 6.968880653381348, 'learning_rate': 3.857031959421553e-05, 'epoch': 6.16}\n","{'loss': 0.3056, 'grad_norm': 5.941532135009766, 'learning_rate': 3.8267129976614254e-05, 'epoch': 6.18}\n","{'loss': 0.3486, 'grad_norm': 3.909396171569824, 'learning_rate': 3.7964396229551364e-05, 'epoch': 6.19}\n","{'loss': 0.3718, 'grad_norm': 3.2766306400299072, 'learning_rate': 3.7662130115485314e-05, 'epoch': 6.21}\n","{'loss': 0.3308, 'grad_norm': 2.8996589183807373, 'learning_rate': 3.7360343378705124e-05, 'epoch': 6.23}\n","{'loss': 0.3928, 'grad_norm': 3.5176424980163574, 'learning_rate': 3.705904774487396e-05, 'epoch': 6.25}\n","{'loss': 0.2392, 'grad_norm': 10.349823951721191, 'learning_rate': 3.675825492057364e-05, 'epoch': 6.27}\n","{'loss': 0.3343, 'grad_norm': 7.081973552703857, 'learning_rate': 3.6457976592849754e-05, 'epoch': 6.28}\n","{'loss': 0.3097, 'grad_norm': 4.772485733032227, 'learning_rate': 3.6158224428757535e-05, 'epoch': 6.3}\n","{'loss': 0.3416, 'grad_norm': 3.539324998855591, 'learning_rate': 3.585901007490863e-05, 'epoch': 6.32}\n","{'loss': 0.34, 'grad_norm': 3.7091081142425537, 'learning_rate': 3.556034515701852e-05, 'epoch': 6.34}\n","{'loss': 0.3241, 'grad_norm': 5.218664646148682, 'learning_rate': 3.5262241279454785e-05, 'epoch': 6.35}\n","{'loss': 0.297, 'grad_norm': 3.1589152812957764, 'learning_rate': 3.4964710024786354e-05, 'epoch': 6.37}\n","{'loss': 0.3073, 'grad_norm': 2.8222711086273193, 'learning_rate': 3.4667762953333295e-05, 'epoch': 6.39}\n","{'loss': 0.2632, 'grad_norm': 5.614787578582764, 'learning_rate': 3.4371411602717784e-05, 'epoch': 6.41}\n","{'loss': 0.3375, 'grad_norm': 3.49419903755188, 'learning_rate': 3.4075667487415785e-05, 'epoch': 6.43}\n","{'loss': 0.323, 'grad_norm': 3.6888363361358643, 'learning_rate': 3.3780542098309654e-05, 'epoch': 6.44}\n","{'loss': 0.384, 'grad_norm': 2.8714163303375244, 'learning_rate': 3.3486046902241664e-05, 'epoch': 6.46}\n","{'loss': 0.36, 'grad_norm': 3.664397716522217, 'learning_rate': 3.319219334156847e-05, 'epoch': 6.48}\n","{'loss': 0.3239, 'grad_norm': 6.702901840209961, 'learning_rate': 3.289899283371657e-05, 'epoch': 6.5}\n","{'loss': 0.4023, 'grad_norm': 4.371044158935547, 'learning_rate': 3.2606456770738636e-05, 'epoch': 6.51}\n","{'loss': 0.2967, 'grad_norm': 7.265868663787842, 'learning_rate': 3.231459651887093e-05, 'epoch': 6.53}\n","{'loss': 0.3192, 'grad_norm': 4.020201683044434, 'learning_rate': 3.2023423418091626e-05, 'epoch': 6.55}\n","{'loss': 0.2575, 'grad_norm': 4.1831374168396, 'learning_rate': 3.173294878168025e-05, 'epoch': 6.57}\n","{'loss': 0.4101, 'grad_norm': 3.9656155109405518, 'learning_rate': 3.1443183895778105e-05, 'epoch': 6.59}\n","{'loss': 0.3168, 'grad_norm': 4.220931053161621, 'learning_rate': 3.115414001894974e-05, 'epoch': 6.6}\n","{'loss': 0.3144, 'grad_norm': 5.018192768096924, 'learning_rate': 3.086582838174551e-05, 'epoch': 6.62}\n","{'loss': 0.3115, 'grad_norm': 5.038303852081299, 'learning_rate': 3.0578260186265265e-05, 'epoch': 6.64}\n","{'loss': 0.2704, 'grad_norm': 2.8466811180114746, 'learning_rate': 3.029144660572304e-05, 'epoch': 6.66}\n","{'loss': 0.3401, 'grad_norm': 6.789051055908203, 'learning_rate': 3.000539878401296e-05, 'epoch': 6.68}\n","{'loss': 0.3225, 'grad_norm': 4.522548198699951, 'learning_rate': 2.9720127835276256e-05, 'epoch': 6.69}\n","{'loss': 0.3306, 'grad_norm': 3.3021113872528076, 'learning_rate': 2.9435644843469436e-05, 'epoch': 6.71}\n","{'loss': 0.3113, 'grad_norm': 6.549985885620117, 'learning_rate': 2.9151960861933614e-05, 'epoch': 6.73}\n","{'loss': 0.3258, 'grad_norm': 5.234971523284912, 'learning_rate': 2.886908691296504e-05, 'epoch': 6.75}\n","{'loss': 0.3151, 'grad_norm': 11.139360427856445, 'learning_rate': 2.858703398738686e-05, 'epoch': 6.76}\n","{'loss': 0.2697, 'grad_norm': 4.061713695526123, 'learning_rate': 2.8305813044122097e-05, 'epoch': 6.78}\n","{'loss': 0.2933, 'grad_norm': 5.231247425079346, 'learning_rate': 2.8025435009767747e-05, 'epoch': 6.8}\n","{'loss': 0.3019, 'grad_norm': 4.183421611785889, 'learning_rate': 2.774591077817038e-05, 'epoch': 6.82}\n","{'loss': 0.3599, 'grad_norm': 3.7946386337280273, 'learning_rate': 2.746725121000273e-05, 'epoch': 6.84}\n","{'loss': 0.2529, 'grad_norm': 4.079517364501953, 'learning_rate': 2.718946713234185e-05, 'epoch': 6.85}\n","{'loss': 0.3049, 'grad_norm': 4.9091033935546875, 'learning_rate': 2.6912569338248315e-05, 'epoch': 6.87}\n","{'loss': 0.3128, 'grad_norm': 4.097899436950684, 'learning_rate': 2.66365685863469e-05, 'epoch': 6.89}\n","{'loss': 0.3901, 'grad_norm': 5.156275749206543, 'learning_rate': 2.636147560040866e-05, 'epoch': 6.91}\n","{'loss': 0.2444, 'grad_norm': 4.529239177703857, 'learning_rate': 2.6087301068934106e-05, 'epoch': 6.93}\n","{'loss': 0.3085, 'grad_norm': 2.783036470413208, 'learning_rate': 2.581405564473801e-05, 'epoch': 6.94}\n","{'loss': 0.3202, 'grad_norm': 5.854332447052002, 'learning_rate': 2.5541749944535554e-05, 'epoch': 6.96}\n","{'loss': 0.3932, 'grad_norm': 5.028692722320557, 'learning_rate': 2.527039454852963e-05, 'epoch': 6.98}\n","{'loss': 0.2907, 'grad_norm': 7.342068672180176, 'learning_rate': 2.500000000000001e-05, 'epoch': 7.0}\n"," 70%|โโโโโโโโโโโโโโโโโโโโโโโโโโ | 3920/5600 [1:49:46<47:02, 1.68s/it][INFO|trainer.py:3788] 2024-06-30 08:06:21,782 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-30 08:06:21,782 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-30 08:06:21,782 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 7%|โโโ | 3/46 [00:00<00:01, 28.22it/s]\u001b[A\n"," 13%|โโโโโโ | 6/46 [00:00<00:01, 20.48it/s]\u001b[A\n"," 20%|โโโโโโโโโ | 9/46 [00:00<00:01, 19.88it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 19.66it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 19.39it/s]\u001b[A\n"," 37%|โโโโโโโโโโโโโโโโ | 17/46 [00:00<00:01, 19.14it/s]\u001b[A\n"," 41%|โโโโโโโโโโโโโโโโโโ | 19/46 [00:00<00:01, 18.76it/s]\u001b[A\n"," 46%|โโโโโโโโโโโโโโโโโโโโ | 21/46 [00:01<00:01, 18.05it/s]\u001b[A\n"," 50%|โโโโโโโโโโโโโโโโโโโโโโ | 23/46 [00:01<00:01, 18.18it/s]\u001b[A\n"," 54%|โโโโโโโโโโโโโโโโโโโโโโโโ | 25/46 [00:01<00:01, 17.78it/s]\u001b[A\n"," 59%|โโโโโโโโโโโโโโโโโโโโโโโโโโ | 27/46 [00:01<00:01, 17.79it/s]\u001b[A\n"," 63%|โโโโโโโโโโโโโโโโโโโโโโโโโโโ | 29/46 [00:01<00:00, 18.33it/s]\u001b[A\n"," 67%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 31/46 [00:01<00:00, 17.54it/s]\u001b[A\n"," 72%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 33/46 [00:01<00:00, 17.87it/s]\u001b[A\n"," 76%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 35/46 [00:01<00:00, 17.98it/s]\u001b[A\n"," 80%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 37/46 [00:01<00:00, 18.37it/s]\u001b[A\n"," 85%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 39/46 [00:02<00:00, 17.48it/s]\u001b[A\n"," 89%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 41/46 [00:02<00:00, 17.09it/s]\u001b[A\n"," 93%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ๏ฟฝ๏ฟฝ๏ฟฝโโโ | 43/46 [00:02<00:00, 17.19it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 2.79950213432312, 'eval_runtime': 2.5687, 'eval_samples_per_second': 17.908, 'eval_steps_per_second': 17.908, 'epoch': 7.0}\n"," 70%|โโโโโโโโโโโโโโโโโโโโโโโโโโ | 3920/5600 [1:49:49<47:02, 1.68s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:02<00:00, 17.22it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-30 08:06:24,351 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-3920\n","[INFO|configuration_utils.py:733] 2024-06-30 08:06:24,921 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n","[INFO|configuration_utils.py:800] 2024-06-30 08:06:24,921 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 1536,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 8960,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 28,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 12,\n"," \"num_hidden_layers\": 28,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-30 08:06:24,969 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-3920/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-30 08:06:24,969 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-3920/special_tokens_map.json\n","{'loss': 0.173, 'grad_norm': 2.5581254959106445, 'learning_rate': 2.473057680489348e-05, 'epoch': 7.01}\n","{'loss': 0.155, 'grad_norm': 1.571964144706726, 'learning_rate': 2.4462135431415733e-05, 'epoch': 7.03}\n","{'loss': 0.1707, 'grad_norm': 4.100607872009277, 'learning_rate': 2.4194686309624663e-05, 'epoch': 7.05}\n","{'loss': 0.2566, 'grad_norm': 3.124080181121826, 'learning_rate': 2.39282398310251e-05, 'epoch': 7.07}\n","{'loss': 0.1927, 'grad_norm': 2.3269150257110596, 'learning_rate': 2.366280634816496e-05, 'epoch': 7.09}\n","{'loss': 0.2814, 'grad_norm': 2.976987838745117, 'learning_rate': 2.3398396174233178e-05, 'epoch': 7.1}\n","{'loss': 0.1816, 'grad_norm': 7.45166015625, 'learning_rate': 2.3135019582658802e-05, 'epoch': 7.12}\n","{'loss': 0.2094, 'grad_norm': 3.6462252140045166, 'learning_rate': 2.2872686806712035e-05, 'epoch': 7.14}\n","{'loss': 0.1964, 'grad_norm': 4.667559623718262, 'learning_rate': 2.261140803910644e-05, 'epoch': 7.16}\n","{'loss': 0.2011, 'grad_norm': 1.900875449180603, 'learning_rate': 2.235119343160303e-05, 'epoch': 7.18}\n","{'loss': 0.1984, 'grad_norm': 3.4627413749694824, 'learning_rate': 2.2092053094615813e-05, 'epoch': 7.19}\n","{'loss': 0.2214, 'grad_norm': 3.783445358276367, 'learning_rate': 2.1833997096818898e-05, 'epoch': 7.21}\n","{'loss': 0.2047, 'grad_norm': 3.812368154525757, 'learning_rate': 2.157703546475539e-05, 'epoch': 7.23}\n","{'loss': 0.2822, 'grad_norm': 4.192997455596924, 'learning_rate': 2.132117818244771e-05, 'epoch': 7.25}\n","{'loss': 0.1728, 'grad_norm': 2.5023624897003174, 'learning_rate': 2.1066435191009715e-05, 'epoch': 7.26}\n","{'loss': 0.257, 'grad_norm': 7.863531589508057, 'learning_rate': 2.0812816388260518e-05, 'epoch': 7.28}\n","{'loss': 0.1866, 'grad_norm': 3.7875170707702637, 'learning_rate': 2.056033162833977e-05, 'epoch': 7.3}\n","{'loss': 0.1975, 'grad_norm': 1.9177666902542114, 'learning_rate': 2.0308990721324927e-05, 'epoch': 7.32}\n","{'loss': 0.2784, 'grad_norm': 3.343583345413208, 'learning_rate': 2.0058803432849987e-05, 'epoch': 7.34}\n","{'loss': 0.1964, 'grad_norm': 4.66720724105835, 'learning_rate': 1.980977948372612e-05, 'epoch': 7.35}\n","{'loss': 0.2273, 'grad_norm': 4.310459136962891, 'learning_rate': 1.9561928549563968e-05, 'epoch': 7.37}\n","{'loss': 0.2671, 'grad_norm': 3.40097975730896, 'learning_rate': 1.931526026039764e-05, 'epoch': 7.39}\n","{'loss': 0.1796, 'grad_norm': 4.316131591796875, 'learning_rate': 1.906978420031059e-05, 'epoch': 7.41}\n","{'loss': 0.2422, 'grad_norm': 3.5017640590667725, 'learning_rate': 1.8825509907063327e-05, 'epoch': 7.43}\n","{'loss': 0.2054, 'grad_norm': 3.1226840019226074, 'learning_rate': 1.8582446871722636e-05, 'epoch': 7.44}\n","{'loss': 0.2701, 'grad_norm': 3.492358684539795, 'learning_rate': 1.8340604538293015e-05, 'epoch': 7.46}\n","{'loss': 0.2063, 'grad_norm': 3.7369136810302734, 'learning_rate': 1.8099992303349577e-05, 'epoch': 7.48}\n","{'loss': 0.2125, 'grad_norm': 2.5832254886627197, 'learning_rate': 1.7860619515673033e-05, 'epoch': 7.5}\n","{'loss': 0.2224, 'grad_norm': 4.129978179931641, 'learning_rate': 1.7622495475886487e-05, 'epoch': 7.51}\n","{'loss': 0.2496, 'grad_norm': 3.395150661468506, 'learning_rate': 1.738562943609396e-05, 'epoch': 7.53}\n","{'loss': 0.2337, 'grad_norm': 3.5454814434051514, 'learning_rate': 1.7150030599520984e-05, 'epoch': 7.55}\n","{'loss': 0.166, 'grad_norm': 4.08375883102417, 'learning_rate': 1.691570812015704e-05, 'epoch': 7.57}\n","{'loss': 0.1812, 'grad_norm': 2.692218542098999, 'learning_rate': 1.6682671102399805e-05, 'epoch': 7.59}\n","{'loss': 0.1598, 'grad_norm': 4.0160231590271, 'learning_rate': 1.6450928600701504e-05, 'epoch': 7.6}\n","{'loss': 0.1428, 'grad_norm': 3.891842842102051, 'learning_rate': 1.622048961921699e-05, 'epoch': 7.62}\n","{'loss': 0.2096, 'grad_norm': 4.69844913482666, 'learning_rate': 1.599136311145402e-05, 'epoch': 7.64}\n","{'loss': 0.3205, 'grad_norm': 3.1264488697052, 'learning_rate': 1.5763557979925324e-05, 'epoch': 7.66}\n","{'loss': 0.2708, 'grad_norm': 4.53989315032959, 'learning_rate': 1.553708307580265e-05, 'epoch': 7.68}\n","{'loss': 0.2095, 'grad_norm': 2.3542563915252686, 'learning_rate': 1.531194719857292e-05, 'epoch': 7.69}\n","{'loss': 0.1645, 'grad_norm': 5.031445503234863, 'learning_rate': 1.5088159095696363e-05, 'epoch': 7.71}\n","{'loss': 0.2475, 'grad_norm': 5.543506145477295, 'learning_rate': 1.4865727462266543e-05, 'epoch': 7.73}\n","{'loss': 0.2831, 'grad_norm': 3.0280065536499023, 'learning_rate': 1.4644660940672627e-05, 'epoch': 7.75}\n","{'loss': 0.2326, 'grad_norm': 3.493994951248169, 'learning_rate': 1.4424968120263504e-05, 'epoch': 7.76}\n","{'loss': 0.2206, 'grad_norm': 3.2602357864379883, 'learning_rate': 1.4206657537014079e-05, 'epoch': 7.78}\n","{'loss': 0.2249, 'grad_norm': 5.095388412475586, 'learning_rate': 1.398973767319368e-05, 'epoch': 7.8}\n","{'loss': 0.2642, 'grad_norm': 13.810650825500488, 'learning_rate': 1.3774216957036367e-05, 'epoch': 7.82}\n","{'loss': 0.1592, 'grad_norm': 11.616129875183105, 'learning_rate': 1.3560103762413584e-05, 'epoch': 7.84}\n","{'loss': 0.1754, 'grad_norm': 4.300165176391602, 'learning_rate': 1.3347406408508695e-05, 'epoch': 7.85}\n","{'loss': 0.2721, 'grad_norm': 3.653315782546997, 'learning_rate': 1.3136133159493802e-05, 'epoch': 7.87}\n","{'loss': 0.1998, 'grad_norm': 3.611405611038208, 'learning_rate': 1.2926292224208664e-05, 'epoch': 7.89}\n","{'loss': 0.2173, 'grad_norm': 12.638509750366211, 'learning_rate': 1.2717891755841722e-05, 'epoch': 7.91}\n","{'loss': 0.2315, 'grad_norm': 12.878602981567383, 'learning_rate': 1.2510939851613285e-05, 'epoch': 7.93}\n","{'loss': 0.2456, 'grad_norm': 3.8157997131347656, 'learning_rate': 1.230544455246101e-05, 'epoch': 7.94}\n","{'loss': 0.1785, 'grad_norm': 4.757344722747803, 'learning_rate': 1.2101413842727345e-05, 'epoch': 7.96}\n","{'loss': 0.2599, 'grad_norm': 9.757575035095215, 'learning_rate': 1.1898855649849461e-05, 'epoch': 7.98}\n","{'loss': 0.1662, 'grad_norm': 2.5260682106018066, 'learning_rate': 1.1697777844051105e-05, 'epoch': 8.0}\n"," 80%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 4480/5600 [2:05:14<30:43, 1.65s/it][INFO|trainer.py:3788] 2024-06-30 08:21:50,010 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-30 08:21:50,010 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-30 08:21:50,010 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 7%|โโโ | 3/46 [00:00<00:01, 26.94it/s]\u001b[A\n"," 13%|โโโโโโ | 6/46 [00:00<00:01, 22.20it/s]\u001b[A\n"," 20%|โโโโโโโโโ | 9/46 [00:00<00:01, 21.22it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 20.62it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 19.95it/s]\u001b[A\n"," 39%|โโโโโโโโโโโโโโโโโ | 18/46 [00:00<00:01, 20.12it/s]\u001b[A\n"," 46%|โโโโโโโโโโโโโโโโโโโโ | 21/46 [00:01<00:01, 20.20it/s]\u001b[A\n"," 52%|โโโโโโโโโโโโโโโโโโโโโโโ | 24/46 [00:01<00:01, 19.55it/s]\u001b[A\n"," 57%|โโโโโโโโโโโโโโโโโโโโโโโโโ | 26/46 [00:01<00:01, 19.49it/s]\u001b[A\n"," 61%|โโโโโโโโโโโโโโโโโโโโโโโโโโโ | 28/46 [00:01<00:00, 19.50it/s]\u001b[A\n"," 67%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 31/46 [00:01<00:00, 19.86it/s]\u001b[A\n"," 74%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 34/46 [00:01<00:00, 19.99it/s]\u001b[A\n"," 78%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 36/46 [00:01<00:00, 19.87it/s]\u001b[A\n"," 83%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 38/46 [00:01<00:00, 19.38it/s]\u001b[A\n"," 87%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 40/46 [00:02<00:00, 19.02it/s]\u001b[A\n"," 93%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 43/46 [00:02<00:00, 19.49it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 3.0680618286132812, 'eval_runtime': 2.3737, 'eval_samples_per_second': 19.379, 'eval_steps_per_second': 19.379, 'epoch': 8.0}\n"," 80%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 4480/5600 [2:05:17<30:43, 1.65s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:02<00:00, 18.92it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-30 08:21:52,385 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-4480\n","[INFO|configuration_utils.py:733] 2024-06-30 08:21:52,924 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n","[INFO|configuration_utils.py:800] 2024-06-30 08:21:52,925 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 1536,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 8960,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 28,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 12,\n"," \"num_hidden_layers\": 28,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-30 08:21:52,971 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-4480/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-30 08:21:52,971 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-4480/special_tokens_map.json\n","{'loss': 0.1445, 'grad_norm': 1.7341903448104858, 'learning_rate': 1.1498188238036861e-05, 'epoch': 8.01}\n","{'loss': 0.113, 'grad_norm': 2.666015863418579, 'learning_rate': 1.130009458668863e-05, 'epoch': 8.03}\n","{'loss': 0.2103, 'grad_norm': 3.4498844146728516, 'learning_rate': 1.1103504586764263e-05, 'epoch': 8.05}\n","{'loss': 0.1438, 'grad_norm': 3.1626198291778564, 'learning_rate': 1.090842587659851e-05, 'epoch': 8.07}\n","{'loss': 0.1212, 'grad_norm': 2.0630691051483154, 'learning_rate': 1.0714866035806326e-05, 'epoch': 8.09}\n","{'loss': 0.1799, 'grad_norm': 3.088937282562256, 'learning_rate': 1.0522832584988234e-05, 'epoch': 8.1}\n","{'loss': 0.1107, 'grad_norm': 2.786867380142212, 'learning_rate': 1.0332332985438248e-05, 'epoch': 8.12}\n","{'loss': 0.1752, 'grad_norm': 1.7368676662445068, 'learning_rate': 1.0143374638853891e-05, 'epoch': 8.14}\n","{'loss': 0.1186, 'grad_norm': 2.039095878601074, 'learning_rate': 9.955964887048607e-06, 'epoch': 8.16}\n","{'loss': 0.133, 'grad_norm': 3.3849267959594727, 'learning_rate': 9.770111011666583e-06, 'epoch': 8.17}\n","{'loss': 0.1982, 'grad_norm': 3.557002305984497, 'learning_rate': 9.58582023389974e-06, 'epoch': 8.19}\n","{'loss': 0.1974, 'grad_norm': 2.3108747005462646, 'learning_rate': 9.403099714207175e-06, 'epoch': 8.21}\n","{'loss': 0.1912, 'grad_norm': 3.278822183609009, 'learning_rate': 9.221956552036992e-06, 'epoch': 8.23}\n","{'loss': 0.1803, 'grad_norm': 3.1523773670196533, 'learning_rate': 9.042397785550405e-06, 'epoch': 8.25}\n","{'loss': 0.1569, 'grad_norm': 4.3890862464904785, 'learning_rate': 8.864430391348332e-06, 'epoch': 8.26}\n","{'loss': 0.1246, 'grad_norm': 2.280132532119751, 'learning_rate': 8.688061284200266e-06, 'epoch': 8.28}\n","{'loss': 0.1357, 'grad_norm': 5.60243034362793, 'learning_rate': 8.513297316775625e-06, 'epoch': 8.3}\n","{'loss': 0.148, 'grad_norm': 4.402873992919922, 'learning_rate': 8.34014527937756e-06, 'epoch': 8.32}\n","{'loss': 0.1943, 'grad_norm': 3.0184195041656494, 'learning_rate': 8.168611899679013e-06, 'epoch': 8.34}\n","{'loss': 0.1651, 'grad_norm': 3.244899034500122, 'learning_rate': 7.998703842461431e-06, 'epoch': 8.35}\n","{'loss': 0.1725, 'grad_norm': 2.839618444442749, 'learning_rate': 7.830427709355725e-06, 'epoch': 8.37}\n","{'loss': 0.1764, 'grad_norm': 2.485934257507324, 'learning_rate': 7.663790038585793e-06, 'epoch': 8.39}\n","{'loss': 0.1308, 'grad_norm': 1.899274230003357, 'learning_rate': 7.498797304714544e-06, 'epoch': 8.41}\n","{'loss': 0.1754, 'grad_norm': 2.954799175262451, 'learning_rate': 7.33545591839222e-06, 'epoch': 8.42}\n","{'loss': 0.1637, 'grad_norm': 2.595350980758667, 'learning_rate': 7.173772226107434e-06, 'epoch': 8.44}\n","{'loss': 0.1736, 'grad_norm': 3.245035409927368, 'learning_rate': 7.013752509940485e-06, 'epoch': 8.46}\n","{'loss': 0.1975, 'grad_norm': 2.79209303855896, 'learning_rate': 6.855402987319348e-06, 'epoch': 8.48}\n","{'loss': 0.1608, 'grad_norm': 3.176992177963257, 'learning_rate': 6.698729810778065e-06, 'epoch': 8.5}\n","{'loss': 0.1425, 'grad_norm': 3.4199535846710205, 'learning_rate': 6.54373906771768e-06, 'epoch': 8.51}\n","{'loss': 0.1397, 'grad_norm': 2.7271015644073486, 'learning_rate': 6.390436780169734e-06, 'epoch': 8.53}\n","{'loss': 0.2058, 'grad_norm': 2.070603847503662, 'learning_rate': 6.238828904562316e-06, 'epoch': 8.55}\n","{'loss': 0.1612, 'grad_norm': 2.0566606521606445, 'learning_rate': 6.088921331488568e-06, 'epoch': 8.57}\n","{'loss': 0.0814, 'grad_norm': 2.8068063259124756, 'learning_rate': 5.94071988547788e-06, 'epoch': 8.59}\n","{'loss': 0.1446, 'grad_norm': 2.784498691558838, 'learning_rate': 5.794230324769517e-06, 'epoch': 8.6}\n","{'loss': 0.1159, 'grad_norm': 1.9032204151153564, 'learning_rate': 5.649458341088915e-06, 'epoch': 8.62}\n","{'loss': 0.1592, 'grad_norm': 3.4346718788146973, 'learning_rate': 5.506409559426573e-06, 'epoch': 8.64}\n","{'loss': 0.22, 'grad_norm': 1.5027986764907837, 'learning_rate': 5.365089537819434e-06, 'epoch': 8.66}\n","{'loss': 0.1448, 'grad_norm': 2.700094223022461, 'learning_rate': 5.2255037671349535e-06, 'epoch': 8.67}\n","{'loss': 0.173, 'grad_norm': 1.7306227684020996, 'learning_rate': 5.087657670857798e-06, 'epoch': 8.69}\n","{'loss': 0.1248, 'grad_norm': 2.1537575721740723, 'learning_rate': 4.951556604879048e-06, 'epoch': 8.71}\n","{'loss': 0.1908, 'grad_norm': 3.676980972290039, 'learning_rate': 4.8172058572881765e-06, 'epoch': 8.73}\n","{'loss': 0.1352, 'grad_norm': 3.5679290294647217, 'learning_rate': 4.684610648167503e-06, 'epoch': 8.75}\n","{'loss': 0.1403, 'grad_norm': 3.0709311962127686, 'learning_rate': 4.5537761293894535e-06, 'epoch': 8.76}\n","{'loss': 0.157, 'grad_norm': 4.296623706817627, 'learning_rate': 4.424707384416344e-06, 'epoch': 8.78}\n","{'loss': 0.1893, 'grad_norm': 2.2179601192474365, 'learning_rate': 4.29740942810285e-06, 'epoch': 8.8}\n","{'loss': 0.1651, 'grad_norm': 3.256356716156006, 'learning_rate': 4.1718872065011904e-06, 'epoch': 8.82}\n","{'loss': 0.1567, 'grad_norm': 2.9854118824005127, 'learning_rate': 4.048145596668967e-06, 'epoch': 8.84}\n","{'loss': 0.1463, 'grad_norm': 1.8327380418777466, 'learning_rate': 3.9261894064796135e-06, 'epoch': 8.85}\n","{'loss': 0.1511, 'grad_norm': 3.7827322483062744, 'learning_rate': 3.8060233744356633e-06, 'epoch': 8.87}\n","{'loss': 0.1404, 'grad_norm': 1.0582958459854126, 'learning_rate': 3.687652169484568e-06, 'epoch': 8.89}\n","{'loss': 0.1494, 'grad_norm': 2.8895883560180664, 'learning_rate': 3.5710803908373224e-06, 'epoch': 8.91}\n","{'loss': 0.1691, 'grad_norm': 4.131629467010498, 'learning_rate': 3.4563125677897932e-06, 'epoch': 8.92}\n","{'loss': 0.2159, 'grad_norm': 3.1218199729919434, 'learning_rate': 3.343353159546675e-06, 'epoch': 8.94}\n","{'loss': 0.1488, 'grad_norm': 1.3627033233642578, 'learning_rate': 3.2322065550483007e-06, 'epoch': 8.96}\n","{'loss': 0.1338, 'grad_norm': 0.7474280595779419, 'learning_rate': 3.1228770728000455e-06, 'epoch': 8.98}\n","{'loss': 0.1358, 'grad_norm': 7.197608947753906, 'learning_rate': 3.0153689607045845e-06, 'epoch': 9.0}\n"," 90%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 5040/5600 [2:20:27<14:49, 1.59s/it][INFO|trainer.py:3788] 2024-06-30 08:37:02,463 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-30 08:37:02,463 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-30 08:37:02,463 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 7%|โโโ | 3/46 [00:00<00:01, 28.92it/s]\u001b[A\n"," 13%|โโโโโโ | 6/46 [00:00<00:01, 22.11it/s]\u001b[A\n"," 20%|โโโโโโโโโ | 9/46 [00:00<00:01, 20.35it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 19.98it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 19.91it/s]\u001b[A\n"," 39%|โโโโโโโโโโโโโโโโโ | 18/46 [00:00<00:01, 20.28it/s]\u001b[A\n"," 46%|โโโโโโโโโโโโโโโโโโโโ | 21/46 [00:01<00:01, 20.37it/s]\u001b[A\n"," 52%|โโโโโโโโโโโโโโโโโโโโโโโ | 24/46 [00:01<00:01, 20.46it/s]\u001b[A\n"," 59%|โโโโโโโโโโโโโโโโโโโโโโโโโโ | 27/46 [00:01<00:00, 20.56it/s]\u001b[A\n"," 65%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 30/46 [00:01<00:00, 20.24it/s]\u001b[A\n"," 72%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 33/46 [00:01<00:00, 19.81it/s]\u001b[A\n"," 76%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 35/46 [00:01<00:00, 19.77it/s]\u001b[A\n"," 83%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 38/46 [00:01<00:00, 20.14it/s]\u001b[A\n"," 89%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 41/46 [00:02<00:00, 20.31it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 3.3791232109069824, 'eval_runtime': 2.3157, 'eval_samples_per_second': 19.864, 'eval_steps_per_second': 19.864, 'epoch': 9.0}\n"," 90%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 5040/5600 [2:20:29<14:49, 1.59s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:02<00:00, 20.63it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-30 08:37:04,779 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-5040\n","[INFO|configuration_utils.py:733] 2024-06-30 08:37:05,334 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n","[INFO|configuration_utils.py:800] 2024-06-30 08:37:05,334 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 1536,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 8960,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 28,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 12,\n"," \"num_hidden_layers\": 28,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-30 08:37:05,376 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-5040/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-30 08:37:05,376 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-5040/special_tokens_map.json\n","{'loss': 0.127, 'grad_norm': 2.002713441848755, 'learning_rate': 2.9096863958968268e-06, 'epoch': 9.01}\n","{'loss': 0.1209, 'grad_norm': 1.9945132732391357, 'learning_rate': 2.8058334845816213e-06, 'epoch': 9.03}\n","{'loss': 0.1432, 'grad_norm': 1.8608288764953613, 'learning_rate': 2.7038142618741992e-06, 'epoch': 9.05}\n","{'loss': 0.1707, 'grad_norm': 3.0099308490753174, 'learning_rate': 2.603632691643415e-06, 'epoch': 9.07}\n","{'loss': 0.1253, 'grad_norm': 2.0166702270507812, 'learning_rate': 2.5052926663577e-06, 'epoch': 9.09}\n","{'loss': 0.1316, 'grad_norm': 2.971637487411499, 'learning_rate': 2.408798006933882e-06, 'epoch': 9.1}\n","{'loss': 0.0958, 'grad_norm': 3.2099649906158447, 'learning_rate': 2.314152462588659e-06, 'epoch': 9.12}\n","{'loss': 0.1209, 'grad_norm': 3.63370418548584, 'learning_rate': 2.221359710692961e-06, 'epoch': 9.14}\n","{'loss': 0.0965, 'grad_norm': 1.7885162830352783, 'learning_rate': 2.1304233566290964e-06, 'epoch': 9.16}\n","{'loss': 0.0858, 'grad_norm': 1.2611875534057617, 'learning_rate': 2.041346933650612e-06, 'epoch': 9.17}\n","{'loss': 0.1183, 'grad_norm': 1.9412541389465332, 'learning_rate': 1.9541339027450256e-06, 'epoch': 9.19}\n","{'loss': 0.1395, 'grad_norm': 1.758565068244934, 'learning_rate': 1.8687876524993987e-06, 'epoch': 9.21}\n","{'loss': 0.0943, 'grad_norm': 2.854973316192627, 'learning_rate': 1.785311498968617e-06, 'epoch': 9.23}\n","{'loss': 0.1427, 'grad_norm': 6.644575119018555, 'learning_rate': 1.70370868554659e-06, 'epoch': 9.25}\n","{'loss': 0.1434, 'grad_norm': 3.3001134395599365, 'learning_rate': 1.6239823828401945e-06, 'epoch': 9.26}\n","{'loss': 0.1218, 'grad_norm': 0.8132327795028687, 'learning_rate': 1.5461356885461075e-06, 'epoch': 9.28}\n","{'loss': 0.1723, 'grad_norm': 1.5899766683578491, 'learning_rate': 1.4701716273304521e-06, 'epoch': 9.3}\n","{'loss': 0.1067, 'grad_norm': 1.619358777999878, 'learning_rate': 1.3960931507112752e-06, 'epoch': 9.32}\n","{'loss': 0.1708, 'grad_norm': 1.985873818397522, 'learning_rate': 1.3239031369438326e-06, 'epoch': 9.34}\n","{'loss': 0.1221, 'grad_norm': 2.568528175354004, 'learning_rate': 1.2536043909088191e-06, 'epoch': 9.35}\n","{'loss': 0.1285, 'grad_norm': 2.3798413276672363, 'learning_rate': 1.1851996440033319e-06, 'epoch': 9.37}\n","{'loss': 0.1118, 'grad_norm': 0.7661011815071106, 'learning_rate': 1.118691554034773e-06, 'epoch': 9.39}\n","{'loss': 0.1178, 'grad_norm': 2.5488016605377197, 'learning_rate': 1.0540827051175818e-06, 'epoch': 9.41}\n","{'loss': 0.1357, 'grad_norm': 3.0472471714019775, 'learning_rate': 9.913756075728087e-07, 'epoch': 9.42}\n","{'loss': 0.0948, 'grad_norm': 0.8541691899299622, 'learning_rate': 9.305726978306173e-07, 'epoch': 9.44}\n","{'loss': 0.1502, 'grad_norm': 0.9478998780250549, 'learning_rate': 8.716763383355864e-07, 'epoch': 9.46}\n","{'loss': 0.1266, 'grad_norm': 1.7219117879867554, 'learning_rate': 8.146888174549339e-07, 'epoch': 9.48}\n","{'loss': 0.0963, 'grad_norm': 2.673491954803467, 'learning_rate': 7.596123493895991e-07, 'epoch': 9.5}\n","{'loss': 0.186, 'grad_norm': 2.8655078411102295, 'learning_rate': 7.064490740882057e-07, 'epoch': 9.51}\n","{'loss': 0.1359, 'grad_norm': 2.7357897758483887, 'learning_rate': 6.552010571639456e-07, 'epoch': 9.53}\n","{'loss': 0.1508, 'grad_norm': 2.8306162357330322, 'learning_rate': 6.058702898142643e-07, 'epoch': 9.55}\n","{'loss': 0.1036, 'grad_norm': 3.270542621612549, 'learning_rate': 5.584586887435739e-07, 'epoch': 9.57}\n","{'loss': 0.1504, 'grad_norm': 2.821152925491333, 'learning_rate': 5.129680960887007e-07, 'epoch': 9.59}\n","{'loss': 0.1388, 'grad_norm': 1.7769047021865845, 'learning_rate': 4.6940027934735954e-07, 'epoch': 9.6}\n","{'loss': 0.1101, 'grad_norm': 2.468860387802124, 'learning_rate': 4.277569313094809e-07, 'epoch': 9.62}\n","{'loss': 0.1552, 'grad_norm': 2.018123149871826, 'learning_rate': 3.8803966999139684e-07, 'epoch': 9.64}\n","{'loss': 0.124, 'grad_norm': 1.8176459074020386, 'learning_rate': 3.50250038573019e-07, 'epoch': 9.66}\n","{'loss': 0.1084, 'grad_norm': 1.4483444690704346, 'learning_rate': 3.143895053378698e-07, 'epoch': 9.67}\n","{'loss': 0.1122, 'grad_norm': 2.107964038848877, 'learning_rate': 2.8045946361601183e-07, 'epoch': 9.69}\n","{'loss': 0.167, 'grad_norm': 3.6413228511810303, 'learning_rate': 2.4846123172992954e-07, 'epoch': 9.71}\n","{'loss': 0.1457, 'grad_norm': 3.200455904006958, 'learning_rate': 2.1839605294330933e-07, 'epoch': 9.73}\n","{'loss': 0.1466, 'grad_norm': 2.517279863357544, 'learning_rate': 1.9026509541272275e-07, 'epoch': 9.75}\n","{'loss': 0.1311, 'grad_norm': 1.4828776121139526, 'learning_rate': 1.640694521422459e-07, 'epoch': 9.76}\n","{'loss': 0.1393, 'grad_norm': 2.981771945953369, 'learning_rate': 1.3981014094099353e-07, 'epoch': 9.78}\n","{'loss': 0.0979, 'grad_norm': 1.9358062744140625, 'learning_rate': 1.1748810438355628e-07, 'epoch': 9.8}\n","{'loss': 0.1863, 'grad_norm': 1.4688208103179932, 'learning_rate': 9.710420977340762e-08, 'epoch': 9.82}\n","{'loss': 0.1427, 'grad_norm': 2.011298418045044, 'learning_rate': 7.865924910916977e-08, 'epoch': 9.83}\n","{'loss': 0.0873, 'grad_norm': 3.8186533451080322, 'learning_rate': 6.215393905388278e-08, 'epoch': 9.85}\n","{'loss': 0.099, 'grad_norm': 2.1099774837493896, 'learning_rate': 4.7588920907110094e-08, 'epoch': 9.87}\n","{'loss': 0.1162, 'grad_norm': 2.2421796321868896, 'learning_rate': 3.496476058006959e-08, 'epoch': 9.89}\n","{'loss': 0.1642, 'grad_norm': 3.2422990798950195, 'learning_rate': 2.4281948573617874e-08, 'epoch': 9.91}\n","{'loss': 0.1631, 'grad_norm': 2.7475242614746094, 'learning_rate': 1.5540899959187727e-08, 'epoch': 9.92}\n","{'loss': 0.145, 'grad_norm': 2.781863212585449, 'learning_rate': 8.741954362678772e-09, 'epoch': 9.94}\n","{'loss': 0.1242, 'grad_norm': 3.5185129642486572, 'learning_rate': 3.885375951256931e-09, 'epoch': 9.96}\n","{'loss': 0.1676, 'grad_norm': 2.617418050765991, 'learning_rate': 9.713534230904041e-10, 'epoch': 9.98}\n","{'loss': 0.1304, 'grad_norm': 2.882068395614624, 'learning_rate': 0.0, 'epoch': 10.0}\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 5600/5600 [2:35:33<00:00, 1.59s/it][INFO|trainer.py:3788] 2024-06-30 08:52:08,248 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-30 08:52:08,248 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-30 08:52:08,248 >> Batch size = 1\n","\n"," 0%| | 0/46 [00:00, ?it/s]\u001b[A\n"," 7%|โโโ | 3/46 [00:00<00:01, 27.21it/s]\u001b[A\n"," 13%|โโโโโโ | 6/46 [00:00<00:01, 22.26it/s]\u001b[A\n"," 20%|โโโโโโโโโ | 9/46 [00:00<00:01, 21.37it/s]\u001b[A\n"," 26%|โโโโโโโโโโโโ | 12/46 [00:00<00:01, 21.20it/s]\u001b[A\n"," 33%|โโโโโโโโโโโโโโ | 15/46 [00:00<00:01, 20.31it/s]\u001b[A\n"," 39%|โโโโโโโโโโโโโโโโโ | 18/46 [00:00<00:01, 19.25it/s]\u001b[A\n"," 43%|โโโโโโโโโโโโโโโโโโโ | 20/46 [00:00<00:01, 19.27it/s]\u001b[A\n"," 48%|โโโโโโโโโโโโโโโโโโโโโ | 22/46 [00:01<00:01, 19.44it/s]\u001b[A\n"," 52%|โโโโโโโโโโโโโโโโโโโโโโโ | 24/46 [00:01<00:01, 18.62it/s]\u001b[A\n"," 57%|โโโโโโโโโโโโโโโโโโโโโโโโโ | 26/46 [00:01<00:01, 18.46it/s]\u001b[A\n"," 61%|โโโโโโโโโโโโโโโโโโโโโโโโโโโ | 28/46 [00:01<00:00, 18.63it/s]\u001b[A\n"," 65%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 30/46 [00:01<00:00, 18.93it/s]\u001b[A\n"," 72%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 33/46 [00:01<00:00, 18.94it/s]\u001b[A\n"," 78%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 36/46 [00:01<00:00, 19.35it/s]\u001b[A\n"," 83%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 38/46 [00:01<00:00, 19.17it/s]\u001b[A\n"," 87%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 40/46 [00:02<00:00, 19.01it/s]\u001b[A\n"," 91%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 42/46 [00:02<00:00, 19.01it/s]\u001b[A\n"," \u001b[A\n","\u001b[A{'eval_loss': 3.5356574058532715, 'eval_runtime': 2.4075, 'eval_samples_per_second': 19.107, 'eval_steps_per_second': 19.107, 'epoch': 10.0}\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 5600/5600 [2:35:35<00:00, 1.59s/it]\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:02<00:00, 19.46it/s]\u001b[A\n"," \u001b[A[INFO|trainer.py:3478] 2024-06-30 08:52:10,656 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-5600\n","[INFO|configuration_utils.py:733] 2024-06-30 08:52:11,263 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n","[INFO|configuration_utils.py:800] 2024-06-30 08:52:11,263 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 1536,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 8960,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 28,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 12,\n"," \"num_hidden_layers\": 28,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-30 08:52:11,334 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-5600/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-30 08:52:11,334 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-5600/special_tokens_map.json\n","[INFO|trainer.py:2383] 2024-06-30 08:52:11,559 >> \n","\n","Training completed. Do not forget to share your model on huggingface.co/models =)\n","\n","\n","{'train_runtime': 9336.476, 'train_samples_per_second': 4.801, 'train_steps_per_second': 0.6, 'train_loss': 0.7698830796884639, 'epoch': 10.0}\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 5600/5600 [2:35:36<00:00, 1.67s/it]\n","[INFO|trainer.py:3478] 2024-06-30 08:52:11,560 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft\n","[INFO|configuration_utils.py:733] 2024-06-30 08:52:12,070 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n","[INFO|configuration_utils.py:800] 2024-06-30 08:52:12,070 >> Model config Qwen2Config {\n"," \"architectures\": [\n"," \"Qwen2ForCausalLM\"\n"," ],\n"," \"attention_dropout\": 0.0,\n"," \"bos_token_id\": 151643,\n"," \"eos_token_id\": 151645,\n"," \"hidden_act\": \"silu\",\n"," \"hidden_size\": 1536,\n"," \"initializer_range\": 0.02,\n"," \"intermediate_size\": 8960,\n"," \"max_position_embeddings\": 32768,\n"," \"max_window_layers\": 28,\n"," \"model_type\": \"qwen2\",\n"," \"num_attention_heads\": 12,\n"," \"num_hidden_layers\": 28,\n"," \"num_key_value_heads\": 2,\n"," \"rms_norm_eps\": 1e-06,\n"," \"rope_theta\": 1000000.0,\n"," \"sliding_window\": 32768,\n"," \"tie_word_embeddings\": true,\n"," \"torch_dtype\": \"bfloat16\",\n"," \"transformers_version\": \"4.42.3\",\n"," \"use_cache\": true,\n"," \"use_sliding_window\": false,\n"," \"vocab_size\": 151936\n","}\n","\n","[INFO|tokenization_utils_base.py:2574] 2024-06-30 08:52:12,110 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/tokenizer_config.json\n","[INFO|tokenization_utils_base.py:2583] 2024-06-30 08:52:12,110 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/special_tokens_map.json\n","***** train metrics *****\n"," epoch = 9.9955\n"," total_flos = 27888647GF\n"," train_loss = 0.7699\n"," train_runtime = 2:35:36.47\n"," train_samples_per_second = 4.801\n"," train_steps_per_second = 0.6\n","Figure saved at: saves/qwen2-1.5b/lora/sft/training_loss.png\n","Figure saved at: saves/qwen2-1.5b/lora/sft/training_eval_loss.png\n","[INFO|trainer.py:3788] 2024-06-30 08:52:12,411 >> \n","***** Running Evaluation *****\n","[INFO|trainer.py:3790] 2024-06-30 08:52:12,411 >> Num examples = 46\n","[INFO|trainer.py:3793] 2024-06-30 08:52:12,411 >> Batch size = 1\n","100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 46/46 [00:02<00:00, 19.87it/s]\n","***** eval metrics *****\n"," epoch = 9.9955\n"," eval_loss = 3.5357\n"," eval_runtime = 0:00:02.39\n"," eval_samples_per_second = 19.224\n"," eval_steps_per_second = 19.224\n","[INFO|modelcard.py:449] 2024-06-30 08:52:14,805 >> Dropping the following result as it does not have all the necessary fields:\n","{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}\n","CPU times: user 2min 11s, sys: 44.5 s, total: 2min 56s\n","Wall time: 2h 37min 48s\n"]}],"source":["%%time\n","\n","!./scripts/tune-lf.sh config/qwen2_1.5b_lora_sft.yaml"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"NLLre-t64-V4"},"outputs":[],"source":["%%time\n","\n","!./scripts/tune-lf.sh config/qwen2_7b_lora_sft.yaml"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"07_MAC_+_Qwen2-7B-Instructi_Unsloth_train","widgets":{}},"colab":{"gpuType":"T4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"},"widgets":{"application/vnd.jupyter.widget-state+json":{}}},"nbformat":4,"nbformat_minor":0}
\ No newline at end of file