kevinwsbr commited on
Commit
218f7e1
·
verified ·
1 Parent(s): 46c138a

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: bigcode-openrail-m
4
+ base_model: bigcode/starcoder2-15b
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - kevinwsbr/vulnfixes-web
9
+ model-index:
10
+ - name: outputs/starcoder-vulnfixes-web
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
18
+ <details><summary>See axolotl config</summary>
19
+
20
+ axolotl version: `0.8.0.dev0`
21
+ ```yaml
22
+ base_model: bigcode/starcoder2-15b
23
+ # Automatically upload checkpoint and final model to HF
24
+ # hub_model_id: username/custom_model_name
25
+
26
+ load_in_8bit: false
27
+ load_in_4bit: true
28
+ strict: false
29
+
30
+ datasets:
31
+ - path: kevinwsbr/vulnfixes-web
32
+ type: alpaca
33
+
34
+
35
+ dataset_prepared_path:
36
+ val_set_size: 0.05
37
+ output_dir: ./outputs/starcoder-vulnfixes-web
38
+
39
+ adapter: qlora
40
+ lora_model_dir:
41
+
42
+ sequence_len: 4096
43
+ sample_packing: true
44
+ pad_to_sequence_len: true
45
+
46
+ lora_r: 32
47
+ lora_alpha: 16
48
+ lora_dropout: 0.05
49
+ lora_target_modules:
50
+ lora_target_linear: true
51
+ lora_fan_in_fan_out:
52
+
53
+ wandb_project: starcoder
54
+ wandb_entity:
55
+ wandb_watch:
56
+ wandb_run_id:
57
+ wandb_log_model:
58
+
59
+ gradient_accumulation_steps: 8
60
+ micro_batch_size: 2
61
+ num_epochs: 3
62
+ optimizer: adamw_bnb_8bit
63
+ lr_scheduler: cosine
64
+ learning_rate: 2e-5
65
+
66
+ train_on_inputs: false
67
+ group_by_length: false
68
+ bf16: auto
69
+ fp16: false
70
+ tf32: false
71
+
72
+ gradient_checkpointing: true
73
+ early_stopping_patience:
74
+ resume_from_checkpoint:
75
+ local_rank:
76
+ logging_steps: 1
77
+ xformers_attention:
78
+ flash_attention: true
79
+
80
+ warmup_steps: 20
81
+ evals_per_epoch: 4
82
+ eval_steps:
83
+ eval_table_size:
84
+ saves_per_epoch: 4
85
+ save_steps:
86
+ save_total_limit: 2
87
+ debug:
88
+ deepspeed:
89
+ weight_decay:
90
+ fsdp:
91
+ fsdp_config:
92
+ special_tokens:
93
+ pad_token: "<|endoftext|>"
94
+ eos_token: "<|endoftext|>"
95
+
96
+ ```
97
+
98
+ </details><br>
99
+
100
+ # outputs/starcoder-vulnfixes-web
101
+
102
+ This model is a fine-tuned version of [bigcode/starcoder2-15b](https://huggingface.co/bigcode/starcoder2-15b) on the kevinwsbr/vulnfixes-web dataset.
103
+ It achieves the following results on the evaluation set:
104
+ - Loss: 0.0529
105
+
106
+ ## Model description
107
+
108
+ More information needed
109
+
110
+ ## Intended uses & limitations
111
+
112
+ More information needed
113
+
114
+ ## Training and evaluation data
115
+
116
+ More information needed
117
+
118
+ ## Training procedure
119
+
120
+ ### Training hyperparameters
121
+
122
+ The following hyperparameters were used during training:
123
+ - learning_rate: 2e-05
124
+ - train_batch_size: 2
125
+ - eval_batch_size: 2
126
+ - seed: 42
127
+ - gradient_accumulation_steps: 8
128
+ - total_train_batch_size: 16
129
+ - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
130
+ - lr_scheduler_type: cosine
131
+ - lr_scheduler_warmup_steps: 20
132
+ - num_epochs: 3.0
133
+
134
+ ### Training results
135
+
136
+ | Training Loss | Epoch | Step | Validation Loss |
137
+ |:-------------:|:------:|:----:|:---------------:|
138
+ | 0.1499 | 0.0092 | 1 | 0.0645 |
139
+ | 0.1554 | 0.2569 | 28 | 0.0622 |
140
+ | 0.0745 | 0.5138 | 56 | 0.0571 |
141
+ | 0.0616 | 0.7706 | 84 | 0.0559 |
142
+ | 0.0645 | 1.0275 | 112 | 0.0547 |
143
+ | 0.0601 | 1.2844 | 140 | 0.0542 |
144
+ | 0.0688 | 1.5413 | 168 | 0.0537 |
145
+ | 0.0424 | 1.7982 | 196 | 0.0534 |
146
+ | 0.086 | 2.0550 | 224 | 0.0532 |
147
+ | 0.0759 | 2.3119 | 252 | 0.0530 |
148
+ | 0.0583 | 2.5688 | 280 | 0.0529 |
149
+ | 0.1087 | 2.8257 | 308 | 0.0529 |
150
+
151
+
152
+ ### Framework versions
153
+
154
+ - PEFT 0.14.0
155
+ - Transformers 4.49.0
156
+ - Pytorch 2.5.1+cu124
157
+ - Datasets 3.2.0
158
+ - Tokenizers 0.21.0
adapter_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "bigcode/starcoder2-15b",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": null,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 32,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "c_fc",
27
+ "k_proj",
28
+ "q_proj",
29
+ "o_proj",
30
+ "c_proj",
31
+ "v_proj"
32
+ ],
33
+ "task_type": "CAUSAL_LM",
34
+ "use_dora": false,
35
+ "use_rslora": false
36
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c90ba47f39c0a86cd549d28055b620e1922d97f151fa40bf696f678998cec60
3
+ size 508623712
checkpoint-308/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: bigcode/starcoder2-15b
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.14.0
checkpoint-308/adapter_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "bigcode/starcoder2-15b",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": null,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 32,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "c_fc",
27
+ "k_proj",
28
+ "q_proj",
29
+ "o_proj",
30
+ "c_proj",
31
+ "v_proj"
32
+ ],
33
+ "task_type": "CAUSAL_LM",
34
+ "use_dora": false,
35
+ "use_rslora": false
36
+ }
checkpoint-308/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a9811f43732961fde30393eb326eac6b44f2118cbdc2c73c108fd63a3359a7e
3
+ size 508623712
checkpoint-308/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-308/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e544911fbe509ccbc4b3aec76962932edc0d3151ee5da3b0b5d67cf694f1270
3
+ size 258817236
checkpoint-308/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d716d46ab13fe967ada7a71526d8a1189dc582f3667ee74054a1c7be10a5e9a
3
+ size 14244
checkpoint-308/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e55eba8550a28f83d45d944034813f2f9b600be572840059513b77f292bf0b2
3
+ size 1064
checkpoint-308/special_tokens_map.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim_prefix>",
5
+ "<fim_middle>",
6
+ "<fim_suffix>",
7
+ "<fim_pad>",
8
+ "<repo_name>",
9
+ "<file_sep>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<jupyter_script>",
18
+ "<empty_output>",
19
+ "<code_to_intermediate>",
20
+ "<intermediate_to_code>",
21
+ "<pr>",
22
+ "<pr_status>",
23
+ "<pr_is_merged>",
24
+ "<pr_base>",
25
+ "<pr_file>",
26
+ "<pr_base_code>",
27
+ "<pr_diff>",
28
+ "<pr_diff_hunk>",
29
+ "<pr_comment>",
30
+ "<pr_event_id>",
31
+ "<pr_review>",
32
+ "<pr_review_state>",
33
+ "<pr_review_comment>",
34
+ "<pr_in_reply_to_review_id>",
35
+ "<pr_in_reply_to_comment_id>",
36
+ "<pr_diff_hunk_comment_line>",
37
+ "<NAME>",
38
+ "<EMAIL>",
39
+ "<KEY>",
40
+ "<PASSWORD>"
41
+ ],
42
+ "bos_token": {
43
+ "content": "<|endoftext|>",
44
+ "lstrip": false,
45
+ "normalized": false,
46
+ "rstrip": false,
47
+ "single_word": false
48
+ },
49
+ "eos_token": {
50
+ "content": "<|endoftext|>",
51
+ "lstrip": false,
52
+ "normalized": false,
53
+ "rstrip": false,
54
+ "single_word": false
55
+ },
56
+ "pad_token": {
57
+ "content": "<|endoftext|>",
58
+ "lstrip": false,
59
+ "normalized": false,
60
+ "rstrip": false,
61
+ "single_word": false
62
+ },
63
+ "unk_token": {
64
+ "content": "<|endoftext|>",
65
+ "lstrip": false,
66
+ "normalized": false,
67
+ "rstrip": false,
68
+ "single_word": false
69
+ }
70
+ }
checkpoint-308/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-308/tokenizer_config.json ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<fim_prefix>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<fim_middle>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<fim_suffix>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<fim_pad>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<repo_name>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<file_sep>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<issue_start>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_comment>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_closed>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<jupyter_start>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_text>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_code>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_output>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_script>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<empty_output>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<code_to_intermediate>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "17": {
141
+ "content": "<intermediate_to_code>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "18": {
149
+ "content": "<pr>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "19": {
157
+ "content": "<pr_status>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "20": {
165
+ "content": "<pr_is_merged>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "21": {
173
+ "content": "<pr_base>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "22": {
181
+ "content": "<pr_file>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "23": {
189
+ "content": "<pr_base_code>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "24": {
197
+ "content": "<pr_diff>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "25": {
205
+ "content": "<pr_diff_hunk>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "26": {
213
+ "content": "<pr_comment>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "27": {
221
+ "content": "<pr_event_id>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "28": {
229
+ "content": "<pr_review>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "29": {
237
+ "content": "<pr_review_state>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "30": {
245
+ "content": "<pr_review_comment>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "31": {
253
+ "content": "<pr_in_reply_to_review_id>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "32": {
261
+ "content": "<pr_in_reply_to_comment_id>",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "33": {
269
+ "content": "<pr_diff_hunk_comment_line>",
270
+ "lstrip": false,
271
+ "normalized": false,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "34": {
277
+ "content": "<NAME>",
278
+ "lstrip": false,
279
+ "normalized": false,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "35": {
285
+ "content": "<EMAIL>",
286
+ "lstrip": false,
287
+ "normalized": false,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "36": {
293
+ "content": "<KEY>",
294
+ "lstrip": false,
295
+ "normalized": false,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "37": {
301
+ "content": "<PASSWORD>",
302
+ "lstrip": false,
303
+ "normalized": false,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": true
307
+ }
308
+ },
309
+ "additional_special_tokens": [
310
+ "<|endoftext|>",
311
+ "<fim_prefix>",
312
+ "<fim_middle>",
313
+ "<fim_suffix>",
314
+ "<fim_pad>",
315
+ "<repo_name>",
316
+ "<file_sep>",
317
+ "<issue_start>",
318
+ "<issue_comment>",
319
+ "<issue_closed>",
320
+ "<jupyter_start>",
321
+ "<jupyter_text>",
322
+ "<jupyter_code>",
323
+ "<jupyter_output>",
324
+ "<jupyter_script>",
325
+ "<empty_output>",
326
+ "<code_to_intermediate>",
327
+ "<intermediate_to_code>",
328
+ "<pr>",
329
+ "<pr_status>",
330
+ "<pr_is_merged>",
331
+ "<pr_base>",
332
+ "<pr_file>",
333
+ "<pr_base_code>",
334
+ "<pr_diff>",
335
+ "<pr_diff_hunk>",
336
+ "<pr_comment>",
337
+ "<pr_event_id>",
338
+ "<pr_review>",
339
+ "<pr_review_state>",
340
+ "<pr_review_comment>",
341
+ "<pr_in_reply_to_review_id>",
342
+ "<pr_in_reply_to_comment_id>",
343
+ "<pr_diff_hunk_comment_line>",
344
+ "<NAME>",
345
+ "<EMAIL>",
346
+ "<KEY>",
347
+ "<PASSWORD>"
348
+ ],
349
+ "bos_token": "<|endoftext|>",
350
+ "clean_up_tokenization_spaces": true,
351
+ "eos_token": "<|endoftext|>",
352
+ "extra_special_tokens": {},
353
+ "model_max_length": 1000000000000000019884624838656,
354
+ "pad_token": "<|endoftext|>",
355
+ "tokenizer_class": "GPT2Tokenizer",
356
+ "unk_token": "<|endoftext|>",
357
+ "vocab_size": 49152
358
+ }
checkpoint-308/trainer_state.json ADDED
@@ -0,0 +1,2285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.8256880733944953,
5
+ "eval_steps": 28,
6
+ "global_step": 308,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009174311926605505,
13
+ "grad_norm": 0.01852019689977169,
14
+ "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 0.1499,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.009174311926605505,
20
+ "eval_loss": 0.06453218311071396,
21
+ "eval_runtime": 43.1581,
22
+ "eval_samples_per_second": 4.009,
23
+ "eval_steps_per_second": 2.016,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.01834862385321101,
28
+ "grad_norm": 0.02213262766599655,
29
+ "learning_rate": 2.0000000000000003e-06,
30
+ "loss": 0.1596,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.027522935779816515,
35
+ "grad_norm": 0.045894358307123184,
36
+ "learning_rate": 3e-06,
37
+ "loss": 0.1794,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.03669724770642202,
42
+ "grad_norm": 0.01868272013962269,
43
+ "learning_rate": 4.000000000000001e-06,
44
+ "loss": 0.1592,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.045871559633027525,
49
+ "grad_norm": 0.017406364902853966,
50
+ "learning_rate": 5e-06,
51
+ "loss": 0.1696,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.05504587155963303,
56
+ "grad_norm": 0.01861901767551899,
57
+ "learning_rate": 6e-06,
58
+ "loss": 0.1438,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.06422018348623854,
63
+ "grad_norm": 0.020326999947428703,
64
+ "learning_rate": 7e-06,
65
+ "loss": 0.1627,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.07339449541284404,
70
+ "grad_norm": 0.02542084828019142,
71
+ "learning_rate": 8.000000000000001e-06,
72
+ "loss": 0.1558,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.08256880733944955,
77
+ "grad_norm": 0.022425655275583267,
78
+ "learning_rate": 9e-06,
79
+ "loss": 0.1517,
80
+ "step": 9
81
+ },
82
+ {
83
+ "epoch": 0.09174311926605505,
84
+ "grad_norm": 0.023916274309158325,
85
+ "learning_rate": 1e-05,
86
+ "loss": 0.1672,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.10091743119266056,
91
+ "grad_norm": 0.022392459213733673,
92
+ "learning_rate": 1.1000000000000001e-05,
93
+ "loss": 0.1504,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.11009174311926606,
98
+ "grad_norm": 0.02520025707781315,
99
+ "learning_rate": 1.2e-05,
100
+ "loss": 0.1749,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.11926605504587157,
105
+ "grad_norm": 0.028882062062621117,
106
+ "learning_rate": 1.3000000000000001e-05,
107
+ "loss": 0.1705,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.12844036697247707,
112
+ "grad_norm": 0.03628265857696533,
113
+ "learning_rate": 1.4e-05,
114
+ "loss": 0.166,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.13761467889908258,
119
+ "grad_norm": 0.02980518713593483,
120
+ "learning_rate": 1.5000000000000002e-05,
121
+ "loss": 0.1238,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.14678899082568808,
126
+ "grad_norm": 0.028387300670146942,
127
+ "learning_rate": 1.6000000000000003e-05,
128
+ "loss": 0.1326,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.1559633027522936,
133
+ "grad_norm": 0.03367001935839653,
134
+ "learning_rate": 1.7e-05,
135
+ "loss": 0.1347,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.1651376146788991,
140
+ "grad_norm": 0.03655758127570152,
141
+ "learning_rate": 1.8e-05,
142
+ "loss": 0.1423,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.1743119266055046,
147
+ "grad_norm": 0.04000673070549965,
148
+ "learning_rate": 1.9e-05,
149
+ "loss": 0.1651,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.1834862385321101,
154
+ "grad_norm": 0.03844478353857994,
155
+ "learning_rate": 2e-05,
156
+ "loss": 0.1649,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.1926605504587156,
161
+ "grad_norm": 0.04334944114089012,
162
+ "learning_rate": 1.99994764125734e-05,
163
+ "loss": 0.1292,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.2018348623853211,
168
+ "grad_norm": 0.04224175587296486,
169
+ "learning_rate": 1.9997905705122352e-05,
170
+ "loss": 0.1336,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.21100917431192662,
175
+ "grad_norm": 0.04647228121757507,
176
+ "learning_rate": 1.9995288042127396e-05,
177
+ "loss": 0.128,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.22018348623853212,
182
+ "grad_norm": 0.04862922057509422,
183
+ "learning_rate": 1.9991623697703613e-05,
184
+ "loss": 0.1611,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 0.22935779816513763,
189
+ "grad_norm": 0.05552718788385391,
190
+ "learning_rate": 1.998691305557194e-05,
191
+ "loss": 0.1469,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 0.23853211009174313,
196
+ "grad_norm": 0.04372956603765488,
197
+ "learning_rate": 1.9981156609018977e-05,
198
+ "loss": 0.1341,
199
+ "step": 26
200
+ },
201
+ {
202
+ "epoch": 0.24770642201834864,
203
+ "grad_norm": 0.04992978647351265,
204
+ "learning_rate": 1.9974354960845326e-05,
205
+ "loss": 0.1464,
206
+ "step": 27
207
+ },
208
+ {
209
+ "epoch": 0.25688073394495414,
210
+ "grad_norm": 0.04873019829392433,
211
+ "learning_rate": 1.9966508823302484e-05,
212
+ "loss": 0.1554,
213
+ "step": 28
214
+ },
215
+ {
216
+ "epoch": 0.25688073394495414,
217
+ "eval_loss": 0.06217445060610771,
218
+ "eval_runtime": 43.3972,
219
+ "eval_samples_per_second": 3.986,
220
+ "eval_steps_per_second": 2.005,
221
+ "step": 28
222
+ },
223
+ {
224
+ "epoch": 0.26605504587155965,
225
+ "grad_norm": 0.04296933487057686,
226
+ "learning_rate": 1.9957619018018243e-05,
227
+ "loss": 0.1231,
228
+ "step": 29
229
+ },
230
+ {
231
+ "epoch": 0.27522935779816515,
232
+ "grad_norm": 0.06265883892774582,
233
+ "learning_rate": 1.9947686475910656e-05,
234
+ "loss": 0.1292,
235
+ "step": 30
236
+ },
237
+ {
238
+ "epoch": 0.28440366972477066,
239
+ "grad_norm": 0.044797539710998535,
240
+ "learning_rate": 1.9936712237090554e-05,
241
+ "loss": 0.114,
242
+ "step": 31
243
+ },
244
+ {
245
+ "epoch": 0.29357798165137616,
246
+ "grad_norm": 0.05862847715616226,
247
+ "learning_rate": 1.9924697450752636e-05,
248
+ "loss": 0.1215,
249
+ "step": 32
250
+ },
251
+ {
252
+ "epoch": 0.30275229357798167,
253
+ "grad_norm": 0.04938759654760361,
254
+ "learning_rate": 1.991164337505511e-05,
255
+ "loss": 0.0683,
256
+ "step": 33
257
+ },
258
+ {
259
+ "epoch": 0.3119266055045872,
260
+ "grad_norm": 0.07766549289226532,
261
+ "learning_rate": 1.9897551376987948e-05,
262
+ "loss": 0.1051,
263
+ "step": 34
264
+ },
265
+ {
266
+ "epoch": 0.3211009174311927,
267
+ "grad_norm": 0.05474488437175751,
268
+ "learning_rate": 1.9882422932229765e-05,
269
+ "loss": 0.082,
270
+ "step": 35
271
+ },
272
+ {
273
+ "epoch": 0.3302752293577982,
274
+ "grad_norm": 0.04499150812625885,
275
+ "learning_rate": 1.9866259624993246e-05,
276
+ "loss": 0.1135,
277
+ "step": 36
278
+ },
279
+ {
280
+ "epoch": 0.3394495412844037,
281
+ "grad_norm": 0.07329924404621124,
282
+ "learning_rate": 1.9849063147859282e-05,
283
+ "loss": 0.1082,
284
+ "step": 37
285
+ },
286
+ {
287
+ "epoch": 0.3486238532110092,
288
+ "grad_norm": 0.23288355767726898,
289
+ "learning_rate": 1.983083530159971e-05,
290
+ "loss": 0.0899,
291
+ "step": 38
292
+ },
293
+ {
294
+ "epoch": 0.3577981651376147,
295
+ "grad_norm": 0.06561094522476196,
296
+ "learning_rate": 1.9811577994988755e-05,
297
+ "loss": 0.1096,
298
+ "step": 39
299
+ },
300
+ {
301
+ "epoch": 0.3669724770642202,
302
+ "grad_norm": 0.052528683096170425,
303
+ "learning_rate": 1.979129324460314e-05,
304
+ "loss": 0.107,
305
+ "step": 40
306
+ },
307
+ {
308
+ "epoch": 0.3761467889908257,
309
+ "grad_norm": 0.057943329215049744,
310
+ "learning_rate": 1.9769983174610918e-05,
311
+ "loss": 0.1121,
312
+ "step": 41
313
+ },
314
+ {
315
+ "epoch": 0.3853211009174312,
316
+ "grad_norm": 0.05784667655825615,
317
+ "learning_rate": 1.974765001654903e-05,
318
+ "loss": 0.1125,
319
+ "step": 42
320
+ },
321
+ {
322
+ "epoch": 0.3944954128440367,
323
+ "grad_norm": 0.04998760297894478,
324
+ "learning_rate": 1.9724296109089623e-05,
325
+ "loss": 0.0944,
326
+ "step": 43
327
+ },
328
+ {
329
+ "epoch": 0.4036697247706422,
330
+ "grad_norm": 0.056932978332042694,
331
+ "learning_rate": 1.9699923897795165e-05,
332
+ "loss": 0.0758,
333
+ "step": 44
334
+ },
335
+ {
336
+ "epoch": 0.41284403669724773,
337
+ "grad_norm": 0.05268337205052376,
338
+ "learning_rate": 1.9674535934862327e-05,
339
+ "loss": 0.0767,
340
+ "step": 45
341
+ },
342
+ {
343
+ "epoch": 0.42201834862385323,
344
+ "grad_norm": 0.04703257977962494,
345
+ "learning_rate": 1.9648134878854747e-05,
346
+ "loss": 0.076,
347
+ "step": 46
348
+ },
349
+ {
350
+ "epoch": 0.43119266055045874,
351
+ "grad_norm": 0.05632725730538368,
352
+ "learning_rate": 1.9620723494424627e-05,
353
+ "loss": 0.1143,
354
+ "step": 47
355
+ },
356
+ {
357
+ "epoch": 0.44036697247706424,
358
+ "grad_norm": 0.04887419193983078,
359
+ "learning_rate": 1.9592304652023208e-05,
360
+ "loss": 0.096,
361
+ "step": 48
362
+ },
363
+ {
364
+ "epoch": 0.44954128440366975,
365
+ "grad_norm": 0.06641880422830582,
366
+ "learning_rate": 1.9562881327600197e-05,
367
+ "loss": 0.1108,
368
+ "step": 49
369
+ },
370
+ {
371
+ "epoch": 0.45871559633027525,
372
+ "grad_norm": 0.08709923177957535,
373
+ "learning_rate": 1.9532456602292148e-05,
374
+ "loss": 0.0987,
375
+ "step": 50
376
+ },
377
+ {
378
+ "epoch": 0.46788990825688076,
379
+ "grad_norm": 0.06175887584686279,
380
+ "learning_rate": 1.950103366209978e-05,
381
+ "loss": 0.0821,
382
+ "step": 51
383
+ },
384
+ {
385
+ "epoch": 0.47706422018348627,
386
+ "grad_norm": 0.05565601587295532,
387
+ "learning_rate": 1.9468615797554374e-05,
388
+ "loss": 0.0727,
389
+ "step": 52
390
+ },
391
+ {
392
+ "epoch": 0.48623853211009177,
393
+ "grad_norm": 0.13676409423351288,
394
+ "learning_rate": 1.943520640337318e-05,
395
+ "loss": 0.0834,
396
+ "step": 53
397
+ },
398
+ {
399
+ "epoch": 0.4954128440366973,
400
+ "grad_norm": 0.0817922055721283,
401
+ "learning_rate": 1.9400808978103948e-05,
402
+ "loss": 0.0766,
403
+ "step": 54
404
+ },
405
+ {
406
+ "epoch": 0.5045871559633027,
407
+ "grad_norm": 0.04707655310630798,
408
+ "learning_rate": 1.936542712375855e-05,
409
+ "loss": 0.0753,
410
+ "step": 55
411
+ },
412
+ {
413
+ "epoch": 0.5137614678899083,
414
+ "grad_norm": 0.07192892581224442,
415
+ "learning_rate": 1.9329064545435803e-05,
416
+ "loss": 0.0745,
417
+ "step": 56
418
+ },
419
+ {
420
+ "epoch": 0.5137614678899083,
421
+ "eval_loss": 0.057128190994262695,
422
+ "eval_runtime": 43.3864,
423
+ "eval_samples_per_second": 3.987,
424
+ "eval_steps_per_second": 2.005,
425
+ "step": 56
426
+ },
427
+ {
428
+ "epoch": 0.5229357798165137,
429
+ "grad_norm": 0.07009316980838776,
430
+ "learning_rate": 1.929172505093347e-05,
431
+ "loss": 0.0696,
432
+ "step": 57
433
+ },
434
+ {
435
+ "epoch": 0.5321100917431193,
436
+ "grad_norm": 0.07050078362226486,
437
+ "learning_rate": 1.9253412550349507e-05,
438
+ "loss": 0.0846,
439
+ "step": 58
440
+ },
441
+ {
442
+ "epoch": 0.5412844036697247,
443
+ "grad_norm": 0.06861168146133423,
444
+ "learning_rate": 1.9214131055672648e-05,
445
+ "loss": 0.0659,
446
+ "step": 59
447
+ },
448
+ {
449
+ "epoch": 0.5504587155963303,
450
+ "grad_norm": 0.05825705826282501,
451
+ "learning_rate": 1.917388468036222e-05,
452
+ "loss": 0.1173,
453
+ "step": 60
454
+ },
455
+ {
456
+ "epoch": 0.5596330275229358,
457
+ "grad_norm": 0.11085808277130127,
458
+ "learning_rate": 1.913267763891745e-05,
459
+ "loss": 0.0715,
460
+ "step": 61
461
+ },
462
+ {
463
+ "epoch": 0.5688073394495413,
464
+ "grad_norm": 0.0637730062007904,
465
+ "learning_rate": 1.9090514246436085e-05,
466
+ "loss": 0.0936,
467
+ "step": 62
468
+ },
469
+ {
470
+ "epoch": 0.5779816513761468,
471
+ "grad_norm": 0.05786406993865967,
472
+ "learning_rate": 1.904739891816257e-05,
473
+ "loss": 0.0777,
474
+ "step": 63
475
+ },
476
+ {
477
+ "epoch": 0.5871559633027523,
478
+ "grad_norm": 0.09354288130998611,
479
+ "learning_rate": 1.9003336169025655e-05,
480
+ "loss": 0.0913,
481
+ "step": 64
482
+ },
483
+ {
484
+ "epoch": 0.5963302752293578,
485
+ "grad_norm": 0.1899336725473404,
486
+ "learning_rate": 1.8958330613165622e-05,
487
+ "loss": 0.0761,
488
+ "step": 65
489
+ },
490
+ {
491
+ "epoch": 0.6055045871559633,
492
+ "grad_norm": 0.05038751661777496,
493
+ "learning_rate": 1.891238696345111e-05,
494
+ "loss": 0.078,
495
+ "step": 66
496
+ },
497
+ {
498
+ "epoch": 0.6146788990825688,
499
+ "grad_norm": 0.16387197375297546,
500
+ "learning_rate": 1.8865510030985588e-05,
501
+ "loss": 0.0917,
502
+ "step": 67
503
+ },
504
+ {
505
+ "epoch": 0.6238532110091743,
506
+ "grad_norm": 0.054418135434389114,
507
+ "learning_rate": 1.8817704724603536e-05,
508
+ "loss": 0.0792,
509
+ "step": 68
510
+ },
511
+ {
512
+ "epoch": 0.6330275229357798,
513
+ "grad_norm": 0.0637287050485611,
514
+ "learning_rate": 1.8768976050356428e-05,
515
+ "loss": 0.0731,
516
+ "step": 69
517
+ },
518
+ {
519
+ "epoch": 0.6422018348623854,
520
+ "grad_norm": 0.27382200956344604,
521
+ "learning_rate": 1.8719329110988487e-05,
522
+ "loss": 0.0745,
523
+ "step": 70
524
+ },
525
+ {
526
+ "epoch": 0.6513761467889908,
527
+ "grad_norm": 0.05016474053263664,
528
+ "learning_rate": 1.8668769105402366e-05,
529
+ "loss": 0.0739,
530
+ "step": 71
531
+ },
532
+ {
533
+ "epoch": 0.6605504587155964,
534
+ "grad_norm": 0.08013670146465302,
535
+ "learning_rate": 1.8617301328114704e-05,
536
+ "loss": 0.1024,
537
+ "step": 72
538
+ },
539
+ {
540
+ "epoch": 0.6697247706422018,
541
+ "grad_norm": 0.06279715895652771,
542
+ "learning_rate": 1.8564931168701713e-05,
543
+ "loss": 0.0971,
544
+ "step": 73
545
+ },
546
+ {
547
+ "epoch": 0.6788990825688074,
548
+ "grad_norm": 0.048747751861810684,
549
+ "learning_rate": 1.85116641112348e-05,
550
+ "loss": 0.0657,
551
+ "step": 74
552
+ },
553
+ {
554
+ "epoch": 0.6880733944954128,
555
+ "grad_norm": 0.05987577140331268,
556
+ "learning_rate": 1.845750573370626e-05,
557
+ "loss": 0.1191,
558
+ "step": 75
559
+ },
560
+ {
561
+ "epoch": 0.6972477064220184,
562
+ "grad_norm": 0.08020060509443283,
563
+ "learning_rate": 1.8402461707445206e-05,
564
+ "loss": 0.1112,
565
+ "step": 76
566
+ },
567
+ {
568
+ "epoch": 0.7064220183486238,
569
+ "grad_norm": 0.05764961615204811,
570
+ "learning_rate": 1.8346537796523643e-05,
571
+ "loss": 0.1049,
572
+ "step": 77
573
+ },
574
+ {
575
+ "epoch": 0.7155963302752294,
576
+ "grad_norm": 0.08143055438995361,
577
+ "learning_rate": 1.8289739857152903e-05,
578
+ "loss": 0.0871,
579
+ "step": 78
580
+ },
581
+ {
582
+ "epoch": 0.7247706422018348,
583
+ "grad_norm": 0.08280878514051437,
584
+ "learning_rate": 1.823207383707036e-05,
585
+ "loss": 0.0724,
586
+ "step": 79
587
+ },
588
+ {
589
+ "epoch": 0.7339449541284404,
590
+ "grad_norm": 0.08486371487379074,
591
+ "learning_rate": 1.8173545774916628e-05,
592
+ "loss": 0.068,
593
+ "step": 80
594
+ },
595
+ {
596
+ "epoch": 0.7431192660550459,
597
+ "grad_norm": 0.061856675893068314,
598
+ "learning_rate": 1.8114161799603195e-05,
599
+ "loss": 0.0786,
600
+ "step": 81
601
+ },
602
+ {
603
+ "epoch": 0.7522935779816514,
604
+ "grad_norm": 0.05205192044377327,
605
+ "learning_rate": 1.8053928129670624e-05,
606
+ "loss": 0.0755,
607
+ "step": 82
608
+ },
609
+ {
610
+ "epoch": 0.7614678899082569,
611
+ "grad_norm": 0.07000340521335602,
612
+ "learning_rate": 1.7992851072637366e-05,
613
+ "loss": 0.1239,
614
+ "step": 83
615
+ },
616
+ {
617
+ "epoch": 0.7706422018348624,
618
+ "grad_norm": 0.07025006413459778,
619
+ "learning_rate": 1.793093702433924e-05,
620
+ "loss": 0.0616,
621
+ "step": 84
622
+ },
623
+ {
624
+ "epoch": 0.7706422018348624,
625
+ "eval_loss": 0.055874165147542953,
626
+ "eval_runtime": 43.2177,
627
+ "eval_samples_per_second": 4.003,
628
+ "eval_steps_per_second": 2.013,
629
+ "step": 84
630
+ },
631
+ {
632
+ "epoch": 0.7798165137614679,
633
+ "grad_norm": 0.06147678196430206,
634
+ "learning_rate": 1.7868192468259686e-05,
635
+ "loss": 0.0917,
636
+ "step": 85
637
+ },
638
+ {
639
+ "epoch": 0.7889908256880734,
640
+ "grad_norm": 0.045870471745729446,
641
+ "learning_rate": 1.7804623974850844e-05,
642
+ "loss": 0.092,
643
+ "step": 86
644
+ },
645
+ {
646
+ "epoch": 0.7981651376146789,
647
+ "grad_norm": 0.07622863352298737,
648
+ "learning_rate": 1.7740238200845485e-05,
649
+ "loss": 0.0983,
650
+ "step": 87
651
+ },
652
+ {
653
+ "epoch": 0.8073394495412844,
654
+ "grad_norm": 0.08215321600437164,
655
+ "learning_rate": 1.7675041888559952e-05,
656
+ "loss": 0.0971,
657
+ "step": 88
658
+ },
659
+ {
660
+ "epoch": 0.8165137614678899,
661
+ "grad_norm": 0.06286073476076126,
662
+ "learning_rate": 1.7609041865188122e-05,
663
+ "loss": 0.0876,
664
+ "step": 89
665
+ },
666
+ {
667
+ "epoch": 0.8256880733944955,
668
+ "grad_norm": 0.06399139016866684,
669
+ "learning_rate": 1.754224504208647e-05,
670
+ "loss": 0.1166,
671
+ "step": 90
672
+ },
673
+ {
674
+ "epoch": 0.8348623853211009,
675
+ "grad_norm": 0.07115256041288376,
676
+ "learning_rate": 1.7474658414050344e-05,
677
+ "loss": 0.119,
678
+ "step": 91
679
+ },
680
+ {
681
+ "epoch": 0.8440366972477065,
682
+ "grad_norm": 0.04867429658770561,
683
+ "learning_rate": 1.7406289058581466e-05,
684
+ "loss": 0.0609,
685
+ "step": 92
686
+ },
687
+ {
688
+ "epoch": 0.8532110091743119,
689
+ "grad_norm": 0.1833045333623886,
690
+ "learning_rate": 1.7337144135146818e-05,
691
+ "loss": 0.1247,
692
+ "step": 93
693
+ },
694
+ {
695
+ "epoch": 0.8623853211009175,
696
+ "grad_norm": 0.07023169845342636,
697
+ "learning_rate": 1.7267230884428905e-05,
698
+ "loss": 0.1271,
699
+ "step": 94
700
+ },
701
+ {
702
+ "epoch": 0.8715596330275229,
703
+ "grad_norm": 0.044201672077178955,
704
+ "learning_rate": 1.719655662756753e-05,
705
+ "loss": 0.0661,
706
+ "step": 95
707
+ },
708
+ {
709
+ "epoch": 0.8807339449541285,
710
+ "grad_norm": 0.05139552056789398,
711
+ "learning_rate": 1.7125128765393157e-05,
712
+ "loss": 0.1065,
713
+ "step": 96
714
+ },
715
+ {
716
+ "epoch": 0.8899082568807339,
717
+ "grad_norm": 0.06401531398296356,
718
+ "learning_rate": 1.705295477765188e-05,
719
+ "loss": 0.1006,
720
+ "step": 97
721
+ },
722
+ {
723
+ "epoch": 0.8990825688073395,
724
+ "grad_norm": 0.04898101091384888,
725
+ "learning_rate": 1.6980042222222216e-05,
726
+ "loss": 0.0522,
727
+ "step": 98
728
+ },
729
+ {
730
+ "epoch": 0.908256880733945,
731
+ "grad_norm": 0.053434859961271286,
732
+ "learning_rate": 1.690639873432361e-05,
733
+ "loss": 0.1214,
734
+ "step": 99
735
+ },
736
+ {
737
+ "epoch": 0.9174311926605505,
738
+ "grad_norm": 0.058289702981710434,
739
+ "learning_rate": 1.683203202571692e-05,
740
+ "loss": 0.0547,
741
+ "step": 100
742
+ },
743
+ {
744
+ "epoch": 0.926605504587156,
745
+ "grad_norm": 0.06972479820251465,
746
+ "learning_rate": 1.6756949883896874e-05,
747
+ "loss": 0.0867,
748
+ "step": 101
749
+ },
750
+ {
751
+ "epoch": 0.9357798165137615,
752
+ "grad_norm": 0.05255963280797005,
753
+ "learning_rate": 1.668116017127655e-05,
754
+ "loss": 0.0685,
755
+ "step": 102
756
+ },
757
+ {
758
+ "epoch": 0.944954128440367,
759
+ "grad_norm": 0.04849875345826149,
760
+ "learning_rate": 1.6604670824364067e-05,
761
+ "loss": 0.074,
762
+ "step": 103
763
+ },
764
+ {
765
+ "epoch": 0.9541284403669725,
766
+ "grad_norm": 0.08504751324653625,
767
+ "learning_rate": 1.652748985293149e-05,
768
+ "loss": 0.0901,
769
+ "step": 104
770
+ },
771
+ {
772
+ "epoch": 0.963302752293578,
773
+ "grad_norm": 0.07853557169437408,
774
+ "learning_rate": 1.6449625339176056e-05,
775
+ "loss": 0.0663,
776
+ "step": 105
777
+ },
778
+ {
779
+ "epoch": 0.9724770642201835,
780
+ "grad_norm": 0.0786280408501625,
781
+ "learning_rate": 1.6371085436873847e-05,
782
+ "loss": 0.0936,
783
+ "step": 106
784
+ },
785
+ {
786
+ "epoch": 0.981651376146789,
787
+ "grad_norm": 0.09738124907016754,
788
+ "learning_rate": 1.6291878370525925e-05,
789
+ "loss": 0.1001,
790
+ "step": 107
791
+ },
792
+ {
793
+ "epoch": 0.9908256880733946,
794
+ "grad_norm": 0.061633653938770294,
795
+ "learning_rate": 1.6212012434497103e-05,
796
+ "loss": 0.0728,
797
+ "step": 108
798
+ },
799
+ {
800
+ "epoch": 1.0,
801
+ "grad_norm": 0.06726932525634766,
802
+ "learning_rate": 1.6131495992147363e-05,
803
+ "loss": 0.0837,
804
+ "step": 109
805
+ },
806
+ {
807
+ "epoch": 1.0091743119266054,
808
+ "grad_norm": 0.05419926717877388,
809
+ "learning_rate": 1.605033747495607e-05,
810
+ "loss": 0.0841,
811
+ "step": 110
812
+ },
813
+ {
814
+ "epoch": 1.018348623853211,
815
+ "grad_norm": 0.056966476142406464,
816
+ "learning_rate": 1.596854538163906e-05,
817
+ "loss": 0.0774,
818
+ "step": 111
819
+ },
820
+ {
821
+ "epoch": 1.0275229357798166,
822
+ "grad_norm": 0.05710803344845772,
823
+ "learning_rate": 1.5886128277258665e-05,
824
+ "loss": 0.0645,
825
+ "step": 112
826
+ },
827
+ {
828
+ "epoch": 1.0275229357798166,
829
+ "eval_loss": 0.05474008619785309,
830
+ "eval_runtime": 43.3773,
831
+ "eval_samples_per_second": 3.988,
832
+ "eval_steps_per_second": 2.006,
833
+ "step": 112
834
+ },
835
+ {
836
+ "epoch": 1.036697247706422,
837
+ "grad_norm": 0.05264132842421532,
838
+ "learning_rate": 1.58030947923268e-05,
839
+ "loss": 0.0615,
840
+ "step": 113
841
+ },
842
+ {
843
+ "epoch": 1.0458715596330275,
844
+ "grad_norm": 0.11162517964839935,
845
+ "learning_rate": 1.571945362190121e-05,
846
+ "loss": 0.13,
847
+ "step": 114
848
+ },
849
+ {
850
+ "epoch": 1.0550458715596331,
851
+ "grad_norm": 0.05422775819897652,
852
+ "learning_rate": 1.563521352467493e-05,
853
+ "loss": 0.0677,
854
+ "step": 115
855
+ },
856
+ {
857
+ "epoch": 1.0642201834862386,
858
+ "grad_norm": 0.08082108199596405,
859
+ "learning_rate": 1.55503833220591e-05,
860
+ "loss": 0.0958,
861
+ "step": 116
862
+ },
863
+ {
864
+ "epoch": 1.073394495412844,
865
+ "grad_norm": 0.0667729526758194,
866
+ "learning_rate": 1.546497189725922e-05,
867
+ "loss": 0.1132,
868
+ "step": 117
869
+ },
870
+ {
871
+ "epoch": 1.0825688073394495,
872
+ "grad_norm": 0.06297166645526886,
873
+ "learning_rate": 1.5378988194344913e-05,
874
+ "loss": 0.0646,
875
+ "step": 118
876
+ },
877
+ {
878
+ "epoch": 1.091743119266055,
879
+ "grad_norm": 0.05654435604810715,
880
+ "learning_rate": 1.5292441217313324e-05,
881
+ "loss": 0.1083,
882
+ "step": 119
883
+ },
884
+ {
885
+ "epoch": 1.1009174311926606,
886
+ "grad_norm": 0.054773300886154175,
887
+ "learning_rate": 1.5205340029146256e-05,
888
+ "loss": 0.0833,
889
+ "step": 120
890
+ },
891
+ {
892
+ "epoch": 1.110091743119266,
893
+ "grad_norm": 0.0510234571993351,
894
+ "learning_rate": 1.5117693750861096e-05,
895
+ "loss": 0.0792,
896
+ "step": 121
897
+ },
898
+ {
899
+ "epoch": 1.1192660550458715,
900
+ "grad_norm": 0.054006725549697876,
901
+ "learning_rate": 1.5029511560555707e-05,
902
+ "loss": 0.1066,
903
+ "step": 122
904
+ },
905
+ {
906
+ "epoch": 1.1284403669724772,
907
+ "grad_norm": 0.06665553152561188,
908
+ "learning_rate": 1.4940802692447306e-05,
909
+ "loss": 0.0584,
910
+ "step": 123
911
+ },
912
+ {
913
+ "epoch": 1.1376146788990826,
914
+ "grad_norm": 0.105413518846035,
915
+ "learning_rate": 1.4851576435905489e-05,
916
+ "loss": 0.0907,
917
+ "step": 124
918
+ },
919
+ {
920
+ "epoch": 1.146788990825688,
921
+ "grad_norm": 0.0626402273774147,
922
+ "learning_rate": 1.4761842134479463e-05,
923
+ "loss": 0.0544,
924
+ "step": 125
925
+ },
926
+ {
927
+ "epoch": 1.1559633027522935,
928
+ "grad_norm": 0.06280255317687988,
929
+ "learning_rate": 1.4671609184919622e-05,
930
+ "loss": 0.0487,
931
+ "step": 126
932
+ },
933
+ {
934
+ "epoch": 1.165137614678899,
935
+ "grad_norm": 0.07118494808673859,
936
+ "learning_rate": 1.4580887036193539e-05,
937
+ "loss": 0.0797,
938
+ "step": 127
939
+ },
940
+ {
941
+ "epoch": 1.1743119266055047,
942
+ "grad_norm": 0.0436442606151104,
943
+ "learning_rate": 1.4489685188496488e-05,
944
+ "loss": 0.0568,
945
+ "step": 128
946
+ },
947
+ {
948
+ "epoch": 1.18348623853211,
949
+ "grad_norm": 0.06852032989263535,
950
+ "learning_rate": 1.4398013192256615e-05,
951
+ "loss": 0.1099,
952
+ "step": 129
953
+ },
954
+ {
955
+ "epoch": 1.1926605504587156,
956
+ "grad_norm": 0.05673675611615181,
957
+ "learning_rate": 1.4305880647134847e-05,
958
+ "loss": 0.0649,
959
+ "step": 130
960
+ },
961
+ {
962
+ "epoch": 1.2018348623853212,
963
+ "grad_norm": 0.05407770350575447,
964
+ "learning_rate": 1.4213297201019618e-05,
965
+ "loss": 0.0582,
966
+ "step": 131
967
+ },
968
+ {
969
+ "epoch": 1.2110091743119267,
970
+ "grad_norm": 0.06976728141307831,
971
+ "learning_rate": 1.4120272549016591e-05,
972
+ "loss": 0.0744,
973
+ "step": 132
974
+ },
975
+ {
976
+ "epoch": 1.2201834862385321,
977
+ "grad_norm": 0.044198598712682724,
978
+ "learning_rate": 1.40268164324334e-05,
979
+ "loss": 0.0456,
980
+ "step": 133
981
+ },
982
+ {
983
+ "epoch": 1.2293577981651376,
984
+ "grad_norm": 0.052981797605752945,
985
+ "learning_rate": 1.3932938637759555e-05,
986
+ "loss": 0.067,
987
+ "step": 134
988
+ },
989
+ {
990
+ "epoch": 1.238532110091743,
991
+ "grad_norm": 0.051931336522102356,
992
+ "learning_rate": 1.3838648995641645e-05,
993
+ "loss": 0.0662,
994
+ "step": 135
995
+ },
996
+ {
997
+ "epoch": 1.2477064220183487,
998
+ "grad_norm": 0.06196126341819763,
999
+ "learning_rate": 1.3743957379853885e-05,
1000
+ "loss": 0.0742,
1001
+ "step": 136
1002
+ },
1003
+ {
1004
+ "epoch": 1.2568807339449541,
1005
+ "grad_norm": 0.0772649347782135,
1006
+ "learning_rate": 1.3648873706264159e-05,
1007
+ "loss": 0.1016,
1008
+ "step": 137
1009
+ },
1010
+ {
1011
+ "epoch": 1.2660550458715596,
1012
+ "grad_norm": 0.06751381605863571,
1013
+ "learning_rate": 1.3553407931795662e-05,
1014
+ "loss": 0.0871,
1015
+ "step": 138
1016
+ },
1017
+ {
1018
+ "epoch": 1.2752293577981653,
1019
+ "grad_norm": 0.045472558587789536,
1020
+ "learning_rate": 1.3457570053384225e-05,
1021
+ "loss": 0.0548,
1022
+ "step": 139
1023
+ },
1024
+ {
1025
+ "epoch": 1.2844036697247707,
1026
+ "grad_norm": 0.061834368854761124,
1027
+ "learning_rate": 1.3361370106931486e-05,
1028
+ "loss": 0.0601,
1029
+ "step": 140
1030
+ },
1031
+ {
1032
+ "epoch": 1.2844036697247707,
1033
+ "eval_loss": 0.05419979989528656,
1034
+ "eval_runtime": 43.2558,
1035
+ "eval_samples_per_second": 3.999,
1036
+ "eval_steps_per_second": 2.011,
1037
+ "step": 140
1038
+ },
1039
+ {
1040
+ "epoch": 1.2935779816513762,
1041
+ "grad_norm": 0.062169574201107025,
1042
+ "learning_rate": 1.3264818166253917e-05,
1043
+ "loss": 0.0899,
1044
+ "step": 141
1045
+ },
1046
+ {
1047
+ "epoch": 1.3027522935779816,
1048
+ "grad_norm": 0.051288578659296036,
1049
+ "learning_rate": 1.3167924342027947e-05,
1050
+ "loss": 0.0758,
1051
+ "step": 142
1052
+ },
1053
+ {
1054
+ "epoch": 1.311926605504587,
1055
+ "grad_norm": 0.052053675055503845,
1056
+ "learning_rate": 1.3070698780731194e-05,
1057
+ "loss": 0.0989,
1058
+ "step": 143
1059
+ },
1060
+ {
1061
+ "epoch": 1.3211009174311927,
1062
+ "grad_norm": 0.05368887633085251,
1063
+ "learning_rate": 1.2973151663579948e-05,
1064
+ "loss": 0.0599,
1065
+ "step": 144
1066
+ },
1067
+ {
1068
+ "epoch": 1.3302752293577982,
1069
+ "grad_norm": 0.06439553946256638,
1070
+ "learning_rate": 1.2875293205463018e-05,
1071
+ "loss": 0.0808,
1072
+ "step": 145
1073
+ },
1074
+ {
1075
+ "epoch": 1.3394495412844036,
1076
+ "grad_norm": 0.05169299989938736,
1077
+ "learning_rate": 1.277713365387205e-05,
1078
+ "loss": 0.0668,
1079
+ "step": 146
1080
+ },
1081
+ {
1082
+ "epoch": 1.3486238532110093,
1083
+ "grad_norm": 0.0563080795109272,
1084
+ "learning_rate": 1.2678683287828451e-05,
1085
+ "loss": 0.0888,
1086
+ "step": 147
1087
+ },
1088
+ {
1089
+ "epoch": 1.3577981651376148,
1090
+ "grad_norm": 0.06615187972784042,
1091
+ "learning_rate": 1.257995241680698e-05,
1092
+ "loss": 0.123,
1093
+ "step": 148
1094
+ },
1095
+ {
1096
+ "epoch": 1.3669724770642202,
1097
+ "grad_norm": 0.0562995970249176,
1098
+ "learning_rate": 1.2480951379656175e-05,
1099
+ "loss": 0.0709,
1100
+ "step": 149
1101
+ },
1102
+ {
1103
+ "epoch": 1.3761467889908257,
1104
+ "grad_norm": 0.06163397431373596,
1105
+ "learning_rate": 1.2381690543515692e-05,
1106
+ "loss": 0.0663,
1107
+ "step": 150
1108
+ },
1109
+ {
1110
+ "epoch": 1.385321100917431,
1111
+ "grad_norm": 0.053988683968782425,
1112
+ "learning_rate": 1.2282180302730683e-05,
1113
+ "loss": 0.0812,
1114
+ "step": 151
1115
+ },
1116
+ {
1117
+ "epoch": 1.3944954128440368,
1118
+ "grad_norm": 0.062011417001485825,
1119
+ "learning_rate": 1.2182431077763317e-05,
1120
+ "loss": 0.0821,
1121
+ "step": 152
1122
+ },
1123
+ {
1124
+ "epoch": 1.4036697247706422,
1125
+ "grad_norm": 0.054933421313762665,
1126
+ "learning_rate": 1.2082453314101607e-05,
1127
+ "loss": 0.0726,
1128
+ "step": 153
1129
+ },
1130
+ {
1131
+ "epoch": 1.4128440366972477,
1132
+ "grad_norm": 0.0531560480594635,
1133
+ "learning_rate": 1.1982257481165547e-05,
1134
+ "loss": 0.052,
1135
+ "step": 154
1136
+ },
1137
+ {
1138
+ "epoch": 1.4220183486238533,
1139
+ "grad_norm": 0.0668332502245903,
1140
+ "learning_rate": 1.1881854071210805e-05,
1141
+ "loss": 0.0758,
1142
+ "step": 155
1143
+ },
1144
+ {
1145
+ "epoch": 1.4311926605504588,
1146
+ "grad_norm": 0.06946220993995667,
1147
+ "learning_rate": 1.1781253598229982e-05,
1148
+ "loss": 0.0702,
1149
+ "step": 156
1150
+ },
1151
+ {
1152
+ "epoch": 1.4403669724770642,
1153
+ "grad_norm": 0.06085884943604469,
1154
+ "learning_rate": 1.1680466596851635e-05,
1155
+ "loss": 0.0727,
1156
+ "step": 157
1157
+ },
1158
+ {
1159
+ "epoch": 1.4495412844036697,
1160
+ "grad_norm": 0.04978602007031441,
1161
+ "learning_rate": 1.1579503621237102e-05,
1162
+ "loss": 0.077,
1163
+ "step": 158
1164
+ },
1165
+ {
1166
+ "epoch": 1.4587155963302751,
1167
+ "grad_norm": 0.05822195112705231,
1168
+ "learning_rate": 1.1478375243975298e-05,
1169
+ "loss": 0.0769,
1170
+ "step": 159
1171
+ },
1172
+ {
1173
+ "epoch": 1.4678899082568808,
1174
+ "grad_norm": 0.06906376779079437,
1175
+ "learning_rate": 1.1377092054975586e-05,
1176
+ "loss": 0.094,
1177
+ "step": 160
1178
+ },
1179
+ {
1180
+ "epoch": 1.4770642201834863,
1181
+ "grad_norm": 0.0629039779305458,
1182
+ "learning_rate": 1.1275664660358818e-05,
1183
+ "loss": 0.0491,
1184
+ "step": 161
1185
+ },
1186
+ {
1187
+ "epoch": 1.4862385321100917,
1188
+ "grad_norm": 0.05990944802761078,
1189
+ "learning_rate": 1.1174103681346711e-05,
1190
+ "loss": 0.0514,
1191
+ "step": 162
1192
+ },
1193
+ {
1194
+ "epoch": 1.4954128440366974,
1195
+ "grad_norm": 0.055665481835603714,
1196
+ "learning_rate": 1.1072419753149585e-05,
1197
+ "loss": 0.0652,
1198
+ "step": 163
1199
+ },
1200
+ {
1201
+ "epoch": 1.5045871559633026,
1202
+ "grad_norm": 0.04991630092263222,
1203
+ "learning_rate": 1.0970623523852699e-05,
1204
+ "loss": 0.0576,
1205
+ "step": 164
1206
+ },
1207
+ {
1208
+ "epoch": 1.5137614678899083,
1209
+ "grad_norm": 0.052877284586429596,
1210
+ "learning_rate": 1.0868725653301206e-05,
1211
+ "loss": 0.075,
1212
+ "step": 165
1213
+ },
1214
+ {
1215
+ "epoch": 1.5229357798165137,
1216
+ "grad_norm": 0.0749133750796318,
1217
+ "learning_rate": 1.0766736811983864e-05,
1218
+ "loss": 0.092,
1219
+ "step": 166
1220
+ },
1221
+ {
1222
+ "epoch": 1.5321100917431192,
1223
+ "grad_norm": 0.050854723900556564,
1224
+ "learning_rate": 1.066466767991567e-05,
1225
+ "loss": 0.0282,
1226
+ "step": 167
1227
+ },
1228
+ {
1229
+ "epoch": 1.5412844036697249,
1230
+ "grad_norm": 0.05233798176050186,
1231
+ "learning_rate": 1.0562528945519463e-05,
1232
+ "loss": 0.0688,
1233
+ "step": 168
1234
+ },
1235
+ {
1236
+ "epoch": 1.5412844036697249,
1237
+ "eval_loss": 0.05367153137922287,
1238
+ "eval_runtime": 43.3753,
1239
+ "eval_samples_per_second": 3.988,
1240
+ "eval_steps_per_second": 2.006,
1241
+ "step": 168
1242
+ },
1243
+ {
1244
+ "epoch": 1.5504587155963303,
1245
+ "grad_norm": 0.06258448958396912,
1246
+ "learning_rate": 1.0460331304506658e-05,
1247
+ "loss": 0.0601,
1248
+ "step": 169
1249
+ },
1250
+ {
1251
+ "epoch": 1.5596330275229358,
1252
+ "grad_norm": 0.05399218574166298,
1253
+ "learning_rate": 1.0358085458757233e-05,
1254
+ "loss": 0.0642,
1255
+ "step": 170
1256
+ },
1257
+ {
1258
+ "epoch": 1.5688073394495414,
1259
+ "grad_norm": 0.052848368883132935,
1260
+ "learning_rate": 1.0255802115199034e-05,
1261
+ "loss": 0.054,
1262
+ "step": 171
1263
+ },
1264
+ {
1265
+ "epoch": 1.5779816513761467,
1266
+ "grad_norm": 0.054908327758312225,
1267
+ "learning_rate": 1.0153491984686595e-05,
1268
+ "loss": 0.069,
1269
+ "step": 172
1270
+ },
1271
+ {
1272
+ "epoch": 1.5871559633027523,
1273
+ "grad_norm": 0.08597759157419205,
1274
+ "learning_rate": 1.0051165780879503e-05,
1275
+ "loss": 0.0596,
1276
+ "step": 173
1277
+ },
1278
+ {
1279
+ "epoch": 1.5963302752293578,
1280
+ "grad_norm": 0.06400232017040253,
1281
+ "learning_rate": 9.9488342191205e-06,
1282
+ "loss": 0.0959,
1283
+ "step": 174
1284
+ },
1285
+ {
1286
+ "epoch": 1.6055045871559632,
1287
+ "grad_norm": 0.06301407516002655,
1288
+ "learning_rate": 9.846508015313407e-06,
1289
+ "loss": 0.0863,
1290
+ "step": 175
1291
+ },
1292
+ {
1293
+ "epoch": 1.614678899082569,
1294
+ "grad_norm": 0.09353712201118469,
1295
+ "learning_rate": 9.744197884800968e-06,
1296
+ "loss": 0.0835,
1297
+ "step": 176
1298
+ },
1299
+ {
1300
+ "epoch": 1.6238532110091743,
1301
+ "grad_norm": 0.14486946165561676,
1302
+ "learning_rate": 9.64191454124277e-06,
1303
+ "loss": 0.0786,
1304
+ "step": 177
1305
+ },
1306
+ {
1307
+ "epoch": 1.6330275229357798,
1308
+ "grad_norm": 0.06457175314426422,
1309
+ "learning_rate": 9.539668695493344e-06,
1310
+ "loss": 0.0752,
1311
+ "step": 178
1312
+ },
1313
+ {
1314
+ "epoch": 1.6422018348623855,
1315
+ "grad_norm": 0.059133633971214294,
1316
+ "learning_rate": 9.43747105448054e-06,
1317
+ "loss": 0.0747,
1318
+ "step": 179
1319
+ },
1320
+ {
1321
+ "epoch": 1.6513761467889907,
1322
+ "grad_norm": 0.04722464829683304,
1323
+ "learning_rate": 9.335332320084331e-06,
1324
+ "loss": 0.0503,
1325
+ "step": 180
1326
+ },
1327
+ {
1328
+ "epoch": 1.6605504587155964,
1329
+ "grad_norm": 0.06446046382188797,
1330
+ "learning_rate": 9.233263188016138e-06,
1331
+ "loss": 0.0794,
1332
+ "step": 181
1333
+ },
1334
+ {
1335
+ "epoch": 1.6697247706422018,
1336
+ "grad_norm": 0.05856352299451828,
1337
+ "learning_rate": 9.131274346698797e-06,
1338
+ "loss": 0.0917,
1339
+ "step": 182
1340
+ },
1341
+ {
1342
+ "epoch": 1.6788990825688073,
1343
+ "grad_norm": 0.050151705741882324,
1344
+ "learning_rate": 9.029376476147303e-06,
1345
+ "loss": 0.0534,
1346
+ "step": 183
1347
+ },
1348
+ {
1349
+ "epoch": 1.688073394495413,
1350
+ "grad_norm": 0.11409275978803635,
1351
+ "learning_rate": 8.927580246850418e-06,
1352
+ "loss": 0.0579,
1353
+ "step": 184
1354
+ },
1355
+ {
1356
+ "epoch": 1.6972477064220184,
1357
+ "grad_norm": 0.04147953912615776,
1358
+ "learning_rate": 8.825896318653294e-06,
1359
+ "loss": 0.0596,
1360
+ "step": 185
1361
+ },
1362
+ {
1363
+ "epoch": 1.7064220183486238,
1364
+ "grad_norm": 0.06895549595355988,
1365
+ "learning_rate": 8.724335339641185e-06,
1366
+ "loss": 0.1267,
1367
+ "step": 186
1368
+ },
1369
+ {
1370
+ "epoch": 1.7155963302752295,
1371
+ "grad_norm": 0.07597438991069794,
1372
+ "learning_rate": 8.622907945024418e-06,
1373
+ "loss": 0.0672,
1374
+ "step": 187
1375
+ },
1376
+ {
1377
+ "epoch": 1.7247706422018347,
1378
+ "grad_norm": 0.0447760745882988,
1379
+ "learning_rate": 8.521624756024706e-06,
1380
+ "loss": 0.0619,
1381
+ "step": 188
1382
+ },
1383
+ {
1384
+ "epoch": 1.7339449541284404,
1385
+ "grad_norm": 0.054984625428915024,
1386
+ "learning_rate": 8.420496378762901e-06,
1387
+ "loss": 0.0665,
1388
+ "step": 189
1389
+ },
1390
+ {
1391
+ "epoch": 1.7431192660550459,
1392
+ "grad_norm": 0.06113699823617935,
1393
+ "learning_rate": 8.319533403148368e-06,
1394
+ "loss": 0.0755,
1395
+ "step": 190
1396
+ },
1397
+ {
1398
+ "epoch": 1.7522935779816513,
1399
+ "grad_norm": 0.056827329099178314,
1400
+ "learning_rate": 8.218746401770021e-06,
1401
+ "loss": 0.0674,
1402
+ "step": 191
1403
+ },
1404
+ {
1405
+ "epoch": 1.761467889908257,
1406
+ "grad_norm": 0.06809786707162857,
1407
+ "learning_rate": 8.118145928789198e-06,
1408
+ "loss": 0.0595,
1409
+ "step": 192
1410
+ },
1411
+ {
1412
+ "epoch": 1.7706422018348624,
1413
+ "grad_norm": 0.05617869272828102,
1414
+ "learning_rate": 8.017742518834454e-06,
1415
+ "loss": 0.0823,
1416
+ "step": 193
1417
+ },
1418
+ {
1419
+ "epoch": 1.7798165137614679,
1420
+ "grad_norm": 0.05785394459962845,
1421
+ "learning_rate": 7.917546685898393e-06,
1422
+ "loss": 0.099,
1423
+ "step": 194
1424
+ },
1425
+ {
1426
+ "epoch": 1.7889908256880735,
1427
+ "grad_norm": 0.04695114120841026,
1428
+ "learning_rate": 7.817568922236683e-06,
1429
+ "loss": 0.0559,
1430
+ "step": 195
1431
+ },
1432
+ {
1433
+ "epoch": 1.7981651376146788,
1434
+ "grad_norm": 0.04976266250014305,
1435
+ "learning_rate": 7.717819697269322e-06,
1436
+ "loss": 0.0424,
1437
+ "step": 196
1438
+ },
1439
+ {
1440
+ "epoch": 1.7981651376146788,
1441
+ "eval_loss": 0.05340273305773735,
1442
+ "eval_runtime": 43.3761,
1443
+ "eval_samples_per_second": 3.988,
1444
+ "eval_steps_per_second": 2.006,
1445
+ "step": 196
1446
+ },
1447
+ {
1448
+ "epoch": 1.8073394495412844,
1449
+ "grad_norm": 0.04787492752075195,
1450
+ "learning_rate": 7.618309456484309e-06,
1451
+ "loss": 0.0439,
1452
+ "step": 197
1453
+ },
1454
+ {
1455
+ "epoch": 1.81651376146789,
1456
+ "grad_norm": 0.048712193965911865,
1457
+ "learning_rate": 7.519048620343825e-06,
1458
+ "loss": 0.0714,
1459
+ "step": 198
1460
+ },
1461
+ {
1462
+ "epoch": 1.8256880733944953,
1463
+ "grad_norm": 0.06686241924762726,
1464
+ "learning_rate": 7.42004758319302e-06,
1465
+ "loss": 0.0746,
1466
+ "step": 199
1467
+ },
1468
+ {
1469
+ "epoch": 1.834862385321101,
1470
+ "grad_norm": 0.06182721257209778,
1471
+ "learning_rate": 7.3213167121715514e-06,
1472
+ "loss": 0.068,
1473
+ "step": 200
1474
+ },
1475
+ {
1476
+ "epoch": 1.8440366972477065,
1477
+ "grad_norm": 0.05911610275506973,
1478
+ "learning_rate": 7.222866346127952e-06,
1479
+ "loss": 0.061,
1480
+ "step": 201
1481
+ },
1482
+ {
1483
+ "epoch": 1.853211009174312,
1484
+ "grad_norm": 0.048602789640426636,
1485
+ "learning_rate": 7.124706794536984e-06,
1486
+ "loss": 0.0545,
1487
+ "step": 202
1488
+ },
1489
+ {
1490
+ "epoch": 1.8623853211009176,
1491
+ "grad_norm": 0.09133545309305191,
1492
+ "learning_rate": 7.026848336420053e-06,
1493
+ "loss": 0.0579,
1494
+ "step": 203
1495
+ },
1496
+ {
1497
+ "epoch": 1.8715596330275228,
1498
+ "grad_norm": 0.05401468276977539,
1499
+ "learning_rate": 6.929301219268806e-06,
1500
+ "loss": 0.0498,
1501
+ "step": 204
1502
+ },
1503
+ {
1504
+ "epoch": 1.8807339449541285,
1505
+ "grad_norm": 0.057028841227293015,
1506
+ "learning_rate": 6.8320756579720545e-06,
1507
+ "loss": 0.1163,
1508
+ "step": 205
1509
+ },
1510
+ {
1511
+ "epoch": 1.889908256880734,
1512
+ "grad_norm": 0.06496277451515198,
1513
+ "learning_rate": 6.735181833746087e-06,
1514
+ "loss": 0.0687,
1515
+ "step": 206
1516
+ },
1517
+ {
1518
+ "epoch": 1.8990825688073394,
1519
+ "grad_norm": 0.04509355500340462,
1520
+ "learning_rate": 6.638629893068516e-06,
1521
+ "loss": 0.0833,
1522
+ "step": 207
1523
+ },
1524
+ {
1525
+ "epoch": 1.908256880733945,
1526
+ "grad_norm": 0.05287083983421326,
1527
+ "learning_rate": 6.542429946615774e-06,
1528
+ "loss": 0.0506,
1529
+ "step": 208
1530
+ },
1531
+ {
1532
+ "epoch": 1.9174311926605505,
1533
+ "grad_norm": 0.0586075522005558,
1534
+ "learning_rate": 6.446592068204341e-06,
1535
+ "loss": 0.0747,
1536
+ "step": 209
1537
+ },
1538
+ {
1539
+ "epoch": 1.926605504587156,
1540
+ "grad_norm": 0.05395258963108063,
1541
+ "learning_rate": 6.351126293735843e-06,
1542
+ "loss": 0.0703,
1543
+ "step": 210
1544
+ },
1545
+ {
1546
+ "epoch": 1.9357798165137616,
1547
+ "grad_norm": 0.04344907030463219,
1548
+ "learning_rate": 6.256042620146119e-06,
1549
+ "loss": 0.0445,
1550
+ "step": 211
1551
+ },
1552
+ {
1553
+ "epoch": 1.9449541284403669,
1554
+ "grad_norm": 0.047881245613098145,
1555
+ "learning_rate": 6.16135100435836e-06,
1556
+ "loss": 0.063,
1557
+ "step": 212
1558
+ },
1559
+ {
1560
+ "epoch": 1.9541284403669725,
1561
+ "grad_norm": 0.0735430121421814,
1562
+ "learning_rate": 6.06706136224045e-06,
1563
+ "loss": 0.0936,
1564
+ "step": 213
1565
+ },
1566
+ {
1567
+ "epoch": 1.963302752293578,
1568
+ "grad_norm": 0.0707215815782547,
1569
+ "learning_rate": 5.973183567566605e-06,
1570
+ "loss": 0.0906,
1571
+ "step": 214
1572
+ },
1573
+ {
1574
+ "epoch": 1.9724770642201834,
1575
+ "grad_norm": 0.05329213663935661,
1576
+ "learning_rate": 5.879727450983412e-06,
1577
+ "loss": 0.0665,
1578
+ "step": 215
1579
+ },
1580
+ {
1581
+ "epoch": 1.981651376146789,
1582
+ "grad_norm": 0.04940500482916832,
1583
+ "learning_rate": 5.786702798980388e-06,
1584
+ "loss": 0.0649,
1585
+ "step": 216
1586
+ },
1587
+ {
1588
+ "epoch": 1.9908256880733946,
1589
+ "grad_norm": 0.06716382503509521,
1590
+ "learning_rate": 5.69411935286516e-06,
1591
+ "loss": 0.0882,
1592
+ "step": 217
1593
+ },
1594
+ {
1595
+ "epoch": 2.0,
1596
+ "grad_norm": 0.06293365359306335,
1597
+ "learning_rate": 5.601986807743388e-06,
1598
+ "loss": 0.0649,
1599
+ "step": 218
1600
+ },
1601
+ {
1602
+ "epoch": 2.0091743119266057,
1603
+ "grad_norm": 0.06269426643848419,
1604
+ "learning_rate": 5.51031481150352e-06,
1605
+ "loss": 0.0757,
1606
+ "step": 219
1607
+ },
1608
+ {
1609
+ "epoch": 2.018348623853211,
1610
+ "grad_norm": 0.05706481635570526,
1611
+ "learning_rate": 5.419112963806468e-06,
1612
+ "loss": 0.0519,
1613
+ "step": 220
1614
+ },
1615
+ {
1616
+ "epoch": 2.0275229357798166,
1617
+ "grad_norm": 0.05423853173851967,
1618
+ "learning_rate": 5.328390815080381e-06,
1619
+ "loss": 0.0828,
1620
+ "step": 221
1621
+ },
1622
+ {
1623
+ "epoch": 2.036697247706422,
1624
+ "grad_norm": 0.04003476724028587,
1625
+ "learning_rate": 5.238157865520539e-06,
1626
+ "loss": 0.0431,
1627
+ "step": 222
1628
+ },
1629
+ {
1630
+ "epoch": 2.0458715596330275,
1631
+ "grad_norm": 0.047263018786907196,
1632
+ "learning_rate": 5.148423564094517e-06,
1633
+ "loss": 0.0498,
1634
+ "step": 223
1635
+ },
1636
+ {
1637
+ "epoch": 2.055045871559633,
1638
+ "grad_norm": 0.06567783653736115,
1639
+ "learning_rate": 5.059197307552698e-06,
1640
+ "loss": 0.086,
1641
+ "step": 224
1642
+ },
1643
+ {
1644
+ "epoch": 2.055045871559633,
1645
+ "eval_loss": 0.05316643789410591,
1646
+ "eval_runtime": 43.3865,
1647
+ "eval_samples_per_second": 3.987,
1648
+ "eval_steps_per_second": 2.005,
1649
+ "step": 224
1650
+ },
1651
+ {
1652
+ "epoch": 2.0642201834862384,
1653
+ "grad_norm": 0.04639436677098274,
1654
+ "learning_rate": 4.970488439444296e-06,
1655
+ "loss": 0.0589,
1656
+ "step": 225
1657
+ },
1658
+ {
1659
+ "epoch": 2.073394495412844,
1660
+ "grad_norm": 0.06990166008472443,
1661
+ "learning_rate": 4.882306249138909e-06,
1662
+ "loss": 0.0491,
1663
+ "step": 226
1664
+ },
1665
+ {
1666
+ "epoch": 2.0825688073394497,
1667
+ "grad_norm": 0.05951413884758949,
1668
+ "learning_rate": 4.7946599708537485e-06,
1669
+ "loss": 0.0736,
1670
+ "step": 227
1671
+ },
1672
+ {
1673
+ "epoch": 2.091743119266055,
1674
+ "grad_norm": 0.04998117685317993,
1675
+ "learning_rate": 4.707558782686677e-06,
1676
+ "loss": 0.073,
1677
+ "step": 228
1678
+ },
1679
+ {
1680
+ "epoch": 2.1009174311926606,
1681
+ "grad_norm": 0.05194695666432381,
1682
+ "learning_rate": 4.621011805655093e-06,
1683
+ "loss": 0.0797,
1684
+ "step": 229
1685
+ },
1686
+ {
1687
+ "epoch": 2.1100917431192663,
1688
+ "grad_norm": 0.058086711913347244,
1689
+ "learning_rate": 4.535028102740785e-06,
1690
+ "loss": 0.0612,
1691
+ "step": 230
1692
+ },
1693
+ {
1694
+ "epoch": 2.1192660550458715,
1695
+ "grad_norm": 0.05054466798901558,
1696
+ "learning_rate": 4.449616677940904e-06,
1697
+ "loss": 0.0623,
1698
+ "step": 231
1699
+ },
1700
+ {
1701
+ "epoch": 2.128440366972477,
1702
+ "grad_norm": 0.05400394648313522,
1703
+ "learning_rate": 4.364786475325072e-06,
1704
+ "loss": 0.0778,
1705
+ "step": 232
1706
+ },
1707
+ {
1708
+ "epoch": 2.1376146788990824,
1709
+ "grad_norm": 0.049106135964393616,
1710
+ "learning_rate": 4.280546378098792e-06,
1711
+ "loss": 0.0673,
1712
+ "step": 233
1713
+ },
1714
+ {
1715
+ "epoch": 2.146788990825688,
1716
+ "grad_norm": 0.05438579246401787,
1717
+ "learning_rate": 4.196905207673201e-06,
1718
+ "loss": 0.071,
1719
+ "step": 234
1720
+ },
1721
+ {
1722
+ "epoch": 2.1559633027522938,
1723
+ "grad_norm": 0.05540682002902031,
1724
+ "learning_rate": 4.113871722741337e-06,
1725
+ "loss": 0.0691,
1726
+ "step": 235
1727
+ },
1728
+ {
1729
+ "epoch": 2.165137614678899,
1730
+ "grad_norm": 0.07121812552213669,
1731
+ "learning_rate": 4.031454618360945e-06,
1732
+ "loss": 0.0859,
1733
+ "step": 236
1734
+ },
1735
+ {
1736
+ "epoch": 2.1743119266055047,
1737
+ "grad_norm": 0.06990870833396912,
1738
+ "learning_rate": 3.949662525043935e-06,
1739
+ "loss": 0.0808,
1740
+ "step": 237
1741
+ },
1742
+ {
1743
+ "epoch": 2.18348623853211,
1744
+ "grad_norm": 0.04688805714249611,
1745
+ "learning_rate": 3.868504007852641e-06,
1746
+ "loss": 0.0552,
1747
+ "step": 238
1748
+ },
1749
+ {
1750
+ "epoch": 2.1926605504587156,
1751
+ "grad_norm": 0.05368739366531372,
1752
+ "learning_rate": 3.7879875655029018e-06,
1753
+ "loss": 0.0708,
1754
+ "step": 239
1755
+ },
1756
+ {
1757
+ "epoch": 2.2018348623853212,
1758
+ "grad_norm": 0.05669724941253662,
1759
+ "learning_rate": 3.7081216294740773e-06,
1760
+ "loss": 0.0714,
1761
+ "step": 240
1762
+ },
1763
+ {
1764
+ "epoch": 2.2110091743119265,
1765
+ "grad_norm": 0.07314234972000122,
1766
+ "learning_rate": 3.628914563126156e-06,
1767
+ "loss": 0.0842,
1768
+ "step": 241
1769
+ },
1770
+ {
1771
+ "epoch": 2.220183486238532,
1772
+ "grad_norm": 0.06005195155739784,
1773
+ "learning_rate": 3.5503746608239487e-06,
1774
+ "loss": 0.0637,
1775
+ "step": 242
1776
+ },
1777
+ {
1778
+ "epoch": 2.229357798165138,
1779
+ "grad_norm": 0.05186507850885391,
1780
+ "learning_rate": 3.472510147068515e-06,
1781
+ "loss": 0.0606,
1782
+ "step": 243
1783
+ },
1784
+ {
1785
+ "epoch": 2.238532110091743,
1786
+ "grad_norm": 0.058381304144859314,
1787
+ "learning_rate": 3.3953291756359354e-06,
1788
+ "loss": 0.0626,
1789
+ "step": 244
1790
+ },
1791
+ {
1792
+ "epoch": 2.2477064220183487,
1793
+ "grad_norm": 0.05521458014845848,
1794
+ "learning_rate": 3.3188398287234504e-06,
1795
+ "loss": 0.0628,
1796
+ "step": 245
1797
+ },
1798
+ {
1799
+ "epoch": 2.2568807339449544,
1800
+ "grad_norm": 0.05869848653674126,
1801
+ "learning_rate": 3.243050116103128e-06,
1802
+ "loss": 0.0448,
1803
+ "step": 246
1804
+ },
1805
+ {
1806
+ "epoch": 2.2660550458715596,
1807
+ "grad_norm": 0.07123305648565292,
1808
+ "learning_rate": 3.1679679742830806e-06,
1809
+ "loss": 0.0991,
1810
+ "step": 247
1811
+ },
1812
+ {
1813
+ "epoch": 2.2752293577981653,
1814
+ "grad_norm": 0.05694476515054703,
1815
+ "learning_rate": 3.0936012656763937e-06,
1816
+ "loss": 0.1169,
1817
+ "step": 248
1818
+ },
1819
+ {
1820
+ "epoch": 2.2844036697247705,
1821
+ "grad_norm": 0.04774909466505051,
1822
+ "learning_rate": 3.019957777777788e-06,
1823
+ "loss": 0.0655,
1824
+ "step": 249
1825
+ },
1826
+ {
1827
+ "epoch": 2.293577981651376,
1828
+ "grad_norm": 0.05831342935562134,
1829
+ "learning_rate": 2.9470452223481206e-06,
1830
+ "loss": 0.0618,
1831
+ "step": 250
1832
+ },
1833
+ {
1834
+ "epoch": 2.302752293577982,
1835
+ "grad_norm": 0.08326123654842377,
1836
+ "learning_rate": 2.8748712346068464e-06,
1837
+ "loss": 0.0923,
1838
+ "step": 251
1839
+ },
1840
+ {
1841
+ "epoch": 2.311926605504587,
1842
+ "grad_norm": 0.06080171838402748,
1843
+ "learning_rate": 2.8034433724324716e-06,
1844
+ "loss": 0.0759,
1845
+ "step": 252
1846
+ },
1847
+ {
1848
+ "epoch": 2.311926605504587,
1849
+ "eval_loss": 0.052994709461927414,
1850
+ "eval_runtime": 43.3989,
1851
+ "eval_samples_per_second": 3.986,
1852
+ "eval_steps_per_second": 2.005,
1853
+ "step": 252
1854
+ },
1855
+ {
1856
+ "epoch": 2.3211009174311927,
1857
+ "grad_norm": 0.06857974827289581,
1858
+ "learning_rate": 2.7327691155710978e-06,
1859
+ "loss": 0.0791,
1860
+ "step": 253
1861
+ },
1862
+ {
1863
+ "epoch": 2.330275229357798,
1864
+ "grad_norm": 0.08638834208250046,
1865
+ "learning_rate": 2.6628558648531845e-06,
1866
+ "loss": 0.0894,
1867
+ "step": 254
1868
+ },
1869
+ {
1870
+ "epoch": 2.3394495412844036,
1871
+ "grad_norm": 0.07400725036859512,
1872
+ "learning_rate": 2.593710941418537e-06,
1873
+ "loss": 0.0624,
1874
+ "step": 255
1875
+ },
1876
+ {
1877
+ "epoch": 2.3486238532110093,
1878
+ "grad_norm": 0.05066663771867752,
1879
+ "learning_rate": 2.525341585949662e-06,
1880
+ "loss": 0.0755,
1881
+ "step": 256
1882
+ },
1883
+ {
1884
+ "epoch": 2.3577981651376145,
1885
+ "grad_norm": 0.051476072520017624,
1886
+ "learning_rate": 2.4577549579135318e-06,
1887
+ "loss": 0.074,
1888
+ "step": 257
1889
+ },
1890
+ {
1891
+ "epoch": 2.36697247706422,
1892
+ "grad_norm": 0.05693186819553375,
1893
+ "learning_rate": 2.3909581348118803e-06,
1894
+ "loss": 0.049,
1895
+ "step": 258
1896
+ },
1897
+ {
1898
+ "epoch": 2.376146788990826,
1899
+ "grad_norm": 0.043929580599069595,
1900
+ "learning_rate": 2.324958111440051e-06,
1901
+ "loss": 0.0558,
1902
+ "step": 259
1903
+ },
1904
+ {
1905
+ "epoch": 2.385321100917431,
1906
+ "grad_norm": 0.06177612394094467,
1907
+ "learning_rate": 2.259761799154516e-06,
1908
+ "loss": 0.0634,
1909
+ "step": 260
1910
+ },
1911
+ {
1912
+ "epoch": 2.3944954128440368,
1913
+ "grad_norm": 0.08901379257440567,
1914
+ "learning_rate": 2.195376025149156e-06,
1915
+ "loss": 0.0552,
1916
+ "step": 261
1917
+ },
1918
+ {
1919
+ "epoch": 2.4036697247706424,
1920
+ "grad_norm": 0.059478871524333954,
1921
+ "learning_rate": 2.1318075317403152e-06,
1922
+ "loss": 0.0834,
1923
+ "step": 262
1924
+ },
1925
+ {
1926
+ "epoch": 2.4128440366972477,
1927
+ "grad_norm": 0.14992526173591614,
1928
+ "learning_rate": 2.069062975660765e-06,
1929
+ "loss": 0.0582,
1930
+ "step": 263
1931
+ },
1932
+ {
1933
+ "epoch": 2.4220183486238533,
1934
+ "grad_norm": 0.04817449301481247,
1935
+ "learning_rate": 2.0071489273626376e-06,
1936
+ "loss": 0.0547,
1937
+ "step": 264
1938
+ },
1939
+ {
1940
+ "epoch": 2.4311926605504586,
1941
+ "grad_norm": 0.08196448534727097,
1942
+ "learning_rate": 1.946071870329377e-06,
1943
+ "loss": 0.078,
1944
+ "step": 265
1945
+ },
1946
+ {
1947
+ "epoch": 2.4403669724770642,
1948
+ "grad_norm": 0.07558903098106384,
1949
+ "learning_rate": 1.885838200396808e-06,
1950
+ "loss": 0.0507,
1951
+ "step": 266
1952
+ },
1953
+ {
1954
+ "epoch": 2.44954128440367,
1955
+ "grad_norm": 0.061492372304201126,
1956
+ "learning_rate": 1.826454225083375e-06,
1957
+ "loss": 0.0526,
1958
+ "step": 267
1959
+ },
1960
+ {
1961
+ "epoch": 2.458715596330275,
1962
+ "grad_norm": 0.04717002436518669,
1963
+ "learning_rate": 1.7679261629296408e-06,
1964
+ "loss": 0.05,
1965
+ "step": 268
1966
+ },
1967
+ {
1968
+ "epoch": 2.467889908256881,
1969
+ "grad_norm": 0.050578705966472626,
1970
+ "learning_rate": 1.7102601428470988e-06,
1971
+ "loss": 0.0694,
1972
+ "step": 269
1973
+ },
1974
+ {
1975
+ "epoch": 2.477064220183486,
1976
+ "grad_norm": 0.06575262546539307,
1977
+ "learning_rate": 1.6534622034763558e-06,
1978
+ "loss": 0.0537,
1979
+ "step": 270
1980
+ },
1981
+ {
1982
+ "epoch": 2.4862385321100917,
1983
+ "grad_norm": 0.0549924410879612,
1984
+ "learning_rate": 1.5975382925547966e-06,
1985
+ "loss": 0.0802,
1986
+ "step": 271
1987
+ },
1988
+ {
1989
+ "epoch": 2.4954128440366974,
1990
+ "grad_norm": 0.06130588427186012,
1991
+ "learning_rate": 1.5424942662937436e-06,
1992
+ "loss": 0.0766,
1993
+ "step": 272
1994
+ },
1995
+ {
1996
+ "epoch": 2.5045871559633026,
1997
+ "grad_norm": 0.07862205803394318,
1998
+ "learning_rate": 1.4883358887652044e-06,
1999
+ "loss": 0.0612,
2000
+ "step": 273
2001
+ },
2002
+ {
2003
+ "epoch": 2.5137614678899083,
2004
+ "grad_norm": 0.04936962202191353,
2005
+ "learning_rate": 1.4350688312982864e-06,
2006
+ "loss": 0.0556,
2007
+ "step": 274
2008
+ },
2009
+ {
2010
+ "epoch": 2.522935779816514,
2011
+ "grad_norm": 0.06410589069128036,
2012
+ "learning_rate": 1.3826986718852952e-06,
2013
+ "loss": 0.0483,
2014
+ "step": 275
2015
+ },
2016
+ {
2017
+ "epoch": 2.532110091743119,
2018
+ "grad_norm": 0.09402082115411758,
2019
+ "learning_rate": 1.3312308945976348e-06,
2020
+ "loss": 0.1031,
2021
+ "step": 276
2022
+ },
2023
+ {
2024
+ "epoch": 2.541284403669725,
2025
+ "grad_norm": 0.052867498248815536,
2026
+ "learning_rate": 1.2806708890115138e-06,
2027
+ "loss": 0.065,
2028
+ "step": 277
2029
+ },
2030
+ {
2031
+ "epoch": 2.5504587155963305,
2032
+ "grad_norm": 0.08837206661701202,
2033
+ "learning_rate": 1.2310239496435749e-06,
2034
+ "loss": 0.095,
2035
+ "step": 278
2036
+ },
2037
+ {
2038
+ "epoch": 2.5596330275229358,
2039
+ "grad_norm": 0.08973362296819687,
2040
+ "learning_rate": 1.1822952753964667e-06,
2041
+ "loss": 0.0765,
2042
+ "step": 279
2043
+ },
2044
+ {
2045
+ "epoch": 2.5688073394495414,
2046
+ "grad_norm": 0.061795495450496674,
2047
+ "learning_rate": 1.134489969014414e-06,
2048
+ "loss": 0.0583,
2049
+ "step": 280
2050
+ },
2051
+ {
2052
+ "epoch": 2.5688073394495414,
2053
+ "eval_loss": 0.05294761061668396,
2054
+ "eval_runtime": 43.223,
2055
+ "eval_samples_per_second": 4.002,
2056
+ "eval_steps_per_second": 2.013,
2057
+ "step": 280
2058
+ },
2059
+ {
2060
+ "epoch": 2.5779816513761467,
2061
+ "grad_norm": 0.047972485423088074,
2062
+ "learning_rate": 1.087613036548888e-06,
2063
+ "loss": 0.048,
2064
+ "step": 281
2065
+ },
2066
+ {
2067
+ "epoch": 2.5871559633027523,
2068
+ "grad_norm": 0.06896362453699112,
2069
+ "learning_rate": 1.0416693868343796e-06,
2070
+ "loss": 0.0771,
2071
+ "step": 282
2072
+ },
2073
+ {
2074
+ "epoch": 2.5963302752293576,
2075
+ "grad_norm": 0.06132780387997627,
2076
+ "learning_rate": 9.966638309743481e-07,
2077
+ "loss": 0.0854,
2078
+ "step": 283
2079
+ },
2080
+ {
2081
+ "epoch": 2.6055045871559632,
2082
+ "grad_norm": 0.06309553980827332,
2083
+ "learning_rate": 9.52601081837431e-07,
2084
+ "loss": 0.0827,
2085
+ "step": 284
2086
+ },
2087
+ {
2088
+ "epoch": 2.614678899082569,
2089
+ "grad_norm": 0.08898341655731201,
2090
+ "learning_rate": 9.094857535639157e-07,
2091
+ "loss": 0.0727,
2092
+ "step": 285
2093
+ },
2094
+ {
2095
+ "epoch": 2.623853211009174,
2096
+ "grad_norm": 0.05615299567580223,
2097
+ "learning_rate": 8.673223610825532e-07,
2098
+ "loss": 0.0827,
2099
+ "step": 286
2100
+ },
2101
+ {
2102
+ "epoch": 2.63302752293578,
2103
+ "grad_norm": 0.06234830617904663,
2104
+ "learning_rate": 8.261153196377814e-07,
2105
+ "loss": 0.0772,
2106
+ "step": 287
2107
+ },
2108
+ {
2109
+ "epoch": 2.6422018348623855,
2110
+ "grad_norm": 0.057416193187236786,
2111
+ "learning_rate": 7.858689443273548e-07,
2112
+ "loss": 0.0726,
2113
+ "step": 288
2114
+ },
2115
+ {
2116
+ "epoch": 2.6513761467889907,
2117
+ "grad_norm": 0.056388452649116516,
2118
+ "learning_rate": 7.465874496504944e-07,
2119
+ "loss": 0.0881,
2120
+ "step": 289
2121
+ },
2122
+ {
2123
+ "epoch": 2.6605504587155964,
2124
+ "grad_norm": 0.05161774531006813,
2125
+ "learning_rate": 7.082749490665353e-07,
2126
+ "loss": 0.0447,
2127
+ "step": 290
2128
+ },
2129
+ {
2130
+ "epoch": 2.669724770642202,
2131
+ "grad_norm": 0.047958966344594955,
2132
+ "learning_rate": 6.709354545641989e-07,
2133
+ "loss": 0.0772,
2134
+ "step": 291
2135
+ },
2136
+ {
2137
+ "epoch": 2.6788990825688073,
2138
+ "grad_norm": 0.0640062615275383,
2139
+ "learning_rate": 6.345728762414504e-07,
2140
+ "loss": 0.0607,
2141
+ "step": 292
2142
+ },
2143
+ {
2144
+ "epoch": 2.688073394495413,
2145
+ "grad_norm": 0.05299694091081619,
2146
+ "learning_rate": 5.99191021896055e-07,
2147
+ "loss": 0.047,
2148
+ "step": 293
2149
+ },
2150
+ {
2151
+ "epoch": 2.6972477064220186,
2152
+ "grad_norm": 0.057945024222135544,
2153
+ "learning_rate": 5.647935966268225e-07,
2154
+ "loss": 0.0731,
2155
+ "step": 294
2156
+ },
2157
+ {
2158
+ "epoch": 2.706422018348624,
2159
+ "grad_norm": 0.05141222104430199,
2160
+ "learning_rate": 5.313842024456306e-07,
2161
+ "loss": 0.039,
2162
+ "step": 295
2163
+ },
2164
+ {
2165
+ "epoch": 2.7155963302752295,
2166
+ "grad_norm": 0.08650866150856018,
2167
+ "learning_rate": 4.98966337900224e-07,
2168
+ "loss": 0.0551,
2169
+ "step": 296
2170
+ },
2171
+ {
2172
+ "epoch": 2.7247706422018347,
2173
+ "grad_norm": 0.13347071409225464,
2174
+ "learning_rate": 4.6754339770785474e-07,
2175
+ "loss": 0.0619,
2176
+ "step": 297
2177
+ },
2178
+ {
2179
+ "epoch": 2.7339449541284404,
2180
+ "grad_norm": 0.07346609234809875,
2181
+ "learning_rate": 4.3711867239980335e-07,
2182
+ "loss": 0.0423,
2183
+ "step": 298
2184
+ },
2185
+ {
2186
+ "epoch": 2.7431192660550456,
2187
+ "grad_norm": 0.05696272850036621,
2188
+ "learning_rate": 4.076953479767964e-07,
2189
+ "loss": 0.0903,
2190
+ "step": 299
2191
+ },
2192
+ {
2193
+ "epoch": 2.7522935779816513,
2194
+ "grad_norm": 0.058090586215257645,
2195
+ "learning_rate": 3.792765055753755e-07,
2196
+ "loss": 0.0994,
2197
+ "step": 300
2198
+ },
2199
+ {
2200
+ "epoch": 2.761467889908257,
2201
+ "grad_norm": 0.06176576018333435,
2202
+ "learning_rate": 3.5186512114525283e-07,
2203
+ "loss": 0.1243,
2204
+ "step": 301
2205
+ },
2206
+ {
2207
+ "epoch": 2.770642201834862,
2208
+ "grad_norm": 0.07493139058351517,
2209
+ "learning_rate": 3.25464065137675e-07,
2210
+ "loss": 0.0584,
2211
+ "step": 302
2212
+ },
2213
+ {
2214
+ "epoch": 2.779816513761468,
2215
+ "grad_norm": 0.04847017675638199,
2216
+ "learning_rate": 3.0007610220483927e-07,
2217
+ "loss": 0.0652,
2218
+ "step": 303
2219
+ },
2220
+ {
2221
+ "epoch": 2.7889908256880735,
2222
+ "grad_norm": 0.058301348239183426,
2223
+ "learning_rate": 2.757038909103793e-07,
2224
+ "loss": 0.0708,
2225
+ "step": 304
2226
+ },
2227
+ {
2228
+ "epoch": 2.7981651376146788,
2229
+ "grad_norm": 0.046219125390052795,
2230
+ "learning_rate": 2.523499834509724e-07,
2231
+ "loss": 0.0495,
2232
+ "step": 305
2233
+ },
2234
+ {
2235
+ "epoch": 2.8073394495412844,
2236
+ "grad_norm": 0.051395233720541,
2237
+ "learning_rate": 2.3001682538908333e-07,
2238
+ "loss": 0.0954,
2239
+ "step": 306
2240
+ },
2241
+ {
2242
+ "epoch": 2.81651376146789,
2243
+ "grad_norm": 0.0680239349603653,
2244
+ "learning_rate": 2.0870675539686024e-07,
2245
+ "loss": 0.0717,
2246
+ "step": 307
2247
+ },
2248
+ {
2249
+ "epoch": 2.8256880733944953,
2250
+ "grad_norm": 0.058481365442276,
2251
+ "learning_rate": 1.884220050112462e-07,
2252
+ "loss": 0.1087,
2253
+ "step": 308
2254
+ },
2255
+ {
2256
+ "epoch": 2.8256880733944953,
2257
+ "eval_loss": 0.05293623358011246,
2258
+ "eval_runtime": 43.297,
2259
+ "eval_samples_per_second": 3.996,
2260
+ "eval_steps_per_second": 2.009,
2261
+ "step": 308
2262
+ }
2263
+ ],
2264
+ "logging_steps": 1,
2265
+ "max_steps": 327,
2266
+ "num_input_tokens_seen": 0,
2267
+ "num_train_epochs": 3,
2268
+ "save_steps": 28,
2269
+ "stateful_callbacks": {
2270
+ "TrainerControl": {
2271
+ "args": {
2272
+ "should_epoch_stop": false,
2273
+ "should_evaluate": false,
2274
+ "should_log": false,
2275
+ "should_save": true,
2276
+ "should_training_stop": false
2277
+ },
2278
+ "attributes": {}
2279
+ }
2280
+ },
2281
+ "total_flos": 1.9114921838279393e+18,
2282
+ "train_batch_size": 2,
2283
+ "trial_name": null,
2284
+ "trial_params": null
2285
+ }
checkpoint-308/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8691334e7ff9d485bc39601a29a4096723c23e5fb7323cdb19a40a1c9c993c02
3
+ size 6520
checkpoint-308/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-327/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: bigcode/starcoder2-15b
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.14.0
checkpoint-327/adapter_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "bigcode/starcoder2-15b",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": null,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 32,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "c_fc",
27
+ "k_proj",
28
+ "q_proj",
29
+ "o_proj",
30
+ "c_proj",
31
+ "v_proj"
32
+ ],
33
+ "task_type": "CAUSAL_LM",
34
+ "use_dora": false,
35
+ "use_rslora": false
36
+ }
checkpoint-327/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c90ba47f39c0a86cd549d28055b620e1922d97f151fa40bf696f678998cec60
3
+ size 508623712
checkpoint-327/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-327/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f42239acaef6ca722dabee11b35f50eb2b911f45924c3928e00f1ddb0b34b5f
3
+ size 258817236
checkpoint-327/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03bce6c25ff9b0b569496e5db6947cc523f92f36cbda69995fac679c1bb7c540
3
+ size 14244
checkpoint-327/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fedb7ca125efe47bff8669910b66d94c1ed551f2a3528c1e3d15fd2eeb7a204
3
+ size 1064
checkpoint-327/special_tokens_map.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim_prefix>",
5
+ "<fim_middle>",
6
+ "<fim_suffix>",
7
+ "<fim_pad>",
8
+ "<repo_name>",
9
+ "<file_sep>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<jupyter_script>",
18
+ "<empty_output>",
19
+ "<code_to_intermediate>",
20
+ "<intermediate_to_code>",
21
+ "<pr>",
22
+ "<pr_status>",
23
+ "<pr_is_merged>",
24
+ "<pr_base>",
25
+ "<pr_file>",
26
+ "<pr_base_code>",
27
+ "<pr_diff>",
28
+ "<pr_diff_hunk>",
29
+ "<pr_comment>",
30
+ "<pr_event_id>",
31
+ "<pr_review>",
32
+ "<pr_review_state>",
33
+ "<pr_review_comment>",
34
+ "<pr_in_reply_to_review_id>",
35
+ "<pr_in_reply_to_comment_id>",
36
+ "<pr_diff_hunk_comment_line>",
37
+ "<NAME>",
38
+ "<EMAIL>",
39
+ "<KEY>",
40
+ "<PASSWORD>"
41
+ ],
42
+ "bos_token": {
43
+ "content": "<|endoftext|>",
44
+ "lstrip": false,
45
+ "normalized": false,
46
+ "rstrip": false,
47
+ "single_word": false
48
+ },
49
+ "eos_token": {
50
+ "content": "<|endoftext|>",
51
+ "lstrip": false,
52
+ "normalized": false,
53
+ "rstrip": false,
54
+ "single_word": false
55
+ },
56
+ "pad_token": {
57
+ "content": "<|endoftext|>",
58
+ "lstrip": false,
59
+ "normalized": false,
60
+ "rstrip": false,
61
+ "single_word": false
62
+ },
63
+ "unk_token": {
64
+ "content": "<|endoftext|>",
65
+ "lstrip": false,
66
+ "normalized": false,
67
+ "rstrip": false,
68
+ "single_word": false
69
+ }
70
+ }
checkpoint-327/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-327/tokenizer_config.json ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<fim_prefix>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<fim_middle>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<fim_suffix>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<fim_pad>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<repo_name>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<file_sep>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<issue_start>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_comment>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_closed>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<jupyter_start>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_text>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_code>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_output>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_script>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<empty_output>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<code_to_intermediate>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "17": {
141
+ "content": "<intermediate_to_code>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "18": {
149
+ "content": "<pr>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "19": {
157
+ "content": "<pr_status>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "20": {
165
+ "content": "<pr_is_merged>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "21": {
173
+ "content": "<pr_base>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "22": {
181
+ "content": "<pr_file>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "23": {
189
+ "content": "<pr_base_code>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "24": {
197
+ "content": "<pr_diff>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "25": {
205
+ "content": "<pr_diff_hunk>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "26": {
213
+ "content": "<pr_comment>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "27": {
221
+ "content": "<pr_event_id>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "28": {
229
+ "content": "<pr_review>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "29": {
237
+ "content": "<pr_review_state>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "30": {
245
+ "content": "<pr_review_comment>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "31": {
253
+ "content": "<pr_in_reply_to_review_id>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "32": {
261
+ "content": "<pr_in_reply_to_comment_id>",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "33": {
269
+ "content": "<pr_diff_hunk_comment_line>",
270
+ "lstrip": false,
271
+ "normalized": false,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "34": {
277
+ "content": "<NAME>",
278
+ "lstrip": false,
279
+ "normalized": false,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "35": {
285
+ "content": "<EMAIL>",
286
+ "lstrip": false,
287
+ "normalized": false,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "36": {
293
+ "content": "<KEY>",
294
+ "lstrip": false,
295
+ "normalized": false,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "37": {
301
+ "content": "<PASSWORD>",
302
+ "lstrip": false,
303
+ "normalized": false,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": true
307
+ }
308
+ },
309
+ "additional_special_tokens": [
310
+ "<|endoftext|>",
311
+ "<fim_prefix>",
312
+ "<fim_middle>",
313
+ "<fim_suffix>",
314
+ "<fim_pad>",
315
+ "<repo_name>",
316
+ "<file_sep>",
317
+ "<issue_start>",
318
+ "<issue_comment>",
319
+ "<issue_closed>",
320
+ "<jupyter_start>",
321
+ "<jupyter_text>",
322
+ "<jupyter_code>",
323
+ "<jupyter_output>",
324
+ "<jupyter_script>",
325
+ "<empty_output>",
326
+ "<code_to_intermediate>",
327
+ "<intermediate_to_code>",
328
+ "<pr>",
329
+ "<pr_status>",
330
+ "<pr_is_merged>",
331
+ "<pr_base>",
332
+ "<pr_file>",
333
+ "<pr_base_code>",
334
+ "<pr_diff>",
335
+ "<pr_diff_hunk>",
336
+ "<pr_comment>",
337
+ "<pr_event_id>",
338
+ "<pr_review>",
339
+ "<pr_review_state>",
340
+ "<pr_review_comment>",
341
+ "<pr_in_reply_to_review_id>",
342
+ "<pr_in_reply_to_comment_id>",
343
+ "<pr_diff_hunk_comment_line>",
344
+ "<NAME>",
345
+ "<EMAIL>",
346
+ "<KEY>",
347
+ "<PASSWORD>"
348
+ ],
349
+ "bos_token": "<|endoftext|>",
350
+ "clean_up_tokenization_spaces": true,
351
+ "eos_token": "<|endoftext|>",
352
+ "extra_special_tokens": {},
353
+ "model_max_length": 1000000000000000019884624838656,
354
+ "pad_token": "<|endoftext|>",
355
+ "tokenizer_class": "GPT2Tokenizer",
356
+ "unk_token": "<|endoftext|>",
357
+ "vocab_size": 49152
358
+ }
checkpoint-327/trainer_state.json ADDED
@@ -0,0 +1,2418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 28,
6
+ "global_step": 327,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009174311926605505,
13
+ "grad_norm": 0.01852019689977169,
14
+ "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 0.1499,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.009174311926605505,
20
+ "eval_loss": 0.06453218311071396,
21
+ "eval_runtime": 43.1581,
22
+ "eval_samples_per_second": 4.009,
23
+ "eval_steps_per_second": 2.016,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.01834862385321101,
28
+ "grad_norm": 0.02213262766599655,
29
+ "learning_rate": 2.0000000000000003e-06,
30
+ "loss": 0.1596,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.027522935779816515,
35
+ "grad_norm": 0.045894358307123184,
36
+ "learning_rate": 3e-06,
37
+ "loss": 0.1794,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.03669724770642202,
42
+ "grad_norm": 0.01868272013962269,
43
+ "learning_rate": 4.000000000000001e-06,
44
+ "loss": 0.1592,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.045871559633027525,
49
+ "grad_norm": 0.017406364902853966,
50
+ "learning_rate": 5e-06,
51
+ "loss": 0.1696,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.05504587155963303,
56
+ "grad_norm": 0.01861901767551899,
57
+ "learning_rate": 6e-06,
58
+ "loss": 0.1438,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.06422018348623854,
63
+ "grad_norm": 0.020326999947428703,
64
+ "learning_rate": 7e-06,
65
+ "loss": 0.1627,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.07339449541284404,
70
+ "grad_norm": 0.02542084828019142,
71
+ "learning_rate": 8.000000000000001e-06,
72
+ "loss": 0.1558,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.08256880733944955,
77
+ "grad_norm": 0.022425655275583267,
78
+ "learning_rate": 9e-06,
79
+ "loss": 0.1517,
80
+ "step": 9
81
+ },
82
+ {
83
+ "epoch": 0.09174311926605505,
84
+ "grad_norm": 0.023916274309158325,
85
+ "learning_rate": 1e-05,
86
+ "loss": 0.1672,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.10091743119266056,
91
+ "grad_norm": 0.022392459213733673,
92
+ "learning_rate": 1.1000000000000001e-05,
93
+ "loss": 0.1504,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.11009174311926606,
98
+ "grad_norm": 0.02520025707781315,
99
+ "learning_rate": 1.2e-05,
100
+ "loss": 0.1749,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.11926605504587157,
105
+ "grad_norm": 0.028882062062621117,
106
+ "learning_rate": 1.3000000000000001e-05,
107
+ "loss": 0.1705,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.12844036697247707,
112
+ "grad_norm": 0.03628265857696533,
113
+ "learning_rate": 1.4e-05,
114
+ "loss": 0.166,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.13761467889908258,
119
+ "grad_norm": 0.02980518713593483,
120
+ "learning_rate": 1.5000000000000002e-05,
121
+ "loss": 0.1238,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.14678899082568808,
126
+ "grad_norm": 0.028387300670146942,
127
+ "learning_rate": 1.6000000000000003e-05,
128
+ "loss": 0.1326,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.1559633027522936,
133
+ "grad_norm": 0.03367001935839653,
134
+ "learning_rate": 1.7e-05,
135
+ "loss": 0.1347,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.1651376146788991,
140
+ "grad_norm": 0.03655758127570152,
141
+ "learning_rate": 1.8e-05,
142
+ "loss": 0.1423,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.1743119266055046,
147
+ "grad_norm": 0.04000673070549965,
148
+ "learning_rate": 1.9e-05,
149
+ "loss": 0.1651,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.1834862385321101,
154
+ "grad_norm": 0.03844478353857994,
155
+ "learning_rate": 2e-05,
156
+ "loss": 0.1649,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.1926605504587156,
161
+ "grad_norm": 0.04334944114089012,
162
+ "learning_rate": 1.99994764125734e-05,
163
+ "loss": 0.1292,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.2018348623853211,
168
+ "grad_norm": 0.04224175587296486,
169
+ "learning_rate": 1.9997905705122352e-05,
170
+ "loss": 0.1336,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.21100917431192662,
175
+ "grad_norm": 0.04647228121757507,
176
+ "learning_rate": 1.9995288042127396e-05,
177
+ "loss": 0.128,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.22018348623853212,
182
+ "grad_norm": 0.04862922057509422,
183
+ "learning_rate": 1.9991623697703613e-05,
184
+ "loss": 0.1611,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 0.22935779816513763,
189
+ "grad_norm": 0.05552718788385391,
190
+ "learning_rate": 1.998691305557194e-05,
191
+ "loss": 0.1469,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 0.23853211009174313,
196
+ "grad_norm": 0.04372956603765488,
197
+ "learning_rate": 1.9981156609018977e-05,
198
+ "loss": 0.1341,
199
+ "step": 26
200
+ },
201
+ {
202
+ "epoch": 0.24770642201834864,
203
+ "grad_norm": 0.04992978647351265,
204
+ "learning_rate": 1.9974354960845326e-05,
205
+ "loss": 0.1464,
206
+ "step": 27
207
+ },
208
+ {
209
+ "epoch": 0.25688073394495414,
210
+ "grad_norm": 0.04873019829392433,
211
+ "learning_rate": 1.9966508823302484e-05,
212
+ "loss": 0.1554,
213
+ "step": 28
214
+ },
215
+ {
216
+ "epoch": 0.25688073394495414,
217
+ "eval_loss": 0.06217445060610771,
218
+ "eval_runtime": 43.3972,
219
+ "eval_samples_per_second": 3.986,
220
+ "eval_steps_per_second": 2.005,
221
+ "step": 28
222
+ },
223
+ {
224
+ "epoch": 0.26605504587155965,
225
+ "grad_norm": 0.04296933487057686,
226
+ "learning_rate": 1.9957619018018243e-05,
227
+ "loss": 0.1231,
228
+ "step": 29
229
+ },
230
+ {
231
+ "epoch": 0.27522935779816515,
232
+ "grad_norm": 0.06265883892774582,
233
+ "learning_rate": 1.9947686475910656e-05,
234
+ "loss": 0.1292,
235
+ "step": 30
236
+ },
237
+ {
238
+ "epoch": 0.28440366972477066,
239
+ "grad_norm": 0.044797539710998535,
240
+ "learning_rate": 1.9936712237090554e-05,
241
+ "loss": 0.114,
242
+ "step": 31
243
+ },
244
+ {
245
+ "epoch": 0.29357798165137616,
246
+ "grad_norm": 0.05862847715616226,
247
+ "learning_rate": 1.9924697450752636e-05,
248
+ "loss": 0.1215,
249
+ "step": 32
250
+ },
251
+ {
252
+ "epoch": 0.30275229357798167,
253
+ "grad_norm": 0.04938759654760361,
254
+ "learning_rate": 1.991164337505511e-05,
255
+ "loss": 0.0683,
256
+ "step": 33
257
+ },
258
+ {
259
+ "epoch": 0.3119266055045872,
260
+ "grad_norm": 0.07766549289226532,
261
+ "learning_rate": 1.9897551376987948e-05,
262
+ "loss": 0.1051,
263
+ "step": 34
264
+ },
265
+ {
266
+ "epoch": 0.3211009174311927,
267
+ "grad_norm": 0.05474488437175751,
268
+ "learning_rate": 1.9882422932229765e-05,
269
+ "loss": 0.082,
270
+ "step": 35
271
+ },
272
+ {
273
+ "epoch": 0.3302752293577982,
274
+ "grad_norm": 0.04499150812625885,
275
+ "learning_rate": 1.9866259624993246e-05,
276
+ "loss": 0.1135,
277
+ "step": 36
278
+ },
279
+ {
280
+ "epoch": 0.3394495412844037,
281
+ "grad_norm": 0.07329924404621124,
282
+ "learning_rate": 1.9849063147859282e-05,
283
+ "loss": 0.1082,
284
+ "step": 37
285
+ },
286
+ {
287
+ "epoch": 0.3486238532110092,
288
+ "grad_norm": 0.23288355767726898,
289
+ "learning_rate": 1.983083530159971e-05,
290
+ "loss": 0.0899,
291
+ "step": 38
292
+ },
293
+ {
294
+ "epoch": 0.3577981651376147,
295
+ "grad_norm": 0.06561094522476196,
296
+ "learning_rate": 1.9811577994988755e-05,
297
+ "loss": 0.1096,
298
+ "step": 39
299
+ },
300
+ {
301
+ "epoch": 0.3669724770642202,
302
+ "grad_norm": 0.052528683096170425,
303
+ "learning_rate": 1.979129324460314e-05,
304
+ "loss": 0.107,
305
+ "step": 40
306
+ },
307
+ {
308
+ "epoch": 0.3761467889908257,
309
+ "grad_norm": 0.057943329215049744,
310
+ "learning_rate": 1.9769983174610918e-05,
311
+ "loss": 0.1121,
312
+ "step": 41
313
+ },
314
+ {
315
+ "epoch": 0.3853211009174312,
316
+ "grad_norm": 0.05784667655825615,
317
+ "learning_rate": 1.974765001654903e-05,
318
+ "loss": 0.1125,
319
+ "step": 42
320
+ },
321
+ {
322
+ "epoch": 0.3944954128440367,
323
+ "grad_norm": 0.04998760297894478,
324
+ "learning_rate": 1.9724296109089623e-05,
325
+ "loss": 0.0944,
326
+ "step": 43
327
+ },
328
+ {
329
+ "epoch": 0.4036697247706422,
330
+ "grad_norm": 0.056932978332042694,
331
+ "learning_rate": 1.9699923897795165e-05,
332
+ "loss": 0.0758,
333
+ "step": 44
334
+ },
335
+ {
336
+ "epoch": 0.41284403669724773,
337
+ "grad_norm": 0.05268337205052376,
338
+ "learning_rate": 1.9674535934862327e-05,
339
+ "loss": 0.0767,
340
+ "step": 45
341
+ },
342
+ {
343
+ "epoch": 0.42201834862385323,
344
+ "grad_norm": 0.04703257977962494,
345
+ "learning_rate": 1.9648134878854747e-05,
346
+ "loss": 0.076,
347
+ "step": 46
348
+ },
349
+ {
350
+ "epoch": 0.43119266055045874,
351
+ "grad_norm": 0.05632725730538368,
352
+ "learning_rate": 1.9620723494424627e-05,
353
+ "loss": 0.1143,
354
+ "step": 47
355
+ },
356
+ {
357
+ "epoch": 0.44036697247706424,
358
+ "grad_norm": 0.04887419193983078,
359
+ "learning_rate": 1.9592304652023208e-05,
360
+ "loss": 0.096,
361
+ "step": 48
362
+ },
363
+ {
364
+ "epoch": 0.44954128440366975,
365
+ "grad_norm": 0.06641880422830582,
366
+ "learning_rate": 1.9562881327600197e-05,
367
+ "loss": 0.1108,
368
+ "step": 49
369
+ },
370
+ {
371
+ "epoch": 0.45871559633027525,
372
+ "grad_norm": 0.08709923177957535,
373
+ "learning_rate": 1.9532456602292148e-05,
374
+ "loss": 0.0987,
375
+ "step": 50
376
+ },
377
+ {
378
+ "epoch": 0.46788990825688076,
379
+ "grad_norm": 0.06175887584686279,
380
+ "learning_rate": 1.950103366209978e-05,
381
+ "loss": 0.0821,
382
+ "step": 51
383
+ },
384
+ {
385
+ "epoch": 0.47706422018348627,
386
+ "grad_norm": 0.05565601587295532,
387
+ "learning_rate": 1.9468615797554374e-05,
388
+ "loss": 0.0727,
389
+ "step": 52
390
+ },
391
+ {
392
+ "epoch": 0.48623853211009177,
393
+ "grad_norm": 0.13676409423351288,
394
+ "learning_rate": 1.943520640337318e-05,
395
+ "loss": 0.0834,
396
+ "step": 53
397
+ },
398
+ {
399
+ "epoch": 0.4954128440366973,
400
+ "grad_norm": 0.0817922055721283,
401
+ "learning_rate": 1.9400808978103948e-05,
402
+ "loss": 0.0766,
403
+ "step": 54
404
+ },
405
+ {
406
+ "epoch": 0.5045871559633027,
407
+ "grad_norm": 0.04707655310630798,
408
+ "learning_rate": 1.936542712375855e-05,
409
+ "loss": 0.0753,
410
+ "step": 55
411
+ },
412
+ {
413
+ "epoch": 0.5137614678899083,
414
+ "grad_norm": 0.07192892581224442,
415
+ "learning_rate": 1.9329064545435803e-05,
416
+ "loss": 0.0745,
417
+ "step": 56
418
+ },
419
+ {
420
+ "epoch": 0.5137614678899083,
421
+ "eval_loss": 0.057128190994262695,
422
+ "eval_runtime": 43.3864,
423
+ "eval_samples_per_second": 3.987,
424
+ "eval_steps_per_second": 2.005,
425
+ "step": 56
426
+ },
427
+ {
428
+ "epoch": 0.5229357798165137,
429
+ "grad_norm": 0.07009316980838776,
430
+ "learning_rate": 1.929172505093347e-05,
431
+ "loss": 0.0696,
432
+ "step": 57
433
+ },
434
+ {
435
+ "epoch": 0.5321100917431193,
436
+ "grad_norm": 0.07050078362226486,
437
+ "learning_rate": 1.9253412550349507e-05,
438
+ "loss": 0.0846,
439
+ "step": 58
440
+ },
441
+ {
442
+ "epoch": 0.5412844036697247,
443
+ "grad_norm": 0.06861168146133423,
444
+ "learning_rate": 1.9214131055672648e-05,
445
+ "loss": 0.0659,
446
+ "step": 59
447
+ },
448
+ {
449
+ "epoch": 0.5504587155963303,
450
+ "grad_norm": 0.05825705826282501,
451
+ "learning_rate": 1.917388468036222e-05,
452
+ "loss": 0.1173,
453
+ "step": 60
454
+ },
455
+ {
456
+ "epoch": 0.5596330275229358,
457
+ "grad_norm": 0.11085808277130127,
458
+ "learning_rate": 1.913267763891745e-05,
459
+ "loss": 0.0715,
460
+ "step": 61
461
+ },
462
+ {
463
+ "epoch": 0.5688073394495413,
464
+ "grad_norm": 0.0637730062007904,
465
+ "learning_rate": 1.9090514246436085e-05,
466
+ "loss": 0.0936,
467
+ "step": 62
468
+ },
469
+ {
470
+ "epoch": 0.5779816513761468,
471
+ "grad_norm": 0.05786406993865967,
472
+ "learning_rate": 1.904739891816257e-05,
473
+ "loss": 0.0777,
474
+ "step": 63
475
+ },
476
+ {
477
+ "epoch": 0.5871559633027523,
478
+ "grad_norm": 0.09354288130998611,
479
+ "learning_rate": 1.9003336169025655e-05,
480
+ "loss": 0.0913,
481
+ "step": 64
482
+ },
483
+ {
484
+ "epoch": 0.5963302752293578,
485
+ "grad_norm": 0.1899336725473404,
486
+ "learning_rate": 1.8958330613165622e-05,
487
+ "loss": 0.0761,
488
+ "step": 65
489
+ },
490
+ {
491
+ "epoch": 0.6055045871559633,
492
+ "grad_norm": 0.05038751661777496,
493
+ "learning_rate": 1.891238696345111e-05,
494
+ "loss": 0.078,
495
+ "step": 66
496
+ },
497
+ {
498
+ "epoch": 0.6146788990825688,
499
+ "grad_norm": 0.16387197375297546,
500
+ "learning_rate": 1.8865510030985588e-05,
501
+ "loss": 0.0917,
502
+ "step": 67
503
+ },
504
+ {
505
+ "epoch": 0.6238532110091743,
506
+ "grad_norm": 0.054418135434389114,
507
+ "learning_rate": 1.8817704724603536e-05,
508
+ "loss": 0.0792,
509
+ "step": 68
510
+ },
511
+ {
512
+ "epoch": 0.6330275229357798,
513
+ "grad_norm": 0.0637287050485611,
514
+ "learning_rate": 1.8768976050356428e-05,
515
+ "loss": 0.0731,
516
+ "step": 69
517
+ },
518
+ {
519
+ "epoch": 0.6422018348623854,
520
+ "grad_norm": 0.27382200956344604,
521
+ "learning_rate": 1.8719329110988487e-05,
522
+ "loss": 0.0745,
523
+ "step": 70
524
+ },
525
+ {
526
+ "epoch": 0.6513761467889908,
527
+ "grad_norm": 0.05016474053263664,
528
+ "learning_rate": 1.8668769105402366e-05,
529
+ "loss": 0.0739,
530
+ "step": 71
531
+ },
532
+ {
533
+ "epoch": 0.6605504587155964,
534
+ "grad_norm": 0.08013670146465302,
535
+ "learning_rate": 1.8617301328114704e-05,
536
+ "loss": 0.1024,
537
+ "step": 72
538
+ },
539
+ {
540
+ "epoch": 0.6697247706422018,
541
+ "grad_norm": 0.06279715895652771,
542
+ "learning_rate": 1.8564931168701713e-05,
543
+ "loss": 0.0971,
544
+ "step": 73
545
+ },
546
+ {
547
+ "epoch": 0.6788990825688074,
548
+ "grad_norm": 0.048747751861810684,
549
+ "learning_rate": 1.85116641112348e-05,
550
+ "loss": 0.0657,
551
+ "step": 74
552
+ },
553
+ {
554
+ "epoch": 0.6880733944954128,
555
+ "grad_norm": 0.05987577140331268,
556
+ "learning_rate": 1.845750573370626e-05,
557
+ "loss": 0.1191,
558
+ "step": 75
559
+ },
560
+ {
561
+ "epoch": 0.6972477064220184,
562
+ "grad_norm": 0.08020060509443283,
563
+ "learning_rate": 1.8402461707445206e-05,
564
+ "loss": 0.1112,
565
+ "step": 76
566
+ },
567
+ {
568
+ "epoch": 0.7064220183486238,
569
+ "grad_norm": 0.05764961615204811,
570
+ "learning_rate": 1.8346537796523643e-05,
571
+ "loss": 0.1049,
572
+ "step": 77
573
+ },
574
+ {
575
+ "epoch": 0.7155963302752294,
576
+ "grad_norm": 0.08143055438995361,
577
+ "learning_rate": 1.8289739857152903e-05,
578
+ "loss": 0.0871,
579
+ "step": 78
580
+ },
581
+ {
582
+ "epoch": 0.7247706422018348,
583
+ "grad_norm": 0.08280878514051437,
584
+ "learning_rate": 1.823207383707036e-05,
585
+ "loss": 0.0724,
586
+ "step": 79
587
+ },
588
+ {
589
+ "epoch": 0.7339449541284404,
590
+ "grad_norm": 0.08486371487379074,
591
+ "learning_rate": 1.8173545774916628e-05,
592
+ "loss": 0.068,
593
+ "step": 80
594
+ },
595
+ {
596
+ "epoch": 0.7431192660550459,
597
+ "grad_norm": 0.061856675893068314,
598
+ "learning_rate": 1.8114161799603195e-05,
599
+ "loss": 0.0786,
600
+ "step": 81
601
+ },
602
+ {
603
+ "epoch": 0.7522935779816514,
604
+ "grad_norm": 0.05205192044377327,
605
+ "learning_rate": 1.8053928129670624e-05,
606
+ "loss": 0.0755,
607
+ "step": 82
608
+ },
609
+ {
610
+ "epoch": 0.7614678899082569,
611
+ "grad_norm": 0.07000340521335602,
612
+ "learning_rate": 1.7992851072637366e-05,
613
+ "loss": 0.1239,
614
+ "step": 83
615
+ },
616
+ {
617
+ "epoch": 0.7706422018348624,
618
+ "grad_norm": 0.07025006413459778,
619
+ "learning_rate": 1.793093702433924e-05,
620
+ "loss": 0.0616,
621
+ "step": 84
622
+ },
623
+ {
624
+ "epoch": 0.7706422018348624,
625
+ "eval_loss": 0.055874165147542953,
626
+ "eval_runtime": 43.2177,
627
+ "eval_samples_per_second": 4.003,
628
+ "eval_steps_per_second": 2.013,
629
+ "step": 84
630
+ },
631
+ {
632
+ "epoch": 0.7798165137614679,
633
+ "grad_norm": 0.06147678196430206,
634
+ "learning_rate": 1.7868192468259686e-05,
635
+ "loss": 0.0917,
636
+ "step": 85
637
+ },
638
+ {
639
+ "epoch": 0.7889908256880734,
640
+ "grad_norm": 0.045870471745729446,
641
+ "learning_rate": 1.7804623974850844e-05,
642
+ "loss": 0.092,
643
+ "step": 86
644
+ },
645
+ {
646
+ "epoch": 0.7981651376146789,
647
+ "grad_norm": 0.07622863352298737,
648
+ "learning_rate": 1.7740238200845485e-05,
649
+ "loss": 0.0983,
650
+ "step": 87
651
+ },
652
+ {
653
+ "epoch": 0.8073394495412844,
654
+ "grad_norm": 0.08215321600437164,
655
+ "learning_rate": 1.7675041888559952e-05,
656
+ "loss": 0.0971,
657
+ "step": 88
658
+ },
659
+ {
660
+ "epoch": 0.8165137614678899,
661
+ "grad_norm": 0.06286073476076126,
662
+ "learning_rate": 1.7609041865188122e-05,
663
+ "loss": 0.0876,
664
+ "step": 89
665
+ },
666
+ {
667
+ "epoch": 0.8256880733944955,
668
+ "grad_norm": 0.06399139016866684,
669
+ "learning_rate": 1.754224504208647e-05,
670
+ "loss": 0.1166,
671
+ "step": 90
672
+ },
673
+ {
674
+ "epoch": 0.8348623853211009,
675
+ "grad_norm": 0.07115256041288376,
676
+ "learning_rate": 1.7474658414050344e-05,
677
+ "loss": 0.119,
678
+ "step": 91
679
+ },
680
+ {
681
+ "epoch": 0.8440366972477065,
682
+ "grad_norm": 0.04867429658770561,
683
+ "learning_rate": 1.7406289058581466e-05,
684
+ "loss": 0.0609,
685
+ "step": 92
686
+ },
687
+ {
688
+ "epoch": 0.8532110091743119,
689
+ "grad_norm": 0.1833045333623886,
690
+ "learning_rate": 1.7337144135146818e-05,
691
+ "loss": 0.1247,
692
+ "step": 93
693
+ },
694
+ {
695
+ "epoch": 0.8623853211009175,
696
+ "grad_norm": 0.07023169845342636,
697
+ "learning_rate": 1.7267230884428905e-05,
698
+ "loss": 0.1271,
699
+ "step": 94
700
+ },
701
+ {
702
+ "epoch": 0.8715596330275229,
703
+ "grad_norm": 0.044201672077178955,
704
+ "learning_rate": 1.719655662756753e-05,
705
+ "loss": 0.0661,
706
+ "step": 95
707
+ },
708
+ {
709
+ "epoch": 0.8807339449541285,
710
+ "grad_norm": 0.05139552056789398,
711
+ "learning_rate": 1.7125128765393157e-05,
712
+ "loss": 0.1065,
713
+ "step": 96
714
+ },
715
+ {
716
+ "epoch": 0.8899082568807339,
717
+ "grad_norm": 0.06401531398296356,
718
+ "learning_rate": 1.705295477765188e-05,
719
+ "loss": 0.1006,
720
+ "step": 97
721
+ },
722
+ {
723
+ "epoch": 0.8990825688073395,
724
+ "grad_norm": 0.04898101091384888,
725
+ "learning_rate": 1.6980042222222216e-05,
726
+ "loss": 0.0522,
727
+ "step": 98
728
+ },
729
+ {
730
+ "epoch": 0.908256880733945,
731
+ "grad_norm": 0.053434859961271286,
732
+ "learning_rate": 1.690639873432361e-05,
733
+ "loss": 0.1214,
734
+ "step": 99
735
+ },
736
+ {
737
+ "epoch": 0.9174311926605505,
738
+ "grad_norm": 0.058289702981710434,
739
+ "learning_rate": 1.683203202571692e-05,
740
+ "loss": 0.0547,
741
+ "step": 100
742
+ },
743
+ {
744
+ "epoch": 0.926605504587156,
745
+ "grad_norm": 0.06972479820251465,
746
+ "learning_rate": 1.6756949883896874e-05,
747
+ "loss": 0.0867,
748
+ "step": 101
749
+ },
750
+ {
751
+ "epoch": 0.9357798165137615,
752
+ "grad_norm": 0.05255963280797005,
753
+ "learning_rate": 1.668116017127655e-05,
754
+ "loss": 0.0685,
755
+ "step": 102
756
+ },
757
+ {
758
+ "epoch": 0.944954128440367,
759
+ "grad_norm": 0.04849875345826149,
760
+ "learning_rate": 1.6604670824364067e-05,
761
+ "loss": 0.074,
762
+ "step": 103
763
+ },
764
+ {
765
+ "epoch": 0.9541284403669725,
766
+ "grad_norm": 0.08504751324653625,
767
+ "learning_rate": 1.652748985293149e-05,
768
+ "loss": 0.0901,
769
+ "step": 104
770
+ },
771
+ {
772
+ "epoch": 0.963302752293578,
773
+ "grad_norm": 0.07853557169437408,
774
+ "learning_rate": 1.6449625339176056e-05,
775
+ "loss": 0.0663,
776
+ "step": 105
777
+ },
778
+ {
779
+ "epoch": 0.9724770642201835,
780
+ "grad_norm": 0.0786280408501625,
781
+ "learning_rate": 1.6371085436873847e-05,
782
+ "loss": 0.0936,
783
+ "step": 106
784
+ },
785
+ {
786
+ "epoch": 0.981651376146789,
787
+ "grad_norm": 0.09738124907016754,
788
+ "learning_rate": 1.6291878370525925e-05,
789
+ "loss": 0.1001,
790
+ "step": 107
791
+ },
792
+ {
793
+ "epoch": 0.9908256880733946,
794
+ "grad_norm": 0.061633653938770294,
795
+ "learning_rate": 1.6212012434497103e-05,
796
+ "loss": 0.0728,
797
+ "step": 108
798
+ },
799
+ {
800
+ "epoch": 1.0,
801
+ "grad_norm": 0.06726932525634766,
802
+ "learning_rate": 1.6131495992147363e-05,
803
+ "loss": 0.0837,
804
+ "step": 109
805
+ },
806
+ {
807
+ "epoch": 1.0091743119266054,
808
+ "grad_norm": 0.05419926717877388,
809
+ "learning_rate": 1.605033747495607e-05,
810
+ "loss": 0.0841,
811
+ "step": 110
812
+ },
813
+ {
814
+ "epoch": 1.018348623853211,
815
+ "grad_norm": 0.056966476142406464,
816
+ "learning_rate": 1.596854538163906e-05,
817
+ "loss": 0.0774,
818
+ "step": 111
819
+ },
820
+ {
821
+ "epoch": 1.0275229357798166,
822
+ "grad_norm": 0.05710803344845772,
823
+ "learning_rate": 1.5886128277258665e-05,
824
+ "loss": 0.0645,
825
+ "step": 112
826
+ },
827
+ {
828
+ "epoch": 1.0275229357798166,
829
+ "eval_loss": 0.05474008619785309,
830
+ "eval_runtime": 43.3773,
831
+ "eval_samples_per_second": 3.988,
832
+ "eval_steps_per_second": 2.006,
833
+ "step": 112
834
+ },
835
+ {
836
+ "epoch": 1.036697247706422,
837
+ "grad_norm": 0.05264132842421532,
838
+ "learning_rate": 1.58030947923268e-05,
839
+ "loss": 0.0615,
840
+ "step": 113
841
+ },
842
+ {
843
+ "epoch": 1.0458715596330275,
844
+ "grad_norm": 0.11162517964839935,
845
+ "learning_rate": 1.571945362190121e-05,
846
+ "loss": 0.13,
847
+ "step": 114
848
+ },
849
+ {
850
+ "epoch": 1.0550458715596331,
851
+ "grad_norm": 0.05422775819897652,
852
+ "learning_rate": 1.563521352467493e-05,
853
+ "loss": 0.0677,
854
+ "step": 115
855
+ },
856
+ {
857
+ "epoch": 1.0642201834862386,
858
+ "grad_norm": 0.08082108199596405,
859
+ "learning_rate": 1.55503833220591e-05,
860
+ "loss": 0.0958,
861
+ "step": 116
862
+ },
863
+ {
864
+ "epoch": 1.073394495412844,
865
+ "grad_norm": 0.0667729526758194,
866
+ "learning_rate": 1.546497189725922e-05,
867
+ "loss": 0.1132,
868
+ "step": 117
869
+ },
870
+ {
871
+ "epoch": 1.0825688073394495,
872
+ "grad_norm": 0.06297166645526886,
873
+ "learning_rate": 1.5378988194344913e-05,
874
+ "loss": 0.0646,
875
+ "step": 118
876
+ },
877
+ {
878
+ "epoch": 1.091743119266055,
879
+ "grad_norm": 0.05654435604810715,
880
+ "learning_rate": 1.5292441217313324e-05,
881
+ "loss": 0.1083,
882
+ "step": 119
883
+ },
884
+ {
885
+ "epoch": 1.1009174311926606,
886
+ "grad_norm": 0.054773300886154175,
887
+ "learning_rate": 1.5205340029146256e-05,
888
+ "loss": 0.0833,
889
+ "step": 120
890
+ },
891
+ {
892
+ "epoch": 1.110091743119266,
893
+ "grad_norm": 0.0510234571993351,
894
+ "learning_rate": 1.5117693750861096e-05,
895
+ "loss": 0.0792,
896
+ "step": 121
897
+ },
898
+ {
899
+ "epoch": 1.1192660550458715,
900
+ "grad_norm": 0.054006725549697876,
901
+ "learning_rate": 1.5029511560555707e-05,
902
+ "loss": 0.1066,
903
+ "step": 122
904
+ },
905
+ {
906
+ "epoch": 1.1284403669724772,
907
+ "grad_norm": 0.06665553152561188,
908
+ "learning_rate": 1.4940802692447306e-05,
909
+ "loss": 0.0584,
910
+ "step": 123
911
+ },
912
+ {
913
+ "epoch": 1.1376146788990826,
914
+ "grad_norm": 0.105413518846035,
915
+ "learning_rate": 1.4851576435905489e-05,
916
+ "loss": 0.0907,
917
+ "step": 124
918
+ },
919
+ {
920
+ "epoch": 1.146788990825688,
921
+ "grad_norm": 0.0626402273774147,
922
+ "learning_rate": 1.4761842134479463e-05,
923
+ "loss": 0.0544,
924
+ "step": 125
925
+ },
926
+ {
927
+ "epoch": 1.1559633027522935,
928
+ "grad_norm": 0.06280255317687988,
929
+ "learning_rate": 1.4671609184919622e-05,
930
+ "loss": 0.0487,
931
+ "step": 126
932
+ },
933
+ {
934
+ "epoch": 1.165137614678899,
935
+ "grad_norm": 0.07118494808673859,
936
+ "learning_rate": 1.4580887036193539e-05,
937
+ "loss": 0.0797,
938
+ "step": 127
939
+ },
940
+ {
941
+ "epoch": 1.1743119266055047,
942
+ "grad_norm": 0.0436442606151104,
943
+ "learning_rate": 1.4489685188496488e-05,
944
+ "loss": 0.0568,
945
+ "step": 128
946
+ },
947
+ {
948
+ "epoch": 1.18348623853211,
949
+ "grad_norm": 0.06852032989263535,
950
+ "learning_rate": 1.4398013192256615e-05,
951
+ "loss": 0.1099,
952
+ "step": 129
953
+ },
954
+ {
955
+ "epoch": 1.1926605504587156,
956
+ "grad_norm": 0.05673675611615181,
957
+ "learning_rate": 1.4305880647134847e-05,
958
+ "loss": 0.0649,
959
+ "step": 130
960
+ },
961
+ {
962
+ "epoch": 1.2018348623853212,
963
+ "grad_norm": 0.05407770350575447,
964
+ "learning_rate": 1.4213297201019618e-05,
965
+ "loss": 0.0582,
966
+ "step": 131
967
+ },
968
+ {
969
+ "epoch": 1.2110091743119267,
970
+ "grad_norm": 0.06976728141307831,
971
+ "learning_rate": 1.4120272549016591e-05,
972
+ "loss": 0.0744,
973
+ "step": 132
974
+ },
975
+ {
976
+ "epoch": 1.2201834862385321,
977
+ "grad_norm": 0.044198598712682724,
978
+ "learning_rate": 1.40268164324334e-05,
979
+ "loss": 0.0456,
980
+ "step": 133
981
+ },
982
+ {
983
+ "epoch": 1.2293577981651376,
984
+ "grad_norm": 0.052981797605752945,
985
+ "learning_rate": 1.3932938637759555e-05,
986
+ "loss": 0.067,
987
+ "step": 134
988
+ },
989
+ {
990
+ "epoch": 1.238532110091743,
991
+ "grad_norm": 0.051931336522102356,
992
+ "learning_rate": 1.3838648995641645e-05,
993
+ "loss": 0.0662,
994
+ "step": 135
995
+ },
996
+ {
997
+ "epoch": 1.2477064220183487,
998
+ "grad_norm": 0.06196126341819763,
999
+ "learning_rate": 1.3743957379853885e-05,
1000
+ "loss": 0.0742,
1001
+ "step": 136
1002
+ },
1003
+ {
1004
+ "epoch": 1.2568807339449541,
1005
+ "grad_norm": 0.0772649347782135,
1006
+ "learning_rate": 1.3648873706264159e-05,
1007
+ "loss": 0.1016,
1008
+ "step": 137
1009
+ },
1010
+ {
1011
+ "epoch": 1.2660550458715596,
1012
+ "grad_norm": 0.06751381605863571,
1013
+ "learning_rate": 1.3553407931795662e-05,
1014
+ "loss": 0.0871,
1015
+ "step": 138
1016
+ },
1017
+ {
1018
+ "epoch": 1.2752293577981653,
1019
+ "grad_norm": 0.045472558587789536,
1020
+ "learning_rate": 1.3457570053384225e-05,
1021
+ "loss": 0.0548,
1022
+ "step": 139
1023
+ },
1024
+ {
1025
+ "epoch": 1.2844036697247707,
1026
+ "grad_norm": 0.061834368854761124,
1027
+ "learning_rate": 1.3361370106931486e-05,
1028
+ "loss": 0.0601,
1029
+ "step": 140
1030
+ },
1031
+ {
1032
+ "epoch": 1.2844036697247707,
1033
+ "eval_loss": 0.05419979989528656,
1034
+ "eval_runtime": 43.2558,
1035
+ "eval_samples_per_second": 3.999,
1036
+ "eval_steps_per_second": 2.011,
1037
+ "step": 140
1038
+ },
1039
+ {
1040
+ "epoch": 1.2935779816513762,
1041
+ "grad_norm": 0.062169574201107025,
1042
+ "learning_rate": 1.3264818166253917e-05,
1043
+ "loss": 0.0899,
1044
+ "step": 141
1045
+ },
1046
+ {
1047
+ "epoch": 1.3027522935779816,
1048
+ "grad_norm": 0.051288578659296036,
1049
+ "learning_rate": 1.3167924342027947e-05,
1050
+ "loss": 0.0758,
1051
+ "step": 142
1052
+ },
1053
+ {
1054
+ "epoch": 1.311926605504587,
1055
+ "grad_norm": 0.052053675055503845,
1056
+ "learning_rate": 1.3070698780731194e-05,
1057
+ "loss": 0.0989,
1058
+ "step": 143
1059
+ },
1060
+ {
1061
+ "epoch": 1.3211009174311927,
1062
+ "grad_norm": 0.05368887633085251,
1063
+ "learning_rate": 1.2973151663579948e-05,
1064
+ "loss": 0.0599,
1065
+ "step": 144
1066
+ },
1067
+ {
1068
+ "epoch": 1.3302752293577982,
1069
+ "grad_norm": 0.06439553946256638,
1070
+ "learning_rate": 1.2875293205463018e-05,
1071
+ "loss": 0.0808,
1072
+ "step": 145
1073
+ },
1074
+ {
1075
+ "epoch": 1.3394495412844036,
1076
+ "grad_norm": 0.05169299989938736,
1077
+ "learning_rate": 1.277713365387205e-05,
1078
+ "loss": 0.0668,
1079
+ "step": 146
1080
+ },
1081
+ {
1082
+ "epoch": 1.3486238532110093,
1083
+ "grad_norm": 0.0563080795109272,
1084
+ "learning_rate": 1.2678683287828451e-05,
1085
+ "loss": 0.0888,
1086
+ "step": 147
1087
+ },
1088
+ {
1089
+ "epoch": 1.3577981651376148,
1090
+ "grad_norm": 0.06615187972784042,
1091
+ "learning_rate": 1.257995241680698e-05,
1092
+ "loss": 0.123,
1093
+ "step": 148
1094
+ },
1095
+ {
1096
+ "epoch": 1.3669724770642202,
1097
+ "grad_norm": 0.0562995970249176,
1098
+ "learning_rate": 1.2480951379656175e-05,
1099
+ "loss": 0.0709,
1100
+ "step": 149
1101
+ },
1102
+ {
1103
+ "epoch": 1.3761467889908257,
1104
+ "grad_norm": 0.06163397431373596,
1105
+ "learning_rate": 1.2381690543515692e-05,
1106
+ "loss": 0.0663,
1107
+ "step": 150
1108
+ },
1109
+ {
1110
+ "epoch": 1.385321100917431,
1111
+ "grad_norm": 0.053988683968782425,
1112
+ "learning_rate": 1.2282180302730683e-05,
1113
+ "loss": 0.0812,
1114
+ "step": 151
1115
+ },
1116
+ {
1117
+ "epoch": 1.3944954128440368,
1118
+ "grad_norm": 0.062011417001485825,
1119
+ "learning_rate": 1.2182431077763317e-05,
1120
+ "loss": 0.0821,
1121
+ "step": 152
1122
+ },
1123
+ {
1124
+ "epoch": 1.4036697247706422,
1125
+ "grad_norm": 0.054933421313762665,
1126
+ "learning_rate": 1.2082453314101607e-05,
1127
+ "loss": 0.0726,
1128
+ "step": 153
1129
+ },
1130
+ {
1131
+ "epoch": 1.4128440366972477,
1132
+ "grad_norm": 0.0531560480594635,
1133
+ "learning_rate": 1.1982257481165547e-05,
1134
+ "loss": 0.052,
1135
+ "step": 154
1136
+ },
1137
+ {
1138
+ "epoch": 1.4220183486238533,
1139
+ "grad_norm": 0.0668332502245903,
1140
+ "learning_rate": 1.1881854071210805e-05,
1141
+ "loss": 0.0758,
1142
+ "step": 155
1143
+ },
1144
+ {
1145
+ "epoch": 1.4311926605504588,
1146
+ "grad_norm": 0.06946220993995667,
1147
+ "learning_rate": 1.1781253598229982e-05,
1148
+ "loss": 0.0702,
1149
+ "step": 156
1150
+ },
1151
+ {
1152
+ "epoch": 1.4403669724770642,
1153
+ "grad_norm": 0.06085884943604469,
1154
+ "learning_rate": 1.1680466596851635e-05,
1155
+ "loss": 0.0727,
1156
+ "step": 157
1157
+ },
1158
+ {
1159
+ "epoch": 1.4495412844036697,
1160
+ "grad_norm": 0.04978602007031441,
1161
+ "learning_rate": 1.1579503621237102e-05,
1162
+ "loss": 0.077,
1163
+ "step": 158
1164
+ },
1165
+ {
1166
+ "epoch": 1.4587155963302751,
1167
+ "grad_norm": 0.05822195112705231,
1168
+ "learning_rate": 1.1478375243975298e-05,
1169
+ "loss": 0.0769,
1170
+ "step": 159
1171
+ },
1172
+ {
1173
+ "epoch": 1.4678899082568808,
1174
+ "grad_norm": 0.06906376779079437,
1175
+ "learning_rate": 1.1377092054975586e-05,
1176
+ "loss": 0.094,
1177
+ "step": 160
1178
+ },
1179
+ {
1180
+ "epoch": 1.4770642201834863,
1181
+ "grad_norm": 0.0629039779305458,
1182
+ "learning_rate": 1.1275664660358818e-05,
1183
+ "loss": 0.0491,
1184
+ "step": 161
1185
+ },
1186
+ {
1187
+ "epoch": 1.4862385321100917,
1188
+ "grad_norm": 0.05990944802761078,
1189
+ "learning_rate": 1.1174103681346711e-05,
1190
+ "loss": 0.0514,
1191
+ "step": 162
1192
+ },
1193
+ {
1194
+ "epoch": 1.4954128440366974,
1195
+ "grad_norm": 0.055665481835603714,
1196
+ "learning_rate": 1.1072419753149585e-05,
1197
+ "loss": 0.0652,
1198
+ "step": 163
1199
+ },
1200
+ {
1201
+ "epoch": 1.5045871559633026,
1202
+ "grad_norm": 0.04991630092263222,
1203
+ "learning_rate": 1.0970623523852699e-05,
1204
+ "loss": 0.0576,
1205
+ "step": 164
1206
+ },
1207
+ {
1208
+ "epoch": 1.5137614678899083,
1209
+ "grad_norm": 0.052877284586429596,
1210
+ "learning_rate": 1.0868725653301206e-05,
1211
+ "loss": 0.075,
1212
+ "step": 165
1213
+ },
1214
+ {
1215
+ "epoch": 1.5229357798165137,
1216
+ "grad_norm": 0.0749133750796318,
1217
+ "learning_rate": 1.0766736811983864e-05,
1218
+ "loss": 0.092,
1219
+ "step": 166
1220
+ },
1221
+ {
1222
+ "epoch": 1.5321100917431192,
1223
+ "grad_norm": 0.050854723900556564,
1224
+ "learning_rate": 1.066466767991567e-05,
1225
+ "loss": 0.0282,
1226
+ "step": 167
1227
+ },
1228
+ {
1229
+ "epoch": 1.5412844036697249,
1230
+ "grad_norm": 0.05233798176050186,
1231
+ "learning_rate": 1.0562528945519463e-05,
1232
+ "loss": 0.0688,
1233
+ "step": 168
1234
+ },
1235
+ {
1236
+ "epoch": 1.5412844036697249,
1237
+ "eval_loss": 0.05367153137922287,
1238
+ "eval_runtime": 43.3753,
1239
+ "eval_samples_per_second": 3.988,
1240
+ "eval_steps_per_second": 2.006,
1241
+ "step": 168
1242
+ },
1243
+ {
1244
+ "epoch": 1.5504587155963303,
1245
+ "grad_norm": 0.06258448958396912,
1246
+ "learning_rate": 1.0460331304506658e-05,
1247
+ "loss": 0.0601,
1248
+ "step": 169
1249
+ },
1250
+ {
1251
+ "epoch": 1.5596330275229358,
1252
+ "grad_norm": 0.05399218574166298,
1253
+ "learning_rate": 1.0358085458757233e-05,
1254
+ "loss": 0.0642,
1255
+ "step": 170
1256
+ },
1257
+ {
1258
+ "epoch": 1.5688073394495414,
1259
+ "grad_norm": 0.052848368883132935,
1260
+ "learning_rate": 1.0255802115199034e-05,
1261
+ "loss": 0.054,
1262
+ "step": 171
1263
+ },
1264
+ {
1265
+ "epoch": 1.5779816513761467,
1266
+ "grad_norm": 0.054908327758312225,
1267
+ "learning_rate": 1.0153491984686595e-05,
1268
+ "loss": 0.069,
1269
+ "step": 172
1270
+ },
1271
+ {
1272
+ "epoch": 1.5871559633027523,
1273
+ "grad_norm": 0.08597759157419205,
1274
+ "learning_rate": 1.0051165780879503e-05,
1275
+ "loss": 0.0596,
1276
+ "step": 173
1277
+ },
1278
+ {
1279
+ "epoch": 1.5963302752293578,
1280
+ "grad_norm": 0.06400232017040253,
1281
+ "learning_rate": 9.9488342191205e-06,
1282
+ "loss": 0.0959,
1283
+ "step": 174
1284
+ },
1285
+ {
1286
+ "epoch": 1.6055045871559632,
1287
+ "grad_norm": 0.06301407516002655,
1288
+ "learning_rate": 9.846508015313407e-06,
1289
+ "loss": 0.0863,
1290
+ "step": 175
1291
+ },
1292
+ {
1293
+ "epoch": 1.614678899082569,
1294
+ "grad_norm": 0.09353712201118469,
1295
+ "learning_rate": 9.744197884800968e-06,
1296
+ "loss": 0.0835,
1297
+ "step": 176
1298
+ },
1299
+ {
1300
+ "epoch": 1.6238532110091743,
1301
+ "grad_norm": 0.14486946165561676,
1302
+ "learning_rate": 9.64191454124277e-06,
1303
+ "loss": 0.0786,
1304
+ "step": 177
1305
+ },
1306
+ {
1307
+ "epoch": 1.6330275229357798,
1308
+ "grad_norm": 0.06457175314426422,
1309
+ "learning_rate": 9.539668695493344e-06,
1310
+ "loss": 0.0752,
1311
+ "step": 178
1312
+ },
1313
+ {
1314
+ "epoch": 1.6422018348623855,
1315
+ "grad_norm": 0.059133633971214294,
1316
+ "learning_rate": 9.43747105448054e-06,
1317
+ "loss": 0.0747,
1318
+ "step": 179
1319
+ },
1320
+ {
1321
+ "epoch": 1.6513761467889907,
1322
+ "grad_norm": 0.04722464829683304,
1323
+ "learning_rate": 9.335332320084331e-06,
1324
+ "loss": 0.0503,
1325
+ "step": 180
1326
+ },
1327
+ {
1328
+ "epoch": 1.6605504587155964,
1329
+ "grad_norm": 0.06446046382188797,
1330
+ "learning_rate": 9.233263188016138e-06,
1331
+ "loss": 0.0794,
1332
+ "step": 181
1333
+ },
1334
+ {
1335
+ "epoch": 1.6697247706422018,
1336
+ "grad_norm": 0.05856352299451828,
1337
+ "learning_rate": 9.131274346698797e-06,
1338
+ "loss": 0.0917,
1339
+ "step": 182
1340
+ },
1341
+ {
1342
+ "epoch": 1.6788990825688073,
1343
+ "grad_norm": 0.050151705741882324,
1344
+ "learning_rate": 9.029376476147303e-06,
1345
+ "loss": 0.0534,
1346
+ "step": 183
1347
+ },
1348
+ {
1349
+ "epoch": 1.688073394495413,
1350
+ "grad_norm": 0.11409275978803635,
1351
+ "learning_rate": 8.927580246850418e-06,
1352
+ "loss": 0.0579,
1353
+ "step": 184
1354
+ },
1355
+ {
1356
+ "epoch": 1.6972477064220184,
1357
+ "grad_norm": 0.04147953912615776,
1358
+ "learning_rate": 8.825896318653294e-06,
1359
+ "loss": 0.0596,
1360
+ "step": 185
1361
+ },
1362
+ {
1363
+ "epoch": 1.7064220183486238,
1364
+ "grad_norm": 0.06895549595355988,
1365
+ "learning_rate": 8.724335339641185e-06,
1366
+ "loss": 0.1267,
1367
+ "step": 186
1368
+ },
1369
+ {
1370
+ "epoch": 1.7155963302752295,
1371
+ "grad_norm": 0.07597438991069794,
1372
+ "learning_rate": 8.622907945024418e-06,
1373
+ "loss": 0.0672,
1374
+ "step": 187
1375
+ },
1376
+ {
1377
+ "epoch": 1.7247706422018347,
1378
+ "grad_norm": 0.0447760745882988,
1379
+ "learning_rate": 8.521624756024706e-06,
1380
+ "loss": 0.0619,
1381
+ "step": 188
1382
+ },
1383
+ {
1384
+ "epoch": 1.7339449541284404,
1385
+ "grad_norm": 0.054984625428915024,
1386
+ "learning_rate": 8.420496378762901e-06,
1387
+ "loss": 0.0665,
1388
+ "step": 189
1389
+ },
1390
+ {
1391
+ "epoch": 1.7431192660550459,
1392
+ "grad_norm": 0.06113699823617935,
1393
+ "learning_rate": 8.319533403148368e-06,
1394
+ "loss": 0.0755,
1395
+ "step": 190
1396
+ },
1397
+ {
1398
+ "epoch": 1.7522935779816513,
1399
+ "grad_norm": 0.056827329099178314,
1400
+ "learning_rate": 8.218746401770021e-06,
1401
+ "loss": 0.0674,
1402
+ "step": 191
1403
+ },
1404
+ {
1405
+ "epoch": 1.761467889908257,
1406
+ "grad_norm": 0.06809786707162857,
1407
+ "learning_rate": 8.118145928789198e-06,
1408
+ "loss": 0.0595,
1409
+ "step": 192
1410
+ },
1411
+ {
1412
+ "epoch": 1.7706422018348624,
1413
+ "grad_norm": 0.05617869272828102,
1414
+ "learning_rate": 8.017742518834454e-06,
1415
+ "loss": 0.0823,
1416
+ "step": 193
1417
+ },
1418
+ {
1419
+ "epoch": 1.7798165137614679,
1420
+ "grad_norm": 0.05785394459962845,
1421
+ "learning_rate": 7.917546685898393e-06,
1422
+ "loss": 0.099,
1423
+ "step": 194
1424
+ },
1425
+ {
1426
+ "epoch": 1.7889908256880735,
1427
+ "grad_norm": 0.04695114120841026,
1428
+ "learning_rate": 7.817568922236683e-06,
1429
+ "loss": 0.0559,
1430
+ "step": 195
1431
+ },
1432
+ {
1433
+ "epoch": 1.7981651376146788,
1434
+ "grad_norm": 0.04976266250014305,
1435
+ "learning_rate": 7.717819697269322e-06,
1436
+ "loss": 0.0424,
1437
+ "step": 196
1438
+ },
1439
+ {
1440
+ "epoch": 1.7981651376146788,
1441
+ "eval_loss": 0.05340273305773735,
1442
+ "eval_runtime": 43.3761,
1443
+ "eval_samples_per_second": 3.988,
1444
+ "eval_steps_per_second": 2.006,
1445
+ "step": 196
1446
+ },
1447
+ {
1448
+ "epoch": 1.8073394495412844,
1449
+ "grad_norm": 0.04787492752075195,
1450
+ "learning_rate": 7.618309456484309e-06,
1451
+ "loss": 0.0439,
1452
+ "step": 197
1453
+ },
1454
+ {
1455
+ "epoch": 1.81651376146789,
1456
+ "grad_norm": 0.048712193965911865,
1457
+ "learning_rate": 7.519048620343825e-06,
1458
+ "loss": 0.0714,
1459
+ "step": 198
1460
+ },
1461
+ {
1462
+ "epoch": 1.8256880733944953,
1463
+ "grad_norm": 0.06686241924762726,
1464
+ "learning_rate": 7.42004758319302e-06,
1465
+ "loss": 0.0746,
1466
+ "step": 199
1467
+ },
1468
+ {
1469
+ "epoch": 1.834862385321101,
1470
+ "grad_norm": 0.06182721257209778,
1471
+ "learning_rate": 7.3213167121715514e-06,
1472
+ "loss": 0.068,
1473
+ "step": 200
1474
+ },
1475
+ {
1476
+ "epoch": 1.8440366972477065,
1477
+ "grad_norm": 0.05911610275506973,
1478
+ "learning_rate": 7.222866346127952e-06,
1479
+ "loss": 0.061,
1480
+ "step": 201
1481
+ },
1482
+ {
1483
+ "epoch": 1.853211009174312,
1484
+ "grad_norm": 0.048602789640426636,
1485
+ "learning_rate": 7.124706794536984e-06,
1486
+ "loss": 0.0545,
1487
+ "step": 202
1488
+ },
1489
+ {
1490
+ "epoch": 1.8623853211009176,
1491
+ "grad_norm": 0.09133545309305191,
1492
+ "learning_rate": 7.026848336420053e-06,
1493
+ "loss": 0.0579,
1494
+ "step": 203
1495
+ },
1496
+ {
1497
+ "epoch": 1.8715596330275228,
1498
+ "grad_norm": 0.05401468276977539,
1499
+ "learning_rate": 6.929301219268806e-06,
1500
+ "loss": 0.0498,
1501
+ "step": 204
1502
+ },
1503
+ {
1504
+ "epoch": 1.8807339449541285,
1505
+ "grad_norm": 0.057028841227293015,
1506
+ "learning_rate": 6.8320756579720545e-06,
1507
+ "loss": 0.1163,
1508
+ "step": 205
1509
+ },
1510
+ {
1511
+ "epoch": 1.889908256880734,
1512
+ "grad_norm": 0.06496277451515198,
1513
+ "learning_rate": 6.735181833746087e-06,
1514
+ "loss": 0.0687,
1515
+ "step": 206
1516
+ },
1517
+ {
1518
+ "epoch": 1.8990825688073394,
1519
+ "grad_norm": 0.04509355500340462,
1520
+ "learning_rate": 6.638629893068516e-06,
1521
+ "loss": 0.0833,
1522
+ "step": 207
1523
+ },
1524
+ {
1525
+ "epoch": 1.908256880733945,
1526
+ "grad_norm": 0.05287083983421326,
1527
+ "learning_rate": 6.542429946615774e-06,
1528
+ "loss": 0.0506,
1529
+ "step": 208
1530
+ },
1531
+ {
1532
+ "epoch": 1.9174311926605505,
1533
+ "grad_norm": 0.0586075522005558,
1534
+ "learning_rate": 6.446592068204341e-06,
1535
+ "loss": 0.0747,
1536
+ "step": 209
1537
+ },
1538
+ {
1539
+ "epoch": 1.926605504587156,
1540
+ "grad_norm": 0.05395258963108063,
1541
+ "learning_rate": 6.351126293735843e-06,
1542
+ "loss": 0.0703,
1543
+ "step": 210
1544
+ },
1545
+ {
1546
+ "epoch": 1.9357798165137616,
1547
+ "grad_norm": 0.04344907030463219,
1548
+ "learning_rate": 6.256042620146119e-06,
1549
+ "loss": 0.0445,
1550
+ "step": 211
1551
+ },
1552
+ {
1553
+ "epoch": 1.9449541284403669,
1554
+ "grad_norm": 0.047881245613098145,
1555
+ "learning_rate": 6.16135100435836e-06,
1556
+ "loss": 0.063,
1557
+ "step": 212
1558
+ },
1559
+ {
1560
+ "epoch": 1.9541284403669725,
1561
+ "grad_norm": 0.0735430121421814,
1562
+ "learning_rate": 6.06706136224045e-06,
1563
+ "loss": 0.0936,
1564
+ "step": 213
1565
+ },
1566
+ {
1567
+ "epoch": 1.963302752293578,
1568
+ "grad_norm": 0.0707215815782547,
1569
+ "learning_rate": 5.973183567566605e-06,
1570
+ "loss": 0.0906,
1571
+ "step": 214
1572
+ },
1573
+ {
1574
+ "epoch": 1.9724770642201834,
1575
+ "grad_norm": 0.05329213663935661,
1576
+ "learning_rate": 5.879727450983412e-06,
1577
+ "loss": 0.0665,
1578
+ "step": 215
1579
+ },
1580
+ {
1581
+ "epoch": 1.981651376146789,
1582
+ "grad_norm": 0.04940500482916832,
1583
+ "learning_rate": 5.786702798980388e-06,
1584
+ "loss": 0.0649,
1585
+ "step": 216
1586
+ },
1587
+ {
1588
+ "epoch": 1.9908256880733946,
1589
+ "grad_norm": 0.06716382503509521,
1590
+ "learning_rate": 5.69411935286516e-06,
1591
+ "loss": 0.0882,
1592
+ "step": 217
1593
+ },
1594
+ {
1595
+ "epoch": 2.0,
1596
+ "grad_norm": 0.06293365359306335,
1597
+ "learning_rate": 5.601986807743388e-06,
1598
+ "loss": 0.0649,
1599
+ "step": 218
1600
+ },
1601
+ {
1602
+ "epoch": 2.0091743119266057,
1603
+ "grad_norm": 0.06269426643848419,
1604
+ "learning_rate": 5.51031481150352e-06,
1605
+ "loss": 0.0757,
1606
+ "step": 219
1607
+ },
1608
+ {
1609
+ "epoch": 2.018348623853211,
1610
+ "grad_norm": 0.05706481635570526,
1611
+ "learning_rate": 5.419112963806468e-06,
1612
+ "loss": 0.0519,
1613
+ "step": 220
1614
+ },
1615
+ {
1616
+ "epoch": 2.0275229357798166,
1617
+ "grad_norm": 0.05423853173851967,
1618
+ "learning_rate": 5.328390815080381e-06,
1619
+ "loss": 0.0828,
1620
+ "step": 221
1621
+ },
1622
+ {
1623
+ "epoch": 2.036697247706422,
1624
+ "grad_norm": 0.04003476724028587,
1625
+ "learning_rate": 5.238157865520539e-06,
1626
+ "loss": 0.0431,
1627
+ "step": 222
1628
+ },
1629
+ {
1630
+ "epoch": 2.0458715596330275,
1631
+ "grad_norm": 0.047263018786907196,
1632
+ "learning_rate": 5.148423564094517e-06,
1633
+ "loss": 0.0498,
1634
+ "step": 223
1635
+ },
1636
+ {
1637
+ "epoch": 2.055045871559633,
1638
+ "grad_norm": 0.06567783653736115,
1639
+ "learning_rate": 5.059197307552698e-06,
1640
+ "loss": 0.086,
1641
+ "step": 224
1642
+ },
1643
+ {
1644
+ "epoch": 2.055045871559633,
1645
+ "eval_loss": 0.05316643789410591,
1646
+ "eval_runtime": 43.3865,
1647
+ "eval_samples_per_second": 3.987,
1648
+ "eval_steps_per_second": 2.005,
1649
+ "step": 224
1650
+ },
1651
+ {
1652
+ "epoch": 2.0642201834862384,
1653
+ "grad_norm": 0.04639436677098274,
1654
+ "learning_rate": 4.970488439444296e-06,
1655
+ "loss": 0.0589,
1656
+ "step": 225
1657
+ },
1658
+ {
1659
+ "epoch": 2.073394495412844,
1660
+ "grad_norm": 0.06990166008472443,
1661
+ "learning_rate": 4.882306249138909e-06,
1662
+ "loss": 0.0491,
1663
+ "step": 226
1664
+ },
1665
+ {
1666
+ "epoch": 2.0825688073394497,
1667
+ "grad_norm": 0.05951413884758949,
1668
+ "learning_rate": 4.7946599708537485e-06,
1669
+ "loss": 0.0736,
1670
+ "step": 227
1671
+ },
1672
+ {
1673
+ "epoch": 2.091743119266055,
1674
+ "grad_norm": 0.04998117685317993,
1675
+ "learning_rate": 4.707558782686677e-06,
1676
+ "loss": 0.073,
1677
+ "step": 228
1678
+ },
1679
+ {
1680
+ "epoch": 2.1009174311926606,
1681
+ "grad_norm": 0.05194695666432381,
1682
+ "learning_rate": 4.621011805655093e-06,
1683
+ "loss": 0.0797,
1684
+ "step": 229
1685
+ },
1686
+ {
1687
+ "epoch": 2.1100917431192663,
1688
+ "grad_norm": 0.058086711913347244,
1689
+ "learning_rate": 4.535028102740785e-06,
1690
+ "loss": 0.0612,
1691
+ "step": 230
1692
+ },
1693
+ {
1694
+ "epoch": 2.1192660550458715,
1695
+ "grad_norm": 0.05054466798901558,
1696
+ "learning_rate": 4.449616677940904e-06,
1697
+ "loss": 0.0623,
1698
+ "step": 231
1699
+ },
1700
+ {
1701
+ "epoch": 2.128440366972477,
1702
+ "grad_norm": 0.05400394648313522,
1703
+ "learning_rate": 4.364786475325072e-06,
1704
+ "loss": 0.0778,
1705
+ "step": 232
1706
+ },
1707
+ {
1708
+ "epoch": 2.1376146788990824,
1709
+ "grad_norm": 0.049106135964393616,
1710
+ "learning_rate": 4.280546378098792e-06,
1711
+ "loss": 0.0673,
1712
+ "step": 233
1713
+ },
1714
+ {
1715
+ "epoch": 2.146788990825688,
1716
+ "grad_norm": 0.05438579246401787,
1717
+ "learning_rate": 4.196905207673201e-06,
1718
+ "loss": 0.071,
1719
+ "step": 234
1720
+ },
1721
+ {
1722
+ "epoch": 2.1559633027522938,
1723
+ "grad_norm": 0.05540682002902031,
1724
+ "learning_rate": 4.113871722741337e-06,
1725
+ "loss": 0.0691,
1726
+ "step": 235
1727
+ },
1728
+ {
1729
+ "epoch": 2.165137614678899,
1730
+ "grad_norm": 0.07121812552213669,
1731
+ "learning_rate": 4.031454618360945e-06,
1732
+ "loss": 0.0859,
1733
+ "step": 236
1734
+ },
1735
+ {
1736
+ "epoch": 2.1743119266055047,
1737
+ "grad_norm": 0.06990870833396912,
1738
+ "learning_rate": 3.949662525043935e-06,
1739
+ "loss": 0.0808,
1740
+ "step": 237
1741
+ },
1742
+ {
1743
+ "epoch": 2.18348623853211,
1744
+ "grad_norm": 0.04688805714249611,
1745
+ "learning_rate": 3.868504007852641e-06,
1746
+ "loss": 0.0552,
1747
+ "step": 238
1748
+ },
1749
+ {
1750
+ "epoch": 2.1926605504587156,
1751
+ "grad_norm": 0.05368739366531372,
1752
+ "learning_rate": 3.7879875655029018e-06,
1753
+ "loss": 0.0708,
1754
+ "step": 239
1755
+ },
1756
+ {
1757
+ "epoch": 2.2018348623853212,
1758
+ "grad_norm": 0.05669724941253662,
1759
+ "learning_rate": 3.7081216294740773e-06,
1760
+ "loss": 0.0714,
1761
+ "step": 240
1762
+ },
1763
+ {
1764
+ "epoch": 2.2110091743119265,
1765
+ "grad_norm": 0.07314234972000122,
1766
+ "learning_rate": 3.628914563126156e-06,
1767
+ "loss": 0.0842,
1768
+ "step": 241
1769
+ },
1770
+ {
1771
+ "epoch": 2.220183486238532,
1772
+ "grad_norm": 0.06005195155739784,
1773
+ "learning_rate": 3.5503746608239487e-06,
1774
+ "loss": 0.0637,
1775
+ "step": 242
1776
+ },
1777
+ {
1778
+ "epoch": 2.229357798165138,
1779
+ "grad_norm": 0.05186507850885391,
1780
+ "learning_rate": 3.472510147068515e-06,
1781
+ "loss": 0.0606,
1782
+ "step": 243
1783
+ },
1784
+ {
1785
+ "epoch": 2.238532110091743,
1786
+ "grad_norm": 0.058381304144859314,
1787
+ "learning_rate": 3.3953291756359354e-06,
1788
+ "loss": 0.0626,
1789
+ "step": 244
1790
+ },
1791
+ {
1792
+ "epoch": 2.2477064220183487,
1793
+ "grad_norm": 0.05521458014845848,
1794
+ "learning_rate": 3.3188398287234504e-06,
1795
+ "loss": 0.0628,
1796
+ "step": 245
1797
+ },
1798
+ {
1799
+ "epoch": 2.2568807339449544,
1800
+ "grad_norm": 0.05869848653674126,
1801
+ "learning_rate": 3.243050116103128e-06,
1802
+ "loss": 0.0448,
1803
+ "step": 246
1804
+ },
1805
+ {
1806
+ "epoch": 2.2660550458715596,
1807
+ "grad_norm": 0.07123305648565292,
1808
+ "learning_rate": 3.1679679742830806e-06,
1809
+ "loss": 0.0991,
1810
+ "step": 247
1811
+ },
1812
+ {
1813
+ "epoch": 2.2752293577981653,
1814
+ "grad_norm": 0.05694476515054703,
1815
+ "learning_rate": 3.0936012656763937e-06,
1816
+ "loss": 0.1169,
1817
+ "step": 248
1818
+ },
1819
+ {
1820
+ "epoch": 2.2844036697247705,
1821
+ "grad_norm": 0.04774909466505051,
1822
+ "learning_rate": 3.019957777777788e-06,
1823
+ "loss": 0.0655,
1824
+ "step": 249
1825
+ },
1826
+ {
1827
+ "epoch": 2.293577981651376,
1828
+ "grad_norm": 0.05831342935562134,
1829
+ "learning_rate": 2.9470452223481206e-06,
1830
+ "loss": 0.0618,
1831
+ "step": 250
1832
+ },
1833
+ {
1834
+ "epoch": 2.302752293577982,
1835
+ "grad_norm": 0.08326123654842377,
1836
+ "learning_rate": 2.8748712346068464e-06,
1837
+ "loss": 0.0923,
1838
+ "step": 251
1839
+ },
1840
+ {
1841
+ "epoch": 2.311926605504587,
1842
+ "grad_norm": 0.06080171838402748,
1843
+ "learning_rate": 2.8034433724324716e-06,
1844
+ "loss": 0.0759,
1845
+ "step": 252
1846
+ },
1847
+ {
1848
+ "epoch": 2.311926605504587,
1849
+ "eval_loss": 0.052994709461927414,
1850
+ "eval_runtime": 43.3989,
1851
+ "eval_samples_per_second": 3.986,
1852
+ "eval_steps_per_second": 2.005,
1853
+ "step": 252
1854
+ },
1855
+ {
1856
+ "epoch": 2.3211009174311927,
1857
+ "grad_norm": 0.06857974827289581,
1858
+ "learning_rate": 2.7327691155710978e-06,
1859
+ "loss": 0.0791,
1860
+ "step": 253
1861
+ },
1862
+ {
1863
+ "epoch": 2.330275229357798,
1864
+ "grad_norm": 0.08638834208250046,
1865
+ "learning_rate": 2.6628558648531845e-06,
1866
+ "loss": 0.0894,
1867
+ "step": 254
1868
+ },
1869
+ {
1870
+ "epoch": 2.3394495412844036,
1871
+ "grad_norm": 0.07400725036859512,
1872
+ "learning_rate": 2.593710941418537e-06,
1873
+ "loss": 0.0624,
1874
+ "step": 255
1875
+ },
1876
+ {
1877
+ "epoch": 2.3486238532110093,
1878
+ "grad_norm": 0.05066663771867752,
1879
+ "learning_rate": 2.525341585949662e-06,
1880
+ "loss": 0.0755,
1881
+ "step": 256
1882
+ },
1883
+ {
1884
+ "epoch": 2.3577981651376145,
1885
+ "grad_norm": 0.051476072520017624,
1886
+ "learning_rate": 2.4577549579135318e-06,
1887
+ "loss": 0.074,
1888
+ "step": 257
1889
+ },
1890
+ {
1891
+ "epoch": 2.36697247706422,
1892
+ "grad_norm": 0.05693186819553375,
1893
+ "learning_rate": 2.3909581348118803e-06,
1894
+ "loss": 0.049,
1895
+ "step": 258
1896
+ },
1897
+ {
1898
+ "epoch": 2.376146788990826,
1899
+ "grad_norm": 0.043929580599069595,
1900
+ "learning_rate": 2.324958111440051e-06,
1901
+ "loss": 0.0558,
1902
+ "step": 259
1903
+ },
1904
+ {
1905
+ "epoch": 2.385321100917431,
1906
+ "grad_norm": 0.06177612394094467,
1907
+ "learning_rate": 2.259761799154516e-06,
1908
+ "loss": 0.0634,
1909
+ "step": 260
1910
+ },
1911
+ {
1912
+ "epoch": 2.3944954128440368,
1913
+ "grad_norm": 0.08901379257440567,
1914
+ "learning_rate": 2.195376025149156e-06,
1915
+ "loss": 0.0552,
1916
+ "step": 261
1917
+ },
1918
+ {
1919
+ "epoch": 2.4036697247706424,
1920
+ "grad_norm": 0.059478871524333954,
1921
+ "learning_rate": 2.1318075317403152e-06,
1922
+ "loss": 0.0834,
1923
+ "step": 262
1924
+ },
1925
+ {
1926
+ "epoch": 2.4128440366972477,
1927
+ "grad_norm": 0.14992526173591614,
1928
+ "learning_rate": 2.069062975660765e-06,
1929
+ "loss": 0.0582,
1930
+ "step": 263
1931
+ },
1932
+ {
1933
+ "epoch": 2.4220183486238533,
1934
+ "grad_norm": 0.04817449301481247,
1935
+ "learning_rate": 2.0071489273626376e-06,
1936
+ "loss": 0.0547,
1937
+ "step": 264
1938
+ },
1939
+ {
1940
+ "epoch": 2.4311926605504586,
1941
+ "grad_norm": 0.08196448534727097,
1942
+ "learning_rate": 1.946071870329377e-06,
1943
+ "loss": 0.078,
1944
+ "step": 265
1945
+ },
1946
+ {
1947
+ "epoch": 2.4403669724770642,
1948
+ "grad_norm": 0.07558903098106384,
1949
+ "learning_rate": 1.885838200396808e-06,
1950
+ "loss": 0.0507,
1951
+ "step": 266
1952
+ },
1953
+ {
1954
+ "epoch": 2.44954128440367,
1955
+ "grad_norm": 0.061492372304201126,
1956
+ "learning_rate": 1.826454225083375e-06,
1957
+ "loss": 0.0526,
1958
+ "step": 267
1959
+ },
1960
+ {
1961
+ "epoch": 2.458715596330275,
1962
+ "grad_norm": 0.04717002436518669,
1963
+ "learning_rate": 1.7679261629296408e-06,
1964
+ "loss": 0.05,
1965
+ "step": 268
1966
+ },
1967
+ {
1968
+ "epoch": 2.467889908256881,
1969
+ "grad_norm": 0.050578705966472626,
1970
+ "learning_rate": 1.7102601428470988e-06,
1971
+ "loss": 0.0694,
1972
+ "step": 269
1973
+ },
1974
+ {
1975
+ "epoch": 2.477064220183486,
1976
+ "grad_norm": 0.06575262546539307,
1977
+ "learning_rate": 1.6534622034763558e-06,
1978
+ "loss": 0.0537,
1979
+ "step": 270
1980
+ },
1981
+ {
1982
+ "epoch": 2.4862385321100917,
1983
+ "grad_norm": 0.0549924410879612,
1984
+ "learning_rate": 1.5975382925547966e-06,
1985
+ "loss": 0.0802,
1986
+ "step": 271
1987
+ },
1988
+ {
1989
+ "epoch": 2.4954128440366974,
1990
+ "grad_norm": 0.06130588427186012,
1991
+ "learning_rate": 1.5424942662937436e-06,
1992
+ "loss": 0.0766,
1993
+ "step": 272
1994
+ },
1995
+ {
1996
+ "epoch": 2.5045871559633026,
1997
+ "grad_norm": 0.07862205803394318,
1998
+ "learning_rate": 1.4883358887652044e-06,
1999
+ "loss": 0.0612,
2000
+ "step": 273
2001
+ },
2002
+ {
2003
+ "epoch": 2.5137614678899083,
2004
+ "grad_norm": 0.04936962202191353,
2005
+ "learning_rate": 1.4350688312982864e-06,
2006
+ "loss": 0.0556,
2007
+ "step": 274
2008
+ },
2009
+ {
2010
+ "epoch": 2.522935779816514,
2011
+ "grad_norm": 0.06410589069128036,
2012
+ "learning_rate": 1.3826986718852952e-06,
2013
+ "loss": 0.0483,
2014
+ "step": 275
2015
+ },
2016
+ {
2017
+ "epoch": 2.532110091743119,
2018
+ "grad_norm": 0.09402082115411758,
2019
+ "learning_rate": 1.3312308945976348e-06,
2020
+ "loss": 0.1031,
2021
+ "step": 276
2022
+ },
2023
+ {
2024
+ "epoch": 2.541284403669725,
2025
+ "grad_norm": 0.052867498248815536,
2026
+ "learning_rate": 1.2806708890115138e-06,
2027
+ "loss": 0.065,
2028
+ "step": 277
2029
+ },
2030
+ {
2031
+ "epoch": 2.5504587155963305,
2032
+ "grad_norm": 0.08837206661701202,
2033
+ "learning_rate": 1.2310239496435749e-06,
2034
+ "loss": 0.095,
2035
+ "step": 278
2036
+ },
2037
+ {
2038
+ "epoch": 2.5596330275229358,
2039
+ "grad_norm": 0.08973362296819687,
2040
+ "learning_rate": 1.1822952753964667e-06,
2041
+ "loss": 0.0765,
2042
+ "step": 279
2043
+ },
2044
+ {
2045
+ "epoch": 2.5688073394495414,
2046
+ "grad_norm": 0.061795495450496674,
2047
+ "learning_rate": 1.134489969014414e-06,
2048
+ "loss": 0.0583,
2049
+ "step": 280
2050
+ },
2051
+ {
2052
+ "epoch": 2.5688073394495414,
2053
+ "eval_loss": 0.05294761061668396,
2054
+ "eval_runtime": 43.223,
2055
+ "eval_samples_per_second": 4.002,
2056
+ "eval_steps_per_second": 2.013,
2057
+ "step": 280
2058
+ },
2059
+ {
2060
+ "epoch": 2.5779816513761467,
2061
+ "grad_norm": 0.047972485423088074,
2062
+ "learning_rate": 1.087613036548888e-06,
2063
+ "loss": 0.048,
2064
+ "step": 281
2065
+ },
2066
+ {
2067
+ "epoch": 2.5871559633027523,
2068
+ "grad_norm": 0.06896362453699112,
2069
+ "learning_rate": 1.0416693868343796e-06,
2070
+ "loss": 0.0771,
2071
+ "step": 282
2072
+ },
2073
+ {
2074
+ "epoch": 2.5963302752293576,
2075
+ "grad_norm": 0.06132780387997627,
2076
+ "learning_rate": 9.966638309743481e-07,
2077
+ "loss": 0.0854,
2078
+ "step": 283
2079
+ },
2080
+ {
2081
+ "epoch": 2.6055045871559632,
2082
+ "grad_norm": 0.06309553980827332,
2083
+ "learning_rate": 9.52601081837431e-07,
2084
+ "loss": 0.0827,
2085
+ "step": 284
2086
+ },
2087
+ {
2088
+ "epoch": 2.614678899082569,
2089
+ "grad_norm": 0.08898341655731201,
2090
+ "learning_rate": 9.094857535639157e-07,
2091
+ "loss": 0.0727,
2092
+ "step": 285
2093
+ },
2094
+ {
2095
+ "epoch": 2.623853211009174,
2096
+ "grad_norm": 0.05615299567580223,
2097
+ "learning_rate": 8.673223610825532e-07,
2098
+ "loss": 0.0827,
2099
+ "step": 286
2100
+ },
2101
+ {
2102
+ "epoch": 2.63302752293578,
2103
+ "grad_norm": 0.06234830617904663,
2104
+ "learning_rate": 8.261153196377814e-07,
2105
+ "loss": 0.0772,
2106
+ "step": 287
2107
+ },
2108
+ {
2109
+ "epoch": 2.6422018348623855,
2110
+ "grad_norm": 0.057416193187236786,
2111
+ "learning_rate": 7.858689443273548e-07,
2112
+ "loss": 0.0726,
2113
+ "step": 288
2114
+ },
2115
+ {
2116
+ "epoch": 2.6513761467889907,
2117
+ "grad_norm": 0.056388452649116516,
2118
+ "learning_rate": 7.465874496504944e-07,
2119
+ "loss": 0.0881,
2120
+ "step": 289
2121
+ },
2122
+ {
2123
+ "epoch": 2.6605504587155964,
2124
+ "grad_norm": 0.05161774531006813,
2125
+ "learning_rate": 7.082749490665353e-07,
2126
+ "loss": 0.0447,
2127
+ "step": 290
2128
+ },
2129
+ {
2130
+ "epoch": 2.669724770642202,
2131
+ "grad_norm": 0.047958966344594955,
2132
+ "learning_rate": 6.709354545641989e-07,
2133
+ "loss": 0.0772,
2134
+ "step": 291
2135
+ },
2136
+ {
2137
+ "epoch": 2.6788990825688073,
2138
+ "grad_norm": 0.0640062615275383,
2139
+ "learning_rate": 6.345728762414504e-07,
2140
+ "loss": 0.0607,
2141
+ "step": 292
2142
+ },
2143
+ {
2144
+ "epoch": 2.688073394495413,
2145
+ "grad_norm": 0.05299694091081619,
2146
+ "learning_rate": 5.99191021896055e-07,
2147
+ "loss": 0.047,
2148
+ "step": 293
2149
+ },
2150
+ {
2151
+ "epoch": 2.6972477064220186,
2152
+ "grad_norm": 0.057945024222135544,
2153
+ "learning_rate": 5.647935966268225e-07,
2154
+ "loss": 0.0731,
2155
+ "step": 294
2156
+ },
2157
+ {
2158
+ "epoch": 2.706422018348624,
2159
+ "grad_norm": 0.05141222104430199,
2160
+ "learning_rate": 5.313842024456306e-07,
2161
+ "loss": 0.039,
2162
+ "step": 295
2163
+ },
2164
+ {
2165
+ "epoch": 2.7155963302752295,
2166
+ "grad_norm": 0.08650866150856018,
2167
+ "learning_rate": 4.98966337900224e-07,
2168
+ "loss": 0.0551,
2169
+ "step": 296
2170
+ },
2171
+ {
2172
+ "epoch": 2.7247706422018347,
2173
+ "grad_norm": 0.13347071409225464,
2174
+ "learning_rate": 4.6754339770785474e-07,
2175
+ "loss": 0.0619,
2176
+ "step": 297
2177
+ },
2178
+ {
2179
+ "epoch": 2.7339449541284404,
2180
+ "grad_norm": 0.07346609234809875,
2181
+ "learning_rate": 4.3711867239980335e-07,
2182
+ "loss": 0.0423,
2183
+ "step": 298
2184
+ },
2185
+ {
2186
+ "epoch": 2.7431192660550456,
2187
+ "grad_norm": 0.05696272850036621,
2188
+ "learning_rate": 4.076953479767964e-07,
2189
+ "loss": 0.0903,
2190
+ "step": 299
2191
+ },
2192
+ {
2193
+ "epoch": 2.7522935779816513,
2194
+ "grad_norm": 0.058090586215257645,
2195
+ "learning_rate": 3.792765055753755e-07,
2196
+ "loss": 0.0994,
2197
+ "step": 300
2198
+ },
2199
+ {
2200
+ "epoch": 2.761467889908257,
2201
+ "grad_norm": 0.06176576018333435,
2202
+ "learning_rate": 3.5186512114525283e-07,
2203
+ "loss": 0.1243,
2204
+ "step": 301
2205
+ },
2206
+ {
2207
+ "epoch": 2.770642201834862,
2208
+ "grad_norm": 0.07493139058351517,
2209
+ "learning_rate": 3.25464065137675e-07,
2210
+ "loss": 0.0584,
2211
+ "step": 302
2212
+ },
2213
+ {
2214
+ "epoch": 2.779816513761468,
2215
+ "grad_norm": 0.04847017675638199,
2216
+ "learning_rate": 3.0007610220483927e-07,
2217
+ "loss": 0.0652,
2218
+ "step": 303
2219
+ },
2220
+ {
2221
+ "epoch": 2.7889908256880735,
2222
+ "grad_norm": 0.058301348239183426,
2223
+ "learning_rate": 2.757038909103793e-07,
2224
+ "loss": 0.0708,
2225
+ "step": 304
2226
+ },
2227
+ {
2228
+ "epoch": 2.7981651376146788,
2229
+ "grad_norm": 0.046219125390052795,
2230
+ "learning_rate": 2.523499834509724e-07,
2231
+ "loss": 0.0495,
2232
+ "step": 305
2233
+ },
2234
+ {
2235
+ "epoch": 2.8073394495412844,
2236
+ "grad_norm": 0.051395233720541,
2237
+ "learning_rate": 2.3001682538908333e-07,
2238
+ "loss": 0.0954,
2239
+ "step": 306
2240
+ },
2241
+ {
2242
+ "epoch": 2.81651376146789,
2243
+ "grad_norm": 0.0680239349603653,
2244
+ "learning_rate": 2.0870675539686024e-07,
2245
+ "loss": 0.0717,
2246
+ "step": 307
2247
+ },
2248
+ {
2249
+ "epoch": 2.8256880733944953,
2250
+ "grad_norm": 0.058481365442276,
2251
+ "learning_rate": 1.884220050112462e-07,
2252
+ "loss": 0.1087,
2253
+ "step": 308
2254
+ },
2255
+ {
2256
+ "epoch": 2.8256880733944953,
2257
+ "eval_loss": 0.05293623358011246,
2258
+ "eval_runtime": 43.297,
2259
+ "eval_samples_per_second": 3.996,
2260
+ "eval_steps_per_second": 2.009,
2261
+ "step": 308
2262
+ },
2263
+ {
2264
+ "epoch": 2.834862385321101,
2265
+ "grad_norm": 0.06916595250368118,
2266
+ "learning_rate": 1.691646984002937e-07,
2267
+ "loss": 0.052,
2268
+ "step": 309
2269
+ },
2270
+ {
2271
+ "epoch": 2.8440366972477067,
2272
+ "grad_norm": 0.06048694625496864,
2273
+ "learning_rate": 1.5093685214072173e-07,
2274
+ "loss": 0.0508,
2275
+ "step": 310
2276
+ },
2277
+ {
2278
+ "epoch": 2.853211009174312,
2279
+ "grad_norm": 0.0559132881462574,
2280
+ "learning_rate": 1.3374037500675452e-07,
2281
+ "loss": 0.0992,
2282
+ "step": 311
2283
+ },
2284
+ {
2285
+ "epoch": 2.8623853211009176,
2286
+ "grad_norm": 0.05447855591773987,
2287
+ "learning_rate": 1.1757706777023592e-07,
2288
+ "loss": 0.101,
2289
+ "step": 312
2290
+ },
2291
+ {
2292
+ "epoch": 2.871559633027523,
2293
+ "grad_norm": 0.05133409798145294,
2294
+ "learning_rate": 1.024486230120525e-07,
2295
+ "loss": 0.0481,
2296
+ "step": 313
2297
+ },
2298
+ {
2299
+ "epoch": 2.8807339449541285,
2300
+ "grad_norm": 0.05679594352841377,
2301
+ "learning_rate": 8.835662494489638e-08,
2302
+ "loss": 0.0542,
2303
+ "step": 314
2304
+ },
2305
+ {
2306
+ "epoch": 2.8899082568807337,
2307
+ "grad_norm": 0.08849138766527176,
2308
+ "learning_rate": 7.530254924736691e-08,
2309
+ "loss": 0.0529,
2310
+ "step": 315
2311
+ },
2312
+ {
2313
+ "epoch": 2.8990825688073394,
2314
+ "grad_norm": 0.05840228870511055,
2315
+ "learning_rate": 6.32877629094475e-08,
2316
+ "loss": 0.0509,
2317
+ "step": 316
2318
+ },
2319
+ {
2320
+ "epoch": 2.908256880733945,
2321
+ "grad_norm": 0.05628720670938492,
2322
+ "learning_rate": 5.231352408934687e-08,
2323
+ "loss": 0.0684,
2324
+ "step": 317
2325
+ },
2326
+ {
2327
+ "epoch": 2.9174311926605503,
2328
+ "grad_norm": 0.09971769899129868,
2329
+ "learning_rate": 4.2380981981759994e-08,
2330
+ "loss": 0.0992,
2331
+ "step": 318
2332
+ },
2333
+ {
2334
+ "epoch": 2.926605504587156,
2335
+ "grad_norm": 0.05017199367284775,
2336
+ "learning_rate": 3.349117669751767e-08,
2337
+ "loss": 0.0506,
2338
+ "step": 319
2339
+ },
2340
+ {
2341
+ "epoch": 2.9357798165137616,
2342
+ "grad_norm": 0.041759029030799866,
2343
+ "learning_rate": 2.5645039154675867e-08,
2344
+ "loss": 0.0475,
2345
+ "step": 320
2346
+ },
2347
+ {
2348
+ "epoch": 2.944954128440367,
2349
+ "grad_norm": 0.04425744712352753,
2350
+ "learning_rate": 1.8843390981024835e-08,
2351
+ "loss": 0.0421,
2352
+ "step": 321
2353
+ },
2354
+ {
2355
+ "epoch": 2.9541284403669725,
2356
+ "grad_norm": 0.04827776178717613,
2357
+ "learning_rate": 1.3086944428060132e-08,
2358
+ "loss": 0.0628,
2359
+ "step": 322
2360
+ },
2361
+ {
2362
+ "epoch": 2.963302752293578,
2363
+ "grad_norm": 0.05134027451276779,
2364
+ "learning_rate": 8.376302296387862e-09,
2365
+ "loss": 0.068,
2366
+ "step": 323
2367
+ },
2368
+ {
2369
+ "epoch": 2.9724770642201834,
2370
+ "grad_norm": 0.04932725057005882,
2371
+ "learning_rate": 4.711957872606254e-09,
2372
+ "loss": 0.0787,
2373
+ "step": 324
2374
+ },
2375
+ {
2376
+ "epoch": 2.981651376146789,
2377
+ "grad_norm": 0.04113076254725456,
2378
+ "learning_rate": 2.0942948776481175e-09,
2379
+ "loss": 0.0384,
2380
+ "step": 325
2381
+ },
2382
+ {
2383
+ "epoch": 2.9908256880733948,
2384
+ "grad_norm": 0.06910436600446701,
2385
+ "learning_rate": 5.23587426601857e-10,
2386
+ "loss": 0.0805,
2387
+ "step": 326
2388
+ },
2389
+ {
2390
+ "epoch": 3.0,
2391
+ "grad_norm": 0.042928002774715424,
2392
+ "learning_rate": 0.0,
2393
+ "loss": 0.0588,
2394
+ "step": 327
2395
+ }
2396
+ ],
2397
+ "logging_steps": 1,
2398
+ "max_steps": 327,
2399
+ "num_input_tokens_seen": 0,
2400
+ "num_train_epochs": 3,
2401
+ "save_steps": 28,
2402
+ "stateful_callbacks": {
2403
+ "TrainerControl": {
2404
+ "args": {
2405
+ "should_epoch_stop": false,
2406
+ "should_evaluate": false,
2407
+ "should_log": false,
2408
+ "should_save": true,
2409
+ "should_training_stop": true
2410
+ },
2411
+ "attributes": {}
2412
+ }
2413
+ },
2414
+ "total_flos": 2.0294089094536888e+18,
2415
+ "train_batch_size": 2,
2416
+ "trial_name": null,
2417
+ "trial_params": null
2418
+ }
checkpoint-327/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8691334e7ff9d485bc39601a29a4096723c23e5fb7323cdb19a40a1c9c993c02
3
+ size 6520
checkpoint-327/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "bigcode/starcoder2-15b",
4
+ "architectures": [
5
+ "Starcoder2ForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 0,
9
+ "embedding_dropout": 0.1,
10
+ "eos_token_id": 0,
11
+ "hidden_act": "gelu_pytorch_tanh",
12
+ "hidden_size": 6144,
13
+ "initializer_range": 0.01275,
14
+ "intermediate_size": 24576,
15
+ "max_position_embeddings": 16384,
16
+ "mlp_type": "default",
17
+ "model_type": "starcoder2",
18
+ "norm_epsilon": 1e-05,
19
+ "norm_type": "layer_norm",
20
+ "num_attention_heads": 48,
21
+ "num_hidden_layers": 40,
22
+ "num_key_value_heads": 4,
23
+ "quantization_config": {
24
+ "_load_in_4bit": true,
25
+ "_load_in_8bit": false,
26
+ "bnb_4bit_compute_dtype": "bfloat16",
27
+ "bnb_4bit_quant_storage": "bfloat16",
28
+ "bnb_4bit_quant_type": "nf4",
29
+ "bnb_4bit_use_double_quant": true,
30
+ "llm_int8_enable_fp32_cpu_offload": false,
31
+ "llm_int8_has_fp16_weight": false,
32
+ "llm_int8_skip_modules": null,
33
+ "llm_int8_threshold": 6.0,
34
+ "load_in_4bit": true,
35
+ "load_in_8bit": false,
36
+ "quant_method": "bitsandbytes"
37
+ },
38
+ "residual_dropout": 0.1,
39
+ "rope_scaling": null,
40
+ "rope_theta": 100000,
41
+ "sliding_window": 4096,
42
+ "tie_word_embeddings": false,
43
+ "torch_dtype": "float32",
44
+ "transformers_version": "4.49.0",
45
+ "use_bias": true,
46
+ "use_cache": false,
47
+ "vocab_size": 49152
48
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim_prefix>",
5
+ "<fim_middle>",
6
+ "<fim_suffix>",
7
+ "<fim_pad>",
8
+ "<repo_name>",
9
+ "<file_sep>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<jupyter_script>",
18
+ "<empty_output>",
19
+ "<code_to_intermediate>",
20
+ "<intermediate_to_code>",
21
+ "<pr>",
22
+ "<pr_status>",
23
+ "<pr_is_merged>",
24
+ "<pr_base>",
25
+ "<pr_file>",
26
+ "<pr_base_code>",
27
+ "<pr_diff>",
28
+ "<pr_diff_hunk>",
29
+ "<pr_comment>",
30
+ "<pr_event_id>",
31
+ "<pr_review>",
32
+ "<pr_review_state>",
33
+ "<pr_review_comment>",
34
+ "<pr_in_reply_to_review_id>",
35
+ "<pr_in_reply_to_comment_id>",
36
+ "<pr_diff_hunk_comment_line>",
37
+ "<NAME>",
38
+ "<EMAIL>",
39
+ "<KEY>",
40
+ "<PASSWORD>"
41
+ ],
42
+ "bos_token": {
43
+ "content": "<|endoftext|>",
44
+ "lstrip": false,
45
+ "normalized": false,
46
+ "rstrip": false,
47
+ "single_word": false
48
+ },
49
+ "eos_token": {
50
+ "content": "<|endoftext|>",
51
+ "lstrip": false,
52
+ "normalized": false,
53
+ "rstrip": false,
54
+ "single_word": false
55
+ },
56
+ "pad_token": {
57
+ "content": "<|endoftext|>",
58
+ "lstrip": false,
59
+ "normalized": false,
60
+ "rstrip": false,
61
+ "single_word": false
62
+ },
63
+ "unk_token": {
64
+ "content": "<|endoftext|>",
65
+ "lstrip": false,
66
+ "normalized": false,
67
+ "rstrip": false,
68
+ "single_word": false
69
+ }
70
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<fim_prefix>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<fim_middle>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<fim_suffix>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<fim_pad>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<repo_name>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<file_sep>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<issue_start>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_comment>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_closed>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<jupyter_start>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_text>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_code>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_output>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_script>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<empty_output>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<code_to_intermediate>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "17": {
141
+ "content": "<intermediate_to_code>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "18": {
149
+ "content": "<pr>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "19": {
157
+ "content": "<pr_status>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "20": {
165
+ "content": "<pr_is_merged>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "21": {
173
+ "content": "<pr_base>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "22": {
181
+ "content": "<pr_file>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "23": {
189
+ "content": "<pr_base_code>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "24": {
197
+ "content": "<pr_diff>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "25": {
205
+ "content": "<pr_diff_hunk>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "26": {
213
+ "content": "<pr_comment>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "27": {
221
+ "content": "<pr_event_id>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "28": {
229
+ "content": "<pr_review>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "29": {
237
+ "content": "<pr_review_state>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "30": {
245
+ "content": "<pr_review_comment>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "31": {
253
+ "content": "<pr_in_reply_to_review_id>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "32": {
261
+ "content": "<pr_in_reply_to_comment_id>",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "33": {
269
+ "content": "<pr_diff_hunk_comment_line>",
270
+ "lstrip": false,
271
+ "normalized": false,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "34": {
277
+ "content": "<NAME>",
278
+ "lstrip": false,
279
+ "normalized": false,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "35": {
285
+ "content": "<EMAIL>",
286
+ "lstrip": false,
287
+ "normalized": false,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "36": {
293
+ "content": "<KEY>",
294
+ "lstrip": false,
295
+ "normalized": false,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "37": {
301
+ "content": "<PASSWORD>",
302
+ "lstrip": false,
303
+ "normalized": false,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": true
307
+ }
308
+ },
309
+ "additional_special_tokens": [
310
+ "<|endoftext|>",
311
+ "<fim_prefix>",
312
+ "<fim_middle>",
313
+ "<fim_suffix>",
314
+ "<fim_pad>",
315
+ "<repo_name>",
316
+ "<file_sep>",
317
+ "<issue_start>",
318
+ "<issue_comment>",
319
+ "<issue_closed>",
320
+ "<jupyter_start>",
321
+ "<jupyter_text>",
322
+ "<jupyter_code>",
323
+ "<jupyter_output>",
324
+ "<jupyter_script>",
325
+ "<empty_output>",
326
+ "<code_to_intermediate>",
327
+ "<intermediate_to_code>",
328
+ "<pr>",
329
+ "<pr_status>",
330
+ "<pr_is_merged>",
331
+ "<pr_base>",
332
+ "<pr_file>",
333
+ "<pr_base_code>",
334
+ "<pr_diff>",
335
+ "<pr_diff_hunk>",
336
+ "<pr_comment>",
337
+ "<pr_event_id>",
338
+ "<pr_review>",
339
+ "<pr_review_state>",
340
+ "<pr_review_comment>",
341
+ "<pr_in_reply_to_review_id>",
342
+ "<pr_in_reply_to_comment_id>",
343
+ "<pr_diff_hunk_comment_line>",
344
+ "<NAME>",
345
+ "<EMAIL>",
346
+ "<KEY>",
347
+ "<PASSWORD>"
348
+ ],
349
+ "bos_token": "<|endoftext|>",
350
+ "clean_up_tokenization_spaces": true,
351
+ "eos_token": "<|endoftext|>",
352
+ "extra_special_tokens": {},
353
+ "model_max_length": 1000000000000000019884624838656,
354
+ "pad_token": "<|endoftext|>",
355
+ "tokenizer_class": "GPT2Tokenizer",
356
+ "unk_token": "<|endoftext|>",
357
+ "vocab_size": 49152
358
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff