Commit
·
be0c125
1
Parent(s):
2598166
Add config
Browse files- config.json +4 -4
- operative_config.gin +6 -8
config.json
CHANGED
@@ -3,9 +3,9 @@
|
|
3 |
"architectures": [
|
4 |
"T5ForConditionalGeneration"
|
5 |
],
|
6 |
-
"d_ff":
|
7 |
-
"d_kv":
|
8 |
-
"d_model":
|
9 |
"decoder_start_token_id": 0,
|
10 |
"dropout_rate": 0.1,
|
11 |
"eos_token_id": 1,
|
@@ -16,7 +16,7 @@
|
|
16 |
"model_type": "t5",
|
17 |
"n_positions": 512,
|
18 |
"num_decoder_layers": 12,
|
19 |
-
"num_heads":
|
20 |
"num_layers": 12,
|
21 |
"pad_token_id": 0,
|
22 |
"relative_attention_num_buckets": 32,
|
|
|
3 |
"architectures": [
|
4 |
"T5ForConditionalGeneration"
|
5 |
],
|
6 |
+
"d_ff": 3072,
|
7 |
+
"d_kv": 64,
|
8 |
+
"d_model": 768,
|
9 |
"decoder_start_token_id": 0,
|
10 |
"dropout_rate": 0.1,
|
11 |
"eos_token_id": 1,
|
|
|
16 |
"model_type": "t5",
|
17 |
"n_positions": 512,
|
18 |
"num_decoder_layers": 12,
|
19 |
+
"num_heads": 12,
|
20 |
"num_layers": 12,
|
21 |
"pad_token_id": 0,
|
22 |
"relative_attention_num_buckets": 32,
|
operative_config.gin
CHANGED
@@ -9,15 +9,15 @@ import t5.models.mesh_transformer
|
|
9 |
|
10 |
# Macros:
|
11 |
# ==============================================================================
|
12 |
-
d_ff =
|
13 |
-
d_kv =
|
14 |
-
d_model =
|
15 |
dropout_rate = 0.0
|
16 |
inputs_length = 512
|
17 |
mean_noise_span_length = 3.0
|
18 |
MIXTURE_NAME = 'c4_v220_unsupervised'
|
19 |
noise_density = 0.15
|
20 |
-
num_heads =
|
21 |
num_layers = 12
|
22 |
|
23 |
# Parameters for adafactor_decay_rate_pow:
|
@@ -146,7 +146,6 @@ encoder/make_layer_stack.num_layers = %num_layers
|
|
146 |
mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
|
147 |
mesh_train_dataset_fn.pack = True
|
148 |
mesh_train_dataset_fn.seed = None
|
149 |
-
mesh_train_dataset_fn.shuffle = True
|
150 |
mesh_train_dataset_fn.use_cached = 1
|
151 |
|
152 |
# Parameters for noise_span_to_unique_sentinel:
|
@@ -195,7 +194,6 @@ rewrite_stack_variables.max_combined_variable_size = 536870912
|
|
195 |
# ==============================================================================
|
196 |
run.autostack = True
|
197 |
run.batch_size = ('tokens_per_batch', 65536)
|
198 |
-
run.checkpoint_input_pipeline = False
|
199 |
run.dataset_split = 'train'
|
200 |
run.ensemble_inputs = None
|
201 |
run.eval_checkpoint_step = None
|
@@ -217,7 +215,7 @@ run.optimizer = @optimize.AdafactorOptimizer
|
|
217 |
run.output_eval_examples = True
|
218 |
run.perplexity_eval_steps = 100
|
219 |
run.predict_fn = None
|
220 |
-
run.save_checkpoints_steps =
|
221 |
run.seen_data_init_step = 0
|
222 |
run.sequence_length = {'inputs': 512, 'targets': 128}
|
223 |
run.skip_seen_data = False
|
@@ -312,7 +310,7 @@ tpu_estimator_model_fn.tpu_summaries = False
|
|
312 |
# Parameters for tpu_mesh_shape:
|
313 |
# ==============================================================================
|
314 |
tpu_mesh_shape.ensemble_parallelism = None
|
315 |
-
tpu_mesh_shape.model_parallelism =
|
316 |
tpu_mesh_shape.tpu_topology = '4x4'
|
317 |
|
318 |
# Parameters for unit_scaling_convention:
|
|
|
9 |
|
10 |
# Macros:
|
11 |
# ==============================================================================
|
12 |
+
d_ff = 3072
|
13 |
+
d_kv = 64
|
14 |
+
d_model = 768
|
15 |
dropout_rate = 0.0
|
16 |
inputs_length = 512
|
17 |
mean_noise_span_length = 3.0
|
18 |
MIXTURE_NAME = 'c4_v220_unsupervised'
|
19 |
noise_density = 0.15
|
20 |
+
num_heads = 12
|
21 |
num_layers = 12
|
22 |
|
23 |
# Parameters for adafactor_decay_rate_pow:
|
|
|
146 |
mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
|
147 |
mesh_train_dataset_fn.pack = True
|
148 |
mesh_train_dataset_fn.seed = None
|
|
|
149 |
mesh_train_dataset_fn.use_cached = 1
|
150 |
|
151 |
# Parameters for noise_span_to_unique_sentinel:
|
|
|
194 |
# ==============================================================================
|
195 |
run.autostack = True
|
196 |
run.batch_size = ('tokens_per_batch', 65536)
|
|
|
197 |
run.dataset_split = 'train'
|
198 |
run.ensemble_inputs = None
|
199 |
run.eval_checkpoint_step = None
|
|
|
215 |
run.output_eval_examples = True
|
216 |
run.perplexity_eval_steps = 100
|
217 |
run.predict_fn = None
|
218 |
+
run.save_checkpoints_steps = 10000
|
219 |
run.seen_data_init_step = 0
|
220 |
run.sequence_length = {'inputs': 512, 'targets': 128}
|
221 |
run.skip_seen_data = False
|
|
|
310 |
# Parameters for tpu_mesh_shape:
|
311 |
# ==============================================================================
|
312 |
tpu_mesh_shape.ensemble_parallelism = None
|
313 |
+
tpu_mesh_shape.model_parallelism = 1
|
314 |
tpu_mesh_shape.tpu_topology = '4x4'
|
315 |
|
316 |
# Parameters for unit_scaling_convention:
|