sync configuration_deepseek.py with upstream
Browse files- configuration_deepseek.py +0 -11
configuration_deepseek.py
CHANGED
@@ -82,11 +82,6 @@ class DeepseekV3Config(PretrainedConfig):
|
|
82 |
Beginning of stream token id.
|
83 |
eos_token_id (`int`, *optional*, defaults to 2):
|
84 |
End of stream token id.
|
85 |
-
pretraining_tp (`int`, *optional*, defaults to 1):
|
86 |
-
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
|
87 |
-
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
|
88 |
-
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
|
89 |
-
issue](https://github.com/pytorch/pytorch/issues/76232).
|
90 |
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
91 |
Whether to tie weight embeddings
|
92 |
rope_theta (`float`, *optional*, defaults to 10000.0):
|
@@ -141,8 +136,6 @@ class DeepseekV3Config(PretrainedConfig):
|
|
141 |
first_k_dense_replace = 3,
|
142 |
norm_topk_prob = True,
|
143 |
scoring_func = 'sigmoid',
|
144 |
-
aux_loss_alpha = 0.001,
|
145 |
-
seq_aux = True,
|
146 |
hidden_act="silu",
|
147 |
max_position_embeddings=4096,
|
148 |
initializer_range=0.02,
|
@@ -151,7 +144,6 @@ class DeepseekV3Config(PretrainedConfig):
|
|
151 |
pad_token_id=None,
|
152 |
bos_token_id=0,
|
153 |
eos_token_id=1,
|
154 |
-
pretraining_tp=1,
|
155 |
tie_word_embeddings=False,
|
156 |
rope_theta=10000.0,
|
157 |
rope_scaling=None,
|
@@ -184,8 +176,6 @@ class DeepseekV3Config(PretrainedConfig):
|
|
184 |
self.first_k_dense_replace = first_k_dense_replace
|
185 |
self.norm_topk_prob = norm_topk_prob
|
186 |
self.scoring_func = scoring_func
|
187 |
-
self.aux_loss_alpha = aux_loss_alpha
|
188 |
-
self.seq_aux = seq_aux
|
189 |
# for backward compatibility
|
190 |
if num_key_value_heads is None:
|
191 |
num_key_value_heads = num_attention_heads
|
@@ -194,7 +184,6 @@ class DeepseekV3Config(PretrainedConfig):
|
|
194 |
self.hidden_act = hidden_act
|
195 |
self.initializer_range = initializer_range
|
196 |
self.rms_norm_eps = rms_norm_eps
|
197 |
-
self.pretraining_tp = pretraining_tp
|
198 |
self.use_cache = use_cache
|
199 |
self.rope_theta = rope_theta
|
200 |
self.rope_scaling = rope_scaling
|
|
|
82 |
Beginning of stream token id.
|
83 |
eos_token_id (`int`, *optional*, defaults to 2):
|
84 |
End of stream token id.
|
|
|
|
|
|
|
|
|
|
|
85 |
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
86 |
Whether to tie weight embeddings
|
87 |
rope_theta (`float`, *optional*, defaults to 10000.0):
|
|
|
136 |
first_k_dense_replace = 3,
|
137 |
norm_topk_prob = True,
|
138 |
scoring_func = 'sigmoid',
|
|
|
|
|
139 |
hidden_act="silu",
|
140 |
max_position_embeddings=4096,
|
141 |
initializer_range=0.02,
|
|
|
144 |
pad_token_id=None,
|
145 |
bos_token_id=0,
|
146 |
eos_token_id=1,
|
|
|
147 |
tie_word_embeddings=False,
|
148 |
rope_theta=10000.0,
|
149 |
rope_scaling=None,
|
|
|
176 |
self.first_k_dense_replace = first_k_dense_replace
|
177 |
self.norm_topk_prob = norm_topk_prob
|
178 |
self.scoring_func = scoring_func
|
|
|
|
|
179 |
# for backward compatibility
|
180 |
if num_key_value_heads is None:
|
181 |
num_key_value_heads = num_attention_heads
|
|
|
184 |
self.hidden_act = hidden_act
|
185 |
self.initializer_range = initializer_range
|
186 |
self.rms_norm_eps = rms_norm_eps
|
|
|
187 |
self.use_cache = use_cache
|
188 |
self.rope_theta = rope_theta
|
189 |
self.rope_scaling = rope_scaling
|