KuangDW commited on
Commit
57a7224
·
1 Parent(s): ba00803

gemma version

Browse files
Files changed (2) hide show
  1. app.py +26 -13
  2. vecalign/plan2align.py +154 -49
app.py CHANGED
@@ -61,7 +61,7 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
61
  print(f"Using device: {device}")
62
  # Load models once
63
  print("Loading models...")
64
- model_id = "meta-llama/Llama-3.3-70B-Instruct"
65
  tokenizer = AutoTokenizer.from_pretrained(model_id)
66
  model = AutoModelForCausalLM.from_pretrained(
67
  model_id,
@@ -69,20 +69,33 @@ model = AutoModelForCausalLM.from_pretrained(
69
  torch_dtype=torch.float16
70
  )
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def generate_translation(system_prompt, prompt):
73
- messages=[
74
- {"role": "system", "content": system_prompt},
75
- {"role": "user", "content": prompt}
76
- ]
77
- inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
78
  outputs = model.generate(
79
- inputs,
80
- max_new_tokens=512,
81
- temperature=0.7,
82
- top_p=0.9,
83
- do_sample=True
84
- )
85
- translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
86
  return translation
87
 
88
  def check_token_length(text, max_tokens=1024):
 
61
  print(f"Using device: {device}")
62
  # Load models once
63
  print("Loading models...")
64
+ model_id = "google/gemma-2-9b-it" # "meta-llama/Meta-Llama-3.1-8B-Instruct"
65
  tokenizer = AutoTokenizer.from_pretrained(model_id)
66
  model = AutoModelForCausalLM.from_pretrained(
67
  model_id,
 
69
  torch_dtype=torch.float16
70
  )
71
 
72
+ # def generate_translation(system_prompt, prompt):
73
+ # messages=[
74
+ # {"role": "system", "content": system_prompt},
75
+ # {"role": "user", "content": prompt}
76
+ # ]
77
+ # inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
78
+ # outputs = model.generate(
79
+ # inputs,
80
+ # max_new_tokens=512,
81
+ # temperature=0.7,
82
+ # top_p=0.9,
83
+ # do_sample=True
84
+ # )
85
+ # translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
86
+ # return translation
87
+
88
  def generate_translation(system_prompt, prompt):
89
+ full_prompt = f"System: {system_prompt}\nUser: {prompt}\nAssistant:"
90
+ inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
 
 
 
91
  outputs = model.generate(
92
+ **inputs,
93
+ max_new_tokens=512,
94
+ temperature=0.7,
95
+ top_p=0.9,
96
+ do_sample=True
97
+ )
98
+ translation = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
99
  return translation
100
 
101
  def check_token_length(text, max_tokens=1024):
vecalign/plan2align.py CHANGED
@@ -183,6 +183,76 @@ def external_find_best_translation(evals, language, session_id):
183
 
184
  ################################# generating translation #################################
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer, good_sent_size, src_language, tgt_language):
187
  system_prompts = [
188
  "You are a meticulous translator. Provide a literal, word-for-word translation that preserves the structure and meaning of each individual word.",
@@ -190,17 +260,6 @@ def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer,
190
  "You are a creative and expressive translator. Render the text in a vivid way, as if narrating a captivating story."
191
  ]
192
 
193
- context_prompt = f"Below is a specialized, intermediate translation task. The input text is a mix of {src_language} and partial {tgt_language} translations. "
194
- context_prompt += f"In the text, some {src_language} sentences are already followed by preliminary {tgt_language} translations enclosed in parentheses. "
195
- context_prompt += f"These provided translations are rough references – they may be incomplete, inconsistent, or not fully aligned with the original meaning.\n\n"
196
- context_prompt += f"Your task is to produce an improved {tgt_language} translation according to the following guidelines:\n"
197
- context_prompt += f"1. **Refinement:** For sections with existing {tgt_language} translations (in parentheses), refine and polish them so that they are fluent, accurate, and coherent, fully capturing the meaning of the corresponding {src_language} text.\n"
198
- context_prompt += f"2. **Completion:** For sections that remain untranslated, translate the {src_language} text accurately and naturally in the specified style.\n"
199
- context_prompt += f"3. **Translation Order and Structure Preservation:** Maintain the original order and structure of the text. Every {src_language} sentence must appear in the same sequence as in the source text, with its corresponding {tgt_language} translation (if available) inserted immediately after it. Do not rearrange or reorder any part of the text.\n"
200
- context_prompt += f"4. **Consistency:** Ensure a uniform tone and style across the entire translation, adhering to the translator role specified.\n"
201
- context_prompt += f"5. **Final Output:** Provide the final output as a single, well-structured {tgt_language} text. Do not include any extraneous commentary, explanations, annotations, or headers – output only the translation in the correct order.\n\n"
202
- context_prompt += f"Note: This translation is an intermediate version that may later be merged with other translations. Focus on clarity, coherence, and fidelity to the source text.\n"
203
-
204
  # Process the buffer to extract relevant English translations
205
  processed_source = source_sentence
206
  if len(buffer) > 0:
@@ -214,35 +273,49 @@ def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer,
214
  key_sentence,
215
  f"{key_sentence}\n({translated_sentence})\n"
216
  )
217
-
218
- context_prompt += f"\nHere is the input data for translation:\n{processed_source}\n\n"
219
- context_prompt += "Apply the above guidelines to produce an improved, coherent translation that strictly follows the original order of the text.\n"
220
-
221
- if len(buffer) == 0:
222
- context_prompt = f"### Translate this from {src_language} to {tgt_language} and **only** output the result."
223
- context_prompt += f"\n### {src_language}:\n {source_sentence}"
224
- context_prompt += f"\n### {tgt_language}:\n"
225
-
226
- print("--------------------------------------------------------------------------------")
227
- print("\n context_prompt \n")
228
- print(context_prompt)
229
- print("--------------------------------------------------------------------------------")
230
 
231
  translations = []
232
- for prompt in system_prompts:
233
- messages=[
234
- {"role": "system", "content": prompt},
235
- {"role": "user", "content": context_prompt}
236
- ]
237
- inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  outputs = model.generate(
239
- inputs,
240
  max_new_tokens=512,
241
  temperature=0.7,
242
  top_p=0.9,
243
  do_sample=True
244
  )
245
- translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
246
 
247
  print("--------------------------------------------------------------------------------")
248
  print("\n rollout translation: \n")
@@ -272,11 +345,50 @@ def process_buffer_sentences(source_sentences, buffer):
272
  translations.append(translation_map[src_sent][0])
273
  return translations
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  def final_translate_with_deepinfra(model, tokenizer, device, source_sentence, source_segments, buffer, src_language, tgt_language):
276
  translations = process_buffer_sentences(source_segments, buffer)
277
  initial_translation = "\n".join(translations)
278
 
279
  rewrite_prompt = (
 
280
  f"Below is an initial translation of a {src_language} text into {tgt_language}. "
281
  f"This translation may include omissions, inaccuracies, or awkward phrasing. "
282
  f"Your task is to produce a refined version that is fluent, accurate, and coherent, "
@@ -289,26 +401,19 @@ def final_translate_with_deepinfra(model, tokenizer, device, source_sentence, so
289
  f"5. Output only the final refined translation without any additional commentary.\n\n"
290
  f"### Original {src_language} Text:\n{source_sentence}\n\n"
291
  f"### Initial {tgt_language} Translation:\n{initial_translation}\n\n"
292
- f"### Refined Translation:"
293
  )
294
 
295
- print("rewrite prompt:")
296
- print(rewrite_prompt)
297
- messages=[
298
- {"role": "system", "content": "You are a helpful translator and only output the result."},
299
- {"role": "user", "content": rewrite_prompt}
300
- ]
301
- inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
302
  outputs = model.generate(
303
- inputs,
304
- max_new_tokens=512,
305
- temperature=0.7,
306
- top_p=0.9,
307
- do_sample=True
308
- )
309
- translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
310
- return translation
311
-
312
 
313
  ################################# alignment functions #################################
314
  def save_sentences_to_txt(sentences, filename):
 
183
 
184
  ################################# generating translation #################################
185
 
186
+ # def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer, good_sent_size, src_language, tgt_language):
187
+ # system_prompts = [
188
+ # "You are a meticulous translator. Provide a literal, word-for-word translation that preserves the structure and meaning of each individual word.",
189
+ # "You are a professional translator. Deliver a clear, formal, and precise translation that faithfully conveys the original meaning.",
190
+ # "You are a creative and expressive translator. Render the text in a vivid way, as if narrating a captivating story."
191
+ # ]
192
+
193
+ # context_prompt = f"Below is a specialized, intermediate translation task. The input text is a mix of {src_language} and partial {tgt_language} translations. "
194
+ # context_prompt += f"In the text, some {src_language} sentences are already followed by preliminary {tgt_language} translations enclosed in parentheses. "
195
+ # context_prompt += f"These provided translations are rough references – they may be incomplete, inconsistent, or not fully aligned with the original meaning.\n\n"
196
+ # context_prompt += f"Your task is to produce an improved {tgt_language} translation according to the following guidelines:\n"
197
+ # context_prompt += f"1. **Refinement:** For sections with existing {tgt_language} translations (in parentheses), refine and polish them so that they are fluent, accurate, and coherent, fully capturing the meaning of the corresponding {src_language} text.\n"
198
+ # context_prompt += f"2. **Completion:** For sections that remain untranslated, translate the {src_language} text accurately and naturally in the specified style.\n"
199
+ # context_prompt += f"3. **Translation Order and Structure Preservation:** Maintain the original order and structure of the text. Every {src_language} sentence must appear in the same sequence as in the source text, with its corresponding {tgt_language} translation (if available) inserted immediately after it. Do not rearrange or reorder any part of the text.\n"
200
+ # context_prompt += f"4. **Consistency:** Ensure a uniform tone and style across the entire translation, adhering to the translator role specified.\n"
201
+ # context_prompt += f"5. **Final Output:** Provide the final output as a single, well-structured {tgt_language} text. Do not include any extraneous commentary, explanations, annotations, or headers – output only the translation in the correct order.\n\n"
202
+ # context_prompt += f"Note: This translation is an intermediate version that may later be merged with other translations. Focus on clarity, coherence, and fidelity to the source text.\n"
203
+
204
+ # # Process the buffer to extract relevant English translations
205
+ # processed_source = source_sentence
206
+ # if len(buffer) > 0:
207
+ # selected_keys = random.sample(buffer.keys(), min(len(buffer), good_sent_size))
208
+ # for key_sentence in selected_keys:
209
+ # key_sentence = key_sentence.strip()
210
+ # if key_sentence and (key_sentence in source_sentence) :
211
+ # translated_sentence = buffer[key_sentence][0][0]
212
+ # if f"\n({translated_sentence})\n" not in processed_source:
213
+ # processed_source = processed_source.replace(
214
+ # key_sentence,
215
+ # f"{key_sentence}\n({translated_sentence})\n"
216
+ # )
217
+
218
+ # context_prompt += f"\nHere is the input data for translation:\n{processed_source}\n\n"
219
+ # context_prompt += "Apply the above guidelines to produce an improved, coherent translation that strictly follows the original order of the text.\n"
220
+
221
+ # if len(buffer) == 0:
222
+ # context_prompt = f"### Translate this from {src_language} to {tgt_language} and **only** output the result."
223
+ # context_prompt += f"\n### {src_language}:\n {source_sentence}"
224
+ # context_prompt += f"\n### {tgt_language}:\n"
225
+
226
+ # print("--------------------------------------------------------------------------------")
227
+ # print("\n context_prompt \n")
228
+ # print(context_prompt)
229
+ # print("--------------------------------------------------------------------------------")
230
+
231
+ # translations = []
232
+ # for prompt in system_prompts:
233
+ # messages=[
234
+ # {"role": "system", "content": prompt},
235
+ # {"role": "user", "content": context_prompt}
236
+ # ]
237
+ # inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
238
+ # outputs = model.generate(
239
+ # inputs,
240
+ # max_new_tokens=512,
241
+ # temperature=0.7,
242
+ # top_p=0.9,
243
+ # do_sample=True
244
+ # )
245
+ # translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
246
+
247
+ # print("--------------------------------------------------------------------------------")
248
+ # print("\n rollout translation: \n")
249
+ # print(translation)
250
+ # print("--------------------------------------------------------------------------------")
251
+
252
+ # translations.append(translation)
253
+
254
+ # return translations
255
+
256
  def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer, good_sent_size, src_language, tgt_language):
257
  system_prompts = [
258
  "You are a meticulous translator. Provide a literal, word-for-word translation that preserves the structure and meaning of each individual word.",
 
260
  "You are a creative and expressive translator. Render the text in a vivid way, as if narrating a captivating story."
261
  ]
262
 
 
 
 
 
 
 
 
 
 
 
 
263
  # Process the buffer to extract relevant English translations
264
  processed_source = source_sentence
265
  if len(buffer) > 0:
 
273
  key_sentence,
274
  f"{key_sentence}\n({translated_sentence})\n"
275
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
  translations = []
278
+ for system_prompt in system_prompts:
279
+ if len(buffer) == 0:
280
+ full_prompt = (
281
+ f"System: {system_prompt}\n\n"
282
+ f"### Translate this from {src_language} to {tgt_language}.\n"
283
+ f"{src_language}:\n{source_sentence}\n\n"
284
+ f"{tgt_language}:\n"
285
+ )
286
+ else:
287
+ context_prompt = (
288
+ f"Below is a specialized, intermediate translation task. The input text is a mix of {src_language} and partial {tgt_language} translations. "
289
+ f"In the text, some {src_language} sentences are already followed by preliminary {tgt_language} translations enclosed in parentheses. "
290
+ f"These provided translations are rough references - they may be incomplete, inconsistent, or not fully aligned with the original meaning.\n\n"
291
+ f"Your task is to produce an improved {tgt_language} translation according to the following guidelines:\n"
292
+ f"1. Refinement: For sections with existing {tgt_language} translations (in parentheses), refine and polish them.\n"
293
+ f"2. Completion: For untranslated sections, translate the {src_language} text naturally.\n"
294
+ f"3. Translation Order: Maintain the original sequence - every source sentence must appear in order with its translation right after it.\n"
295
+ f"4. Consistency: Ensure a uniform tone and style.\n"
296
+ f"5. Output only the final {tgt_language} translation. No extra commentary.\n\n"
297
+ f"Note: This is an intermediate version that may later be merged. Focus on clarity and fidelity.\n\n"
298
+ f"Input Text:\n{processed_source}\n\n"
299
+ f"Assistant:"
300
+ )
301
+
302
+ full_prompt = f"System: {system_prompt}\n\n{context_prompt}"
303
+
304
+ print("--------------------------------------------------------------------------------")
305
+ print("\n full_prompt \n")
306
+ print(full_prompt)
307
+ print("--------------------------------------------------------------------------------")
308
+
309
+ # Tokenize and generate
310
+ inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
311
  outputs = model.generate(
312
+ **inputs,
313
  max_new_tokens=512,
314
  temperature=0.7,
315
  top_p=0.9,
316
  do_sample=True
317
  )
318
+ translation = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
319
 
320
  print("--------------------------------------------------------------------------------")
321
  print("\n rollout translation: \n")
 
345
  translations.append(translation_map[src_sent][0])
346
  return translations
347
 
348
+ # def final_translate_with_deepinfra(model, tokenizer, device, source_sentence, source_segments, buffer, src_language, tgt_language):
349
+ # translations = process_buffer_sentences(source_segments, buffer)
350
+ # initial_translation = "\n".join(translations)
351
+
352
+ # rewrite_prompt = (
353
+ # f"Below is an initial translation of a {src_language} text into {tgt_language}. "
354
+ # f"This translation may include omissions, inaccuracies, or awkward phrasing. "
355
+ # f"Your task is to produce a refined version that is fluent, accurate, and coherent, "
356
+ # f"while faithfully preserving the full meaning of the original {src_language} text.\n\n"
357
+ # f"### Instructions:\n"
358
+ # f"1. Ensure that every detail in the original {src_language} text is accurately represented.\n"
359
+ # f"2. Correct any grammatical errors, unnatural expressions, or inconsistencies.\n"
360
+ # f"3. Improve the natural flow so that the translation reads as if written by a native speaker.\n"
361
+ # f"4. Do not add, omit, or change any essential details from the source text.\n"
362
+ # f"5. Output only the final refined translation without any additional commentary.\n\n"
363
+ # f"### Original {src_language} Text:\n{source_sentence}\n\n"
364
+ # f"### Initial {tgt_language} Translation:\n{initial_translation}\n\n"
365
+ # f"### Refined Translation:"
366
+ # )
367
+
368
+ # print("rewrite prompt:")
369
+ # print(rewrite_prompt)
370
+ # messages=[
371
+ # {"role": "system", "content": "You are a helpful translator and only output the result."},
372
+ # {"role": "user", "content": rewrite_prompt}
373
+ # ]
374
+ # inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
375
+ # outputs = model.generate(
376
+ # inputs,
377
+ # max_new_tokens=512,
378
+ # temperature=0.7,
379
+ # top_p=0.9,
380
+ # do_sample=True
381
+ # )
382
+ # translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
383
+ # return translation
384
+
385
+
386
  def final_translate_with_deepinfra(model, tokenizer, device, source_sentence, source_segments, buffer, src_language, tgt_language):
387
  translations = process_buffer_sentences(source_segments, buffer)
388
  initial_translation = "\n".join(translations)
389
 
390
  rewrite_prompt = (
391
+ f"System: You are a helpful translator and only output the result.\n\n"
392
  f"Below is an initial translation of a {src_language} text into {tgt_language}. "
393
  f"This translation may include omissions, inaccuracies, or awkward phrasing. "
394
  f"Your task is to produce a refined version that is fluent, accurate, and coherent, "
 
401
  f"5. Output only the final refined translation without any additional commentary.\n\n"
402
  f"### Original {src_language} Text:\n{source_sentence}\n\n"
403
  f"### Initial {tgt_language} Translation:\n{initial_translation}\n\n"
404
+ f"Assistant:"
405
  )
406
 
407
+ inputs = tokenizer(rewrite_prompt, return_tensors="pt").to(device)
 
 
 
 
 
 
408
  outputs = model.generate(
409
+ **inputs,
410
+ max_new_tokens=512,
411
+ temperature=0.7,
412
+ top_p=0.9,
413
+ do_sample=True
414
+ )
415
+ refined_translation = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
416
+ return refined_translation
 
417
 
418
  ################################# alignment functions #################################
419
  def save_sentences_to_txt(sentences, filename):