Spaces:
Sleeping
Sleeping
gemma version
Browse files- app.py +26 -13
- vecalign/plan2align.py +154 -49
app.py
CHANGED
@@ -61,7 +61,7 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
61 |
print(f"Using device: {device}")
|
62 |
# Load models once
|
63 |
print("Loading models...")
|
64 |
-
model_id = "meta-llama/Llama-3.
|
65 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
66 |
model = AutoModelForCausalLM.from_pretrained(
|
67 |
model_id,
|
@@ -69,20 +69,33 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
69 |
torch_dtype=torch.float16
|
70 |
)
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
def generate_translation(system_prompt, prompt):
|
73 |
-
|
74 |
-
|
75 |
-
{"role": "user", "content": prompt}
|
76 |
-
]
|
77 |
-
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
|
78 |
outputs = model.generate(
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
|
86 |
return translation
|
87 |
|
88 |
def check_token_length(text, max_tokens=1024):
|
|
|
61 |
print(f"Using device: {device}")
|
62 |
# Load models once
|
63 |
print("Loading models...")
|
64 |
+
model_id = "google/gemma-2-9b-it" # "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
65 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
66 |
model = AutoModelForCausalLM.from_pretrained(
|
67 |
model_id,
|
|
|
69 |
torch_dtype=torch.float16
|
70 |
)
|
71 |
|
72 |
+
# def generate_translation(system_prompt, prompt):
|
73 |
+
# messages=[
|
74 |
+
# {"role": "system", "content": system_prompt},
|
75 |
+
# {"role": "user", "content": prompt}
|
76 |
+
# ]
|
77 |
+
# inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
|
78 |
+
# outputs = model.generate(
|
79 |
+
# inputs,
|
80 |
+
# max_new_tokens=512,
|
81 |
+
# temperature=0.7,
|
82 |
+
# top_p=0.9,
|
83 |
+
# do_sample=True
|
84 |
+
# )
|
85 |
+
# translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
|
86 |
+
# return translation
|
87 |
+
|
88 |
def generate_translation(system_prompt, prompt):
|
89 |
+
full_prompt = f"System: {system_prompt}\nUser: {prompt}\nAssistant:"
|
90 |
+
inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
|
|
|
|
|
|
|
91 |
outputs = model.generate(
|
92 |
+
**inputs,
|
93 |
+
max_new_tokens=512,
|
94 |
+
temperature=0.7,
|
95 |
+
top_p=0.9,
|
96 |
+
do_sample=True
|
97 |
+
)
|
98 |
+
translation = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
|
99 |
return translation
|
100 |
|
101 |
def check_token_length(text, max_tokens=1024):
|
vecalign/plan2align.py
CHANGED
@@ -183,6 +183,76 @@ def external_find_best_translation(evals, language, session_id):
|
|
183 |
|
184 |
################################# generating translation #################################
|
185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer, good_sent_size, src_language, tgt_language):
|
187 |
system_prompts = [
|
188 |
"You are a meticulous translator. Provide a literal, word-for-word translation that preserves the structure and meaning of each individual word.",
|
@@ -190,17 +260,6 @@ def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer,
|
|
190 |
"You are a creative and expressive translator. Render the text in a vivid way, as if narrating a captivating story."
|
191 |
]
|
192 |
|
193 |
-
context_prompt = f"Below is a specialized, intermediate translation task. The input text is a mix of {src_language} and partial {tgt_language} translations. "
|
194 |
-
context_prompt += f"In the text, some {src_language} sentences are already followed by preliminary {tgt_language} translations enclosed in parentheses. "
|
195 |
-
context_prompt += f"These provided translations are rough references – they may be incomplete, inconsistent, or not fully aligned with the original meaning.\n\n"
|
196 |
-
context_prompt += f"Your task is to produce an improved {tgt_language} translation according to the following guidelines:\n"
|
197 |
-
context_prompt += f"1. **Refinement:** For sections with existing {tgt_language} translations (in parentheses), refine and polish them so that they are fluent, accurate, and coherent, fully capturing the meaning of the corresponding {src_language} text.\n"
|
198 |
-
context_prompt += f"2. **Completion:** For sections that remain untranslated, translate the {src_language} text accurately and naturally in the specified style.\n"
|
199 |
-
context_prompt += f"3. **Translation Order and Structure Preservation:** Maintain the original order and structure of the text. Every {src_language} sentence must appear in the same sequence as in the source text, with its corresponding {tgt_language} translation (if available) inserted immediately after it. Do not rearrange or reorder any part of the text.\n"
|
200 |
-
context_prompt += f"4. **Consistency:** Ensure a uniform tone and style across the entire translation, adhering to the translator role specified.\n"
|
201 |
-
context_prompt += f"5. **Final Output:** Provide the final output as a single, well-structured {tgt_language} text. Do not include any extraneous commentary, explanations, annotations, or headers – output only the translation in the correct order.\n\n"
|
202 |
-
context_prompt += f"Note: This translation is an intermediate version that may later be merged with other translations. Focus on clarity, coherence, and fidelity to the source text.\n"
|
203 |
-
|
204 |
# Process the buffer to extract relevant English translations
|
205 |
processed_source = source_sentence
|
206 |
if len(buffer) > 0:
|
@@ -214,35 +273,49 @@ def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer,
|
|
214 |
key_sentence,
|
215 |
f"{key_sentence}\n({translated_sentence})\n"
|
216 |
)
|
217 |
-
|
218 |
-
context_prompt += f"\nHere is the input data for translation:\n{processed_source}\n\n"
|
219 |
-
context_prompt += "Apply the above guidelines to produce an improved, coherent translation that strictly follows the original order of the text.\n"
|
220 |
-
|
221 |
-
if len(buffer) == 0:
|
222 |
-
context_prompt = f"### Translate this from {src_language} to {tgt_language} and **only** output the result."
|
223 |
-
context_prompt += f"\n### {src_language}:\n {source_sentence}"
|
224 |
-
context_prompt += f"\n### {tgt_language}:\n"
|
225 |
-
|
226 |
-
print("--------------------------------------------------------------------------------")
|
227 |
-
print("\n context_prompt \n")
|
228 |
-
print(context_prompt)
|
229 |
-
print("--------------------------------------------------------------------------------")
|
230 |
|
231 |
translations = []
|
232 |
-
for
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
outputs = model.generate(
|
239 |
-
inputs,
|
240 |
max_new_tokens=512,
|
241 |
temperature=0.7,
|
242 |
top_p=0.9,
|
243 |
do_sample=True
|
244 |
)
|
245 |
-
translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
|
246 |
|
247 |
print("--------------------------------------------------------------------------------")
|
248 |
print("\n rollout translation: \n")
|
@@ -272,11 +345,50 @@ def process_buffer_sentences(source_sentences, buffer):
|
|
272 |
translations.append(translation_map[src_sent][0])
|
273 |
return translations
|
274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
def final_translate_with_deepinfra(model, tokenizer, device, source_sentence, source_segments, buffer, src_language, tgt_language):
|
276 |
translations = process_buffer_sentences(source_segments, buffer)
|
277 |
initial_translation = "\n".join(translations)
|
278 |
|
279 |
rewrite_prompt = (
|
|
|
280 |
f"Below is an initial translation of a {src_language} text into {tgt_language}. "
|
281 |
f"This translation may include omissions, inaccuracies, or awkward phrasing. "
|
282 |
f"Your task is to produce a refined version that is fluent, accurate, and coherent, "
|
@@ -289,26 +401,19 @@ def final_translate_with_deepinfra(model, tokenizer, device, source_sentence, so
|
|
289 |
f"5. Output only the final refined translation without any additional commentary.\n\n"
|
290 |
f"### Original {src_language} Text:\n{source_sentence}\n\n"
|
291 |
f"### Initial {tgt_language} Translation:\n{initial_translation}\n\n"
|
292 |
-
f"
|
293 |
)
|
294 |
|
295 |
-
|
296 |
-
print(rewrite_prompt)
|
297 |
-
messages=[
|
298 |
-
{"role": "system", "content": "You are a helpful translator and only output the result."},
|
299 |
-
{"role": "user", "content": rewrite_prompt}
|
300 |
-
]
|
301 |
-
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
|
302 |
outputs = model.generate(
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
return
|
311 |
-
|
312 |
|
313 |
################################# alignment functions #################################
|
314 |
def save_sentences_to_txt(sentences, filename):
|
|
|
183 |
|
184 |
################################# generating translation #################################
|
185 |
|
186 |
+
# def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer, good_sent_size, src_language, tgt_language):
|
187 |
+
# system_prompts = [
|
188 |
+
# "You are a meticulous translator. Provide a literal, word-for-word translation that preserves the structure and meaning of each individual word.",
|
189 |
+
# "You are a professional translator. Deliver a clear, formal, and precise translation that faithfully conveys the original meaning.",
|
190 |
+
# "You are a creative and expressive translator. Render the text in a vivid way, as if narrating a captivating story."
|
191 |
+
# ]
|
192 |
+
|
193 |
+
# context_prompt = f"Below is a specialized, intermediate translation task. The input text is a mix of {src_language} and partial {tgt_language} translations. "
|
194 |
+
# context_prompt += f"In the text, some {src_language} sentences are already followed by preliminary {tgt_language} translations enclosed in parentheses. "
|
195 |
+
# context_prompt += f"These provided translations are rough references – they may be incomplete, inconsistent, or not fully aligned with the original meaning.\n\n"
|
196 |
+
# context_prompt += f"Your task is to produce an improved {tgt_language} translation according to the following guidelines:\n"
|
197 |
+
# context_prompt += f"1. **Refinement:** For sections with existing {tgt_language} translations (in parentheses), refine and polish them so that they are fluent, accurate, and coherent, fully capturing the meaning of the corresponding {src_language} text.\n"
|
198 |
+
# context_prompt += f"2. **Completion:** For sections that remain untranslated, translate the {src_language} text accurately and naturally in the specified style.\n"
|
199 |
+
# context_prompt += f"3. **Translation Order and Structure Preservation:** Maintain the original order and structure of the text. Every {src_language} sentence must appear in the same sequence as in the source text, with its corresponding {tgt_language} translation (if available) inserted immediately after it. Do not rearrange or reorder any part of the text.\n"
|
200 |
+
# context_prompt += f"4. **Consistency:** Ensure a uniform tone and style across the entire translation, adhering to the translator role specified.\n"
|
201 |
+
# context_prompt += f"5. **Final Output:** Provide the final output as a single, well-structured {tgt_language} text. Do not include any extraneous commentary, explanations, annotations, or headers – output only the translation in the correct order.\n\n"
|
202 |
+
# context_prompt += f"Note: This translation is an intermediate version that may later be merged with other translations. Focus on clarity, coherence, and fidelity to the source text.\n"
|
203 |
+
|
204 |
+
# # Process the buffer to extract relevant English translations
|
205 |
+
# processed_source = source_sentence
|
206 |
+
# if len(buffer) > 0:
|
207 |
+
# selected_keys = random.sample(buffer.keys(), min(len(buffer), good_sent_size))
|
208 |
+
# for key_sentence in selected_keys:
|
209 |
+
# key_sentence = key_sentence.strip()
|
210 |
+
# if key_sentence and (key_sentence in source_sentence) :
|
211 |
+
# translated_sentence = buffer[key_sentence][0][0]
|
212 |
+
# if f"\n({translated_sentence})\n" not in processed_source:
|
213 |
+
# processed_source = processed_source.replace(
|
214 |
+
# key_sentence,
|
215 |
+
# f"{key_sentence}\n({translated_sentence})\n"
|
216 |
+
# )
|
217 |
+
|
218 |
+
# context_prompt += f"\nHere is the input data for translation:\n{processed_source}\n\n"
|
219 |
+
# context_prompt += "Apply the above guidelines to produce an improved, coherent translation that strictly follows the original order of the text.\n"
|
220 |
+
|
221 |
+
# if len(buffer) == 0:
|
222 |
+
# context_prompt = f"### Translate this from {src_language} to {tgt_language} and **only** output the result."
|
223 |
+
# context_prompt += f"\n### {src_language}:\n {source_sentence}"
|
224 |
+
# context_prompt += f"\n### {tgt_language}:\n"
|
225 |
+
|
226 |
+
# print("--------------------------------------------------------------------------------")
|
227 |
+
# print("\n context_prompt \n")
|
228 |
+
# print(context_prompt)
|
229 |
+
# print("--------------------------------------------------------------------------------")
|
230 |
+
|
231 |
+
# translations = []
|
232 |
+
# for prompt in system_prompts:
|
233 |
+
# messages=[
|
234 |
+
# {"role": "system", "content": prompt},
|
235 |
+
# {"role": "user", "content": context_prompt}
|
236 |
+
# ]
|
237 |
+
# inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
|
238 |
+
# outputs = model.generate(
|
239 |
+
# inputs,
|
240 |
+
# max_new_tokens=512,
|
241 |
+
# temperature=0.7,
|
242 |
+
# top_p=0.9,
|
243 |
+
# do_sample=True
|
244 |
+
# )
|
245 |
+
# translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
|
246 |
+
|
247 |
+
# print("--------------------------------------------------------------------------------")
|
248 |
+
# print("\n rollout translation: \n")
|
249 |
+
# print(translation)
|
250 |
+
# print("--------------------------------------------------------------------------------")
|
251 |
+
|
252 |
+
# translations.append(translation)
|
253 |
+
|
254 |
+
# return translations
|
255 |
+
|
256 |
def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer, good_sent_size, src_language, tgt_language):
|
257 |
system_prompts = [
|
258 |
"You are a meticulous translator. Provide a literal, word-for-word translation that preserves the structure and meaning of each individual word.",
|
|
|
260 |
"You are a creative and expressive translator. Render the text in a vivid way, as if narrating a captivating story."
|
261 |
]
|
262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
# Process the buffer to extract relevant English translations
|
264 |
processed_source = source_sentence
|
265 |
if len(buffer) > 0:
|
|
|
273 |
key_sentence,
|
274 |
f"{key_sentence}\n({translated_sentence})\n"
|
275 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
|
277 |
translations = []
|
278 |
+
for system_prompt in system_prompts:
|
279 |
+
if len(buffer) == 0:
|
280 |
+
full_prompt = (
|
281 |
+
f"System: {system_prompt}\n\n"
|
282 |
+
f"### Translate this from {src_language} to {tgt_language}.\n"
|
283 |
+
f"{src_language}:\n{source_sentence}\n\n"
|
284 |
+
f"{tgt_language}:\n"
|
285 |
+
)
|
286 |
+
else:
|
287 |
+
context_prompt = (
|
288 |
+
f"Below is a specialized, intermediate translation task. The input text is a mix of {src_language} and partial {tgt_language} translations. "
|
289 |
+
f"In the text, some {src_language} sentences are already followed by preliminary {tgt_language} translations enclosed in parentheses. "
|
290 |
+
f"These provided translations are rough references - they may be incomplete, inconsistent, or not fully aligned with the original meaning.\n\n"
|
291 |
+
f"Your task is to produce an improved {tgt_language} translation according to the following guidelines:\n"
|
292 |
+
f"1. Refinement: For sections with existing {tgt_language} translations (in parentheses), refine and polish them.\n"
|
293 |
+
f"2. Completion: For untranslated sections, translate the {src_language} text naturally.\n"
|
294 |
+
f"3. Translation Order: Maintain the original sequence - every source sentence must appear in order with its translation right after it.\n"
|
295 |
+
f"4. Consistency: Ensure a uniform tone and style.\n"
|
296 |
+
f"5. Output only the final {tgt_language} translation. No extra commentary.\n\n"
|
297 |
+
f"Note: This is an intermediate version that may later be merged. Focus on clarity and fidelity.\n\n"
|
298 |
+
f"Input Text:\n{processed_source}\n\n"
|
299 |
+
f"Assistant:"
|
300 |
+
)
|
301 |
+
|
302 |
+
full_prompt = f"System: {system_prompt}\n\n{context_prompt}"
|
303 |
+
|
304 |
+
print("--------------------------------------------------------------------------------")
|
305 |
+
print("\n full_prompt \n")
|
306 |
+
print(full_prompt)
|
307 |
+
print("--------------------------------------------------------------------------------")
|
308 |
+
|
309 |
+
# Tokenize and generate
|
310 |
+
inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
|
311 |
outputs = model.generate(
|
312 |
+
**inputs,
|
313 |
max_new_tokens=512,
|
314 |
temperature=0.7,
|
315 |
top_p=0.9,
|
316 |
do_sample=True
|
317 |
)
|
318 |
+
translation = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
|
319 |
|
320 |
print("--------------------------------------------------------------------------------")
|
321 |
print("\n rollout translation: \n")
|
|
|
345 |
translations.append(translation_map[src_sent][0])
|
346 |
return translations
|
347 |
|
348 |
+
# def final_translate_with_deepinfra(model, tokenizer, device, source_sentence, source_segments, buffer, src_language, tgt_language):
|
349 |
+
# translations = process_buffer_sentences(source_segments, buffer)
|
350 |
+
# initial_translation = "\n".join(translations)
|
351 |
+
|
352 |
+
# rewrite_prompt = (
|
353 |
+
# f"Below is an initial translation of a {src_language} text into {tgt_language}. "
|
354 |
+
# f"This translation may include omissions, inaccuracies, or awkward phrasing. "
|
355 |
+
# f"Your task is to produce a refined version that is fluent, accurate, and coherent, "
|
356 |
+
# f"while faithfully preserving the full meaning of the original {src_language} text.\n\n"
|
357 |
+
# f"### Instructions:\n"
|
358 |
+
# f"1. Ensure that every detail in the original {src_language} text is accurately represented.\n"
|
359 |
+
# f"2. Correct any grammatical errors, unnatural expressions, or inconsistencies.\n"
|
360 |
+
# f"3. Improve the natural flow so that the translation reads as if written by a native speaker.\n"
|
361 |
+
# f"4. Do not add, omit, or change any essential details from the source text.\n"
|
362 |
+
# f"5. Output only the final refined translation without any additional commentary.\n\n"
|
363 |
+
# f"### Original {src_language} Text:\n{source_sentence}\n\n"
|
364 |
+
# f"### Initial {tgt_language} Translation:\n{initial_translation}\n\n"
|
365 |
+
# f"### Refined Translation:"
|
366 |
+
# )
|
367 |
+
|
368 |
+
# print("rewrite prompt:")
|
369 |
+
# print(rewrite_prompt)
|
370 |
+
# messages=[
|
371 |
+
# {"role": "system", "content": "You are a helpful translator and only output the result."},
|
372 |
+
# {"role": "user", "content": rewrite_prompt}
|
373 |
+
# ]
|
374 |
+
# inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
|
375 |
+
# outputs = model.generate(
|
376 |
+
# inputs,
|
377 |
+
# max_new_tokens=512,
|
378 |
+
# temperature=0.7,
|
379 |
+
# top_p=0.9,
|
380 |
+
# do_sample=True
|
381 |
+
# )
|
382 |
+
# translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
|
383 |
+
# return translation
|
384 |
+
|
385 |
+
|
386 |
def final_translate_with_deepinfra(model, tokenizer, device, source_sentence, source_segments, buffer, src_language, tgt_language):
|
387 |
translations = process_buffer_sentences(source_segments, buffer)
|
388 |
initial_translation = "\n".join(translations)
|
389 |
|
390 |
rewrite_prompt = (
|
391 |
+
f"System: You are a helpful translator and only output the result.\n\n"
|
392 |
f"Below is an initial translation of a {src_language} text into {tgt_language}. "
|
393 |
f"This translation may include omissions, inaccuracies, or awkward phrasing. "
|
394 |
f"Your task is to produce a refined version that is fluent, accurate, and coherent, "
|
|
|
401 |
f"5. Output only the final refined translation without any additional commentary.\n\n"
|
402 |
f"### Original {src_language} Text:\n{source_sentence}\n\n"
|
403 |
f"### Initial {tgt_language} Translation:\n{initial_translation}\n\n"
|
404 |
+
f"Assistant:"
|
405 |
)
|
406 |
|
407 |
+
inputs = tokenizer(rewrite_prompt, return_tensors="pt").to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
408 |
outputs = model.generate(
|
409 |
+
**inputs,
|
410 |
+
max_new_tokens=512,
|
411 |
+
temperature=0.7,
|
412 |
+
top_p=0.9,
|
413 |
+
do_sample=True
|
414 |
+
)
|
415 |
+
refined_translation = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
|
416 |
+
return refined_translation
|
|
|
417 |
|
418 |
################################# alignment functions #################################
|
419 |
def save_sentences_to_txt(sentences, filename):
|