Spaces:

nvidia
/

Plan2Align-NV

Sleeping

App Files Files Community

KuangDW commited on 23 days ago

Commit

57a7224

1 Parent(s): ba00803

gemma version

Browse files

Files changed (2) hide show

app.py +26 -13
vecalign/plan2align.py +154 -49

app.py CHANGED Viewed

@@ -61,7 +61,7 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
 # Load models once
 print("Loading models...")
-model_id = "meta-llama/Llama-3.3-70B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
@@ -69,20 +69,33 @@ model = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.float16
 )
 def generate_translation(system_prompt, prompt):
-    messages=[
-        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": prompt}
-    ]
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
     outputs = model.generate(
-            inputs,
-            max_new_tokens=512,
-            temperature=0.7,
-            top_p=0.9,
-            do_sample=True
-        )
-    translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
     return translation
 def check_token_length(text, max_tokens=1024):

 print(f"Using device: {device}")
 # Load models once
 print("Loading models...")
+model_id = "google/gemma-2-9b-it" # "meta-llama/Meta-Llama-3.1-8B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     torch_dtype=torch.float16
 )
+# def generate_translation(system_prompt, prompt):
+#     messages=[
+#         {"role": "system", "content": system_prompt},
+#         {"role": "user", "content": prompt}
+#     ]
+#     inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
+#     outputs = model.generate(
+#             inputs,
+#             max_new_tokens=512,
+#             temperature=0.7,
+#             top_p=0.9,
+#             do_sample=True
+#         )
+#     translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
+#     return translation
 def generate_translation(system_prompt, prompt):
+    full_prompt = f"System: {system_prompt}\nUser: {prompt}\nAssistant:"
+    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
     outputs = model.generate(
+        **inputs,
+        max_new_tokens=512,
+        temperature=0.7,
+        top_p=0.9,
+        do_sample=True
+    )
+    translation = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
     return translation
 def check_token_length(text, max_tokens=1024):

vecalign/plan2align.py CHANGED Viewed

@@ -183,6 +183,76 @@ def external_find_best_translation(evals, language, session_id):
 ################################# generating translation #################################
 def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer, good_sent_size, src_language, tgt_language):
     system_prompts = [
         "You are a meticulous translator. Provide a literal, word-for-word translation that preserves the structure and meaning of each individual word.",
@@ -190,17 +260,6 @@ def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer,
         "You are a creative and expressive translator. Render the text in a vivid way, as if narrating a captivating story."
     ]
-    context_prompt =  f"Below is a specialized, intermediate translation task. The input text is a mix of {src_language} and partial {tgt_language} translations. "
-    context_prompt += f"In the text, some {src_language} sentences are already followed by preliminary {tgt_language} translations enclosed in parentheses. "
-    context_prompt += f"These provided translations are rough references – they may be incomplete, inconsistent, or not fully aligned with the original meaning.\n\n"
-    context_prompt += f"Your task is to produce an improved {tgt_language} translation according to the following guidelines:\n"
-    context_prompt += f"1. **Refinement:** For sections with existing {tgt_language} translations (in parentheses), refine and polish them so that they are fluent, accurate, and coherent, fully capturing the meaning of the corresponding {src_language} text.\n"
-    context_prompt += f"2. **Completion:** For sections that remain untranslated, translate the {src_language} text accurately and naturally in the specified style.\n"
-    context_prompt += f"3. **Translation Order and Structure Preservation:** Maintain the original order and structure of the text. Every {src_language} sentence must appear in the same sequence as in the source text, with its corresponding {tgt_language} translation (if available) inserted immediately after it. Do not rearrange or reorder any part of the text.\n"
-    context_prompt += f"4. **Consistency:** Ensure a uniform tone and style across the entire translation, adhering to the translator role specified.\n"
-    context_prompt += f"5. **Final Output:** Provide the final output as a single, well-structured {tgt_language} text. Do not include any extraneous commentary, explanations, annotations, or headers – output only the translation in the correct order.\n\n"
-    context_prompt += f"Note: This translation is an intermediate version that may later be merged with other translations. Focus on clarity, coherence, and fidelity to the source text.\n"
     # Process the buffer to extract relevant English translations
     processed_source = source_sentence
     if len(buffer) > 0:
@@ -214,35 +273,49 @@ def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer,
                         key_sentence,
                         f"{key_sentence}\n({translated_sentence})\n"
                     )
-    context_prompt += f"\nHere is the input data for translation:\n{processed_source}\n\n"
-    context_prompt += "Apply the above guidelines to produce an improved, coherent translation that strictly follows the original order of the text.\n"
-    if len(buffer) == 0:
-        context_prompt = f"### Translate this from {src_language} to {tgt_language} and **only** output the result."
-        context_prompt += f"\n### {src_language}:\n {source_sentence}"
-        context_prompt += f"\n### {tgt_language}:\n"
-    print("--------------------------------------------------------------------------------")
-    print("\n context_prompt \n")
-    print(context_prompt)
-    print("--------------------------------------------------------------------------------")
     translations = []
-    for prompt in system_prompts:
-        messages=[
-            {"role": "system", "content": prompt},
-            {"role": "user", "content": context_prompt}
-        ]
-        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
         outputs = model.generate(
-            inputs,
             max_new_tokens=512,
             temperature=0.7,
             top_p=0.9,
             do_sample=True
         )
-        translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
         print("--------------------------------------------------------------------------------")
         print("\n rollout translation: \n")
@@ -272,11 +345,50 @@ def process_buffer_sentences(source_sentences, buffer):
             translations.append(translation_map[src_sent][0])
     return translations
 def final_translate_with_deepinfra(model, tokenizer, device, source_sentence, source_segments, buffer, src_language, tgt_language):
     translations = process_buffer_sentences(source_segments, buffer)
     initial_translation = "\n".join(translations)
     rewrite_prompt = (
         f"Below is an initial translation of a {src_language} text into {tgt_language}. "
         f"This translation may include omissions, inaccuracies, or awkward phrasing. "
         f"Your task is to produce a refined version that is fluent, accurate, and coherent, "
@@ -289,26 +401,19 @@ def final_translate_with_deepinfra(model, tokenizer, device, source_sentence, so
         f"5. Output only the final refined translation without any additional commentary.\n\n"
         f"### Original {src_language} Text:\n{source_sentence}\n\n"
         f"### Initial {tgt_language} Translation:\n{initial_translation}\n\n"
-        f"### Refined Translation:"
     )
-    print("rewrite prompt:")
-    print(rewrite_prompt)
-    messages=[
-        {"role": "system", "content": "You are a helpful translator and only output the result."},
-        {"role": "user", "content": rewrite_prompt}
-    ]
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
     outputs = model.generate(
-            inputs,
-            max_new_tokens=512,
-            temperature=0.7,
-            top_p=0.9,
-            do_sample=True
-        )
-    translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
-    return translation
 ################################# alignment functions #################################
 def save_sentences_to_txt(sentences, filename):

 ################################# generating translation #################################
+# def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer, good_sent_size, src_language, tgt_language):
+#     system_prompts = [
+#         "You are a meticulous translator. Provide a literal, word-for-word translation that preserves the structure and meaning of each individual word.",
+#         "You are a professional translator. Deliver a clear, formal, and precise translation that faithfully conveys the original meaning.",
+#         "You are a creative and expressive translator. Render the text in a vivid way, as if narrating a captivating story."
+#     ]
+#     context_prompt =  f"Below is a specialized, intermediate translation task. The input text is a mix of {src_language} and partial {tgt_language} translations. "
+#     context_prompt += f"In the text, some {src_language} sentences are already followed by preliminary {tgt_language} translations enclosed in parentheses. "
+#     context_prompt += f"These provided translations are rough references – they may be incomplete, inconsistent, or not fully aligned with the original meaning.\n\n"
+#     context_prompt += f"Your task is to produce an improved {tgt_language} translation according to the following guidelines:\n"
+#     context_prompt += f"1. **Refinement:** For sections with existing {tgt_language} translations (in parentheses), refine and polish them so that they are fluent, accurate, and coherent, fully capturing the meaning of the corresponding {src_language} text.\n"
+#     context_prompt += f"2. **Completion:** For sections that remain untranslated, translate the {src_language} text accurately and naturally in the specified style.\n"
+#     context_prompt += f"3. **Translation Order and Structure Preservation:** Maintain the original order and structure of the text. Every {src_language} sentence must appear in the same sequence as in the source text, with its corresponding {tgt_language} translation (if available) inserted immediately after it. Do not rearrange or reorder any part of the text.\n"
+#     context_prompt += f"4. **Consistency:** Ensure a uniform tone and style across the entire translation, adhering to the translator role specified.\n"
+#     context_prompt += f"5. **Final Output:** Provide the final output as a single, well-structured {tgt_language} text. Do not include any extraneous commentary, explanations, annotations, or headers – output only the translation in the correct order.\n\n"
+#     context_prompt += f"Note: This translation is an intermediate version that may later be merged with other translations. Focus on clarity, coherence, and fidelity to the source text.\n"
+#     # Process the buffer to extract relevant English translations
+#     processed_source = source_sentence
+#     if len(buffer) > 0:
+#         selected_keys = random.sample(buffer.keys(), min(len(buffer), good_sent_size))
+#         for key_sentence in selected_keys:
+#             key_sentence = key_sentence.strip()
+#             if key_sentence and (key_sentence in source_sentence) :
+#                 translated_sentence =  buffer[key_sentence][0][0]
+#                 if f"\n({translated_sentence})\n" not in processed_source:
+#                     processed_source = processed_source.replace(
+#                         key_sentence,
+#                         f"{key_sentence}\n({translated_sentence})\n"
+#                     )
+#     context_prompt += f"\nHere is the input data for translation:\n{processed_source}\n\n"
+#     context_prompt += "Apply the above guidelines to produce an improved, coherent translation that strictly follows the original order of the text.\n"
+#     if len(buffer) == 0:
+#         context_prompt = f"### Translate this from {src_language} to {tgt_language} and **only** output the result."
+#         context_prompt += f"\n### {src_language}:\n {source_sentence}"
+#         context_prompt += f"\n### {tgt_language}:\n"
+#     print("--------------------------------------------------------------------------------")
+#     print("\n context_prompt \n")
+#     print(context_prompt)
+#     print("--------------------------------------------------------------------------------")
+#     translations = []
+#     for prompt in system_prompts:
+#         messages=[
+#             {"role": "system", "content": prompt},
+#             {"role": "user", "content": context_prompt}
+#         ]
+#         inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
+#         outputs = model.generate(
+#             inputs,
+#             max_new_tokens=512,
+#             temperature=0.7,
+#             top_p=0.9,
+#             do_sample=True
+#         )
+#         translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
+#         print("--------------------------------------------------------------------------------")
+#         print("\n rollout translation: \n")
+#         print(translation)
+#         print("--------------------------------------------------------------------------------")
+#         translations.append(translation)
+#     return translations
 def translate_with_deepinfra(model, tokenizer, device, source_sentence, buffer, good_sent_size, src_language, tgt_language):
     system_prompts = [
         "You are a meticulous translator. Provide a literal, word-for-word translation that preserves the structure and meaning of each individual word.",
         "You are a creative and expressive translator. Render the text in a vivid way, as if narrating a captivating story."
     ]
     # Process the buffer to extract relevant English translations
     processed_source = source_sentence
     if len(buffer) > 0:
                         key_sentence,
                         f"{key_sentence}\n({translated_sentence})\n"
                     )
     translations = []
+    for system_prompt in system_prompts:
+        if len(buffer) == 0:
+            full_prompt = (
+                f"System: {system_prompt}\n\n"
+                f"### Translate this from {src_language} to {tgt_language}.\n"
+                f"{src_language}:\n{source_sentence}\n\n"
+                f"{tgt_language}:\n"
+            )
+        else:
+            context_prompt = (
+                f"Below is a specialized, intermediate translation task. The input text is a mix of {src_language} and partial {tgt_language} translations. "
+                f"In the text, some {src_language} sentences are already followed by preliminary {tgt_language} translations enclosed in parentheses. "
+                f"These provided translations are rough references - they may be incomplete, inconsistent, or not fully aligned with the original meaning.\n\n"
+                f"Your task is to produce an improved {tgt_language} translation according to the following guidelines:\n"
+                f"1. Refinement: For sections with existing {tgt_language} translations (in parentheses), refine and polish them.\n"
+                f"2. Completion: For untranslated sections, translate the {src_language} text naturally.\n"
+                f"3. Translation Order: Maintain the original sequence - every source sentence must appear in order with its translation right after it.\n"
+                f"4. Consistency: Ensure a uniform tone and style.\n"
+                f"5. Output only the final {tgt_language} translation. No extra commentary.\n\n"
+                f"Note: This is an intermediate version that may later be merged. Focus on clarity and fidelity.\n\n"
+                f"Input Text:\n{processed_source}\n\n"
+                f"Assistant:"
+            )
+            full_prompt = f"System: {system_prompt}\n\n{context_prompt}"
+        print("--------------------------------------------------------------------------------")
+        print("\n full_prompt \n")
+        print(full_prompt)
+        print("--------------------------------------------------------------------------------")
+        # Tokenize and generate
+        inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
         outputs = model.generate(
+            **inputs,
             max_new_tokens=512,
             temperature=0.7,
             top_p=0.9,
             do_sample=True
         )
+        translation = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
         print("--------------------------------------------------------------------------------")
         print("\n rollout translation: \n")
             translations.append(translation_map[src_sent][0])
     return translations
+# def final_translate_with_deepinfra(model, tokenizer, device, source_sentence, source_segments, buffer, src_language, tgt_language):
+#     translations = process_buffer_sentences(source_segments, buffer)
+#     initial_translation = "\n".join(translations)
+#     rewrite_prompt = (
+#         f"Below is an initial translation of a {src_language} text into {tgt_language}. "
+#         f"This translation may include omissions, inaccuracies, or awkward phrasing. "
+#         f"Your task is to produce a refined version that is fluent, accurate, and coherent, "
+#         f"while faithfully preserving the full meaning of the original {src_language} text.\n\n"
+#         f"### Instructions:\n"
+#         f"1. Ensure that every detail in the original {src_language} text is accurately represented.\n"
+#         f"2. Correct any grammatical errors, unnatural expressions, or inconsistencies.\n"
+#         f"3. Improve the natural flow so that the translation reads as if written by a native speaker.\n"
+#         f"4. Do not add, omit, or change any essential details from the source text.\n"
+#         f"5. Output only the final refined translation without any additional commentary.\n\n"
+#         f"### Original {src_language} Text:\n{source_sentence}\n\n"
+#         f"### Initial {tgt_language} Translation:\n{initial_translation}\n\n"
+#         f"### Refined Translation:"
+#     )
+#     print("rewrite prompt:")
+#     print(rewrite_prompt)
+#     messages=[
+#         {"role": "system", "content": "You are a helpful translator and only output the result."},
+#         {"role": "user", "content": rewrite_prompt}
+#     ]
+#     inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
+#     outputs = model.generate(
+#             inputs,
+#             max_new_tokens=512,
+#             temperature=0.7,
+#             top_p=0.9,
+#             do_sample=True
+#         )
+#     translation = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
+#     return translation
 def final_translate_with_deepinfra(model, tokenizer, device, source_sentence, source_segments, buffer, src_language, tgt_language):
     translations = process_buffer_sentences(source_segments, buffer)
     initial_translation = "\n".join(translations)
     rewrite_prompt = (
+        f"System: You are a helpful translator and only output the result.\n\n"
         f"Below is an initial translation of a {src_language} text into {tgt_language}. "
         f"This translation may include omissions, inaccuracies, or awkward phrasing. "
         f"Your task is to produce a refined version that is fluent, accurate, and coherent, "
         f"5. Output only the final refined translation without any additional commentary.\n\n"
         f"### Original {src_language} Text:\n{source_sentence}\n\n"
         f"### Initial {tgt_language} Translation:\n{initial_translation}\n\n"
+        f"Assistant:"
     )
+    inputs = tokenizer(rewrite_prompt, return_tensors="pt").to(device)
     outputs = model.generate(
+        **inputs,
+        max_new_tokens=512,
+        temperature=0.7,
+        top_p=0.9,
+        do_sample=True
+    )
+    refined_translation = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
+    return refined_translation
 ################################# alignment functions #################################
 def save_sentences_to_txt(sentences, filename):