KuangDW commited on
Commit
dd05f29
·
1 Parent(s): 7f92284

add alignment and specify encoder

Browse files
app.py CHANGED
@@ -1,7 +1,337 @@
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gc
3
  import gradio as gr
4
+ import torch
5
+ import random
6
+ import logging
7
+ import openai
8
+ from openai import OpenAI
9
+ from vecalign.plan2align import translate_text, external_find_best_translation
10
+ from transformers import AutoTokenizer, AutoModelForCausalLM
11
+ from trl import AutoModelForCausalLMWithValueHead
12
+ from huggingface_hub import login
13
+ import spacy
14
+ import subprocess
15
+ import pkg_resources
16
+ import sys
17
 
18
+ laser_token = os.environ.get("align_enc")
19
+ laser_path = snapshot_download(repo_id="KuangDW/laser", use_auth_token=hf_token)
20
+ os.environ["LASER"] = laser_path
21
 
22
+ def check_and_install(package, required_version):
23
+ try:
24
+ dist = pkg_resources.get_distribution(package)
25
+ installed_version = dist.version
26
+ if installed_version != required_version:
27
+ print(f"[{package}] already installed {installed_version}. Required version {required_version},re-install...")
28
+ subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package}=={required_version}", "--force-reinstall"])
29
+ else:
30
+ print(f"[{package}] required version {required_version} finished")
31
+ except pkg_resources.DistributionNotFound:
32
+ print(f"[{package}] not found, install: {required_version}...")
33
+ subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package}=={required_version}"])
34
+ packages = {
35
+ "pip": "24.0",
36
+ "fairseq": "0.12.2"
37
+ }
38
+ for package, version in packages.items():
39
+ check_and_install(package, version)
40
+
41
+
42
+ models = ["en_core_web_sm", "ru_core_news_sm", "de_core_news_sm",
43
+ "ja_core_news_sm", "ko_core_news_sm", "es_core_news_sm"]
44
+ for model in models:
45
+ try:
46
+ spacy.load(model)
47
+ except OSError:
48
+ from spacy.cli import download
49
+ download(model)
50
+ try:
51
+ spacy.load("zh_core_web_sm")
52
+ except OSError:
53
+ from spacy.cli import download
54
+ download("zh_core_web_sm")
55
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy==1.24.0", "--force-reinstall"])
56
+
57
+ # ---------- deepinfra translation ----------
58
+ openai = OpenAI(
59
+ api_key="",
60
+ base_url="https://api.deepinfra.com/v1/openai",
61
+ )
62
+
63
+ def generate_translation(system_prompt, prompt):
64
+ response = openai.chat.completions.create(
65
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct",
66
+ messages=[
67
+ {"role": "system", "content": system_prompt},
68
+ {"role": "user", "content": prompt}
69
+ ]
70
+ )
71
+ translation = response.choices[0].message.content.strip()
72
+ return translation
73
+
74
+ def check_token_length(text, max_tokens=1024):
75
+ return len(text) <= max_tokens
76
+
77
+ import uuid
78
+ def get_user_session(state):
79
+ if not state.get("session_id"):
80
+ state["session_id"] = uuid.uuid4().hex
81
+ return state["session_id"]
82
+
83
+
84
+ # ---------- Translation Function ----------
85
+
86
+ def mpc_initial_translate(source_sentence, src_language, tgt_language):
87
+ system_prompts = [
88
+ "You are a meticulous translator. Provide a literal, word-for-word translation that preserves the structure and meaning of each individual word.",
89
+ "You are a professional translator. Deliver a clear, formal, and precise translation that faithfully conveys the original meaning.",
90
+ "You are a creative and expressive translator. Render the text in a vivid and imaginative way, as if narrating a captivating story."
91
+ ]
92
+ translations = []
93
+ for prompt_style in system_prompts:
94
+ prompt = f"### Translate this from {src_language} to {tgt_language} and only output the result."
95
+ prompt += f"\n### {src_language}:\n {source_sentence}"
96
+ prompt += f"\n### {tgt_language}:\n"
97
+ translation = generate_translation(prompt_style, prompt)
98
+ translations.append(translation)
99
+
100
+ print("mpc_initial_translate")
101
+ print(translations)
102
+ return translations
103
+
104
+ def mpc_improved_translate(source_sentence, current_translation, src_language, tgt_language):
105
+ system_prompts = [
106
+ "You are a meticulous translator. Please improve the following translation by ensuring it is a literal and structurally precise version.",
107
+ "You are a professional translator. Please refine the provided translation to be clear, formal, and accurate.",
108
+ "You are a creative translator. Please enhance the translation so that it is vivid, natural, and engaging."
109
+ ]
110
+ translations = []
111
+ for prompt_style in system_prompts:
112
+ prompt = (f"Source ({src_language}): {source_sentence}\n"
113
+ f"Current Translation ({tgt_language}): {current_translation}\n"
114
+ f"Please provide an improved translation into {tgt_language} and only output the result:")
115
+ translation = generate_translation(prompt_style, prompt)
116
+ translations.append(translation)
117
+
118
+ print("mpc_improved_translate")
119
+ print(translations)
120
+ return translations
121
+
122
+ def basic_translate(source_sentence, src_language, tgt_language):
123
+ system_prompts = ["You are a helpful translator and only output the result."]
124
+ translations = []
125
+ for prompt_style in system_prompts:
126
+ prompt = f"### Translate this from {src_language} to {tgt_language}."
127
+ prompt += f"\n### {src_language}:\n {source_sentence}"
128
+ prompt += f"\n### {tgt_language}:\n"
129
+ translation = generate_translation(prompt_style, prompt)
130
+ translations.append(translation)
131
+ return translations
132
+
133
+ def plan2align_translate_text(text, session_id, src_language, task_language, max_iterations_value, threshold_value, good_ref_contexts_num_value, reward_model_type):
134
+ result = translate_text(
135
+ text,
136
+ src_language=src_language,
137
+ task_language=task_language,
138
+ max_iterations_value=max_iterations_value,
139
+ threshold_value=threshold_value,
140
+ good_ref_contexts_num_value=good_ref_contexts_num_value,
141
+ reward_model_type=reward_model_type,
142
+ session_id=session_id
143
+ )
144
+ _, score = evaluate_candidates(text, [result], task_language, session_id)
145
+ return result, score
146
+
147
+ def evaluate_candidates(source, candidates, language, session_id):
148
+ evals = [(source, candidates)]
149
+ best_translations = external_find_best_translation(evals, language, session_id)
150
+ best_candidate, best_score = best_translations[0]
151
+ return best_candidate, best_score
152
+
153
+ def original_translation(text, src_language, target_language, session_id):
154
+ cand_list = basic_translate(text, src_language, target_language)
155
+ best, score = evaluate_candidates(text, cand_list, target_language, session_id)
156
+ if cand_list:
157
+ return best, score
158
+ return "", 0
159
+
160
+ def best_of_n_translation(text, src_language, target_language, n, session_id):
161
+ if not check_token_length(text, 2048):
162
+ return "Warning: Input text exceeds 2048 tokens.", None, ""
163
+ candidates = []
164
+ for i in range(n):
165
+ cand_list = basic_translate(text, src_language, target_language)
166
+ if cand_list:
167
+ candidates.append(cand_list[0])
168
+ best, score = evaluate_candidates(text, candidates, target_language, session_id)
169
+ print("best_of_n evaluate_candidates results:")
170
+ print(best, score)
171
+ return best, score
172
+
173
+ def mpc_translation(text, src_language, target_language, iterations, session_id):
174
+ if not check_token_length(text, 2048):
175
+ return "Warning: Input text exceeds 2048 tokens.", None, ""
176
+ current_trans = ""
177
+ best_score = None
178
+ for i in range(iterations):
179
+ if i == 0:
180
+ cand_list = mpc_initial_translate(text, src_language, target_language)
181
+ else:
182
+ cand_list = mpc_improved_translate(text, current_trans, src_language, target_language)
183
+
184
+ best, score = evaluate_candidates(text, cand_list, target_language, session_id)
185
+ print("mpc evaluate_candidates results:")
186
+ print(best, score)
187
+ current_trans = best
188
+ best_score = score
189
+ return current_trans, best_score
190
+
191
+ # ---------- Gradio 主流程函數 ----------
192
+
193
+ def process_text(text, src_language, target_language, max_iterations_value, threshold_value,
194
+ good_ref_contexts_num_value, translation_methods, state):
195
+ session_id = get_user_session(state)
196
+
197
+ """
198
+ 傳入中文文本與目標語言,依序產生四種翻譯結果:
199
+ 1. 原始翻譯
200
+ 2. Plan2Align 翻譯
201
+ 3. Best-of-N 翻譯
202
+ 4. MPC 翻譯
203
+ """
204
+
205
+ # 初始化各輸出內容
206
+ orig_output = ""
207
+ plan2align_output = ""
208
+ best_of_n_output = ""
209
+ mpc_output = ""
210
+
211
+
212
+ if "Original" in translation_methods:
213
+ orig, best_score = original_translation(text, src_language, target_language, session_id)
214
+ orig_output = f"{orig}\n\nScore: {best_score:.2f}"
215
+ if "Plan2Align" in translation_methods:
216
+ plan2align_trans, best_score = plan2align_translate_text(
217
+ text, session_id, src_language, target_language,
218
+ max_iterations_value, threshold_value, good_ref_contexts_num_value, "metricx"
219
+ )
220
+ plan2align_output = f"{plan2align_trans}\n\nScore: {best_score:.2f}"
221
+ if "Best-of-N" in translation_methods:
222
+ best_candidate, best_score = best_of_n_translation(text, src_language, target_language,
223
+ max_iterations_value, session_id)
224
+ best_of_n_output = f"{best_candidate}\n\nScore: {best_score:.2f}"
225
+ if "MPC" in translation_methods:
226
+ mpc_candidate, mpc_score = mpc_translation(text, src_language, target_language,
227
+ max_iterations_value, session_id)
228
+ mpc_output = f"{mpc_candidate}\n\nScore: {mpc_score:.2f}"
229
+
230
+ return orig_output, plan2align_output, best_of_n_output, mpc_output
231
+
232
+ # ---------- Gradio ----------
233
+ target_languages = ["Chinese", "English", "Russian", "German", "Japanese", "Korean"]
234
+ src_languages = ["Chinese", "English", "Russian", "German", "Japanese", "Korean"]
235
+
236
+ with gr.Blocks(title="Test-Time Machine Translation with Plan2Align") as demo:
237
+ state = gr.State({})
238
+
239
+ gr.Markdown("# Translation Demo: Multiple Translation Methods")
240
+ gr.Markdown("請選擇要執行的翻譯方法(可多選或全選):")
241
+
242
+ with gr.Row():
243
+ with gr.Column(scale=1):
244
+ source_text = gr.Textbox(
245
+ label="Source Text",
246
+ placeholder="請輸入文本...",
247
+ lines=5
248
+ )
249
+ src_language_input = gr.Dropdown(
250
+ choices=src_languages,
251
+ value="Chinese",
252
+ label="Source Language"
253
+ )
254
+ task_language_input = gr.Dropdown(
255
+ choices=target_languages,
256
+ value="English",
257
+ label="Task (Target) Language"
258
+ )
259
+ max_iterations_input = gr.Number(label="Max Iterations", value=6)
260
+ threshold_input = gr.Number(label="Threshold", value=0.7)
261
+ good_ref_contexts_num_input = gr.Number(label="Good Ref Contexts Num", value=5)
262
+ translation_methods_input = gr.CheckboxGroup(
263
+ choices=["Original", "Plan2Align", "Best-of-N", "MPC"],
264
+ value=["Original", "Plan2Align"],
265
+ label="Translation Methods"
266
+ )
267
+ translate_button = gr.Button("Translate")
268
+ with gr.Column(scale=2):
269
+ original_output = gr.Textbox(
270
+ label="Original Translation",
271
+ lines=5,
272
+ interactive=False
273
+ )
274
+ plan2align_output = gr.Textbox(
275
+ label="Plan2Align Translation",
276
+ lines=5,
277
+ interactive=False
278
+ )
279
+ best_of_n_output = gr.Textbox(
280
+ label="Best-of-N Translation",
281
+ lines=5,
282
+ interactive=False
283
+ )
284
+ mpc_output = gr.Textbox(
285
+ label="MPC Translation",
286
+ lines=5,
287
+ interactive=False
288
+ )
289
+
290
+ translate_button.click(
291
+ fn=process_text,
292
+ inputs=[
293
+ source_text,
294
+ src_language_input,
295
+ task_language_input,
296
+ max_iterations_input,
297
+ threshold_input,
298
+ good_ref_contexts_num_input,
299
+ translation_methods_input,
300
+ state
301
+ ],
302
+ outputs=[original_output, plan2align_output, best_of_n_output, mpc_output]
303
+ )
304
+
305
+ gr.Examples(
306
+ examples=[
307
+ ["夜市文化豐富多彩,從士林夜市到饒河街夜市,提供各種美食、遊戲和購物體驗,吸引了無數遊客。", "Chinese", "English", 2, 0.7, 1],
308
+ ["台北101曾經是世界最高的建築物,它不僅是台灣的地標,也象徵著經濟成就和創新精神。", "Chinese", "Russian", 2, 0.7, 1],
309
+ ["阿里山日出和森林鐵路是台灣最著名的自然景觀之一,每年吸引數十萬遊客前來欣賞雲海和壯麗的日出。", "Chinese", "German", 2, 0.7, 1],
310
+ ["珍珠奶茶,這款源自台灣的獨特飲品,不僅在台灣本地深受喜愛,更以其獨特的風味和口感,在全球掀起了一股熱潮,成為了一種跨越文化、風靡全球的時尚飲品。", "Chinese", "Japanese", 3, 0.7, 3],
311
+ ["原住民文化如同一片深邃的星空,閃爍著無數璀璨的傳統與藝術光芒。他們的歌舞,是與祖靈對話的旋律,是與自然共鳴的節奏,每一個舞步、每一聲吟唱,都承載著古老的傳說與智慧。編織,是他們巧手下的詩篇,一絲一線,交織出生命的紋理,也編織出對土地的熱愛與敬畏。木雕,則是他們與自然對話的雕塑,每一刀、每一鑿,都刻畫著對萬物的觀察與敬意,也雕琢出對祖先的追憶與傳承。", "Chinese", "Korean", 5, 0.7, 5]
312
+ ],
313
+ inputs=[
314
+ source_text,
315
+ src_language_input,
316
+ task_language_input,
317
+ max_iterations_input,
318
+ threshold_input,
319
+ good_ref_contexts_num_input,
320
+ state
321
+ ],
322
+ outputs=[original_output, plan2align_output, best_of_n_output, mpc_output],
323
+ fn=process_text
324
+ )
325
+
326
+ gr.Markdown("## How It Works")
327
+ gr.Markdown("""
328
+ 1. **Original Translation:** 利用固定提示生成候選,直接取首個候選作為原始翻譯。
329
+ 2. **Plan2Align Translation:** 採用 context alignment 和 self-rewriting 策略進行翻譯,適合長文翻譯。
330
+ 3. **Best-of-N Translation:** 重複生成多次候選,評分選出最佳翻譯,適合短文翻譯。
331
+ 4. **MPC Translation:** 以迭代改善策略,每輪生成候選後評分,並將最佳翻譯作為下一輪輸入,適合短文翻譯。
332
+
333
+ 若輸入文本超過 1024 tokens,Best-of-N 與 MPC 方法會回傳警告訊息。
334
+ """)
335
+
336
+ if __name__ == "__main__":
337
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pip==24.0
2
+ botok==0.9.0
3
+ Cython==3.0.11
4
+ datasets==3.4.1
5
+ emoji==2.14.1
6
+ Flask==3.1.0
7
+ func_argparse==1.1.1
8
+ gradio==5.25.0
9
+ indicnlp==0.0.1
10
+ laonlp==1.2.0
11
+ mcerp==0.12
12
+ openai==1.73.0
13
+ pandas==2.2.3
14
+ pytest==8.3.5
15
+ Requests==2.32.3
16
+ sacremoses==0.1.1
17
+ safetensors==0.5.3
18
+ scipy==1.15.2
19
+ sentence_splitter==1.4
20
+ sentence_transformers==4.0.2
21
+ setuptools==69.0.3
22
+ spacy==3.8.3
23
+ stopes==2.2.1
24
+ tabulate==0.9.0
25
+ tiktoken==0.8.0
26
+ torch==2.1.2
27
+ tqdm==4.67.1
28
+ transformers>=4.41.0,<5.0.0
29
+ transliterate==1.10.2
30
+ trl==0.16.1
31
+ unicategories==0.1.2
32
+ xxhash==3.5.0
33
+ blobfile==3.0.0
34
+ numpy==1.24.0
35
+ sentencepiece==0.2.0
36
+ protobuf==6.30.2
vecalign/.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ build/
2
+ dp_core.c*
3
+ dp_core.html
4
+ __pycache__/
5
+ .idea
6
+ *~
7
+ .pytest_cache/
8
+ venv/
9
+ fairseq/
10
+ scores/
vecalign/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 NYCU-RL-Bandits-Lab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
vecalign/README.md ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Plan2Align
2
+
3
+ This is the official implementation for the paper **"Plan2Align: Predictive Planning Based Test-Time Preference Alignment in Paragraph-Level Machine Translation"**.
4
+
5
+ ## Environment Setup Guide for Plan2Align
6
+
7
+ This document provides a step-by-step guide for setting up the environment required to run Plan2Align efficiently. Please follow the instructions below to ensure a smooth installation process.
8
+
9
+ ### 1. Create a Conda Virtual Environment (Recommended)
10
+
11
+ It is highly recommended to use a Conda virtual environment to manage dependencies and avoid conflicts. Execute the following commands:
12
+
13
+ ```bash
14
+ conda create --name plan2align python=3.9
15
+ conda activate plan2align
16
+ ```
17
+
18
+ ### 2. Install VecAlign & SpaCy
19
+
20
+ Plan2Align relies on VecAlign for alignment tasks. Please follow the installation instructions provided in the official repository:
21
+ [VecAlign GitHub Repository](https://github.com/thompsonb/vecalign)
22
+
23
+ ### 3. Configure Environment Variables for LASER
24
+
25
+ LASER must be properly configured by setting up the required environment variables. Use the following steps:
26
+
27
+ ```bash
28
+ nano ~/.bashrc
29
+ export LASER="{PATH_TO_LASER}"
30
+ source ~/.bashrc
31
+ ```
32
+
33
+ Make sure to replace `{PATH_TO_LASER}` with the actual path where LASER is installed.
34
+
35
+ ### 4. Prepare API Key
36
+
37
+ Plan2Align requires an API key for OpenAI services. Ensure that you have the necessary credentials set up:
38
+
39
+ ```python
40
+ openai = OpenAI(
41
+ api_key='your-api-key',
42
+ base_url='your-base_url',
43
+ )
44
+ ```
45
+
46
+ Replace `'your-api-key'` and `'your-base_url'` with your actual API key and endpoint.
47
+
48
+ ### 5. Configure Reward Model
49
+
50
+ Plan2Align utilizes a reward model for alignment tasks. Ensure that you modify the following paths in your reward model setup before use:
51
+
52
+ ```python
53
+ self.RM = AutoModelForCausalLMWithValueHead.from_pretrained(
54
+ '../<path-to-rm>',
55
+ torch_dtype=torch.bfloat16
56
+ ).to(self.device)
57
+
58
+ value_head_weights = load_file("../<path-to-value_head>")
59
+ ```
60
+
61
+ Replace `<path-to-rm>` and `<path-to-value_head>` with the correct file paths in your system.
62
+
63
+ Before running the program, you can use `set_translation_model("rm")` to make Plan2Align perform alignment based on the reward model.
64
+
65
+ ### 6. Running Plan2Align
66
+
67
+ For ease of testing Plan2Align, we provide a small preference model for alignment. You can download its weights from the following link:
68
+ [Download Weights](https://drive.google.com/file/d/1us3pBmnJseI0-lozh999dDraql9m03im/view?usp=sharing).
69
+ Place it directly in the project directory, and use `set_translation_model("pm")` in `plan2align.py` to utilize it.
70
+
71
+ Regarding datasets, we used the dataset from [Hugging Face](https://huggingface.co/datasets/huckiyang/zh-tw-en-us-nv-tech-blog-v1) and for validation. We selected longer, semantically structured samples from it, created a `valid_zh_en.csv`, and performed Chinese-to-English translation tasks.
72
+
73
+ To validate that Plan2Align is correctly installed and configured, execute the following command:
74
+
75
+ ```bash
76
+ python plan2align.py \
77
+ --input_file "valid_en_ja.csv" \
78
+ --rm "metricx" \
79
+ --src_language English \
80
+ --task_language Japanese \
81
+ --threshold 0.7 \
82
+ --max_iterations 6 \
83
+ --good_ref_contexts_num 5 \
84
+ --cuda_num 0
85
+ ```
86
+
87
+ ### 7. Evaluation Process
88
+
89
+ ---
90
+
91
+ ## Citation
92
+
93
+ If you would like to cite this work, please use the following BibTeX entry:
94
+
95
+ ```bibtex
96
+ @article{wang2025plan2align,
97
+ title={Plan2Align: Predictive Planning Based Test-Time Preference Alignment in Paragraph-Level Machine Translation},
98
+ author={Wang, Kuang-Da and Chen, Teng-Ruei and Hung, Yu Heng and Ding, Shuoyang and Wu, Yueh-Hua and Wang, Yu-Chiang Frank and Yang, Chao-Han Huck and Peng, Wen-Chih and Hsieh, Ping-Chun},
99
+ journal={arXiv preprint arXiv:2502.20795},
100
+ year={2025}
101
+ }
102
+ ```
vecalign/__init__.py ADDED
File without changes
vecalign/dp_core.pyx ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cython: language_level=3
2
+
3
+ """
4
+ Copyright 2019 Brian Thompson
5
+
6
+ Licensed under the Apache License, Version 2.0 (the "License");
7
+ you may not use this file except in compliance with the License.
8
+ You may obtain a copy of the License at
9
+
10
+ https://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+ """
18
+
19
+ import numpy as np
20
+
21
+ cimport numpy as np
22
+ cimport cython
23
+
24
+
25
+ def make_x_y_offsets(alignment_types):
26
+ # alignment types for which we will precompute costs
27
+
28
+ # deletion/insertion is added later
29
+ for x, y in alignment_types:
30
+ assert (x > 0)
31
+ assert (y > 0)
32
+
33
+ x_offsets = np.array([x for x, y in alignment_types], dtype=np.int32) # MUST **NOT** INCLUDE (0,1), (1,0)
34
+ y_offsets = np.array([y for x, y in alignment_types], dtype=np.int32) # MUST **NOT** INCLUDE (0,1), (1,0)
35
+ return x_offsets, y_offsets
36
+
37
+
38
+ def make_dense_costs(np.ndarray[float, ndim=3] vecs0, # itput
39
+ np.ndarray[float, ndim=3] vecs1, # input
40
+ np.ndarray[float, ndim=2] norm0, # input
41
+ np.ndarray[float, ndim=2] norm1, # input
42
+ int offset0 = 0, # index into vecs0/norms0
43
+ int offset1 = 0, # index into vecs1/norms1
44
+ ):
45
+ """
46
+ Make a full N*M feature matrix. By default, makes 1-1 alignments,
47
+ can build others by specifying offset0, offset1 to index into
48
+ vecs0, norms0 and vecs1, norms1 respectivly.
49
+ """
50
+ assert vecs0.shape[0] > offset0
51
+ assert vecs1.shape[0] > offset1
52
+ assert norm0.shape[0] > offset0
53
+ assert norm1.shape[0] > offset1
54
+
55
+ cdef int size0 = np.shape(vecs0)[1]
56
+ assert norm0.shape[1] == size0
57
+
58
+ cdef int size1 = np.shape(vecs1)[1]
59
+ assert norm1.shape[1] == size1
60
+
61
+ cdef int vecsize = np.shape(vecs0)[2]
62
+ assert vecs1.shape[2] == vecsize
63
+
64
+ cdef int xi, yi
65
+ cdef float sumx
66
+
67
+ cdef np.ndarray[float, ndim=2] costs = np.empty((size0, size1), dtype=np.float32)
68
+
69
+ for xi in range(size0):
70
+ for yi in range(size1):
71
+ sumx = 0.0
72
+ for jj in range(vecsize):
73
+ sumx += vecs0[offset0, xi, jj] * vecs1[offset1, yi, jj]
74
+
75
+ costs[xi, yi] = 2.0 * (1.0 - sumx) / (1e-6 + norm0[offset0, xi] + norm1[offset1, yi])
76
+ # normalize by alignment type
77
+ costs[xi, yi] = costs[xi, yi] * (offset0 + 1) * (offset1 + 1)
78
+
79
+ return costs
80
+
81
+
82
+ def dense_dp(np.ndarray[float, ndim=2] alignment_cost, float pen):
83
+ """
84
+ Compute cost matrix (csum) and backpointers (bp)
85
+ from full 2-D 1-1 alignment costs matrix (alignment_cost)
86
+ """
87
+
88
+ size0 = alignment_cost.shape[0]
89
+ size1 = alignment_cost.shape[1]
90
+ # csum and traceback matrix are both on nodes
91
+ # so they are +1 in each dimension compared to the jump costs matrix
92
+ # For anything being used in accumulation, use float64
93
+ cdef np.ndarray[double, ndim=2] csum = np.empty((size0 + 1, size1 + 1), dtype=np.float64)
94
+ cdef np.ndarray[int, ndim=2] bp = np.empty((size0 + 1, size1 + 1), dtype=np.int32)
95
+
96
+ # bp and csum are nodes,
97
+ # while alignment_cost is the cost of going between the nodes
98
+ # Size of nodes should be one larger than alignment costs
99
+ b0, b1 = np.shape(bp)
100
+ c0, c1 = np.shape(csum)
101
+ j0, j1 = np.shape(alignment_cost)
102
+ assert (b0 == c0 == j0 + 1)
103
+ assert (b1 == c1 == j1 + 1)
104
+
105
+ cdef int cmax = np.shape(csum)[1]
106
+ cdef int rmax = np.shape(csum)[0]
107
+ cdef int c, r
108
+ cdef double cost0, cost1, cost2
109
+
110
+ # initialize the all c-direction deletion path
111
+ for c in range(cmax):
112
+ csum[0, c] = c * pen
113
+ bp[0, c] = 1
114
+
115
+ # initialize the all r-direction deletion path
116
+ for r in range(rmax):
117
+ csum[r, 0] = r * pen
118
+ bp[r, 0] = 2
119
+
120
+ # Initial cost is 0.0
121
+ csum[0, 0] = 0.0 # noop
122
+ bp[0, 0] = 4 # should not matter
123
+
124
+ # Calculate the rest recursively
125
+ for c in range(1, cmax):
126
+ for r in range(1, rmax):
127
+
128
+ # alignment_cost indexes are off by 1 wrt
129
+ # csum/bp, since csum/bp are nodes
130
+ cost0 = csum[r - 1, c - 1] + alignment_cost[r - 1, c - 1]
131
+ cost1 = csum[r, c - 1] + pen
132
+ cost2 = csum[r - 1, c] + pen
133
+
134
+ csum[r, c] = cost0
135
+ bp[r, c] = 0
136
+
137
+ if cost1 < csum[r, c]:
138
+ csum[r, c] = cost1
139
+ bp[r, c] = 1
140
+ if cost2 < csum[r, c]:
141
+ csum[r, c] = cost2
142
+ bp[r, c] = 2
143
+
144
+ return csum, bp
145
+
146
+
147
+ def score_path(np.ndarray[int, ndim=1] xx,
148
+ np.ndarray[int, ndim=1] yy,
149
+ np.ndarray[float, ndim=1] norm1,
150
+ np.ndarray[float, ndim=1] norm2,
151
+ np.ndarray[float, ndim=2] vecs1,
152
+ np.ndarray[float, ndim=2] vecs2,
153
+ np.ndarray[float, ndim=1] out):
154
+ cdef int xi, yi, ii, jj
155
+ cdef float outx
156
+ cdef int lenxy = xx.shape[0]
157
+ cdef int vecsize = vecs1.shape[1]
158
+
159
+ for ii in range(lenxy):
160
+ xi = xx[ii]
161
+ yi = yy[ii]
162
+ outx = 0.0
163
+ for jj in range(vecsize):
164
+ outx += vecs1[xi, jj] * vecs2[yi, jj]
165
+ out[ii] = 2.0 * (1.0 - outx) / (norm1[xi] + norm2[yi])
166
+
167
+
168
+ # Bounds checking and wraparound slow things down by about 2x
169
+ # Division by 0 checking has minimal speed impact
170
+ @cython.boundscheck(False) # turn off bounds-checking for entire function
171
+ @cython.wraparound(False) # turn off negative index wrapping for entire function
172
+ @cython.cdivision(True) # use c-style division (no division-by-zero check)
173
+ def make_sparse_costs(np.ndarray[float, ndim=3] vecs0, # intput: num aligns X num sents X dim
174
+ np.ndarray[float, ndim=3] vecs1, # input
175
+ np.ndarray[float, ndim=2] norms0, # intput: num aligns X num sents
176
+ np.ndarray[float, ndim=2] norms1, # input
177
+ x_y_path,
178
+ alignment_types,
179
+ int width_over2):
180
+ """
181
+ Make features for DP, *for lines running across approximate path*, *for each alignment type*
182
+ x_offsets, y_offsets should not include (0,1), (1,0)
183
+
184
+ Basically, we take the feature matrix, rotate it 45 degress,
185
+ and compute a "wavy" matrix for the features.
186
+ It's like the diagonal but it moves around to hopefully always include the true path.
187
+ """
188
+
189
+ cdef np.ndarray[int, ndim=2] x_y_path_ = np.array(x_y_path).astype(np.int32)
190
+
191
+ assert (vecs0.shape[0] == norms0.shape[0])
192
+ assert (vecs1.shape[0] == norms1.shape[0])
193
+
194
+ assert (vecs0.shape[1] == norms0.shape[1])
195
+ assert (vecs1.shape[1] == norms1.shape[1])
196
+
197
+ # check how many overlaps vectors were passed in
198
+ num_overlaps_in_vecs0 = vecs0.shape[0]
199
+ num_overlaps_in_vecs1 = vecs1.shape[0]
200
+
201
+ # check how many overlaps were requested
202
+ # edge case: alignment_types could be empty
203
+ # In that case, we should just return insertions/deletions
204
+ # and max_x_overlap == max_y_overlap == 0
205
+ max_x_overlap = max([0] + [x for x, y in alignment_types]) # add [0] in case alignment_types is empty
206
+ max_y_overlap = max([0] + [y for x, y in alignment_types]) # add [0] in case alignment_types is empty
207
+
208
+ # note: alignment types are specified 1-based, but vectors are stored 0-based
209
+ if max_x_overlap > num_overlaps_in_vecs0:
210
+ raise Exception('%d x overlaps requrested (via alignment_types), but vecs0 only has %d' % (
211
+ max_x_overlap, num_overlaps_in_vecs0))
212
+ if max_y_overlap > num_overlaps_in_vecs1:
213
+ raise Exception('%d y overlaps requrested (via alignment_types), but vecs1 only has %d' % (
214
+ max_y_overlap, num_overlaps_in_vecs1))
215
+
216
+ # number of sentences in each document
217
+ cdef int xsize = vecs0.shape[1]
218
+ cdef int ysize = vecs1.shape[1]
219
+
220
+ # vector diminsions should match
221
+ assert (vecs0.shape[2] == vecs1.shape[2])
222
+
223
+ cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
224
+ x_offsets, y_offsets = make_x_y_offsets(alignment_types)
225
+
226
+ # reserve outputs
227
+ a_len = x_y_path_.shape[0]
228
+ b_len = 2 * width_over2
229
+ cdef np.ndarray[float, ndim=3] a_b_feats = np.empty((len(alignment_types), a_len, b_len), dtype=np.float32)
230
+ cdef np.ndarray[int, ndim=1] b_offset = np.empty(a_len).astype(np.int32)
231
+
232
+ cdef int x, y, aa, bb, xx, yy, a_idx, b_idx, bb2, x_offset, y_offset, ii_align, x_offset_idx, y_offset_idx
233
+ cdef int vecsize = vecs0.shape[2]
234
+ cdef int num_alignments = x_offsets.shape[0]
235
+
236
+ cdef float sumx, feat
237
+ cdef float inf = np.inf
238
+
239
+ for ii in range(x_y_path_.shape[0]):
240
+ x = x_y_path_[ii, 0]
241
+ y = x_y_path_[ii, 1]
242
+
243
+ # convert xy to ab cords
244
+ aa = x + y
245
+ bb = y
246
+
247
+ a_idx = aa
248
+ b_offset[aa] = bb - width_over2
249
+ for b_idx, bb2 in enumerate(range(bb - width_over2, bb + width_over2)):
250
+ # convert ab to xy cords
251
+ xx = aa - bb2
252
+ yy = bb2
253
+
254
+ for ii_align in range(num_alignments):
255
+ x_offset = x_offsets[ii_align]
256
+ x_offset_idx = x_offset - 1 # overlaps start at 1, vectors stored 0-based
257
+ y_offset = y_offsets[ii_align]
258
+ y_offset_idx = y_offset - 1
259
+
260
+ if 0 <= xx < xsize and 0 <= yy < ysize:
261
+ sumx = 0.0
262
+ for jj in range(vecsize):
263
+ sumx += vecs0[x_offset_idx, xx, jj] * vecs1[y_offset_idx, yy, jj]
264
+ feat = 2.0 * x_offset * y_offset * (1.0 - sumx) / (
265
+ 1e-6 + norms0[x_offset_idx, xx] + norms1[y_offset_idx, yy])
266
+
267
+ else:
268
+ feat = inf
269
+
270
+ a_b_feats[ii_align, a_idx, b_idx] = feat
271
+
272
+ return a_b_feats, b_offset
273
+
274
+
275
+ def sparse_dp(np.ndarray[float, ndim=3] a_b_costs,
276
+ np.ndarray[int, ndim=1] b_offset_in,
277
+ alignment_types,
278
+ double del_penalty,
279
+ int x_in_size,
280
+ int y_in_size):
281
+ """
282
+ Do DP along a path, using features saved off along path.
283
+ x_offsets, y_offsets should not include (0,1), (1,0)
284
+
285
+ xsize, ysize refer to the costs a_b_csum, but in x/y space
286
+
287
+ As in the simpler full-DP case,
288
+ we compute cumulative costs and backpointers on notes,
289
+ and there are COSTS associated with moving between them.
290
+
291
+ This means the size of the notes +1,+1 larger (in x,y) than the COSTS.
292
+
293
+ So the size of a_b_csum, a_b_xp, a_b_yp are all one larger in x and y compared to the costs
294
+
295
+ In order to save memory (and time, vs a sparse matrix with hashes to look up values), let:
296
+ a = x + y
297
+ b = x - y
298
+
299
+ b_offsets tells us how far from the left edge the features are computed for.
300
+ basically it's like we are computing along the diagonal,
301
+ but we shift the diagonal around based on our belief
302
+ about where the alignments are.
303
+
304
+ b_offsets is used for both costs AND csum, backpointers, so it needs to be
305
+ +2 longer (it is in the a-direction) than the costs (in the a direction)
306
+
307
+ """
308
+ cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
309
+ x_offsets, y_offsets = make_x_y_offsets(alignment_types)
310
+
311
+ # make x/y offsets, including (0,1), (1,), i.e. including deletion and insertion
312
+ x_offsets = np.concatenate([x_offsets, np.array([0, 1], dtype=np.int32)])
313
+ y_offsets = np.concatenate([y_offsets, np.array([1, 0], dtype=np.int32)])
314
+
315
+ cdef int a_in_size = a_b_costs.shape[1]
316
+ cdef int b_in_size = a_b_costs.shape[2]
317
+
318
+ cdef int a_out_size = a_in_size + 2
319
+ cdef int b_out_size = b_in_size
320
+
321
+ cdef int x_out_size = x_in_size + 1
322
+ cdef int y_out_size = y_in_size + 1
323
+
324
+ # costs are the costs of going between nodes.
325
+ # in x,y for the nodes, we basically add a buffer
326
+ # at x=0 and y=0, and shift the cost by (x=+1,y=+1)
327
+ # In a,b space, this means adding two points (for the buffer)
328
+ # at the beginning, and shifting by (a=+0,b=+1) since
329
+ # a=x+y and b=y
330
+ # for the first two points, we can simply replicate the
331
+ # original b_offset, since it should be -width_over2
332
+ # i.e. b_offset_in[0] == -width_over2
333
+ extra_two_points = np.array([b_offset_in[0], b_offset_in[0]], dtype=np.int32)
334
+ cdef np.ndarray[int, ndim=1] b_offset_out = np.concatenate([extra_two_points, b_offset_in + 1])
335
+
336
+ # outputs
337
+ # For anything being used in accumulation, use float64
338
+ cdef np.ndarray[double, ndim=2] a_b_csum = np.zeros((a_in_size + 2, b_in_size),
339
+ dtype=np.float64) + np.inf # error cumulative sum
340
+ cdef np.ndarray[int, ndim=2] a_b_xp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2 # backpointer for x
341
+ cdef np.ndarray[int, ndim=2] a_b_yp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2 # backpointer for y
342
+
343
+ cdef int num_alignments = x_offsets.shape[0]
344
+ cdef double inf = np.inf
345
+ cdef int xx_out, yy_out, ii_align, x_offset, y_offset
346
+ cdef int aa_in_cost, bb_in_cost, aa_out, bb_out, aa_out_prev, bb_out_prev, xx_in_cost, yy_in_cost, xx_out_prev, yy_out_prev
347
+
348
+ cdef double alignment_cost, total_cost, prev_cost
349
+
350
+ # increasing in a is the same as going along diagonals in x/y, so DP order works
351
+ # (and any ordering is fine in b - nothing depends on values adjacent on diagonal in x/y)
352
+ for aa_out in range(a_in_size + 2):
353
+ for bb_out in range(b_in_size):
354
+ #xx_out, yy_out = ab2xy_w_offset(aa_out, bb_out, b_offset_out)
355
+ yy_out = bb_out + b_offset_out[aa_out]
356
+ xx_out = aa_out - yy_out
357
+
358
+ # edge case: all deletions in y-direction
359
+ if xx_out == 0 and 0 <= yy_out < y_out_size:
360
+ a_b_csum[aa_out, bb_out] = del_penalty * yy_out
361
+ a_b_xp[aa_out, bb_out] = 0
362
+ a_b_yp[aa_out, bb_out] = 1
363
+
364
+ # edge case: all deletions in x-direction
365
+ elif yy_out == 0 and 0 <= xx_out < x_out_size:
366
+ a_b_csum[aa_out, bb_out] = del_penalty * xx_out
367
+ a_b_xp[aa_out, bb_out] = 1
368
+ a_b_yp[aa_out, bb_out] = 0
369
+
370
+ else:
371
+ # initialize output to inf
372
+ a_b_csum[aa_out, bb_out] = inf
373
+ a_b_xp[aa_out, bb_out] = -42
374
+ a_b_yp[aa_out, bb_out] = -42
375
+
376
+ for ii_align in range(num_alignments):
377
+ x_offset = x_offsets[ii_align]
378
+ y_offset = y_offsets[ii_align]
379
+
380
+ # coords of location of alignment cost, in input x/y space
381
+ xx_in_cost = xx_out - 1 # features were front padded,
382
+ yy_in_cost = yy_out - 1 # so offset is always 1
383
+
384
+ # the coords of location of previous cumsum cost, in input x/y space
385
+ xx_out_prev = xx_out - x_offset
386
+ yy_out_prev = yy_out - y_offset
387
+
388
+ if 0 <= xx_in_cost < x_in_size and 0 <= yy_in_cost < y_in_size and 0 <= xx_out_prev < x_out_size and 0 <= yy_out_prev < y_out_size:
389
+ # convert x,y to a,b
390
+ aa_in_cost = xx_in_cost + yy_in_cost
391
+ bb_in_cost = yy_in_cost - b_offset_in[aa_in_cost]
392
+
393
+ aa_out_prev = xx_out_prev + yy_out_prev
394
+ bb_out_prev = yy_out_prev - b_offset_out[aa_out_prev]
395
+
396
+ if 0 <= aa_in_cost < a_in_size and 0 <= bb_in_cost < b_in_size and 0 <= aa_out_prev < a_out_size and 0 <= bb_out_prev < b_out_size:
397
+ if x_offset == 0 or y_offset == 0:
398
+ alignment_cost = del_penalty
399
+ else:
400
+ alignment_cost = a_b_costs[ii_align, aa_in_cost, bb_in_cost]
401
+
402
+ prev_cost = a_b_csum[aa_out_prev, bb_out_prev]
403
+
404
+ total_cost = prev_cost + alignment_cost
405
+
406
+ if total_cost < a_b_csum[aa_out, bb_out]:
407
+ a_b_csum[aa_out, bb_out] = total_cost
408
+ a_b_xp[aa_out, bb_out] = x_offset
409
+ a_b_yp[aa_out, bb_out] = y_offset
410
+
411
+ return a_b_csum, a_b_xp, a_b_yp, b_offset_out
vecalign/dp_utils.py ADDED
@@ -0,0 +1,679 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2019 Brian Thompson
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import logging
18
+ import sys
19
+ from ast import literal_eval
20
+ from collections import OrderedDict
21
+ from math import ceil
22
+ from time import time
23
+
24
+ import numpy as np
25
+
26
+ import pyximport
27
+ pyximport.install(setup_args={'include_dirs':np.get_include()}, inplace=True, reload_support=True)
28
+
29
+ from dp_core import make_dense_costs, score_path, sparse_dp, make_sparse_costs, dense_dp
30
+
31
+ logger = logging.getLogger('vecalign') # set up in vecalign.py
32
+
33
+
34
+ def preprocess_line(line):
35
+ line = line.strip()
36
+ if len(line) == 0:
37
+ line = 'BLANK_LINE'
38
+ return line
39
+
40
+
41
+ def yield_overlaps(lines, num_overlaps):
42
+ lines = [preprocess_line(line) for line in lines]
43
+ for overlap in range(1, num_overlaps + 1):
44
+ for out_line in layer(lines, overlap):
45
+ # check must be here so all outputs are unique
46
+ out_line2 = out_line[:10000] # limit line so dont encode arbitrarily long sentences
47
+ yield out_line2
48
+
49
+
50
+ def read_in_embeddings(text_file, embed_file):
51
+ """
52
+ Given a text file with candidate sentences and a corresponing embedding file,
53
+ make a maping from candidate sentence to embedding index,
54
+ and a numpy array of the embeddings
55
+ """
56
+ sent2line = dict()
57
+ with open(text_file, 'rt', encoding="utf-8") as fin:
58
+ for ii, line in enumerate(fin):
59
+ if line.strip() in sent2line:
60
+ raise Exception('got multiple embeddings for the same line')
61
+ sent2line[line.strip()] = ii
62
+
63
+ line_embeddings = np.fromfile(embed_file, dtype=np.float32, count=-1)
64
+ if line_embeddings.size == 0:
65
+ raise Exception('Got empty embedding file')
66
+
67
+ laser_embedding_size = line_embeddings.size // len(sent2line) # currently hardcoded to 1024
68
+ if laser_embedding_size != 1024:
69
+ logger.warning('expected an embedding size of 1024, got %s', laser_embedding_size)
70
+ logger.info('laser_embedding_size determined to be %d', laser_embedding_size)
71
+ line_embeddings.resize(line_embeddings.shape[0] // laser_embedding_size, laser_embedding_size)
72
+ return sent2line, line_embeddings
73
+
74
+
75
+ def make_doc_embedding(sent2line, line_embeddings, lines, num_overlaps):
76
+ """
77
+ lines: sentences in input document to embed
78
+ sent2line, line_embeddings: precomputed embeddings for lines (and overlaps of lines)
79
+ """
80
+
81
+ lines = [preprocess_line(line) for line in lines]
82
+
83
+ vecsize = line_embeddings.shape[1]
84
+
85
+ vecs0 = np.empty((num_overlaps, len(lines), vecsize), dtype=np.float32)
86
+
87
+ for ii, overlap in enumerate(range(1, num_overlaps + 1)):
88
+ for jj, out_line in enumerate(layer(lines, overlap)):
89
+ try:
90
+ line_id = sent2line[out_line]
91
+ except KeyError:
92
+ logger.warning('Failed to find overlap=%d line "%s". Will use random vector.', overlap, out_line)
93
+ line_id = None
94
+
95
+ if line_id is not None:
96
+ vec = line_embeddings[line_id]
97
+ else:
98
+ vec = np.random.random(vecsize) - 0.5
99
+ vec = vec / np.linalg.norm(vec)
100
+
101
+ vecs0[ii, jj, :] = vec
102
+
103
+ return vecs0
104
+
105
+
106
+ def make_norm1(vecs0):
107
+ """
108
+ make vectors norm==1 so that cosine distance can be computed via dot product
109
+ """
110
+ for ii in range(vecs0.shape[0]):
111
+ for jj in range(vecs0.shape[1]):
112
+ norm = np.sqrt(np.square(vecs0[ii, jj, :]).sum())
113
+ vecs0[ii, jj, :] = vecs0[ii, jj, :] / (norm + 1e-5)
114
+
115
+
116
+ def layer(lines, num_overlaps, comb=' '):
117
+ """
118
+ make front-padded overlapping sentences
119
+ """
120
+ if num_overlaps < 1:
121
+ raise Exception('num_overlaps must be >= 1')
122
+ out = ['PAD', ] * min(num_overlaps - 1, len(lines))
123
+ for ii in range(len(lines) - num_overlaps + 1):
124
+ out.append(comb.join(lines[ii:ii + num_overlaps]))
125
+ return out
126
+
127
+
128
+ def read_alignments(fin):
129
+ alignments = []
130
+ with open(fin, 'rt', encoding="utf-8") as infile:
131
+ for line in infile:
132
+ fields = [x.strip() for x in line.split(':') if len(x.strip())]
133
+ if len(fields) < 2:
134
+ raise Exception('Got line "%s", which does not have at least two ":" separated fields' % line.strip())
135
+ try:
136
+ src = literal_eval(fields[0])
137
+ tgt = literal_eval(fields[1])
138
+ except:
139
+ raise Exception('Failed to parse line "%s"' % line.strip())
140
+ alignments.append((src, tgt))
141
+
142
+ # I know bluealign files have a few entries entries missing,
143
+ # but I don't fix them in order to be consistent previous reported scores
144
+ return alignments
145
+
146
+
147
+ def print_alignments(alignments, scores=None, src_lines=None, tgt_lines=None, ofile=sys.stdout):
148
+ if scores is None:
149
+ scores = [None for _ in alignments]
150
+ for (x, y), s in zip(alignments, scores):
151
+ if s is None:
152
+ print('%s:%s' % (x, y), file=ofile)
153
+ else:
154
+ print('%s:%s:%.6f' % (x, y, s), file=ofile)
155
+ if src_lines is not None and tgt_lines is not None:
156
+ print(' '*40, 'SRC: ', ' '.join([src_lines[i].replace('\n', ' ').strip() for i in x]), file=ofile)
157
+ print(' '*40, 'TGT: ', ' '.join([tgt_lines[i].replace('\n', ' ').strip() for i in y]), file=ofile)
158
+
159
+
160
+ class DeletionKnob(object):
161
+ """
162
+ A good deletion penalty is dependent on normalization, and probably language, domain, etc, etc
163
+ I want a way to control deletion penalty that generalizes well...
164
+ Sampling costs and use percentile seems to work fairly well.
165
+ """
166
+ def __init__(self, samp, res_min, res_max):
167
+
168
+ self.res_min = res_min
169
+ self.res_max = res_max
170
+
171
+ if self.res_min >= self.res_max:
172
+ logger.warning('res_max <= res_min, increasing it')
173
+ self.res_max = self.res_min + 1e-4
174
+
175
+ num_bins = 1000
176
+ num_pts = 30
177
+
178
+ self.hist, self.bin_edges = np.histogram(samp, bins=num_bins,
179
+ range=[self.res_min, self.res_max],
180
+ density=True)
181
+
182
+ dx = self.bin_edges[1] - self.bin_edges[0]
183
+ self.cdf = np.cumsum(self.hist) * dx
184
+
185
+ interp_points = [(0, self.res_min), ]
186
+ for knob_val in np.linspace(0, 1, num_pts - 1)[1:-1]:
187
+ cdf_idx = np.searchsorted(self.cdf, knob_val)
188
+ cdf_val = self.res_min + cdf_idx / float(num_bins) * (self.res_max - self.res_min)
189
+ interp_points.append((knob_val, cdf_val))
190
+ interp_points.append((1, self.res_max))
191
+ self.x, self.y = zip(*interp_points)
192
+
193
+ def percentile_frac_to_del_penalty(self, knob_val):
194
+ del_pen = np.interp([knob_val], self.x, self.y)[0]
195
+ return del_pen
196
+
197
+
198
+ def make_alignment_types(max_alignment_size):
199
+ # return list of all (n,m) where n+m <= max_alignment_size
200
+ # does not include deletions, i.e. (1, 0) or (0, 1)
201
+ alignment_types = []
202
+ for x in range(1, max_alignment_size):
203
+ for y in range(1, max_alignment_size):
204
+ if x + y <= max_alignment_size:
205
+ alignment_types.append((x, y))
206
+ return alignment_types
207
+
208
+
209
+ def make_one_to_many_alignment_types(max_alignment_size):
210
+ # return list of all (1, m) where m <= max_alignment_size
211
+ # does not include deletions, i.e. (1, 0) or (0, 1)
212
+ alignment_types = []
213
+ for m in range(1, max_alignment_size + 1):
214
+ alignment_types.append((1, m))
215
+ return alignment_types
216
+
217
+
218
+ def ab2xy_w_offset(aa, bb_idx, bb_offset):
219
+ bb_from_side = bb_idx + bb_offset[aa]
220
+ xx = aa - bb_from_side
221
+ yy = bb_from_side
222
+ return (xx, yy)
223
+
224
+
225
+ def xy2ab_w_offset(xx, yy, bb_offset):
226
+ aa = xx + yy
227
+ bb_from_side = yy
228
+ bb = bb_from_side - bb_offset[aa]
229
+ return aa, bb
230
+
231
+
232
+ def process_scores(scores, alignments):
233
+ # floating point sometimes gives negative numbers, which is a little unnerving ...
234
+ scores = np.clip(scores, a_min=0, a_max=None)
235
+
236
+ for ii, (x_algn, y_algn) in enumerate(alignments):
237
+ # deletion penalty is pretty arbitrary, just report 0
238
+ if len(x_algn) == 0 or len(y_algn) == 0:
239
+ scores[ii] = 0.0
240
+ # report sores un-normalized by alignment sizes
241
+ # (still normalized with random vectors, though)
242
+ else:
243
+ scores[ii] = scores[ii] / len(x_algn) / len(y_algn)
244
+
245
+ return scores
246
+
247
+
248
+ def sparse_traceback(a_b_csum, a_b_xp, a_b_yp, b_offset, xsize, ysize):
249
+ alignments = []
250
+ xx = xsize
251
+ yy = ysize
252
+
253
+ cum_costs = []
254
+
255
+ while True:
256
+ aa, bb = xy2ab_w_offset(xx, yy, b_offset)
257
+
258
+ cum_costs.append(a_b_csum[aa, bb])
259
+
260
+ xp = a_b_xp[aa, bb]
261
+ yp = a_b_yp[aa, bb]
262
+
263
+ if xx == yy == 0:
264
+ break
265
+
266
+ if xx < 0 or yy < 0:
267
+ raise Exception('traceback bug')
268
+
269
+ x_side = list(range(xx - xp, xx))
270
+ y_side = list(range(yy - yp, yy))
271
+ alignments.append((x_side, y_side))
272
+
273
+ xx = xx - xp
274
+ yy = yy - yp
275
+
276
+ alignments.reverse()
277
+ cum_costs.reverse()
278
+ costs = np.array(cum_costs[1:]) - np.array(cum_costs[:-1])
279
+ # "costs" are scaled by x_alignment_size * y_alignment_size
280
+ # and the cost of a deletion is del_penalty
281
+ # "scores": 0 for deletion/insertion,
282
+ # and cosine distance, *not* scaled
283
+ # by len(x_alignment)*len(y_alignment)
284
+ scores = process_scores(scores=costs, alignments=alignments)
285
+
286
+ return alignments, scores
287
+
288
+
289
+ def dense_traceback(x_y_tb):
290
+ xsize, ysize = x_y_tb.shape
291
+
292
+ xx = xsize - 1
293
+ yy = ysize - 1
294
+
295
+ alignments = []
296
+ while True:
297
+ if xx == yy == 0:
298
+ break
299
+ bp = x_y_tb[xx, yy]
300
+ if bp == 0:
301
+ xp, yp = 1, 1
302
+ alignments.append(([xx - 1], [yy - 1]))
303
+ elif bp == 1:
304
+ xp, yp = 0, 1
305
+ alignments.append(([], [yy - 1]))
306
+ elif bp == 2:
307
+ xp, yp = 1, 0
308
+ alignments.append(([xx - 1], []))
309
+ else:
310
+ raise Exception('got unknown value')
311
+
312
+ xx = xx - xp
313
+ yy = yy - yp
314
+
315
+ alignments.reverse()
316
+
317
+ return alignments
318
+
319
+
320
+ def append_slant(path, xwidth, ywidth):
321
+ """
322
+ Append quantized approximation to a straight line
323
+ from current x,y to a point at (x+xwidth, y+ywidth)
324
+ """
325
+ NN = xwidth + ywidth
326
+ xstart, ystart = path[-1]
327
+ for ii in range(1, NN + 1):
328
+ x = xstart + round(xwidth * ii / NN)
329
+ y = ystart + round(ywidth * ii / NN)
330
+ # In the case of ties we want them to round differently,
331
+ # so explicitly make sure we take a step of 1, not 0 or 2
332
+ lastx, lasty = path[-1]
333
+ delta = x + y - lastx - lasty
334
+ if delta == 1:
335
+ path.append((x, y))
336
+ elif delta == 2:
337
+ path.append((x - 1, y))
338
+ elif delta == 0:
339
+ path.append((x + 1, y))
340
+
341
+
342
+ def alignment_to_search_path(algn):
343
+ """
344
+ Given an alignment, make searchpath.
345
+ Searchpath must step exactly one position in x XOR y at each time step.
346
+
347
+ In the case of a block of deletions, the order found by DP is not meaningful.
348
+ To make things consistent and to improve the probability of recovering
349
+ from search errors, we search an approximately straight line
350
+ through a block of deletions. We do the same through a many-many
351
+ alignment, even though we currently don't refine a many-many alignment...
352
+ """
353
+ path = [(0, 0), ]
354
+ xdel, ydel = 0, 0
355
+ ydel = 0
356
+ for x, y in algn:
357
+ if len(x) and len(y):
358
+ append_slant(path, xdel, ydel)
359
+ xdel, ydel = 0, 0
360
+ append_slant(path, len(x), len(y))
361
+ elif len(x):
362
+ xdel += len(x)
363
+ elif len(y):
364
+ ydel += len(y)
365
+
366
+ append_slant(path, xdel, ydel)
367
+
368
+ return path
369
+
370
+
371
+ def extend_alignments(course_alignments, size0, size1):
372
+ """
373
+ extend alignments to include new endpoints size0, size1
374
+ if alignments are larger than size0/size1, raise exception
375
+ """
376
+ # could be a string of deletions or insertions at end, so cannot just grab last one
377
+ xmax = 0 # maximum x value in course_alignments
378
+ ymax = 0 # maximum y value in course_alignments
379
+ for x, y in course_alignments:
380
+ for xval in x:
381
+ xmax = max(xmax, xval)
382
+ for yval in y:
383
+ ymax = max(ymax, yval)
384
+
385
+ if xmax > size0 or ymax > size1:
386
+ raise Exception('asked to extend alignments but already bigger than requested')
387
+
388
+ # do not duplicate xmax/ymax, do include size0/size1
389
+ extra_x = list(range(xmax + 1, size0 + 1))
390
+ extra_y = list(range(ymax + 1, size1 + 1))
391
+
392
+ logger.debug('extending alignments in x by %d and y by %d', len(extra_x), len(extra_y))
393
+
394
+ if len(extra_x) == 0:
395
+ for yval in extra_y:
396
+ course_alignments.append(([], [yval]))
397
+ elif len(extra_y) == 0:
398
+ for xval in extra_x:
399
+ course_alignments.append(([xval], []))
400
+ else:
401
+ course_alignments.append((extra_x, extra_y))
402
+
403
+
404
+ def upsample_alignment(algn):
405
+ def upsample_one_alignment(xx):
406
+ return list(range(min(xx) * 2, (max(xx) + 1) * 2))
407
+
408
+ new_algn = []
409
+ for xx, yy in algn:
410
+ if len(xx) == 0:
411
+ for yyy in upsample_one_alignment(yy):
412
+ new_algn.append(([], [yyy]))
413
+ elif len(yy) == 0:
414
+ for xxx in upsample_one_alignment(xx):
415
+ new_algn.append(([xxx], []))
416
+ else:
417
+ new_algn.append((upsample_one_alignment(xx), upsample_one_alignment(yy)))
418
+ return new_algn
419
+
420
+
421
+ def make_del_knob(e_laser,
422
+ f_laser,
423
+ e_laser_norms,
424
+ f_laser_norms,
425
+ sample_size):
426
+ e_size = e_laser.shape[0]
427
+ f_size = f_laser.shape[0]
428
+
429
+ if e_size > 0 and f_size > 0 and sample_size > 0:
430
+
431
+ if e_size * f_size < sample_size:
432
+ # dont sample, just compute full matrix
433
+ sample_size = e_size * f_size
434
+ x_idxs = np.zeros(sample_size, dtype=np.int32)
435
+ y_idxs = np.zeros(sample_size, dtype=np.int32)
436
+ c = 0
437
+ for ii in range(e_size):
438
+ for jj in range(f_size):
439
+ x_idxs[c] = ii
440
+ y_idxs[c] = jj
441
+ c += 1
442
+ else:
443
+ # get random samples
444
+ x_idxs = np.random.choice(range(e_size), size=sample_size, replace=True).astype(np.int32)
445
+ y_idxs = np.random.choice(range(f_size), size=sample_size, replace=True).astype(np.int32)
446
+
447
+ # output
448
+ random_scores = np.empty(sample_size, dtype=np.float32)
449
+
450
+ score_path(x_idxs, y_idxs,
451
+ e_laser_norms, f_laser_norms,
452
+ e_laser, f_laser,
453
+ random_scores, )
454
+
455
+ min_score = 0
456
+ max_score = max(random_scores) # could bump this up... but its probably fine
457
+
458
+ else:
459
+ # Not much we can do here...
460
+ random_scores = np.array([0.0, 0.5, 1.0]) # ???
461
+ min_score = 0
462
+ max_score = 1 # ????
463
+
464
+ del_knob = DeletionKnob(random_scores, min_score, max_score)
465
+
466
+ return del_knob
467
+
468
+
469
+ def compute_norms(vecs0, vecs1, num_samples, overlaps_to_use=None):
470
+ # overlaps_to_use = 10 # 10 matches before
471
+
472
+ overlaps1, size1, dim = vecs1.shape
473
+ overlaps0, size0, dim0 = vecs0.shape
474
+ assert (dim == dim0)
475
+
476
+ if overlaps_to_use is not None:
477
+ if overlaps_to_use > overlaps1:
478
+ raise Exception('Cannot use more overlaps than provided. You may want to re-run make_verlaps.py with a larger -n value')
479
+ else:
480
+ overlaps_to_use = overlaps1
481
+
482
+ samps_per_overlap = ceil(num_samples / overlaps_to_use)
483
+
484
+ if size1 and samps_per_overlap:
485
+ # sample other size (from all overlaps) to compre to this side
486
+ vecs1_rand_sample = np.empty((samps_per_overlap * overlaps_to_use, dim), dtype=np.float32)
487
+ for overlap_ii in range(overlaps_to_use):
488
+ idxs = np.random.choice(range(size1), size=samps_per_overlap, replace=True)
489
+ random_vecs = vecs1[overlap_ii, idxs, :]
490
+ vecs1_rand_sample[overlap_ii * samps_per_overlap:(overlap_ii + 1) * samps_per_overlap, :] = random_vecs
491
+
492
+ norms0 = np.empty((overlaps0, size0), dtype=np.float32)
493
+ for overlap_ii in range(overlaps0):
494
+ e_laser = vecs0[overlap_ii, :, :]
495
+ sim = np.matmul(e_laser, vecs1_rand_sample.T)
496
+ norms0[overlap_ii, :] = 1.0 - sim.mean(axis=1)
497
+
498
+ else: # no samples, no normalization
499
+ norms0 = np.ones((overlaps0, size0)).astype(np.float32)
500
+
501
+ return norms0
502
+
503
+
504
+ def downsample_vectors(vecs1):
505
+ a, b, c = vecs1.shape
506
+ half = np.empty((a, b // 2, c), dtype=np.float32)
507
+ for ii in range(a):
508
+ # average consecutive vectors
509
+ for jj in range(0, b - b % 2, 2):
510
+ v1 = vecs1[ii, jj, :]
511
+ v2 = vecs1[ii, jj + 1, :]
512
+ half[ii, jj // 2, :] = v1 + v2
513
+ # compute mean for all vectors
514
+ mean = np.mean(half[ii, :, :], axis=0)
515
+ for jj in range(0, b - b % 2, 2):
516
+ # remove mean
517
+ half[ii, jj // 2, :] = half[ii, jj // 2, :] - mean
518
+ # make vectors norm==1 so dot product is cosine distance
519
+ make_norm1(half)
520
+ return half
521
+
522
+
523
+ def vecalign(vecs0,
524
+ vecs1,
525
+ final_alignment_types,
526
+ del_percentile_frac,
527
+ width_over2,
528
+ max_size_full_dp,
529
+ costs_sample_size,
530
+ num_samps_for_norm,
531
+ norms0=None,
532
+ norms1=None):
533
+ if width_over2 < 3:
534
+ logger.warning('width_over2 was set to %d, which does not make sense. increasing to 3.', width_over2)
535
+ width_over2 = 3
536
+
537
+ # make sure input embeddings are norm==1
538
+ make_norm1(vecs0)
539
+ make_norm1(vecs1)
540
+
541
+ # save off runtime stats for summary
542
+ runtimes = OrderedDict()
543
+
544
+ # Determine stack depth
545
+ s0, s1 = vecs0.shape[1], vecs1.shape[1]
546
+ max_depth = 0
547
+ while s0 * s1 > max_size_full_dp ** 2:
548
+ max_depth += 1
549
+ s0 = s0 // 2
550
+ s1 = s1 // 2
551
+
552
+ # init recursion stack
553
+ # depth is 0-based (full size is 0, 1 is half, 2 is quarter, etc)
554
+ stack = {0: {'v0': vecs0, 'v1': vecs1}}
555
+
556
+ # downsample sentence vectors
557
+ t0 = time()
558
+ for depth in range(1, max_depth + 1):
559
+ stack[depth] = {'v0': downsample_vectors(stack[depth - 1]['v0']),
560
+ 'v1': downsample_vectors(stack[depth - 1]['v1'])}
561
+ runtimes['Downsample embeddings'] = time() - t0
562
+
563
+ # compute norms for all depths, add sizes, add alignment types
564
+ t0 = time()
565
+ for depth in stack:
566
+ stack[depth]['size0'] = stack[depth]['v0'].shape[1]
567
+ stack[depth]['size1'] = stack[depth]['v1'].shape[1]
568
+ stack[depth]['alignment_types'] = final_alignment_types if depth == 0 else [(1, 1)]
569
+
570
+ if depth == 0 and norms0 is not None:
571
+ if norms0.shape != vecs0.shape[:2]:
572
+ print('norms0.shape:', norms0.shape)
573
+ print('vecs0.shape[:2]:', vecs0.shape[:2])
574
+ raise Exception('norms0 wrong shape')
575
+ stack[depth]['n0'] = norms0
576
+ else:
577
+ stack[depth]['n0'] = compute_norms(stack[depth]['v0'], stack[depth]['v1'], num_samps_for_norm)
578
+
579
+ if depth == 0 and norms1 is not None:
580
+ if norms1.shape != vecs1.shape[:2]:
581
+ print('norms1.shape:', norms1.shape)
582
+ print('vecs1.shape[:2]:', vecs1.shape[:2])
583
+ raise Exception('norms1 wrong shape')
584
+ stack[depth]['n1'] = norms1
585
+ else:
586
+ stack[depth]['n1'] = compute_norms(stack[depth]['v1'], stack[depth]['v0'], num_samps_for_norm)
587
+
588
+ runtimes['Normalize embeddings'] = time() - t0
589
+
590
+ # Compute deletion penalty for all depths
591
+ t0 = time()
592
+ for depth in stack:
593
+ stack[depth]['del_knob'] = make_del_knob(e_laser=stack[depth]['v0'][0, :, :],
594
+ f_laser=stack[depth]['v1'][0, :, :],
595
+ e_laser_norms=stack[depth]['n0'][0, :],
596
+ f_laser_norms=stack[depth]['n1'][0, :],
597
+ sample_size=costs_sample_size)
598
+ stack[depth]['del_penalty'] = stack[depth]['del_knob'].percentile_frac_to_del_penalty(del_percentile_frac)
599
+ logger.debug('del_penalty at depth %d: %f', depth, stack[depth]['del_penalty'])
600
+ runtimes['Compute deletion penalties'] = time() - t0
601
+ tt = time() - t0
602
+ logger.debug('%d x %d full DP make features: %.6fs (%.3e per dot product)',
603
+ stack[max_depth]['size0'], stack[max_depth]['size1'], tt,
604
+ tt / (stack[max_depth]['size0'] + 1e-6) / (stack[max_depth]['size1'] + 1e-6))
605
+ # full DP at maximum recursion depth
606
+ t0 = time()
607
+ stack[max_depth]['costs_1to1'] = make_dense_costs(stack[max_depth]['v0'],
608
+ stack[max_depth]['v1'],
609
+ stack[max_depth]['n0'],
610
+ stack[max_depth]['n1'])
611
+
612
+ runtimes['Full DP make features'] = time() - t0
613
+ t0 = time()
614
+ _, stack[max_depth]['x_y_tb'] = dense_dp(stack[max_depth]['costs_1to1'], stack[max_depth]['del_penalty'])
615
+ stack[max_depth]['alignments'] = dense_traceback(stack[max_depth]['x_y_tb'])
616
+ runtimes['Full DP'] = time() - t0
617
+
618
+ # upsample the path up to the top resolution
619
+ compute_costs_times = []
620
+ dp_times = []
621
+ upsample_depths = [0, ] if max_depth == 0 else list(reversed(range(0, max_depth)))
622
+ for depth in upsample_depths:
623
+ if max_depth > 0: # upsample previoius alignment to current resolution
624
+ course_alignments = upsample_alignment(stack[depth + 1]['alignments'])
625
+ # features may have been truncated when downsampleing, so alignment may need extended
626
+ extend_alignments(course_alignments, stack[depth]['size0'], stack[depth]['size1']) # in-place
627
+ else: # We did a full size 1-1 search, so search same size with more alignment types
628
+ course_alignments = stack[0]['alignments']
629
+
630
+ # convert couse alignments to a searchpath
631
+ stack[depth]['searchpath'] = alignment_to_search_path(course_alignments)
632
+
633
+ # compute ccosts for sparse DP
634
+ t0 = time()
635
+ stack[depth]['a_b_costs'], stack[depth]['b_offset'] = make_sparse_costs(stack[depth]['v0'], stack[depth]['v1'],
636
+ stack[depth]['n0'], stack[depth]['n1'],
637
+ stack[depth]['searchpath'],
638
+ stack[depth]['alignment_types'],
639
+ width_over2)
640
+
641
+ tt = time() - t0
642
+ num_dot_products = len(stack[depth]['b_offset']) * len(stack[depth]['alignment_types']) * width_over2 * 2
643
+ logger.debug('%d x %d sparse DP (%d alignment types, %d window) make features: %.6fs (%.3e per dot product)',
644
+ stack[max_depth]['size0'], stack[max_depth]['size1'],
645
+ len(stack[depth]['alignment_types']), width_over2 * 2,
646
+ tt, tt / (num_dot_products + 1e6))
647
+
648
+ compute_costs_times.append(time() - t0)
649
+ t0 = time()
650
+ # perform sparse DP
651
+ stack[depth]['a_b_csum'], stack[depth]['a_b_xp'], stack[depth]['a_b_yp'], \
652
+ stack[depth]['new_b_offset'] = sparse_dp(stack[depth]['a_b_costs'], stack[depth]['b_offset'],
653
+ stack[depth]['alignment_types'], stack[depth]['del_penalty'],
654
+ stack[depth]['size0'], stack[depth]['size1'])
655
+
656
+ # performace traceback to get alignments and alignment scores
657
+ # for debugging, avoid overwriting stack[depth]['alignments']
658
+ akey = 'final_alignments' if depth == 0 else 'alignments'
659
+ stack[depth][akey], stack[depth]['alignment_scores'] = sparse_traceback(stack[depth]['a_b_csum'],
660
+ stack[depth]['a_b_xp'],
661
+ stack[depth]['a_b_yp'],
662
+ stack[depth]['new_b_offset'],
663
+ stack[depth]['size0'],
664
+ stack[depth]['size1'])
665
+ dp_times.append(time() - t0)
666
+
667
+ runtimes['Upsample DP compute costs'] = sum(compute_costs_times[:-1])
668
+ runtimes['Upsample DP'] = sum(dp_times[:-1])
669
+
670
+ runtimes['Final DP compute costs'] = compute_costs_times[-1]
671
+ runtimes['Final DP'] = dp_times[-1]
672
+
673
+ # log time stats
674
+ max_key_str_len = max([len(key) for key in runtimes])
675
+ for key in runtimes:
676
+ if runtimes[key] > 5e-5:
677
+ logger.info(key + ' took ' + '.' * (max_key_str_len + 5 - len(key)) + ('%.4fs' % runtimes[key]).rjust(7))
678
+
679
+ return stack
vecalign/json_for_metricx/f1e4c89f859b47a09a65f8a7e8329dfb_input.jsonl ADDED
File without changes
vecalign/long_context_eval.py ADDED
@@ -0,0 +1,1080 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Long Context Evaluation
4
+
5
+ This script performs the following steps:
6
+ 1. Reads a specified CSV file containing the evaluation data.
7
+ 2. Segments source, reference, and MT texts into sentences and sliding windows.
8
+ 3. Generates overlaps and embeddings for alignment.
9
+ 4. Runs vector alignment exploration and computes COMET and COMET-QE scores.
10
+ 5. Aggregates the scores and saves the results.
11
+ """
12
+
13
+ import os
14
+ import re
15
+ import json
16
+ import csv
17
+ import spacy
18
+ import torch
19
+ import random
20
+ import argparse
21
+ import numpy as np
22
+ import pandas as pd
23
+ import tempfile
24
+ import subprocess
25
+ import unicodedata
26
+ from multiprocessing import Pool
27
+ import datetime
28
+ from typing import Optional
29
+
30
+ # -----------------------------------------------------------------------------
31
+ # Utility Functions
32
+ # -----------------------------------------------------------------------------
33
+ def set_seed(seed: int = 42) -> None:
34
+ """
35
+ Set the global random seed for reproducibility.
36
+
37
+ Args:
38
+ seed (int): Random seed (default is 42).
39
+ """
40
+ random.seed(seed)
41
+ np.random.seed(seed)
42
+ torch.manual_seed(seed)
43
+ torch.cuda.manual_seed(seed)
44
+ torch.cuda.manual_seed_all(seed)
45
+ torch.backends.cudnn.deterministic = True
46
+ torch.backends.cudnn.benchmark = False
47
+
48
+
49
+ def normalize_text(text: str) -> str:
50
+ """
51
+ Normalize text using Unicode normalization (NFKC) to convert full-width characters to half-width.
52
+ Additional normalization (e.g., lowercasing) can be added if needed.
53
+
54
+ Args:
55
+ text (str): Input text.
56
+
57
+ Returns:
58
+ str: Normalized text.
59
+ """
60
+ normalized = unicodedata.normalize("NFKC", text)
61
+ # Uncomment the following if lowercase conversion is desired:
62
+ # normalized = normalized.lower()
63
+ return normalized
64
+
65
+
66
+ def segment_sentences_by_punctuation(text: str, lang: str) -> list:
67
+ """
68
+ Segment text into sentences based on punctuation and add an end-of-sentence separator.
69
+
70
+ Args:
71
+ text (str): Input text (may contain multiple paragraphs).
72
+ lang (str): Language code (e.g., "zh", "en", "ru", "de").
73
+
74
+ Returns:
75
+ list: List of segmented sentences with the SEPARATOR appended.
76
+ """
77
+ segmented_sentences = []
78
+ paragraphs = text.split('\n')
79
+ for paragraph in paragraphs:
80
+ if paragraph.strip():
81
+ if lang == SRC_LANG:
82
+ doc = src_nlp(paragraph)
83
+ else:
84
+ doc = mt_nlp(paragraph)
85
+ for sent in doc.sents:
86
+ segmented_sentences.append(normalize_text(sent.text.strip()) + SEPARATOR)
87
+ return segmented_sentences
88
+
89
+
90
+ def preprocess_sentences(sentences: list) -> str:
91
+ """
92
+ Preprocess sentences by removing the end-of-sentence token and joining them with newline characters.
93
+
94
+ Args:
95
+ sentences (list): List of sentences.
96
+
97
+ Returns:
98
+ str: Preprocessed text.
99
+ """
100
+ processed = [sentence.replace(SEPARATOR, "").strip() for sentence in sentences]
101
+ return "\n".join(processed)
102
+
103
+
104
+ def generate_overlap_and_embedding(text: str) -> tuple:
105
+ """
106
+ Generate overlap and embedding data from text using temporary files.
107
+
108
+ Args:
109
+ text (str): Input text.
110
+
111
+ Returns:
112
+ tuple: (overlap_content (str), embeddings_content (bytes))
113
+ """
114
+ with tempfile.NamedTemporaryFile(delete=True, mode="w+", encoding="utf-8", suffix=".txt") as txt_file:
115
+ txt_file.write(text)
116
+ txt_file.flush()
117
+ txt_filename = txt_file.name
118
+ overlaps_file = txt_filename + ".overlaps"
119
+ embed_file = txt_filename + ".emb"
120
+
121
+ # Generate overlap data
122
+ subprocess.run(["./overlap.py", "-i", txt_filename, "-o", overlaps_file, "-n", "10"], check=True)
123
+ # Generate embedding data
124
+ subprocess.run(" ".join(["$LASER/tasks/embed/embed.sh", overlaps_file, embed_file]),
125
+ shell=True, check=True)
126
+
127
+ with open(embed_file, "rb") as f:
128
+ embeddings_content = f.read()
129
+ with open(overlaps_file, "r", encoding="utf-8") as f:
130
+ overlap_content = f.read()
131
+
132
+ for need_to_del_file in [overlaps_file, embed_file]:
133
+ try:
134
+ os.remove(need_to_del_file)
135
+ print(f"Removed file: {need_to_del_file}")
136
+ except Exception as e:
137
+ print(f"Error removing {need_to_del_file}: {e}")
138
+
139
+ return overlap_content, embeddings_content
140
+
141
+
142
+ def compute_alignment_stats(alignment_results: list) -> tuple:
143
+ """
144
+ Compute the average alignment cost (ignoring zero-cost alignments) and the zero-cost ratio.
145
+
146
+ Args:
147
+ alignment_results (list): List of alignment result strings in the format "[src]:[tgt]:cost".
148
+
149
+ Returns:
150
+ tuple: (average_cost (float), zero_cost_ratio (float))
151
+ """
152
+ costs = []
153
+ zero_cost_count = 0
154
+
155
+ for entry in alignment_results:
156
+ try:
157
+ cost = float(entry.split(":")[-1])
158
+ if cost == 0.0:
159
+ zero_cost_count += 1
160
+ else:
161
+ costs.append(cost)
162
+ except ValueError:
163
+ continue
164
+
165
+ avg_cost = sum(costs) / len(costs) if costs else 0.0
166
+ zero_cost_ratio = zero_cost_count / len(alignment_results) if alignment_results else 0.0
167
+
168
+ return avg_cost, zero_cost_ratio
169
+
170
+
171
+ def run_vecalign_explore(src_text: str, tgt_text: str, src_overlap: str, tgt_overlap: str,
172
+ src_embed: bytes, tgt_embed: bytes) -> list:
173
+ """
174
+ Explore the best vector alignment parameters and return the best alignments.
175
+
176
+ Args:
177
+ src_text (str): Source text.
178
+ tgt_text (str): Target text.
179
+ src_overlap (str): Overlap data for the source.
180
+ tgt_overlap (str): Overlap data for the target.
181
+ src_embed (bytes): Embedding data for the source.
182
+ tgt_embed (bytes): Embedding data for the target.
183
+
184
+ Returns:
185
+ list: Parsed best alignments as a list of tuples [(src_indices, tgt_indices), ...].
186
+ """
187
+ del_percentile_frac = 0.2
188
+ step_size = 0.005
189
+ prev_zero_cost_ratio = None
190
+ prev_avg_cost = None
191
+
192
+ best_avg_cost = float('inf')
193
+ best_del_percentile_frac = del_percentile_frac
194
+ best_zero_cost_ratio = 0.0
195
+ best_alignments = []
196
+
197
+ first_flag = True
198
+
199
+ with tempfile.NamedTemporaryFile(delete=True, mode="w+", encoding="utf-8", suffix=".txt") as src_file, \
200
+ tempfile.NamedTemporaryFile(delete=True, mode="w+", encoding="utf-8", suffix=".txt") as tgt_file, \
201
+ tempfile.NamedTemporaryFile(delete=True, mode="w+", encoding="utf-8", suffix=".overlaps") as src_overlap_file, \
202
+ tempfile.NamedTemporaryFile(delete=True, mode="w+", encoding="utf-8", suffix=".overlaps") as tgt_overlap_file, \
203
+ tempfile.NamedTemporaryFile(delete=True, mode="wb", suffix=".emb") as src_embed_file, \
204
+ tempfile.NamedTemporaryFile(delete=True, mode="wb", suffix=".emb") as tgt_embed_file:
205
+
206
+ src_file.write(src_text)
207
+ src_file.flush()
208
+ tgt_file.write(tgt_text)
209
+ tgt_file.flush()
210
+
211
+ src_overlap_file.write(src_overlap)
212
+ src_overlap_file.flush()
213
+ tgt_overlap_file.write(tgt_overlap)
214
+ tgt_overlap_file.flush()
215
+
216
+ src_embed_file.write(src_embed)
217
+ src_embed_file.flush()
218
+ tgt_embed_file.write(tgt_embed)
219
+ tgt_embed_file.flush()
220
+
221
+ while del_percentile_frac > 0:
222
+ result = subprocess.run(
223
+ [
224
+ "./vecalign.py",
225
+ "--alignment_max_size", "8",
226
+ "--del_percentile_frac", str(del_percentile_frac),
227
+ "--src", src_file.name,
228
+ "--tgt", tgt_file.name,
229
+ "--src_embed", src_overlap_file.name, src_embed_file.name,
230
+ "--tgt_embed", tgt_overlap_file.name, tgt_embed_file.name,
231
+ ],
232
+ stdout=subprocess.PIPE,
233
+ text=True,
234
+ )
235
+
236
+ output_lines = result.stdout.strip().split("\n")
237
+ avg_cost, zero_cost_ratio = compute_alignment_stats(output_lines)
238
+ print(f"del_percentile_frac: {del_percentile_frac:.3f} | Avg Cost: {avg_cost:.6f} | Zero-Cost Ratio: {zero_cost_ratio:.2%}")
239
+
240
+ if first_flag:
241
+ first_flag = False
242
+
243
+ if prev_zero_cost_ratio is not None and prev_zero_cost_ratio != 0 and (zero_cost_ratio / prev_zero_cost_ratio) > 1.5:
244
+ print(f"Stopping exploration: Zero-cost ratio increased sharply at {del_percentile_frac:.3f}")
245
+ break
246
+ elif prev_zero_cost_ratio is not None and (
247
+ (zero_cost_ratio - prev_zero_cost_ratio) > 0.15 or
248
+ avg_cost > prev_avg_cost or
249
+ avg_cost < 0.3 or zero_cost_ratio > 0.7
250
+ ):
251
+ print(f"Stopping exploration: Zero-cost ratio increased sharply at {del_percentile_frac:.3f}")
252
+ break
253
+ else:
254
+ if avg_cost < best_avg_cost:
255
+ best_avg_cost = avg_cost
256
+ best_del_percentile_frac = del_percentile_frac
257
+ best_zero_cost_ratio = zero_cost_ratio
258
+ best_alignments = output_lines
259
+
260
+ prev_zero_cost_ratio = zero_cost_ratio
261
+ prev_avg_cost = avg_cost
262
+ del_percentile_frac -= step_size
263
+
264
+ # Parse the best alignments
265
+ parsed_alignments = []
266
+ for line in best_alignments:
267
+ if line:
268
+ src_part, tgt_part, _ = line.split(":")
269
+ src_indices = list(map(int, src_part.strip("[]").split(","))) if src_part.strip("[]") else []
270
+ tgt_indices = list(map(int, tgt_part.strip("[]").split(","))) if tgt_part.strip("[]") else []
271
+ parsed_alignments.append((src_indices, tgt_indices))
272
+
273
+ print("\nBest Found:")
274
+ print(f"del_percentile_frac: {best_del_percentile_frac:.3f} | Avg Cost: {best_avg_cost:.6f} | Zero-Cost Ratio: {best_zero_cost_ratio:.2%}")
275
+ return parsed_alignments
276
+
277
+
278
+ def clean_sentence(sentence: str) -> str:
279
+ """
280
+ Clean a sentence by removing duplicate parts and reconnecting with the separator.
281
+
282
+ Args:
283
+ sentence (str): Input sentence.
284
+
285
+ Returns:
286
+ str: Cleaned sentence.
287
+ """
288
+ if not sentence:
289
+ return ""
290
+ parts = sentence.split(SEPARATOR)
291
+ unique_parts = list(dict.fromkeys(part.strip() for part in parts if part.strip()))
292
+ return f" {SEPARATOR} ".join(unique_parts) + f" {SEPARATOR}"
293
+
294
+
295
+ def sliding_windows(sentences: list, window_size: int) -> list:
296
+ """
297
+ Create sliding windows from a list of sentences.
298
+
299
+ Args:
300
+ sentences (list): List of sentences.
301
+ window_size (int): Window size.
302
+
303
+ Returns:
304
+ list: List of sliding windows (each is a list of sentences).
305
+ """
306
+ windows = []
307
+ for i in range(len(sentences) - window_size + 1):
308
+ window = [clean_sentence(s) for s in sentences[i:i + window_size]]
309
+ # Remove duplicate window contents
310
+ unique_window = list(dict.fromkeys(window))
311
+ windows.append(unique_window)
312
+ return windows
313
+
314
+
315
+ def save_windows_to_file(paragraph_id: int, aligned_src: list, aligned_ref: list, aligned_mt: list,
316
+ src_windows: list, ref_windows: list, mt_windows: list,
317
+ qe_src_windows: list, qe_mt_windows: list, output_dir: str,
318
+ output_name: str) -> None:
319
+ """
320
+ Save window information and alignment data as JSON files.
321
+
322
+ Args:
323
+ paragraph_id (int): Paragraph ID.
324
+ aligned_src (list): Adjusted source alignment.
325
+ aligned_ref (list): Adjusted reference alignment.
326
+ aligned_mt (list): Adjusted MT alignment.
327
+ src_windows (list): Source sliding windows.
328
+ ref_windows (list): Reference sliding windows.
329
+ mt_windows (list): MT sliding windows.
330
+ qe_src_windows (list): QE source sliding windows.
331
+ qe_mt_windows (list): QE MT sliding windows.
332
+ output_dir (str): Output directory path.
333
+ output_name (str): Identifier for the output file.
334
+ """
335
+ os.makedirs(output_dir, exist_ok=True)
336
+
337
+ windows_data = {
338
+ "paragraph_id": paragraph_id,
339
+ "src_windows": src_windows,
340
+ "ref_windows": ref_windows,
341
+ "mt_windows": mt_windows,
342
+ }
343
+ windows_file = os.path.join(output_dir, f"windows_{paragraph_id}_{output_name}.json")
344
+ with open(windows_file, "w", encoding="utf-8") as f:
345
+ json.dump(windows_data, f, ensure_ascii=False, indent=2)
346
+
347
+ qe_windows_data = {
348
+ "paragraph_id": paragraph_id,
349
+ "src_windows": qe_src_windows,
350
+ "mt_windows": qe_mt_windows,
351
+ }
352
+ qe_windows_file = os.path.join(output_dir, f"qe_windows_{paragraph_id}_{output_name}.json")
353
+ with open(qe_windows_file, "w", encoding="utf-8") as f:
354
+ json.dump(qe_windows_data, f, ensure_ascii=False, indent=2)
355
+
356
+ aligned_info = {
357
+ "src": aligned_src,
358
+ "ref": aligned_ref,
359
+ "mt": aligned_mt,
360
+ }
361
+ aligned_file = os.path.join(output_dir, f"aligned_{paragraph_id}_{output_name}.json")
362
+ with open(aligned_file, "w", encoding="utf-8") as f:
363
+ json.dump(aligned_info, f, ensure_ascii=False, indent=2)
364
+
365
+
366
+ # -----------------------------------------------------------------------------
367
+ # Alignment Gap Processing Functions
368
+ # -----------------------------------------------------------------------------
369
+ def process_gaps(alignments: list) -> tuple:
370
+ """
371
+ Process alignment list blocks where the source is empty but target is non-empty,
372
+ converting them into gap alignments (source converted to a negative gap key).
373
+
374
+ Args:
375
+ alignments (list): Original alignment list (each element is (src_indices, tgt_indices)).
376
+
377
+ Returns:
378
+ tuple: (new_alignments (list), gap_counts (dict))
379
+ """
380
+ new_alignments = []
381
+ gap_counts = {}
382
+ n = len(alignments)
383
+ i = 0
384
+ while i < n:
385
+ src, tgt = alignments[i]
386
+ if not src and tgt:
387
+ block = []
388
+ while i < n and not alignments[i][0] and alignments[i][1]:
389
+ block.append(alignments[i])
390
+ i += 1
391
+ # Get the left neighbor's source index if available
392
+ left_src = new_alignments[-1][0][-1] if new_alignments and new_alignments[-1][0] else None
393
+ # Get the first non-empty source index on the right
394
+ right_src = None
395
+ j = i
396
+ while j < n:
397
+ if alignments[j][0]:
398
+ right_src = alignments[j][0][0]
399
+ break
400
+ j += 1
401
+ gap_key = left_src if left_src is not None else (right_src - 1 if right_src is not None else 0)
402
+ for item in block:
403
+ new_alignments.append(([-gap_key], item[1]))
404
+ gap_counts[gap_key] = gap_counts.get(gap_key, 0) + len(block)
405
+ else:
406
+ new_alignments.append(alignments[i])
407
+ i += 1
408
+ return new_alignments, gap_counts
409
+
410
+
411
+ def complement_gaps(processed: list, gap_counts: dict, desired_gaps: dict) -> list:
412
+ """
413
+ Complement the gaps in the processed alignment list by inserting dummy alignments until
414
+ the desired gap count is met.
415
+
416
+ Args:
417
+ processed (list): Processed alignment list.
418
+ gap_counts (dict): Counts of each gap key in the processed list.
419
+ desired_gaps (dict): Desired counts for each gap key from the other alignment list.
420
+
421
+ Returns:
422
+ list: Processed alignment list after gap completion.
423
+ """
424
+ all_keys = set(gap_counts.keys()) | set(desired_gaps.keys())
425
+ for gap in all_keys:
426
+ current = gap_counts.get(gap, 0)
427
+ desired = desired_gaps.get(gap, 0)
428
+ if current < desired:
429
+ indices = [i for i, (src, _) in enumerate(processed) if src and src[0] == -gap]
430
+ insert_idx = indices[0] if indices else next((i for i, (src, _) in enumerate(processed) if src and src[0] > gap), len(processed))
431
+ for _ in range(desired - current):
432
+ processed.insert(insert_idx, ([-gap], []))
433
+ gap_counts[gap] = desired
434
+ return processed
435
+
436
+
437
+ def custom_sort_key(item: tuple) -> tuple:
438
+ """
439
+ Custom sort key:
440
+ - For non-gap alignments (positive), key = (source, 0).
441
+ - For gap alignments (negative), key = (abs(source), 1).
442
+
443
+ Args:
444
+ item (tuple): Alignment tuple (src_indices, tgt_indices).
445
+
446
+ Returns:
447
+ tuple: Sorting key.
448
+ """
449
+ src, _ = item
450
+ if src:
451
+ val = src[0]
452
+ return (val, 0) if val >= 0 else (abs(val), 1)
453
+ return (float('inf'), 2)
454
+
455
+
456
+ def fill_empty_alignments(src_ref_alignments: list, src_mt_alignments: list) -> tuple:
457
+ """
458
+ Fill the empty alignments (gaps) in both source-reference and source-MT alignments so that
459
+ the gap key counts match, then sort them.
460
+
461
+ Args:
462
+ src_ref_alignments (list): Alignment list for source-reference.
463
+ src_mt_alignments (list): Alignment list for source-MT.
464
+
465
+ Returns:
466
+ tuple: (filled_src_ref_alignments, filled_src_mt_alignments)
467
+ """
468
+ proc_ref, gaps_ref = process_gaps(src_ref_alignments)
469
+ proc_mt, gaps_mt = process_gaps(src_mt_alignments)
470
+ proc_ref = complement_gaps(proc_ref, gaps_ref, gaps_mt)
471
+ proc_mt = complement_gaps(proc_mt, gaps_mt, gaps_ref)
472
+ proc_ref.sort(key=custom_sort_key)
473
+ proc_mt.sort(key=custom_sort_key)
474
+ return proc_ref, proc_mt
475
+
476
+
477
+ def find_common_alignments(src_ref_alignments: list, src_mt_alignments: list) -> list:
478
+ """
479
+ Find common alignments between source-reference and source-MT alignment lists and remove duplicates.
480
+
481
+ Args:
482
+ src_ref_alignments (list): Alignment list for source-reference.
483
+ src_mt_alignments (list): Alignment list for source-MT.
484
+
485
+ Returns:
486
+ list: List of common alignments as (common_src_indices, common_ref_indices, common_mt_indices).
487
+ """
488
+ common_alignments = []
489
+ src_ref_alignments, src_mt_alignments = fill_empty_alignments(src_ref_alignments, src_mt_alignments)
490
+
491
+ for ref_align in src_ref_alignments:
492
+ for mt_align in src_mt_alignments:
493
+ common_src = sorted(list(set(ref_align[0]) & set(mt_align[0])))
494
+ if common_src:
495
+ common_ref = sorted(list(set(ref_align[1]))) if ref_align[1] else [-1]
496
+ common_mt = sorted(list(set(mt_align[1]))) if mt_align[1] else [-1]
497
+ common_alignments.append((common_src, common_ref, common_mt))
498
+
499
+ # Remove duplicate triples
500
+ unique = []
501
+ seen = set()
502
+ for triple in common_alignments:
503
+ key = (tuple(triple[0]), tuple(triple[1]), tuple(triple[2]))
504
+ if key not in seen:
505
+ seen.add(key)
506
+ unique.append(triple)
507
+ print("Unique common alignments:")
508
+ print(unique)
509
+ return unique
510
+
511
+
512
+ def args_to_dict(args: argparse.Namespace, prefix: str, strip_prefix: bool = False) -> dict:
513
+ """
514
+ Convert an argparse Namespace to a dictionary, optionally filtering by a prefix and stripping it.
515
+
516
+ Args:
517
+ args (argparse.Namespace): Input arguments.
518
+ prefix (str): Prefix to filter keys.
519
+ strip_prefix (bool): Whether to remove the prefix from keys (default is False).
520
+
521
+ Returns:
522
+ dict: Filtered dictionary.
523
+ """
524
+ d = vars(args)
525
+ prefix_key = prefix + '_'
526
+ filtered = {k: v for k, v in d.items() if k.startswith(prefix_key)}
527
+ if strip_prefix:
528
+ return {k[len(prefix_key):]: v for k, v in filtered.items()}
529
+ return filtered
530
+
531
+
532
+ # -----------------------------------------------------------------------------
533
+ # Metrics Computation
534
+ # -----------------------------------------------------------------------------
535
+ def compute_metrics(paragraph_src: str, paragraph_ref: str, paragraph_mt: str,
536
+ src_windows: list, ref_windows: list, mt_windows: list,
537
+ qe_src_windows: list, qe_mt_windows: list,
538
+ paragraph_id: int, mt_col: str) -> dict:
539
+ """
540
+ Compute COMET and COMET-QE scores, then save the scores and related window information as a JSON file.
541
+
542
+ Args:
543
+ paragraph_src (str): Source paragraph text.
544
+ paragraph_ref (str): Reference paragraph text.
545
+ paragraph_mt (str): MT paragraph text.
546
+ bleu_adjusted_ref: (Placeholder) BLEU adjusted parameter.
547
+ bleu_adjusted_mt: (Placeholder) BLEU adjusted parameter.
548
+ src_windows (list): Source sliding windows.
549
+ ref_windows (list): Reference sliding windows.
550
+ mt_windows (list): MT sliding windows.
551
+ qe_src_windows (list): QE source sliding windows.
552
+ qe_mt_windows (list): QE MT sliding windows.
553
+ paragraph_id (int): Paragraph ID.
554
+ mt_col (str): MT column name.
555
+
556
+ Returns:
557
+ dict: Dictionary containing various computed scores.
558
+ """
559
+ comet_zero_score_windows = []
560
+ comet_qe_zero_score_windows = []
561
+
562
+ with tempfile.NamedTemporaryFile(mode='w+', delete=True) as src_file, \
563
+ tempfile.NamedTemporaryFile(mode='w+', delete=True) as ref_file, \
564
+ tempfile.NamedTemporaryFile(mode='w+', delete=True) as mt_file, \
565
+ tempfile.NamedTemporaryFile(mode='w+', delete=True) as qe_src_file, \
566
+ tempfile.NamedTemporaryFile(mode='w+', delete=True) as qe_mt_file:
567
+
568
+ # Write each window on a separate line
569
+ for idx, (src_win, ref_win, mt_win) in enumerate(zip(src_windows, ref_windows, mt_windows)):
570
+ src_line = " ".join(src_win)
571
+ ref_line = " ".join(ref_win)
572
+ mt_line = " ".join(mt_win)
573
+ if src_line and mt_line:
574
+ src_file.write(src_line + "\n")
575
+ ref_file.write(ref_line + "\n")
576
+ mt_file.write(mt_line + "\n")
577
+ else:
578
+ comet_zero_score_windows.append(idx)
579
+
580
+ src_file.flush()
581
+ ref_file.flush()
582
+ mt_file.flush()
583
+
584
+ comet_command = [
585
+ "comet-score",
586
+ "-s", src_file.name,
587
+ "-t", mt_file.name,
588
+ "-r", ref_file.name,
589
+ "--model", COMET_MODEL,
590
+ "--enable-context",
591
+ "--gpus", GPU_ID,
592
+ "--quiet",
593
+ ]
594
+ result = subprocess.run(comet_command, stdout=subprocess.PIPE, text=True)
595
+ print(result.stdout)
596
+ comet_scores = [float(s) for s in re.findall(r"score:\s(-?[0-9.]+)", result.stdout.strip())][:-1]
597
+
598
+ for idx, (src_win, mt_win) in enumerate(zip(qe_src_windows, qe_mt_windows)):
599
+ src_line = " ".join(src_win)
600
+ mt_line = " ".join(mt_win)
601
+ if src_line and mt_line:
602
+ qe_src_file.write(src_line + "\n")
603
+ qe_mt_file.write(mt_line + "\n")
604
+ else:
605
+ comet_qe_zero_score_windows.append(idx)
606
+
607
+ qe_src_file.flush()
608
+ qe_mt_file.flush()
609
+
610
+ qe_command = [
611
+ "comet-score",
612
+ "-s", qe_src_file.name,
613
+ "-t", qe_mt_file.name,
614
+ "--model", COMET_QE_MODEL,
615
+ "--enable-context",
616
+ "--gpus", GPU_ID,
617
+ "--quiet",
618
+ ]
619
+ qe_result = subprocess.run(qe_command, stdout=subprocess.PIPE, text=True)
620
+ print(qe_result.stdout)
621
+ comet_qe_scores = [float(s) for s in re.findall(r"score:\s(-?[0-9.]+)", qe_result.stdout.strip())][:-1]
622
+
623
+ # Insert zero scores for windows that had missing scores
624
+ for idx in comet_zero_score_windows:
625
+ comet_scores.insert(idx, 0.0)
626
+ for idx in comet_qe_zero_score_windows:
627
+ comet_qe_scores.insert(idx, 0.0)
628
+
629
+ # Placeholder values for sentence-level metrics
630
+ sentences_length = len(paragraph_mt.splitlines())
631
+ sentences_zero_ratio = 0.0
632
+
633
+ scores_data = {
634
+ 'paragraph_id': paragraph_id,
635
+ 'comet_scores': comet_scores,
636
+ 'comet_qe_scores': comet_qe_scores,
637
+ 'sentences_length': sentences_length,
638
+ 'windows_length': len(comet_scores),
639
+ 'windows_qe_length': len(comet_qe_scores),
640
+ 'sentences_zero_ratio': sentences_zero_ratio,
641
+ 'windows_zero_ratio': len(comet_zero_score_windows) / len(comet_scores) if comet_scores else 0,
642
+ 'windows_qe_zero_ratio': len(comet_qe_zero_score_windows) / len(comet_qe_scores) if comet_qe_scores else 0,
643
+ 'avg_comet': sum(comet_scores) / len(comet_scores) if comet_scores else 0,
644
+ 'avg_comet_qe': sum(comet_qe_scores) / len(comet_qe_scores) if comet_qe_scores else 0
645
+ }
646
+
647
+ scores_file = os.path.join(SAVE_FOLDER, 'scores', f'scores_{paragraph_id}_{mt_col}.json')
648
+ os.makedirs(os.path.dirname(scores_file), exist_ok=True)
649
+ with open(scores_file, 'w', encoding='utf-8') as f:
650
+ json.dump(scores_data, f, ensure_ascii=False, indent=2)
651
+
652
+ return scores_data
653
+
654
+
655
+ def compute_metrics_reference_free(src_windows: list, mt_windows: list,
656
+ qe_src_windows: list, qe_mt_windows: list,
657
+ paragraph_id: int, mt_col: str) -> dict:
658
+ """
659
+ Compute reference-free evaluation metrics (only QE scores) when no reference is provided.
660
+
661
+ Args:
662
+ src_windows (list): (Unused) Source sliding windows.
663
+ mt_windows (list): (Unused) MT sliding windows.
664
+ qe_src_windows (list): QE source sliding windows.
665
+ qe_mt_windows (list): QE MT sliding windows.
666
+ paragraph_id (int): Paragraph ID.
667
+ mt_col (str): MT column name.
668
+
669
+ Returns:
670
+ dict: Dictionary containing computed QE scores.
671
+ """
672
+ comet_qe_zero_score_windows = []
673
+
674
+ with tempfile.NamedTemporaryFile(mode='w+', delete=True) as qe_src_file, \
675
+ tempfile.NamedTemporaryFile(mode='w+', delete=True) as qe_mt_file:
676
+
677
+ for idx, (src_win, mt_win) in enumerate(zip(qe_src_windows, qe_mt_windows)):
678
+ src_line = " ".join(src_win)
679
+ mt_line = " ".join(mt_win)
680
+ if src_line and mt_line:
681
+ qe_src_file.write(src_line + "\n")
682
+ qe_mt_file.write(mt_line + "\n")
683
+ else:
684
+ comet_qe_zero_score_windows.append(idx)
685
+ qe_src_file.flush()
686
+ qe_mt_file.flush()
687
+
688
+ qe_command = [
689
+ "comet-score",
690
+ "-s", qe_src_file.name,
691
+ "-t", qe_mt_file.name,
692
+ "--model", COMET_QE_MODEL,
693
+ "--enable-context",
694
+ "--gpus", GPU_ID,
695
+ "--quiet",
696
+ ]
697
+ qe_result = subprocess.run(qe_command, stdout=subprocess.PIPE, text=True)
698
+ print(qe_result.stdout)
699
+ comet_qe_scores = [float(s) for s in re.findall(r"score:\s(-?[0-9.]+)", qe_result.stdout.strip())][:-1]
700
+
701
+ for idx in comet_qe_zero_score_windows:
702
+ comet_qe_scores.insert(idx, 0.0)
703
+
704
+ scores_data = {
705
+ 'paragraph_id': paragraph_id,
706
+ 'comet_scores': 0.0, # Not computed in reference-free mode.
707
+ 'comet_qe_scores': comet_qe_scores,
708
+ 'windows_length': len(comet_qe_scores),
709
+ 'windows_qe_length': len(comet_qe_scores),
710
+ 'avg_comet': 0.0,
711
+ 'avg_comet_qe': sum(comet_qe_scores) / len(comet_qe_scores) if comet_qe_scores else 0,
712
+ }
713
+ return scores_data
714
+
715
+
716
+ # -----------------------------------------------------------------------------
717
+ # Paragraph-Level Processing
718
+ # -----------------------------------------------------------------------------
719
+ def paragraph_level_score(row: pd.Series, paragraph_id: int, src_col: str = None,
720
+ ref_col: str = None, mt_col: str = None) -> None:
721
+ """
722
+ Process alignment and scoring for a single paragraph. Steps include:
723
+ 1. Sentence segmentation and preprocessing.
724
+ 2. Generating overlaps and embeddings.
725
+ 3. Running vector alignment exploration.
726
+ 4. Computing COMET and COMET-QE scores and saving window information.
727
+
728
+ Args:
729
+ row (pd.Series): A single data row.
730
+ paragraph_id (int): Paragraph identifier.
731
+ src_col (str): Source column name (default is "zh").
732
+ ref_col (str): Reference column name (default is set based on language).
733
+ mt_col (str): MT column name (default is set based on TARGET).
734
+ """
735
+ global mt_nlp, src_nlp
736
+
737
+ # Set default columns if not provided
738
+ if ref_col is None:
739
+ ref_col = LANG
740
+ if mt_col is None:
741
+ mt_col = TARGET
742
+
743
+ # Sentence segmentation and preprocessing
744
+ src_sentences = segment_sentences_by_punctuation(row[src_col], src_col)
745
+ ref_sentences = segment_sentences_by_punctuation(row[ref_col], ref_col)
746
+ mt_sentences = segment_sentences_by_punctuation(row[mt_col], ref_col)
747
+
748
+ src_txt = preprocess_sentences(src_sentences)
749
+ ref_txt = preprocess_sentences(ref_sentences)
750
+ mt_txt = preprocess_sentences(mt_sentences)
751
+
752
+ # Generate overlap and embedding data
753
+ src_overlap, src_embed = generate_overlap_and_embedding(src_txt)
754
+ ref_overlap, ref_embed = generate_overlap_and_embedding(ref_txt)
755
+ mt_overlap, mt_embed = generate_overlap_and_embedding(mt_txt)
756
+
757
+ # Run vector alignment exploration
758
+ src_ref_alignments = run_vecalign_explore(src_txt, ref_txt, src_overlap, ref_overlap, src_embed, ref_embed)
759
+ src_mt_alignments = run_vecalign_explore(src_txt, mt_txt, src_overlap, mt_overlap, src_embed, mt_embed)
760
+
761
+ # For reference-free evaluation: get non-adjusted alignments
762
+ non_adjusted_src = []
763
+ non_adjusted_mt = []
764
+ for src_indices, mt_indices in src_mt_alignments:
765
+ mt_indices = [x for x in mt_indices if x != -1]
766
+ aligned_src = " ".join([src_sentences[i] for i in src_indices]) if src_indices else ""
767
+ aligned_mt = " ".join([mt_sentences[i] for i in mt_indices]) if mt_indices else ""
768
+ non_adjusted_src.append(aligned_src)
769
+ non_adjusted_mt.append(aligned_mt)
770
+
771
+ # Find common alignments between src-ref and src-mt
772
+ common_alignments = find_common_alignments(src_ref_alignments, src_mt_alignments)
773
+
774
+ adjusted_src, adjusted_ref, adjusted_mt = [], [], []
775
+ for src_indices, ref_indices, mt_indices in common_alignments:
776
+ ref_indices = [x for x in ref_indices if x != -1]
777
+ mt_indices = [x for x in mt_indices if x != -1]
778
+ aligned_src = "" if (src_indices and src_indices[0] < 0) else " ".join([src_sentences[i] for i in src_indices])
779
+ aligned_ref = " ".join([ref_sentences[i] for i in ref_indices]) if ref_indices else ""
780
+ aligned_mt = " ".join([mt_sentences[i] for i in mt_indices]) if mt_indices else ""
781
+ adjusted_src.append(aligned_src)
782
+ adjusted_ref.append(aligned_ref)
783
+ adjusted_mt.append(aligned_mt)
784
+
785
+ # Create sliding windows
786
+ src_windows = sliding_windows(adjusted_src, WINDOW_SIZE)
787
+ ref_windows = sliding_windows(adjusted_ref, WINDOW_SIZE)
788
+ mt_windows = sliding_windows(adjusted_mt, WINDOW_SIZE)
789
+ qe_src_windows = sliding_windows(non_adjusted_src, WINDOW_SIZE)
790
+ qe_mt_windows = sliding_windows(non_adjusted_mt, WINDOW_SIZE)
791
+
792
+ # Compute metrics and save window information
793
+ compute_metrics(
794
+ row[src_col], row[ref_col], row[mt_col],
795
+ src_windows, ref_windows, mt_windows,
796
+ qe_src_windows, qe_mt_windows,
797
+ paragraph_id, mt_col
798
+ )
799
+
800
+ output_dir = os.path.join(SAVE_FOLDER, "windows")
801
+ save_windows_to_file(paragraph_id, adjusted_src, adjusted_ref, adjusted_mt,
802
+ src_windows, ref_windows, mt_windows,
803
+ qe_src_windows, qe_mt_windows, output_dir, output_name=mt_col)
804
+
805
+
806
+ def parallel_paragraph_level_score(args: tuple) -> None:
807
+ """
808
+ Process a single paragraph in parallel. If an exception occurs, it prints an error message.
809
+
810
+ Args:
811
+ args (tuple): (row (pd.Series), paragraph_id (int))
812
+ """
813
+ row, paragraph_id = args
814
+ try:
815
+ paragraph_level_score(row, paragraph_id, mt_col=TARGET, src_col= SRC_LANG)
816
+ except Exception as e:
817
+ print(f"Error processing paragraph {paragraph_id}: {e}")
818
+ print(f"{TARGET} result cannot be aligned in paragraph {paragraph_id}\n")
819
+
820
+
821
+ # -----------------------------------------------------------------------------
822
+ # New Function: Flexible Evaluation (Reference-Free or Full Evaluation)
823
+ # -----------------------------------------------------------------------------
824
+ def evaluate_score(src: str, tgt: str, ref: Optional[str] = None) -> dict:
825
+ """
826
+ Evaluate quality scores for given source and target texts.
827
+ If a reference is provided, full evaluation is performed (including src-ref alignment);
828
+ otherwise, reference-free evaluation is conducted using only src and tgt.
829
+
830
+ Args:
831
+ src (str): Source text.
832
+ tgt (str): Target (MT) text.
833
+ ref (Optional[str]): Reference text (if provided).
834
+
835
+ Returns:
836
+ dict: Dictionary of evaluation scores.
837
+ """
838
+ # Full evaluation (with reference)
839
+ if ref is not None:
840
+ src_sentences = segment_sentences_by_punctuation(src, SRC_LANG)
841
+ ref_sentences = segment_sentences_by_punctuation(ref, LANG)
842
+ tgt_sentences = segment_sentences_by_punctuation(tgt, LANG)
843
+
844
+ src_txt = preprocess_sentences(src_sentences)
845
+ ref_txt = preprocess_sentences(ref_sentences)
846
+ tgt_txt = preprocess_sentences(tgt_sentences)
847
+
848
+ src_overlap, src_embed = generate_overlap_and_embedding(src_txt)
849
+ ref_overlap, ref_embed = generate_overlap_and_embedding(ref_txt)
850
+ tgt_overlap, tgt_embed = generate_overlap_and_embedding(tgt_txt)
851
+
852
+ src_ref_alignments = run_vecalign_explore(src_txt, ref_txt, src_overlap, ref_overlap, src_embed, ref_embed)
853
+ src_mt_alignments = run_vecalign_explore(src_txt, tgt_txt, src_overlap, tgt_overlap, src_embed, tgt_embed)
854
+
855
+ non_adjusted_src = []
856
+ non_adjusted_mt = []
857
+ for s_indices, t_indices in src_mt_alignments:
858
+ filtered_t_indices = [x for x in t_indices if x != -1]
859
+ aligned_src = " ".join([src_sentences[i] for i in s_indices]) if s_indices else ""
860
+ aligned_mt = " ".join([tgt_sentences[i] for i in filtered_t_indices]) if filtered_t_indices else ""
861
+ non_adjusted_src.append(aligned_src)
862
+ non_adjusted_mt.append(aligned_mt)
863
+
864
+ common_alignments = find_common_alignments(src_ref_alignments, src_mt_alignments)
865
+ adjusted_src, adjusted_ref, adjusted_mt = [], [], []
866
+ for s_indices, r_indices, t_indices in common_alignments:
867
+ r_indices = [x for x in r_indices if x != -1]
868
+ t_indices = [x for x in t_indices if x != -1]
869
+ aligned_src = "" if (s_indices and s_indices[0] < 0) else " ".join([src_sentences[i] for i in s_indices])
870
+ aligned_ref = " ".join([ref_sentences[i] for i in r_indices]) if r_indices else ""
871
+ aligned_mt = " ".join([tgt_sentences[i] for i in t_indices]) if t_indices else ""
872
+ adjusted_src.append(aligned_src)
873
+ adjusted_ref.append(aligned_ref)
874
+ adjusted_mt.append(aligned_mt)
875
+
876
+ src_windows = sliding_windows(adjusted_src, WINDOW_SIZE)
877
+ ref_windows = sliding_windows(adjusted_ref, WINDOW_SIZE)
878
+ tgt_windows = sliding_windows(adjusted_mt, WINDOW_SIZE)
879
+ qe_src_windows = sliding_windows(non_adjusted_src, WINDOW_SIZE)
880
+ qe_mt_windows = sliding_windows(non_adjusted_mt, WINDOW_SIZE)
881
+
882
+ # Use paragraph_id=0 for single evaluation
883
+ scores_data = compute_metrics(src, ref, tgt,
884
+ src_windows, ref_windows, tgt_windows,
885
+ qe_src_windows, qe_mt_windows,
886
+ paragraph_id=0, mt_col=TARGET)
887
+ return scores_data
888
+
889
+ # Reference-free evaluation
890
+ else:
891
+ src_sentences = segment_sentences_by_punctuation(src, SRC_LANG)
892
+ tgt_sentences = segment_sentences_by_punctuation(tgt, LANG)
893
+
894
+ src_txt = preprocess_sentences(src_sentences)
895
+ tgt_txt = preprocess_sentences(tgt_sentences)
896
+
897
+ src_overlap, src_embed = generate_overlap_and_embedding(src_txt)
898
+ tgt_overlap, tgt_embed = generate_overlap_and_embedding(tgt_txt)
899
+
900
+ src_mt_alignments = run_vecalign_explore(src_txt, tgt_txt, src_overlap, tgt_overlap, src_embed, tgt_embed)
901
+
902
+ non_adjusted_src = []
903
+ non_adjusted_mt = []
904
+ for s_indices, t_indices in src_mt_alignments:
905
+ filtered_t_indices = [x for x in t_indices if x != -1]
906
+ aligned_src = " ".join([src_sentences[i] for i in s_indices]) if s_indices else ""
907
+ aligned_mt = " ".join([tgt_sentences[i] for i in filtered_t_indices]) if filtered_t_indices else ""
908
+ non_adjusted_src.append(aligned_src)
909
+ non_adjusted_mt.append(aligned_mt)
910
+
911
+ # In reference-free mode, only compute QE evaluation.
912
+ qe_src_windows = sliding_windows(non_adjusted_src, WINDOW_SIZE)
913
+ qe_mt_windows = sliding_windows(non_adjusted_mt, WINDOW_SIZE)
914
+
915
+ scores_data = compute_metrics_reference_free(src_windows=[], mt_windows=[],
916
+ qe_src_windows=qe_src_windows, qe_mt_windows=qe_mt_windows,
917
+ paragraph_id=0, mt_col=TARGET)
918
+ return scores_data
919
+
920
+
921
+ def aggregate_scores_and_merge(evaluated_file_path: str, save_folder: str, target: str) -> dict:
922
+ """
923
+ Read scores for each paragraph, aggregate the results, and save them as a CSV.
924
+
925
+ Args:
926
+ evaluated_file_path (str): Path to the original CSV file.
927
+ save_folder (str): Folder where scores are saved.
928
+ target (str): MT target name.
929
+
930
+ Returns:
931
+ dict: Overall average scores for each metric.
932
+ """
933
+ df = pd.read_csv(evaluated_file_path)
934
+ df['comet'] = 0.0
935
+ df['comet_qe'] = 0.0
936
+ df['sentences_zero_ratio'] = 0.0
937
+ df['windows_zero_ratio'] = 0.0
938
+ df['windows_qe_zero_ratio'] = 0.0
939
+
940
+ scores_dir = os.path.join(save_folder, 'scores')
941
+ total_scores = {
942
+ 'comet': [],
943
+ 'comet_qe': [],
944
+ 'sentences_zero_ratio': [],
945
+ 'windows_zero_ratio': [],
946
+ 'windows_qe_zero_ratio': []
947
+ }
948
+
949
+ for idx in df.index:
950
+ score_file = os.path.join(scores_dir, f'scores_{idx}_{target}.json')
951
+ if os.path.exists(score_file):
952
+ with open(score_file, 'r', encoding='utf-8') as f:
953
+ scores = json.load(f)
954
+ df.at[idx, 'comet'] = scores.get('avg_comet', 0)
955
+ df.at[idx, 'comet_qe'] = scores.get('avg_comet_qe', 0)
956
+ df.at[idx, 'sentences_zero_ratio'] = scores.get('sentences_zero_ratio', 0)
957
+ df.at[idx, 'windows_zero_ratio'] = scores.get('windows_zero_ratio', 0)
958
+ df.at[idx, 'windows_qe_zero_ratio'] = scores.get('windows_qe_zero_ratio', 0)
959
+
960
+ total_scores['comet'].append(scores.get('avg_comet', 0))
961
+ total_scores['comet_qe'].append(scores.get('avg_comet_qe', 0))
962
+ total_scores['sentences_zero_ratio'].append(scores.get('sentences_zero_ratio', 0))
963
+ total_scores['windows_zero_ratio'].append(scores.get('windows_zero_ratio', 0))
964
+ total_scores['windows_qe_zero_ratio'].append(scores.get('windows_qe_zero_ratio', 0))
965
+
966
+ overall_scores = {metric: (sum(vals) / len(vals) if vals else 0) for metric, vals in total_scores.items()}
967
+
968
+ output_path = os.path.join(save_folder, f'evaluated_results_{target}.csv')
969
+ df.to_csv(output_path, index=False)
970
+ return overall_scores
971
+
972
+
973
+ # -----------------------------------------------------------------------------
974
+ # Global Parameters
975
+ # -----------------------------------------------------------------------------
976
+ set_seed(42)
977
+
978
+ # Set up argparse with defaults for file, target_column, and save folder.
979
+ parser = argparse.ArgumentParser(description="Set TARGET_FILE, TARGET_COLUMN, and TASK_LANGUAGE")
980
+ parser.add_argument("--file", type=str, default="", help="(Optional) Set the MT target file")
981
+ parser.add_argument("--target_column", type=str, default="", help="(Optional) Set the MT target column")
982
+ parser.add_argument("--save", type=str, default="./", help="(Optional) Set the save folder")
983
+ parser.add_argument("--src_language", type=str, required=True, help="Set the task language (English, Russian, German)")
984
+ parser.add_argument("--task_language", type=str, required=True, help="Set the task language (English, Russian, German)")
985
+
986
+ args = parser.parse_args()
987
+
988
+ TARGET = args.target_column
989
+ TASK_LANGUAGE = args.task_language
990
+ SRC_LANGUAGE = args.src_language
991
+ print(f"TARGET: {TARGET}")
992
+ print(f"TASK_LANGUAGE: {TASK_LANGUAGE}")
993
+
994
+ if TASK_LANGUAGE == "English":
995
+ LANG = 'en'
996
+ elif TASK_LANGUAGE == "Russian":
997
+ LANG = 'ru'
998
+ elif TASK_LANGUAGE == "German":
999
+ LANG = 'de'
1000
+ elif TASK_LANGUAGE == "Japanese":
1001
+ LANG = 'ja'
1002
+ elif TASK_LANGUAGE == "Spanish":
1003
+ LANG = 'es'
1004
+ elif TASK_LANGUAGE == "Chinese":
1005
+ LANG = 'zh'
1006
+ else:
1007
+ raise ValueError("Unsupported TASK_LANGUAGE.")
1008
+
1009
+ if SRC_LANGUAGE == "English":
1010
+ SRC_LANG = 'en'
1011
+ elif SRC_LANGUAGE == "Russian":
1012
+ SRC_LANG = 'ru'
1013
+ elif SRC_LANGUAGE == "German":
1014
+ SRC_LANG = 'de'
1015
+ elif SRC_LANGUAGE == "Japanese":
1016
+ SRC_LANG = 'ja'
1017
+ elif SRC_LANGUAGE == "Spanish":
1018
+ SRC_LANG = 'es'
1019
+ elif SRC_LANGUAGE == "Chinese":
1020
+ SRC_LANG = 'zh'
1021
+ else:
1022
+ raise ValueError("Unsupported TASK_LANGUAGE.")
1023
+
1024
+ # File and folder path settings
1025
+ evaluated_file_path = args.file # May be empty if not provided
1026
+ WINDOW_SIZE = 3
1027
+ SEPARATOR = "</s>"
1028
+ SAVE_FOLDER = args.save
1029
+ GPU_ID = "1"
1030
+ COMET_MODEL = "Unbabel/wmt22-comet-da"
1031
+ COMET_QE_MODEL = "Unbabel/wmt22-cometkiwi-da"
1032
+
1033
+ if not os.path.exists(SAVE_FOLDER):
1034
+ os.makedirs(SAVE_FOLDER)
1035
+ print(f"Folder '{SAVE_FOLDER}' created")
1036
+ else:
1037
+ print(f"Folder '{SAVE_FOLDER}' already exists")
1038
+
1039
+ # Load Spacy models based on task language
1040
+ if TASK_LANGUAGE == "English":
1041
+ mt_nlp = spacy.load("en_core_web_sm")
1042
+ elif TASK_LANGUAGE == "Russian":
1043
+ mt_nlp = spacy.load("ru_core_news_sm")
1044
+ elif TASK_LANGUAGE == "German":
1045
+ mt_nlp = spacy.load("de_core_news_sm")
1046
+ elif TASK_LANGUAGE == "Japanese":
1047
+ mt_nlp = spacy.load("ja_core_news_sm")
1048
+ elif TASK_LANGUAGE == "Spanish":
1049
+ mt_nlp = spacy.load("es_core_news_sm")
1050
+ elif TASK_LANGUAGE == "Chinese":
1051
+ mt_nlp = spacy.load("zh_core_web_sm")
1052
+
1053
+ if SRC_LANGUAGE == "English":
1054
+ src_nlp = spacy.load("en_core_web_sm")
1055
+ elif SRC_LANGUAGE == "Russian":
1056
+ src_nlp = spacy.load("ru_core_news_sm")
1057
+ elif SRC_LANGUAGE == "German":
1058
+ src_nlp = spacy.load("de_core_news_sm")
1059
+ elif SRC_LANGUAGE == "Japanese":
1060
+ src_nlp = spacy.load("ja_core_news_sm")
1061
+ elif SRC_LANGUAGE == "Spanish":
1062
+ src_nlp = spacy.load("es_core_news_sm")
1063
+ elif SRC_LANGUAGE == "Chinese":
1064
+ src_nlp = spacy.load("zh_core_web_sm")
1065
+
1066
+ # -----------------------------------------------------------------------------
1067
+ # Main Process: Parallel processing of paragraphs and score aggregation
1068
+ # Command: export LASER="/path/to/laser/"
1069
+ # Command for evaluate csv:
1070
+ # python long_context_eval.py --file eval_en_ja.csv --target_column mpc --save eval_en_ja --src_language English --task_language Japanese
1071
+ # -----------------------------------------------------------------------------
1072
+ if __name__ == "__main__":
1073
+ data = pd.read_csv(evaluated_file_path)
1074
+ pool_args = [(row, idx) for idx, row in data.iterrows()]
1075
+ with Pool(2) as pool:
1076
+ pool.map(parallel_paragraph_level_score, pool_args)
1077
+ overall_scores = aggregate_scores_and_merge(evaluated_file_path, SAVE_FOLDER, TARGET)
1078
+ timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
1079
+ output_result = f"{TARGET}: {TASK_LANGUAGE} Overall scores: {overall_scores}, time: {timestamp}\n"
1080
+ print(output_result)
vecalign/memory2csv.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pandas as pd
4
+ import argparse
5
+
6
+ def process_csv(num, input_csv, output_csv, column_name):
7
+ # Read the input CSV file
8
+ try:
9
+ df = pd.read_csv(input_csv)
10
+ except Exception as e:
11
+ print(f"Error reading {input_csv}: {e}")
12
+ return
13
+
14
+ # Add the target column
15
+ df[column_name] = None
16
+
17
+ # Process each row by using the index to construct the JSON file path
18
+ for idx, row in df.iterrows():
19
+ json_path = os.path.join(os.path.splitext(input_csv)[0], f"metadata_{idx}_iter_{num}.json")
20
+
21
+ # Check if the file exists
22
+ if not os.path.exists(json_path):
23
+ print(f"File not found: {json_path}")
24
+ df.at[idx, column_name] = None
25
+ continue
26
+
27
+ # Open and read the JSON file
28
+ try:
29
+ with open(json_path, 'r', encoding='utf-8') as f:
30
+ data = json.load(f)
31
+ except Exception as e:
32
+ print(f"Error reading {json_path}: {e}")
33
+ df.at[idx, column_name] = None
34
+ continue
35
+
36
+ # Extract the value from final_translations_record
37
+ final_record = data.get("final_translations_record", [])
38
+ if isinstance(final_record, list) and len(final_record) > 0:
39
+ value = final_record[0]
40
+ else:
41
+ value = None
42
+
43
+ # Write the value into the target column
44
+ df.at[idx, column_name] = value
45
+
46
+ # Save the result to output CSV
47
+ try:
48
+ df.to_csv(output_csv, index=False)
49
+ print(f"Saved successfully: {output_csv}")
50
+ except Exception as e:
51
+ print(f"Error saving {output_csv}: {e}")
52
+
53
+
54
+ # Example command: python memory2csv.py --num 5 --input_csv valid_en_ja.csv --output_csv eval_en_ja.csv --column_name mpc
55
+ if __name__ == '__main__':
56
+ parser = argparse.ArgumentParser(description='Process CSV and extract data from JSON files.')
57
+ parser.add_argument('--num', type=int, required=True, help='Iteration number used in JSON filenames')
58
+ parser.add_argument('--input_csv', type=str, required=True, help='Path to input CSV file')
59
+ parser.add_argument('--output_csv', type=str, required=True, help='Path to save the output CSV file')
60
+ parser.add_argument('--column_name', type=str, required=True, help='Column name to store extracted values')
61
+
62
+ args = parser.parse_args()
63
+
64
+ process_csv(args.num, args.input_csv, args.output_csv, args.column_name)
vecalign/metricx24/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Google LLC
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
vecalign/metricx24/evaluate.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Google LLC
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Evaluates the predictions from a MetricX model."""
16
+
17
+ import dataclasses
18
+ import json
19
+ import os
20
+ from typing import Any, Tuple
21
+
22
+ from mt_metrics_eval import data
23
+ from mt_metrics_eval import stats
24
+ from mt_metrics_eval import tau_optimization
25
+ import numpy as np
26
+ import scipy.stats
27
+ import transformers
28
+
29
+
30
+ @dataclasses.dataclass
31
+ class Arguments:
32
+ dataset: str = dataclasses.field(metadata={"help": "The MTME dataset."})
33
+
34
+ lp: str = dataclasses.field(metadata={"help": "The language pair."})
35
+
36
+ input_file: str = dataclasses.field(metadata={"help": "The input file."})
37
+
38
+ output_file: str = dataclasses.field(
39
+ metadata={"help": "The output file with evaluation metrics."},
40
+ )
41
+
42
+
43
+ def _convert_to_matrices(
44
+ instances: list[dict[str, Any]]
45
+ ) -> Tuple[np.ndarray, np.ndarray]:
46
+ """Converts the instances to metric and human score matrices."""
47
+ system_id_to_row = {}
48
+ segment_id_to_col = {}
49
+
50
+ for instance in instances:
51
+ system_id = instance["system_id"]
52
+ segment_id = instance["segment_id"]
53
+ if system_id not in system_id_to_row:
54
+ system_id_to_row[system_id] = len(system_id_to_row)
55
+ if segment_id not in segment_id_to_col:
56
+ segment_id_to_col[segment_id] = len(segment_id_to_col)
57
+
58
+ num_rows = len(system_id_to_row)
59
+ num_cols = len(segment_id_to_col)
60
+ # MTME requires that missing scores must be None, not NaN.
61
+ metric_scores = np.full((num_rows, num_cols), None, dtype=np.dtype(object))
62
+ human_scores = np.full((num_rows, num_cols), None, dtype=np.dtype(object))
63
+
64
+ for instance in instances:
65
+ system_id = instance["system_id"]
66
+ segment_id = instance["segment_id"]
67
+ row = system_id_to_row[system_id]
68
+ col = segment_id_to_col[segment_id]
69
+ metric_scores[row, col] = (
70
+ -1 * instance["prediction"]
71
+ ) # negate so higher is better
72
+ human_scores[row, col] = instance["label"]
73
+
74
+ return metric_scores, human_scores
75
+
76
+
77
+ def main() -> None:
78
+ parser = transformers.HfArgumentParser(Arguments)
79
+ (args,) = parser.parse_args_into_dataclasses()
80
+
81
+ # Download MTME data
82
+ data.Download()
83
+
84
+ # Load the data and filter outliers, the human system corresponding to the
85
+ # references, and any system that doesn't have any MQM scores.
86
+ evs = data.EvalSet(args.dataset, args.lp)
87
+ bad_systems = {evs.std_ref} | evs.outlier_sys_names
88
+ mqm = evs.Scores("seg", "mqm")
89
+ for system_id, scores in mqm.items():
90
+ if not any(score is not None for score in scores):
91
+ bad_systems.add(system_id)
92
+
93
+ instances = []
94
+ with open(args.input_file, "r") as f:
95
+ for line in f:
96
+ instance = json.loads(line)
97
+ if instance["system_id"] in bad_systems:
98
+ continue
99
+ instances.append(instance)
100
+
101
+ metric_seg_scores, human_seg_scores = _convert_to_matrices(instances)
102
+ metric_sys_scores = np.mean(metric_seg_scores, axis=1)
103
+ human_sys_scores = np.apply_along_axis(
104
+ lambda row: np.mean(row[row != None]), 1, human_seg_scores # pylint: disable=singleton-comparison
105
+ )
106
+
107
+ # Segment-level correlations.
108
+ mask = human_seg_scores.reshape(-1) != None # pylint: disable=singleton-comparison
109
+ seg_no_grouping_pearson, _ = scipy.stats.pearsonr(
110
+ metric_seg_scores.reshape(-1)[mask],
111
+ human_seg_scores.reshape(-1)[mask],
112
+ )
113
+ tie_calib_result = tau_optimization.tau_optimization(
114
+ metric_seg_scores.T,
115
+ human_seg_scores.T,
116
+ tau_optimization.TauSufficientStats.acc_23,
117
+ )
118
+
119
+ # System-level correlations.
120
+ sys_pearson, _ = scipy.stats.pearsonr(human_sys_scores, metric_sys_scores)
121
+ agree, num_pairs = stats.Agreement(human_sys_scores, metric_sys_scores)
122
+ sys_accuracy = agree / num_pairs
123
+ sys_spa = stats.PairwiseConfidenceError(
124
+ human_seg_scores.reshape(-1),
125
+ metric_seg_scores.reshape(-1),
126
+ human_seg_scores.shape[0],
127
+ filter_nones=True,
128
+ )[0]
129
+
130
+ metrics = {
131
+ "system_level": {
132
+ "pearson": sys_pearson,
133
+ "accuracy": sys_accuracy,
134
+ "spa": sys_spa,
135
+ },
136
+ "segment_level_no_grouping": {
137
+ "pearson": seg_no_grouping_pearson,
138
+ },
139
+ "segment_level_group_by_item": {
140
+ "accuracy": tie_calib_result.best_tau,
141
+ "epsilon": tie_calib_result.best_threshold,
142
+ },
143
+ }
144
+ print(json.dumps(metrics, indent=2))
145
+
146
+ if args.output_file:
147
+ dirname = os.path.dirname(args.output_file)
148
+ if dirname:
149
+ os.makedirs(dirname, exist_ok=True)
150
+ with open(args.output_file, "w") as out:
151
+ out.write(json.dumps(metrics))
152
+
153
+
154
+ if __name__ == "__main__":
155
+ main()
vecalign/metricx24/evaluate_wmt24.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Google LLC
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Evaluates the predictions from a MetricX model."""
16
+
17
+ import collections
18
+ import dataclasses
19
+ import json
20
+ import os
21
+
22
+ from mt_metrics_eval import data
23
+ from mt_metrics_eval import tasks
24
+ import numpy as np
25
+ import transformers
26
+
27
+
28
+ @dataclasses.dataclass
29
+ class Arguments:
30
+ en_de: str = dataclasses.field(metadata={"help": "The en-de input file."})
31
+ en_es: str = dataclasses.field(metadata={"help": "The en-es input file."})
32
+ ja_zh: str = dataclasses.field(metadata={"help": "The ja-zh input file."})
33
+
34
+ output_file: str = dataclasses.field(
35
+ metadata={"help": "The output file with evaluation metrics."},
36
+ )
37
+
38
+
39
+ def _load_scores(
40
+ input_file: str, num_segments: int,
41
+ ) -> tuple[dict[str, list[float]], dict[str, list[float]]]:
42
+ """Loads segment and system-level scores."""
43
+ scores = collections.defaultdict(dict)
44
+ with open(input_file, "r") as f:
45
+ for line in f:
46
+ instance = json.loads(line)
47
+ system_id = instance["system_id"]
48
+ segment_id = instance["segment_id"]
49
+ score = -1 * instance["prediction"]
50
+ scores[system_id][segment_id] = score
51
+
52
+ seg_scores = {}
53
+ for system_id in scores:
54
+ seg_scores[system_id] = []
55
+ for segment_id in range(num_segments):
56
+ seg_scores[system_id].append(scores[system_id].get(segment_id, None))
57
+
58
+ sys_scores = {}
59
+ for system_id in seg_scores:
60
+ cur_scores = np.asarray(seg_scores[system_id])
61
+ sys_scores[system_id] = np.mean(cur_scores[cur_scores != None]) # pylint: disable=singleton-comparison
62
+
63
+ return seg_scores, sys_scores
64
+
65
+
66
+ def main() -> None:
67
+ parser = transformers.HfArgumentParser(Arguments)
68
+ (args,) = parser.parse_args_into_dataclasses()
69
+
70
+ # Download MTME data
71
+ data.Download()
72
+
73
+ metric_name = "metricx-24-v2p6"
74
+ wmt24_lps = ["en-de", "en-es", "ja-zh"]
75
+ evs_dict = {
76
+ ("wmt24", lp): data.EvalSet("wmt24", lp, True) for lp in wmt24_lps
77
+ }
78
+
79
+ segment_counts_per_lp = {}
80
+ for lp in wmt24_lps:
81
+ evs = evs_dict[("wmt24", lp)]
82
+ gold_scores = evs.Scores("seg", "mqm")
83
+ for _, scores in gold_scores.items():
84
+ segment_counts_per_lp[lp] = len(scores)
85
+ continue
86
+ scores = {
87
+ "en-de": _load_scores(args.en_de, segment_counts_per_lp["en-de"]),
88
+ "en-es": _load_scores(args.en_es, segment_counts_per_lp["en-es"]),
89
+ "ja-zh": _load_scores(args.ja_zh, segment_counts_per_lp["ja-zh"]),
90
+ }
91
+
92
+ for lp in wmt24_lps:
93
+ evs = evs_dict[("wmt24", lp)]
94
+ seg_scores, sys_scores = scores[lp]
95
+ evs._scores["seg"][f"{metric_name}-{evs.std_ref}"] = seg_scores # pylint: disable=protected-access
96
+ evs._scores["sys"][f"{metric_name}-{evs.std_ref}"] = sys_scores # pylint: disable=protected-access
97
+ evs._metric_names.add(f"{metric_name}-{evs.std_ref}") # pylint: disable=protected-access
98
+ evs._metric_basenames.add(metric_name) # pylint: disable=protected-access
99
+
100
+ for evs in evs_dict.values():
101
+ evs.SetPrimaryMetrics(evs.primary_metrics | {metric_name})
102
+
103
+ wmt24_tasks, wts = tasks.WMT24(wmt24_lps, k=0)
104
+ results = wmt24_tasks.Run(eval_set_dict=evs_dict)
105
+ metrics = {"average_correlation": results.AverageCorrs(wts)[metric_name]}
106
+
107
+ if args.output_file:
108
+ dirname = os.path.dirname(args.output_file)
109
+ if dirname:
110
+ os.makedirs(dirname, exist_ok=True)
111
+ with open(args.output_file, "w") as out:
112
+ out.write(json.dumps(metrics, indent=2))
113
+
114
+
115
+ if __name__ == "__main__":
116
+ main()
vecalign/metricx24/models.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Google LLC
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Model classes for MetricX, modified from the T5 versions in HF."""
16
+
17
+ import copy
18
+ import dataclasses
19
+ from typing import Optional, Tuple, Union
20
+ import warnings
21
+
22
+ import torch
23
+ from torch import nn
24
+ import transformers
25
+ import transformers.modeling_outputs
26
+
27
+ BaseModelOutput = transformers.modeling_outputs.BaseModelOutput
28
+ ModelOutput = transformers.modeling_outputs.ModelOutput
29
+
30
+ MT5Config = transformers.models.mt5.modeling_mt5.MT5Config
31
+ MT5PreTrainedModel = transformers.models.mt5.modeling_mt5.MT5PreTrainedModel
32
+ MT5Stack = transformers.models.mt5.modeling_mt5.MT5Stack
33
+
34
+ __HEAD_MASK_WARNING_MSG = (
35
+ transformers.models.mt5.modeling_mt5.__HEAD_MASK_WARNING_MSG # pylint: disable=protected-access
36
+ )
37
+
38
+
39
+ @dataclasses.dataclass
40
+ class MT5ForRegressionOutput(ModelOutput):
41
+ loss: Optional[torch.FloatTensor] = None
42
+ predictions: torch.FloatTensor = None
43
+
44
+
45
+ class MT5ForRegression(MT5PreTrainedModel):
46
+ """MT5 model for regression."""
47
+
48
+ def __init__(self, config: MT5Config):
49
+ super().__init__(config)
50
+ self.model_dim = config.d_model
51
+
52
+ self.shared = nn.Embedding(config.vocab_size, config.d_model)
53
+
54
+ encoder_config = copy.deepcopy(config)
55
+ encoder_config.is_decoder = False
56
+ encoder_config.use_cache = False
57
+ encoder_config.is_encoder_decoder = False
58
+ self.encoder = MT5Stack(encoder_config, self.shared)
59
+
60
+ decoder_config = copy.deepcopy(config)
61
+ decoder_config.is_decoder = True
62
+ decoder_config.is_encoder_decoder = False
63
+ decoder_config.num_layers = config.num_decoder_layers
64
+ self.decoder = MT5Stack(decoder_config, self.shared)
65
+
66
+ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
67
+
68
+ # Initialize weights and apply final processing
69
+ self.post_init()
70
+
71
+ # Model parallel
72
+ self.model_parallel = False
73
+ self.device_map = None
74
+
75
+ def forward(
76
+ self,
77
+ input_ids: Optional[torch.LongTensor] = None,
78
+ attention_mask: Optional[torch.FloatTensor] = None,
79
+ decoder_attention_mask: Optional[torch.BoolTensor] = None,
80
+ head_mask: Optional[torch.FloatTensor] = None,
81
+ decoder_head_mask: Optional[torch.FloatTensor] = None,
82
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
83
+ encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
84
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
85
+ inputs_embeds: Optional[torch.FloatTensor] = None,
86
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
87
+ labels: Optional[torch.FloatTensor] = None,
88
+ use_cache: Optional[bool] = None,
89
+ output_attentions: Optional[bool] = None,
90
+ output_hidden_states: Optional[bool] = None,
91
+ return_dict: Optional[bool] = None,
92
+ ) -> Union[Tuple[torch.FloatTensor], MT5ForRegressionOutput]:
93
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
94
+ return_dict = (
95
+ return_dict if return_dict is not None else self.config.use_return_dict
96
+ )
97
+
98
+ # FutureWarning: head_mask was separated into two input args - head_mask,
99
+ # decoder_head_mask
100
+ if head_mask is not None and decoder_head_mask is None:
101
+ if self.config.num_layers == self.config.num_decoder_layers:
102
+ warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
103
+ decoder_head_mask = head_mask
104
+
105
+ # Encode if needed (training, first prediction pass)
106
+ if encoder_outputs is None:
107
+ # Convert encoder inputs in embeddings if needed
108
+ encoder_outputs = self.encoder(
109
+ input_ids=input_ids,
110
+ attention_mask=attention_mask,
111
+ inputs_embeds=inputs_embeds,
112
+ head_mask=head_mask,
113
+ output_attentions=output_attentions,
114
+ output_hidden_states=output_hidden_states,
115
+ return_dict=return_dict,
116
+ )
117
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
118
+ encoder_outputs = BaseModelOutput(
119
+ last_hidden_state=encoder_outputs[0],
120
+ hidden_states=encoder_outputs[1]
121
+ if len(encoder_outputs) > 1
122
+ else None,
123
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
124
+ )
125
+
126
+ hidden_states = encoder_outputs[0]
127
+
128
+ if self.model_parallel:
129
+ torch.cuda.set_device(self.decoder.first_device)
130
+
131
+ # Create 1 step of dummy input for the decoder.
132
+ batch_size = input_ids.size(0)
133
+ decoder_input_ids = torch.LongTensor([0]).repeat(batch_size).reshape(-1, 1)
134
+ if torch.cuda.is_available():
135
+ decoder_input_ids = decoder_input_ids.to(torch.device("cuda"))
136
+
137
+ # Set device for model parallelism
138
+ if self.model_parallel:
139
+ torch.cuda.set_device(self.decoder.first_device)
140
+ hidden_states = hidden_states.to(self.decoder.first_device)
141
+ if decoder_input_ids is not None:
142
+ decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
143
+ if attention_mask is not None:
144
+ attention_mask = attention_mask.to(self.decoder.first_device)
145
+ if decoder_attention_mask is not None:
146
+ decoder_attention_mask = decoder_attention_mask.to(
147
+ self.decoder.first_device
148
+ )
149
+
150
+ # Decode
151
+ decoder_outputs = self.decoder(
152
+ input_ids=decoder_input_ids,
153
+ attention_mask=decoder_attention_mask,
154
+ inputs_embeds=decoder_inputs_embeds,
155
+ past_key_values=past_key_values,
156
+ encoder_hidden_states=hidden_states,
157
+ encoder_attention_mask=attention_mask,
158
+ head_mask=decoder_head_mask,
159
+ cross_attn_head_mask=cross_attn_head_mask,
160
+ use_cache=use_cache,
161
+ output_attentions=output_attentions,
162
+ output_hidden_states=output_hidden_states,
163
+ return_dict=return_dict,
164
+ )
165
+
166
+ sequence_output = decoder_outputs[0]
167
+
168
+ # Set device for model parallelism
169
+ if self.model_parallel:
170
+ torch.cuda.set_device(self.encoder.first_device)
171
+ self.lm_head = self.lm_head.to(self.encoder.first_device)
172
+ sequence_output = sequence_output.to(self.lm_head.weight.device)
173
+
174
+ if self.config.tie_word_embeddings:
175
+ # Rescale output before projecting on vocab
176
+ # See
177
+ # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
178
+ sequence_output = sequence_output * (self.model_dim**-0.5)
179
+
180
+ lm_logits = self.lm_head(sequence_output)
181
+
182
+ # 250089 = <extra_id_10>
183
+ predictions = lm_logits[:, 0, 250089]
184
+
185
+ # Clip to 0 to 25
186
+ predictions = torch.clamp(predictions, 0, 25)
187
+
188
+ loss = None
189
+ if labels is not None:
190
+ loss_fct = nn.MSELoss()
191
+ # move labels to correct device to enable PP
192
+ labels = labels.to(predictions.device)
193
+ loss = loss_fct(predictions.view(-1), labels.view(-1))
194
+
195
+ return MT5ForRegressionOutput(
196
+ loss=loss,
197
+ predictions=predictions,
198
+ )
vecalign/metricx24/predict.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Google LLC
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Runs inference with a MetricX model."""
16
+
17
+ import dataclasses
18
+ import json
19
+ import os
20
+
21
+ import datasets
22
+ # from metricx24 import models
23
+ from . import models
24
+ import torch
25
+ import transformers
26
+ import time
27
+
28
+
29
+ @dataclasses.dataclass
30
+ class Arguments:
31
+ """Prediction command-line arguments."""
32
+
33
+ tokenizer: str = dataclasses.field(
34
+ metadata={"help": "The name of the tokenizer"},
35
+ )
36
+
37
+ model_name_or_path: str = dataclasses.field(
38
+ metadata={
39
+ "help": (
40
+ "Path to pretrained model or model identifier from"
41
+ " huggingface.co/models"
42
+ )
43
+ },
44
+ )
45
+
46
+ max_input_length: int = dataclasses.field(
47
+ metadata={"help": "The maximum allowable input sequence length."},
48
+ )
49
+
50
+ batch_size: int = dataclasses.field(
51
+ metadata={"help": "The global prediction batch size."},
52
+ )
53
+
54
+ input_file: str = dataclasses.field(metadata={"help": "The input file."})
55
+
56
+ output_file: str = dataclasses.field(
57
+ metadata={"help": "The output file with predictions."},
58
+ )
59
+
60
+ qe: bool = dataclasses.field(
61
+ metadata={"help": "Indicates the metric is a QE metric."},
62
+ default=False,
63
+ )
64
+ device: str = dataclasses.field(
65
+ metadata={"help": "No device."},
66
+ default='0'
67
+ )
68
+
69
+
70
+ def get_dataset(
71
+ input_file: str, tokenizer, max_input_length: int, device, is_qe: bool
72
+ ):
73
+ """Gets the test dataset for prediction.
74
+
75
+ If `is_qe` is true, the input data must have "hypothesis" and "source" fields.
76
+ If it is false, there must be "hypothesis" and "reference" fields.
77
+
78
+ Args:
79
+ input_file: The path to the jsonl input file.
80
+ tokenizer: The tokenizer to use.
81
+ max_input_length: The maximum input sequence length.
82
+ device: The ID of the device to put the PyTorch tensors on.
83
+ is_qe: Indicates whether the metric is a QE metric or not.
84
+
85
+ Returns:
86
+ The dataset.
87
+ """
88
+
89
+ def _make_input(example):
90
+ if is_qe:
91
+ example["input"] = (
92
+ "source: "
93
+ + example["source"]
94
+ + " candidate: "
95
+ + example["hypothesis"]
96
+ )
97
+ else:
98
+ example["input"] = (
99
+ "source: "
100
+ + example["source"]
101
+ + " candidate: "
102
+ + example["hypothesis"]
103
+ + " reference: "
104
+ + example["reference"]
105
+ )
106
+ return example
107
+
108
+ def _tokenize(example):
109
+ return tokenizer(
110
+ example["input"],
111
+ max_length=max_input_length,
112
+ truncation=True,
113
+ padding=False,
114
+ )
115
+
116
+ def _remove_eos(example):
117
+ example["input_ids"] = example["input_ids"][:-1]
118
+ example["attention_mask"] = example["attention_mask"][:-1]
119
+ return example
120
+
121
+ ds = datasets.load_dataset("json", data_files={"test": input_file})
122
+ ds = ds.map(_make_input)
123
+ ds = ds.map(_tokenize)
124
+ ds = ds.map(_remove_eos)
125
+ ds.set_format(
126
+ type="torch",
127
+ columns=["input_ids", "attention_mask"],
128
+ device=device,
129
+ output_all_columns=True,
130
+ )
131
+ return ds
132
+
133
+
134
+ def main() -> None:
135
+ parser = transformers.HfArgumentParser(Arguments)
136
+ (args,) = parser.parse_args_into_dataclasses()
137
+
138
+ os.environ['CUDA_VISIBLE_DEVICES']=args.device
139
+ os.environ['NCCL_P2P_DISABLE'] = "1"
140
+ os.environ['NCCL_IB_DISABLE'] = "1"
141
+
142
+ if torch.cuda.is_available():
143
+ device = torch.device(f"cuda:0")
144
+ per_device_batch_size = args.batch_size // torch.cuda.device_count()
145
+ else:
146
+ device = torch.device("cpu")
147
+ per_device_batch_size = args.batch_size
148
+
149
+ tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer)
150
+
151
+ model = models.MT5ForRegression.from_pretrained(
152
+ args.model_name_or_path, torch_dtype="auto"
153
+ )
154
+
155
+ model.to(device)
156
+ model.eval()
157
+
158
+ ds = get_dataset(
159
+ args.input_file,
160
+ tokenizer,
161
+ args.max_input_length,
162
+ device,
163
+ args.qe,
164
+ )
165
+
166
+ training_args = transformers.TrainingArguments(
167
+ output_dir=os.path.dirname(args.output_file),
168
+ per_device_eval_batch_size=per_device_batch_size,
169
+ dataloader_pin_memory=False,
170
+ )
171
+ trainer = transformers.Trainer(
172
+ model=model,
173
+ args=training_args,
174
+ )
175
+ predictions, _, _ = trainer.predict(test_dataset=ds["test"])
176
+
177
+ dirname = os.path.dirname(args.output_file)
178
+ if dirname:
179
+ os.makedirs(dirname, exist_ok=True)
180
+
181
+ with open(args.output_file, "w") as out:
182
+ for pred, example in zip(predictions, ds["test"]):
183
+ example["prediction"] = float(pred)
184
+ del example["input"]
185
+ del example["input_ids"]
186
+ del example["attention_mask"]
187
+ out.write(json.dumps(example, ensure_ascii=False) + "\n")
188
+
189
+
190
+ if __name__ == "__main__":
191
+ main()
vecalign/overlap.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ Copyright 2019 Brian Thompson
5
+
6
+ Licensed under the Apache License, Version 2.0 (the "License");
7
+ you may not use this file except in compliance with the License.
8
+ You may obtain a copy of the License at
9
+
10
+ https://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+ """
18
+
19
+
20
+ import argparse
21
+
22
+ from dp_utils import yield_overlaps
23
+
24
+
25
+ def go(output_file, input_files, num_overlaps):
26
+ output = set()
27
+ for fin in input_files:
28
+ lines = open(fin, 'rt', encoding="utf-8").readlines()
29
+ for out_line in yield_overlaps(lines, num_overlaps):
30
+ output.add(out_line)
31
+
32
+ # for reproducibility
33
+ output = list(output)
34
+ output.sort()
35
+
36
+ with open(output_file, 'wt', encoding="utf-8") as fout:
37
+ for line in output:
38
+ fout.write(line + '\n')
39
+
40
+
41
+ def _main():
42
+ parser = argparse.ArgumentParser('Create text file containing overlapping sentences.',
43
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
44
+
45
+ parser.add_argument('-i', '--inputs', type=str, nargs='+',
46
+ help='input text file(s).')
47
+
48
+ parser.add_argument('-o', '--output', type=str,
49
+ help='output text file containing overlapping sentneces')
50
+
51
+ parser.add_argument('-n', '--num_overlaps', type=int, default=4,
52
+ help='Maximum number of allowed overlaps.')
53
+
54
+ args = parser.parse_args()
55
+ go(output_file=args.output,
56
+ num_overlaps=args.num_overlaps,
57
+ input_files=args.inputs)
58
+
59
+
60
+ if __name__ == '__main__':
61
+ _main()
vecalign/plan2align.py ADDED
@@ -0,0 +1,748 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from openai import OpenAI
3
+ import spacy
4
+ import pandas as pd
5
+ from collections import defaultdict
6
+ import random
7
+ import torch
8
+ import torch.nn as nn
9
+ from transformers import MT5Tokenizer, MT5ForConditionalGeneration
10
+ import shutil
11
+ import os
12
+ import subprocess
13
+ import json
14
+ from safetensors.torch import load_file
15
+ from transformers import AutoTokenizer, AutoModelForCausalLM
16
+ from trl import AutoModelForCausalLMWithValueHead
17
+ from huggingface_hub import login
18
+ import logging
19
+ import argparse
20
+
21
+ lang_map = {
22
+ "English": ("en", "en_core_web_sm"),
23
+ "Russian": ("ru", "ru_core_news_sm"),
24
+ "German": ("de", "de_core_news_sm"),
25
+ "Japanese": ("ja", "ja_core_news_sm"),
26
+ "Korean": ("ko", "ko_core_news_sm"),
27
+ "Spanish": ("es", "es_core_news_sm"),
28
+ "Chinese": ("zh", "zh_core_web_sm")
29
+ }
30
+
31
+ openai = OpenAI(
32
+ api_key="",
33
+ base_url="https://api.deepinfra.com/v1/openai",
34
+ )
35
+ MODEL_NAME= "google/gemma-2-9b-it" # "meta-llama/Meta-Llama-3.1-8B-Instruct"
36
+
37
+ ################################# folder / file processing #################################
38
+
39
+ def clear_folder(folder_path):
40
+ if os.path.exists(folder_path):
41
+ shutil.rmtree(folder_path)
42
+ os.makedirs(folder_path)
43
+ else:
44
+ os.makedirs(folder_path)
45
+
46
+ def delete_files_with_mt(folder_path):
47
+ if not os.path.exists(folder_path):
48
+ print(f"Folder {folder_path} does not exist.")
49
+ return
50
+ for filename in os.listdir(folder_path):
51
+ if "mt" in filename:
52
+ file_path = os.path.join(folder_path, filename)
53
+ try:
54
+ if os.path.isfile(file_path):
55
+ os.remove(file_path)
56
+ print(f"Deleted file: {file_path}")
57
+ except Exception as e:
58
+ print(f"Failed to delete {file_path}. Reason: {e}")
59
+
60
+ ################################# reward model for ranking #################################
61
+
62
+ class metricx_RewardModel:
63
+ def __init__(self):
64
+ self.device = "cuda:0"
65
+ current_dir = os.path.dirname(os.path.abspath(__file__))
66
+ self.json_path = os.path.join(current_dir, f'json_for_metricx')
67
+ if not os.path.exists(self.json_path):
68
+ os.makedirs(self.json_path)
69
+
70
+ def get_entry(self, src, mt):
71
+ return {"source": src, "hypothesis": mt, "reference": ""}
72
+
73
+ def write_jsonl(self, src_list, mts, session_id):
74
+ with open(os.path.join(self.json_path, f"{session_id}_input.jsonl"), 'w', encoding='utf-8') as output_file:
75
+ for src, mt in zip(src_list, mts):
76
+ entry = self.get_entry(src, mt)
77
+ output_file.write(json.dumps(entry, ensure_ascii=False) + '\n')
78
+
79
+ def run_command(self, session_id):
80
+ devices_map = {'cuda:0':0, 'cuda:1':1, 'cuda:2':2, 'cuda:3':3}
81
+ command = [
82
+ "python", "-m", "vecalign.metricx24.predict",
83
+ "--tokenizer", "google/mt5-large",
84
+ "--model_name_or_path", "google/metricx-24-hybrid-large-v2p6",
85
+ "--max_input_length", "1536",
86
+ "--batch_size", "1",
87
+ "--input_file", os.path.join(self.json_path, f"{session_id}_input.jsonl"),
88
+ "--output_file", os.path.join(self.json_path, f"{session_id}_output.jsonl"),
89
+ "--device", f"{devices_map.get(self.device, 0)}",
90
+ "--qe"
91
+ ]
92
+ subprocess.run(command)
93
+
94
+ def get_predict(self, session_id):
95
+ scores = []
96
+ with open(os.path.join(self.json_path, f"{session_id}_output.jsonl"), 'r', encoding='utf-8') as new_file:
97
+ for line in new_file:
98
+ entry = json.loads(line)
99
+ score = entry.get('prediction', None)
100
+ scores.append(score)
101
+ clear_folder(self.json_path)
102
+ return scores
103
+
104
+ def reward_fn_batch(self, language, src_list, mts, session_id):
105
+ self.write_jsonl(src_list, mts, session_id)
106
+ self.run_command(session_id)
107
+ scores = self.get_predict(session_id)
108
+ rewards = [1 - (score / 25) for score in scores]
109
+ return rewards
110
+
111
+ reward_model = metricx_RewardModel()
112
+
113
+ def batch_rm_find_best_translation(evals, language, session_id):
114
+ """
115
+ evals: list of (src, [translation1, translation2, ...])
116
+ Return the translation with the highest reward in each group that meets the THRESHOLD, along with its score.
117
+ Otherwise, return (None, score), where score is the highest score in that group.
118
+ """
119
+ src_list = []
120
+ mt_list = []
121
+ counts = []
122
+ for src, translations in evals:
123
+ counts.append(len(translations))
124
+ for mt in translations:
125
+ src_list.append(src)
126
+ mt_list.append(mt)
127
+ rewards = reward_model.reward_fn_batch(language, src_list, mt_list, session_id)
128
+ print("rewards: ", rewards)
129
+ best_translations = []
130
+ index = 0
131
+ for (src, translations), count in zip(evals, counts):
132
+ group_rewards = rewards[index: index+count]
133
+ index += count
134
+ if count < 2:
135
+ if translations:
136
+ best_translations.append((translations[0], group_rewards[0]))
137
+ else:
138
+ best_translations.append((None, None))
139
+ else:
140
+ best_index = group_rewards.index(max(group_rewards))
141
+ best_score = group_rewards[best_index]
142
+ if best_score >= THRESHOLD:
143
+ best_translations.append((translations[best_index], best_score))
144
+ else:
145
+ best_translations.append((None, best_score))
146
+ return best_translations
147
+
148
+
149
+ def external_find_best_translation(evals, language, session_id):
150
+ """
151
+ evals: list of (src, [translation1, translation2, ...])
152
+ Return the translation with the highest reward in each group that meets the THRESHOLD, along with its score.
153
+ Otherwise, return (None, score), where score is the highest score in that group.
154
+ """
155
+ src_list = []
156
+ mt_list = []
157
+ counts = []
158
+ for src, translations in evals:
159
+ counts.append(len(translations))
160
+ for mt in translations:
161
+ src_list.append(src)
162
+ mt_list.append(mt)
163
+ rewards = reward_model.reward_fn_batch(language, src_list, mt_list, session_id)
164
+ print("rewards: ", rewards)
165
+ best_translations = []
166
+ index = 0
167
+ for (src, translations), count in zip(evals, counts):
168
+ group_rewards = rewards[index: index+count]
169
+ index += count
170
+ if count < 2:
171
+ if translations:
172
+ best_translations.append((translations[0], group_rewards[0]))
173
+ else:
174
+ best_translations.append((None, None))
175
+ else:
176
+ best_index = group_rewards.index(max(group_rewards))
177
+ best_score = group_rewards[best_index]
178
+ best_translations.append((translations[best_index], best_score))
179
+ return best_translations
180
+
181
+ ################################# generating translation #################################
182
+
183
+ def translate_with_deepinfra(source_sentence, buffer, good_sent_size, src_language, tgt_language):
184
+ system_prompts = [
185
+ "You are a meticulous translator. Provide a literal, word-for-word translation that preserves the structure and meaning of each individual word.",
186
+ "You are a professional translator. Deliver a clear, formal, and precise translation that faithfully conveys the original meaning.",
187
+ "You are a creative and expressive translator. Render the text in a vivid and imaginative way, as if narrating a captivating story."
188
+ ]
189
+
190
+ context_prompt = f"Below is a specialized, intermediate translation task. The input text is a mix of {src_language} and partial {tgt_language} translations. "
191
+ context_prompt += f"In the text, some {src_language} sentences are already followed by preliminary {tgt_language} translations enclosed in parentheses. "
192
+ context_prompt += f"These provided translations are rough references – they may be incomplete, inconsistent, or not fully aligned with the original meaning.\n\n"
193
+ context_prompt += f"Your task is to produce an improved {tgt_language} translation according to the following guidelines:\n"
194
+ context_prompt += f"1. **Refinement:** For sections with existing {tgt_language} translations (in parentheses), refine and polish them so that they are fluent, accurate, and coherent, fully capturing the meaning of the corresponding {src_language} text.\n"
195
+ context_prompt += f"2. **Completion:** For sections that remain untranslated, translate the {src_language} text accurately and naturally in the specified style.\n"
196
+ context_prompt += f"3. **Translation Order and Structure Preservation:** Maintain the original order and structure of the text. Every {src_language} sentence must appear in the same sequence as in the source text, with its corresponding {tgt_language} translation (if available) inserted immediately after it. Do not rearrange or reorder any part of the text.\n"
197
+ context_prompt += f"4. **Consistency:** Ensure a uniform tone and style across the entire translation, adhering to the translator role specified.\n"
198
+ context_prompt += f"5. **Final Output:** Provide the final output as a single, well-structured {tgt_language} text. Do not include any extraneous commentary, explanations, annotations, or headers – output only the translation in the correct order.\n\n"
199
+ context_prompt += f"Note: This translation is an intermediate version that may later be merged with other translations. Focus on clarity, coherence, and fidelity to the source text.\n"
200
+
201
+ # Process the buffer to extract relevant English translations
202
+ processed_source = source_sentence
203
+ if len(buffer) > 0:
204
+ selected_keys = random.sample(buffer.keys(), min(len(buffer), good_sent_size))
205
+ for key_sentence in selected_keys:
206
+ key_sentence = key_sentence.strip()
207
+ if key_sentence and (key_sentence in source_sentence) :
208
+ translated_sentence = buffer[key_sentence][0][0]
209
+ if f"\n({translated_sentence})\n" not in processed_source:
210
+ processed_source = processed_source.replace(
211
+ key_sentence,
212
+ f"{key_sentence}\n({translated_sentence})\n"
213
+ )
214
+
215
+ context_prompt += f"\nHere is the input data for translation:\n{processed_source}\n\n"
216
+ context_prompt += "Apply the above guidelines to produce an improved, coherent translation that strictly follows the original order of the text.\n"
217
+
218
+ if len(buffer) == 0:
219
+ context_prompt = f"### Translate this from {src_language} to {tgt_language} and only output the result."
220
+ context_prompt += f"\n### {src_language}:\n {source_sentence}"
221
+ context_prompt += f"\n### {tgt_language}:\n"
222
+
223
+ print("--------------------------------------------------------------------------------")
224
+ print("\n context_prompt \n")
225
+ print(context_prompt)
226
+ print("--------------------------------------------------------------------------------")
227
+
228
+ translations = []
229
+ for prompt in system_prompts:
230
+ response = openai.chat.completions.create(
231
+ model=MODEL_NAME,
232
+ messages=[
233
+ {"role": "system", "content": prompt},
234
+ {"role": "user", "content": context_prompt}
235
+ ]
236
+ )
237
+ translation = response.choices[0].message.content.strip()
238
+
239
+ print("--------------------------------------------------------------------------------")
240
+ print("\n rollout translation: \n")
241
+ print(translation)
242
+ print("--------------------------------------------------------------------------------")
243
+
244
+ translations.append(translation)
245
+
246
+ return translations
247
+
248
+ def process_buffer_sentences(source_sentences, buffer):
249
+ translations = []
250
+ translation_map = {}
251
+ for src_key, trans_list in buffer.items():
252
+ if not trans_list or not isinstance(trans_list, list):
253
+ continue
254
+ src_sentences = [src_key]
255
+
256
+ if len(src_sentences) > 0:
257
+ for src_sent in src_sentences:
258
+ if src_sent not in translation_map:
259
+ translation_map[src_sent] = []
260
+ translation_map[src_sent] = trans_list[0]
261
+
262
+ for src_sent in source_sentences:
263
+ if src_sent in translation_map and translation_map[src_sent]:
264
+ translations.append(translation_map[src_sent][0])
265
+ return translations
266
+
267
+ def final_translate_with_deepinfra(source_sentence, source_segments, buffer, src_language, tgt_language):
268
+ translations = process_buffer_sentences(source_segments, buffer)
269
+ initial_translation = "\n".join(translations)
270
+
271
+ rewrite_prompt = (
272
+ f"Below is an initial translation of a {src_language} text into {tgt_language}. "
273
+ f"This translation may include omissions, inaccuracies, or awkward phrasing. "
274
+ f"Your task is to produce a refined version that is fluent, accurate, and coherent, "
275
+ f"while faithfully preserving the full meaning of the original {src_language} text.\n\n"
276
+ f"### Instructions:\n"
277
+ f"1. Ensure that every detail in the original {src_language} text is accurately represented.\n"
278
+ f"2. Correct any grammatical errors, unnatural expressions, or inconsistencies.\n"
279
+ f"3. Improve the natural flow so that the translation reads as if written by a native speaker.\n"
280
+ f"4. Do not add, omit, or change any essential details from the source text.\n"
281
+ f"5. Output only the final refined translation without any additional commentary.\n\n"
282
+ f"### Original {src_language} Text:\n{source_sentence}\n\n"
283
+ f"### Initial {tgt_language} Translation:\n{initial_translation}\n\n"
284
+ f"### Refined Translation:"
285
+ )
286
+
287
+ print("rewrite prompt:")
288
+ print(rewrite_prompt)
289
+
290
+ rewrite_response = openai.chat.completions.create(
291
+ model=MODEL_NAME, # Replace with your actual model name
292
+ messages=[
293
+ {"role": "system", "content": "You are a helpful translator and only output the result."},
294
+ {"role": "user", "content": rewrite_prompt}
295
+ ]
296
+ )
297
+ translation = rewrite_response.choices[0].message.content.strip()
298
+ return translation
299
+
300
+
301
+ ################################# alignment functions #################################
302
+
303
+
304
+ def save_sentences_to_txt(sentences, filename):
305
+ i = 0
306
+ with open(filename, "w", encoding="utf-8") as file:
307
+ for sentence in sentences:
308
+ print(sentence, i)
309
+ file.write(sentence + "\n")
310
+ i += 1
311
+
312
+ def segment_sentences_by_punctuation(text, lang):
313
+ segmented_sentences = []
314
+ paragraphs = text.split('\n')
315
+ for paragraph in paragraphs:
316
+ if paragraph.strip():
317
+ if lang == src_lang:
318
+ doc = src_nlp(paragraph)
319
+ if lang == tgt_lang:
320
+ doc = mt_nlp(paragraph)
321
+ for sent in doc.sents:
322
+ segmented_sentences.append(sent.text.strip())
323
+ return segmented_sentences
324
+
325
+ def generate_overlap_and_embedding(txt_file):
326
+ overlaps_file = txt_file + ".overlaps"
327
+ embed_file = txt_file + ".emb"
328
+ current_dir = os.path.dirname(os.path.abspath(__file__))
329
+ overlap_path = os.path.join(current_dir, "overlap.py")
330
+ subprocess.run([overlap_path, "-i", txt_file, "-o", overlaps_file, "-n", "10"])
331
+ embed_command = [
332
+ "$LASER/tasks/embed/embed.sh",
333
+ overlaps_file,
334
+ embed_file,
335
+ ]
336
+ subprocess.run(" ".join(embed_command), shell=True)
337
+ return overlaps_file, embed_file
338
+
339
+ def run_vecalign(src_txt, tgt_txt, src_embed, tgt_embed):
340
+ current_dir = os.path.dirname(os.path.abspath(__file__))
341
+ vecalign_path = os.path.join(current_dir, "vecalign.py")
342
+ result = subprocess.run(
343
+ [
344
+ "python",
345
+ vecalign_path,
346
+ "--alignment_max_size", "8",
347
+ "--src", src_txt,
348
+ "--tgt", tgt_txt,
349
+ "--src_embed", src_txt + ".overlaps", src_embed,
350
+ "--tgt_embed", tgt_txt + ".overlaps", tgt_embed,
351
+ ],
352
+ stdout=subprocess.PIPE,
353
+ text=True,
354
+ )
355
+ alignments = []
356
+ for line in result.stdout.strip().split("\n"):
357
+ if line:
358
+ src_indices, tgt_indices, _ = line.split(":")
359
+ src_indices = list(map(int, src_indices.strip("[]").split(","))) if src_indices.strip("[]") else []
360
+ tgt_indices = list(map(int, tgt_indices.strip("[]").split(","))) if tgt_indices.strip("[]") else []
361
+ alignments.append((src_indices, tgt_indices))
362
+ return alignments
363
+
364
+ def compute_alignment_stats(alignment_results):
365
+ costs = []
366
+ zero_cost_count = 0
367
+
368
+ for entry in alignment_results:
369
+ try:
370
+ cost = float(entry.split(":")[-1]) # Extract the cost value
371
+ if cost == 0.0:
372
+ zero_cost_count += 1
373
+ else:
374
+ costs.append(cost)
375
+ except ValueError:
376
+ continue # Ignore invalid entries
377
+
378
+ # Compute the average cost, ignoring zero-cost samples
379
+ avg_cost = sum(costs) / len(costs) if costs else 0.0
380
+ zero_cost_ratio = zero_cost_count / len(alignment_results) if alignment_results else 0.0
381
+
382
+ return avg_cost, zero_cost_ratio
383
+
384
+ def run_vecalign_explore(src_txt, tgt_txt, src_embed, tgt_embed):
385
+ """
386
+ Runs vecalign multiple times, exploring the best del_percentile_frac.
387
+ Starts from 0.2 and decreases in 0.005 steps, stopping when zero-cost ratio increases sharply.
388
+
389
+ :param src_txt: Source text file
390
+ :param tgt_txt: Target text file
391
+ :param src_embed: Source embeddings file
392
+ :param tgt_embed: Target embeddings file
393
+ :return: (best_del_percentile_frac, best_avg_cost, best_zero_cost_ratio, best_alignments)
394
+ """
395
+ del_percentile_frac = 0.2 # Starting value
396
+ step_size = 0.005 # Exploration step
397
+ prev_zero_cost_ratio = None
398
+ prev_avg_cost = None
399
+
400
+ best_avg_cost = float('inf')
401
+ best_del_percentile_frac = del_percentile_frac
402
+ best_zero_cost_ratio = 0.0
403
+ best_alignments = []
404
+
405
+ first_flag = True
406
+ first_zero_cost_ratio = 0.0
407
+
408
+ current_dir = os.path.dirname(os.path.abspath(__file__))
409
+ vecalign_path = os.path.join(current_dir, "vecalign.py")
410
+
411
+ while del_percentile_frac > 0:
412
+ result = subprocess.run(
413
+ [
414
+ "python",
415
+ vecalign_path,
416
+ "--alignment_max_size", "8",
417
+ "--del_percentile_frac", str(del_percentile_frac),
418
+ "--src", src_txt,
419
+ "--tgt", tgt_txt,
420
+ "--costs_sample_size", "200000",
421
+ "--search_buffer_size", "20",
422
+ "--src_embed", src_txt + ".overlaps", src_embed,
423
+ "--tgt_embed", tgt_txt + ".overlaps", tgt_embed,
424
+ ],
425
+ stdout=subprocess.PIPE,
426
+ text=True,
427
+ )
428
+
429
+ output_lines = result.stdout.strip().split("\n")
430
+ avg_cost, zero_cost_ratio = compute_alignment_stats(output_lines)
431
+
432
+ print(f"del_percentile_frac: {del_percentile_frac:.3f} | Avg Cost: {avg_cost:.6f} | Zero-Cost Ratio: {zero_cost_ratio:.6%}")
433
+
434
+ if first_flag:
435
+ first_zero_cost_ratio = zero_cost_ratio
436
+ first_flag = False
437
+
438
+ if prev_zero_cost_ratio != 0 and prev_zero_cost_ratio is not None and (zero_cost_ratio / prev_zero_cost_ratio) > 1.5:
439
+ print(f"Stopping exploration: Zero-cost ratio increased sharply at {del_percentile_frac:.3f}")
440
+ break
441
+ elif prev_zero_cost_ratio is not None and (
442
+ (zero_cost_ratio - prev_zero_cost_ratio) > 0.15 or
443
+ avg_cost > prev_avg_cost or
444
+ avg_cost < 0.3 or zero_cost_ratio > 0.7
445
+ ):
446
+ print(f"Stopping exploration: Zero-cost ratio increased sharply at {del_percentile_frac:.3f}")
447
+ break
448
+ else:
449
+ if avg_cost < best_avg_cost:
450
+ best_avg_cost = avg_cost
451
+ best_del_percentile_frac = del_percentile_frac
452
+ best_zero_cost_ratio = zero_cost_ratio
453
+ best_alignments = output_lines
454
+
455
+ prev_zero_cost_ratio = zero_cost_ratio
456
+ prev_avg_cost = avg_cost
457
+ del_percentile_frac -= step_size
458
+
459
+ final_avg_cost = best_avg_cost
460
+ final_zero_cost_ratio = best_zero_cost_ratio
461
+ final_del_percentile_frac = best_del_percentile_frac
462
+ final_alignments = best_alignments.copy()
463
+
464
+ parsed_alignments = []
465
+ for line in final_alignments:
466
+ if line:
467
+ src_indices, tgt_indices, _ = line.split(":")
468
+ src_indices = list(map(int, src_indices.strip("[]").split(","))) if src_indices.strip("[]") else []
469
+ tgt_indices = list(map(int, tgt_indices.strip("[]").split(","))) if tgt_indices.strip("[]") else []
470
+ parsed_alignments.append((src_indices, tgt_indices))
471
+
472
+ print("\nBest Found:")
473
+ print(f"del_percentile_frac: {final_del_percentile_frac:.3f} | Avg Cost: {final_avg_cost:.6f} | Zero-Cost Ratio: {final_zero_cost_ratio:.6%}")
474
+
475
+ return parsed_alignments
476
+
477
+ def standardize_common_alignments(common_alignments_list):
478
+ # Reference alignment for standardization (use the shortest alignment set as baseline)
479
+ reference_alignments = min(common_alignments_list, key=lambda alignments: len(alignments))
480
+
481
+ # Standardized results to return
482
+ standardized_results = []
483
+
484
+ for alignments in common_alignments_list:
485
+ standardized_alignment = []
486
+ mt_idx_map = {tuple(src): mt for src, mt in alignments}
487
+ for src_indices, _ in reference_alignments: # Ignore ref_indices as it no longer exists
488
+ # If src_indices exist in the current alignment, use them directly
489
+ if tuple(src_indices) in mt_idx_map:
490
+ mt_indices = mt_idx_map[tuple(src_indices)]
491
+ else:
492
+ # If not found, merge based on src alignment
493
+ mt_indices = []
494
+ for src in src_indices:
495
+ if (src,) in mt_idx_map:
496
+ mt_indices.extend(mt_idx_map[(src,)])
497
+ # Ensure indices are unique and sorted after merging
498
+ mt_indices = sorted(set(mt_indices))
499
+ standardized_alignment.append((src_indices, mt_indices))
500
+ standardized_results.append(standardized_alignment)
501
+ return standardized_results
502
+
503
+ def generate_windows(source, translations):
504
+ # Segment sentences
505
+ source_segments = segment_sentences_by_punctuation(source, lang=src_lang)
506
+ current_dir = os.path.dirname(os.path.abspath(__file__))
507
+ temp_folder = os.path.join(current_dir, "temp")
508
+ os.makedirs(temp_folder, exist_ok=True)
509
+ # Generate overlaps and embeddings
510
+ src_txt = os.path.join(current_dir, f"temp/{SESSION_ID}_src.txt")
511
+ mt_txt = os.path.join(current_dir, f"temp/{SESSION_ID}_mt.txt")
512
+
513
+ print("\n ----------------- source segmentation --------------------------- ")
514
+ save_sentences_to_txt(source_segments, src_txt)
515
+ print(" ------------------------------------------------------------------- \n")
516
+ _, src_embed = generate_overlap_and_embedding(src_txt)
517
+ mt_segments_list = [segment_sentences_by_punctuation(t, lang=tgt_lang) for t in translations]
518
+ adjusted_mt_list = []
519
+
520
+ common_alignments_list = []
521
+ for mt_segments in mt_segments_list:
522
+ print("\n ----------------- translation segmentation --------------------------- ")
523
+ save_sentences_to_txt(mt_segments, mt_txt)
524
+ print(" ------------------------------------------------------------------------ \n")
525
+ _, mt_embed = generate_overlap_and_embedding(mt_txt)
526
+ src_mt_alignments = run_vecalign_explore(src_txt, mt_txt, src_embed, mt_embed) # run_vecalign_explore, run_vecalign
527
+ common_alignments_list.append(src_mt_alignments.copy())
528
+ delete_files_with_mt(temp_folder)
529
+
530
+ common_alignments_list = standardize_common_alignments(common_alignments_list)
531
+
532
+ mt_index = 0
533
+
534
+ for common_alignments in common_alignments_list:
535
+ adjusted_src = []
536
+ adjusted_mt = []
537
+ for src_indices, mt_indices in common_alignments:
538
+ mt_indices = [x for x in mt_indices if x != -1]
539
+
540
+ if len(src_indices) == 0:
541
+ continue
542
+ else:
543
+ aligned_src = " ".join([source_segments[i] for i in src_indices])
544
+
545
+ if len(mt_indices) > 0:
546
+ aligned_mt = " ".join([mt_segments_list[mt_index][i] for i in mt_indices])
547
+ else:
548
+ aligned_mt = ""
549
+
550
+ adjusted_src.append(aligned_src)
551
+ adjusted_mt.append(aligned_mt)
552
+
553
+ adjusted_mt_list.append(adjusted_mt.copy())
554
+ mt_index += 1
555
+
556
+ clear_folder(temp_folder)
557
+ return adjusted_src, adjusted_mt_list
558
+
559
+ ################################# main function #################################
560
+
561
+ def saving_memory(buffer, index, iteration, final_translations_record):
562
+ """
563
+ Save the buffer, and final_translations_record to the Memory folder.
564
+ """
565
+ current_dir = os.path.dirname(os.path.abspath(__file__))
566
+ memory_folder = os.path.join(current_dir, f"{MEMORY_FOLDER}")
567
+ os.makedirs(memory_folder, exist_ok=True)
568
+ buffer_file_path = f"{MEMORY_FOLDER}/buffer_{index}_iter_{iteration}.json"
569
+ metadata_file_path = f"{MEMORY_FOLDER}/metadata_{index}_iter_{iteration}.json"
570
+
571
+ buffer_to_save = {key: list(value) for key, value in buffer.items()}
572
+ with open(buffer_file_path, "w", encoding="utf-8") as f:
573
+ json.dump(buffer_to_save, f, ensure_ascii=False, indent=4)
574
+
575
+ metadata = {
576
+ "final_translations_record": final_translations_record
577
+ }
578
+ with open(metadata_file_path, "w", encoding="utf-8") as f:
579
+ json.dump(metadata, f, ensure_ascii=False, indent=4)
580
+
581
+ print(f"Buffer saved to {buffer_file_path}")
582
+ print(f"Metadata saved to {metadata_file_path}")
583
+
584
+
585
+ def process_chunk():
586
+
587
+ data = pd.read_csv(csv_path)
588
+ for index, row in data.iterrows():
589
+ print("::::::::::::::::::::::: index :::::::::::::::::::::::", index, " ::::::::::::::::::::::: index :::::::::::::::::::::::", )
590
+ buffer = defaultdict(list)
591
+
592
+ source_sentence = row[src_lang].replace('\n', ' ')
593
+ source_segments = segment_sentences_by_punctuation(source_sentence, lang=src_lang)
594
+
595
+ for iteration in range(max_iterations):
596
+ print(f"\nStarting iteration {iteration + 1}/{max_iterations}...\n")
597
+
598
+ if iteration in stop_memory:
599
+ final_translations = final_translate_with_deepinfra(source_sentence, source_segments, buffer, SRC_LANGUAGE, TASK_LANGUAGE)
600
+ print("Final Translation Method:")
601
+ print(final_translations)
602
+ final_translations_record = [final_translations]
603
+ saving_memory(buffer, index, iteration, final_translations_record)
604
+
605
+ if iteration == max_iterations - 1:
606
+ break
607
+ else:
608
+ translations = translate_with_deepinfra(source_sentence, buffer, good_ref_contexts_num+iteration, SRC_LANGUAGE, TASK_LANGUAGE)
609
+
610
+ src_windows, mt_windows_list = generate_windows(source_sentence, translations)
611
+
612
+ ####################################### Evaluate translations and update buffer #######################################
613
+ print("Evaluate translations and update buffer ..............")
614
+
615
+ # First, store all sources and candidate translations as lists.
616
+ src_context_list = list(src_windows)
617
+ candidates_list = []
618
+ for window_index in range(len(src_windows)):
619
+ candidates = [mt_windows[window_index] for mt_windows in mt_windows_list]
620
+ candidates_list.append(candidates)
621
+
622
+ # Batch evaluate all candidate translations, returning the best translation and score for each source.
623
+ best_candidate_results = batch_rm_find_best_translation(list(zip(src_context_list, candidates_list)), TASK_LANGUAGE)
624
+
625
+ print("\n Our best candidate results:")
626
+ print(best_candidate_results)
627
+ print(" ------------------------------------------------------------------------ \n")
628
+
629
+ print("\n===== Initial buffer state =====")
630
+ for src, translations in buffer.items():
631
+ print(f"Source '{src}': {[t[0] for t in translations]}")
632
+
633
+ # Update the buffer for each source.
634
+ for i, src in enumerate(src_context_list):
635
+ best_tuple = best_candidate_results[i] # (translation, score)
636
+ if best_tuple[0] is not None:
637
+ # If the source is not yet in the buffer, initialize it.
638
+ if src not in buffer:
639
+ buffer[src] = [best_tuple]
640
+ print(f"[ADD] New Source '{src}' Add Translation: '{best_tuple[0]}', Score: {best_tuple[1]}")
641
+ else:
642
+ # Directly add the new translation to the buffer.
643
+ buffer[src].append(best_tuple)
644
+ print(f"[ADD] Source '{src}' Add Translation: '{best_tuple[0]}', Score: {best_tuple[1]}")
645
+
646
+ # Sort by score to place the best translation (highest score) at the top.
647
+ buffer[src].sort(key=lambda x: x[1], reverse=True)
648
+ print(f"[UPDATE] Source '{src}' Best Translation: '{buffer[src][0][0]}'")
649
+
650
+ print("\n===== Final buffer state =====")
651
+ for src, translations in buffer.items():
652
+ print(f"Source '{src}': {[t[0] for t in translations]}")
653
+
654
+
655
+ print("Final Translation:")
656
+ print(final_translations)
657
+
658
+
659
+ def get_lang_and_nlp(language):
660
+ if language not in lang_map:
661
+ raise ValueError(f"Unsupported language: {language}")
662
+ lang_code, model_name = lang_map[language]
663
+ return lang_code, spacy.load(model_name)
664
+
665
+ def translate_text(text, session_id,
666
+ src_language="Japanese",
667
+ task_language="English",
668
+ max_iterations_value=3,
669
+ threshold_value=0.7,
670
+ good_ref_contexts_num_value=5,
671
+ reward_model_type='metricx'):
672
+
673
+ global SRC_LANGUAGE, TASK_LANGUAGE, max_iterations, stop_memory
674
+ global THRESHOLD, good_ref_contexts_num, src_lang, src_nlp, tgt_lang, mt_nlp
675
+ global reward_model, MEMORY_FOLDER, SESSION_ID
676
+
677
+ SESSION_ID = session_id
678
+ print("SESSION_ID: ", SESSION_ID)
679
+
680
+ MEMORY_FOLDER = "external_translation_memory"
681
+ SRC_LANGUAGE = src_language
682
+ TASK_LANGUAGE = task_language
683
+ max_iterations = max_iterations_value
684
+ stop_memory = list(range(1, max_iterations))
685
+ THRESHOLD = threshold_value
686
+ good_ref_contexts_num = good_ref_contexts_num_value
687
+
688
+ import torch
689
+ device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")
690
+ src_lang, src_nlp = get_lang_and_nlp(SRC_LANGUAGE)
691
+ tgt_lang, mt_nlp = get_lang_and_nlp(TASK_LANGUAGE)
692
+
693
+ reward_model = metricx_RewardModel()
694
+
695
+ from collections import defaultdict
696
+ buffer = defaultdict(list)
697
+ source_sentence = text.replace("\n", " ")
698
+ source_segments = segment_sentences_by_punctuation(source_sentence, lang=src_lang)
699
+ final_translations = None
700
+
701
+ for iteration in range(max_iterations):
702
+ # print(f"\nStarting iteration {iteration + 1}/{max_iterations}...\n")
703
+ if iteration in stop_memory:
704
+ final_translations = final_translate_with_deepinfra(source_sentence, source_segments, buffer, SRC_LANGUAGE, TASK_LANGUAGE)
705
+ # saving_memory(buffer, 0, iteration, [final_translations])
706
+ if iteration == max_iterations - 1:
707
+ break
708
+ else:
709
+ translations = translate_with_deepinfra(source_sentence, buffer, good_ref_contexts_num + iteration, SRC_LANGUAGE, TASK_LANGUAGE)
710
+
711
+ src_windows, mt_windows_list = generate_windows(source_sentence, translations)
712
+ # print("Evaluate translations and update buffer ..............")
713
+
714
+ src_context_list = list(src_windows)
715
+ candidates_list = []
716
+ for window_index in range(len(src_windows)):
717
+ candidates = [mt_windows[window_index] for mt_windows in mt_windows_list]
718
+ candidates_list.append(candidates)
719
+
720
+ best_candidate_results = batch_rm_find_best_translation(list(zip(src_context_list, candidates_list)), TASK_LANGUAGE, SESSION_ID)
721
+
722
+ # print("\n Best candidate results:")
723
+ # print(best_candidate_results)
724
+ # print(" ------------------------------------------------------------------------\n")
725
+
726
+ for i, src in enumerate(src_context_list):
727
+ best_tuple = best_candidate_results[i]
728
+ if best_tuple[0] is not None:
729
+ if src not in buffer:
730
+ buffer[src] = [best_tuple]
731
+ # print(f"[ADD] New Source '{src}' Add Translation: '{best_tuple[0]}', Score: {best_tuple[1]}")
732
+ else:
733
+ buffer[src].append(best_tuple)
734
+ # print(f"[ADD] Source '{src}' Add Translation: '{best_tuple[0]}', Score: {best_tuple[1]}")
735
+ buffer[src].sort(key=lambda x: x[1], reverse=True)
736
+ # print(f"[UPDATE] Source '{src}' Best Translation: '{buffer[src][0][0]}'")
737
+
738
+ # print("\n===== Buffer state =====")
739
+ for src, translations in buffer.items():
740
+ print(f"Source '{src}': {[t[0] for t in translations]}")
741
+
742
+ # print("Final Translation:")
743
+ # print(final_translations)
744
+ return final_translations
745
+
746
+
747
+ if __name__ == "__main__":
748
+ process_chunk()
vecalign/score.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ Copyright 2019 Brian Thompson
5
+
6
+ Licensed under the Apache License, Version 2.0 (the "License");
7
+ you may not use this file except in compliance with the License.
8
+ You may obtain a copy of the License at
9
+
10
+ https://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+
18
+ """
19
+
20
+ import argparse
21
+ import sys
22
+ from collections import defaultdict
23
+
24
+ import numpy as np
25
+
26
+ from dp_utils import read_alignments
27
+
28
+ """
29
+ Faster implementation of lax and strict precision and recall, based on
30
+ https://www.aclweb.org/anthology/W11-4624/.
31
+
32
+ """
33
+
34
+
35
+ def _precision(goldalign, testalign):
36
+ """
37
+ Computes tpstrict, fpstrict, tplax, fplax for gold/test alignments
38
+ """
39
+ tpstrict = 0 # true positive strict counter
40
+ tplax = 0 # true positive lax counter
41
+ fpstrict = 0 # false positive strict counter
42
+ fplax = 0 # false positive lax counter
43
+
44
+ # convert to sets, remove alignments empty on both sides
45
+ testalign = set([(tuple(x), tuple(y)) for x, y in testalign if len(x) or len(y)])
46
+ goldalign = set([(tuple(x), tuple(y)) for x, y in goldalign if len(x) or len(y)])
47
+
48
+ # mappings from source test sentence idxs to
49
+ # target gold sentence idxs for which the source test sentence
50
+ # was found in corresponding source gold alignment
51
+ src_id_to_gold_tgt_ids = defaultdict(set)
52
+ for gold_src, gold_tgt in goldalign:
53
+ for gold_src_id in gold_src:
54
+ for gold_tgt_id in gold_tgt:
55
+ src_id_to_gold_tgt_ids[gold_src_id].add(gold_tgt_id)
56
+
57
+ for (test_src, test_target) in testalign:
58
+ if (test_src, test_target) == ((), ()):
59
+ continue
60
+ if (test_src, test_target) in goldalign:
61
+ # strict match
62
+ tpstrict += 1
63
+ tplax += 1
64
+ else:
65
+ # For anything with partial gold/test overlap on the source,
66
+ # see if there is also partial overlap on the gold/test target
67
+ # If so, its a lax match
68
+ target_ids = set()
69
+ for src_test_id in test_src:
70
+ for tgt_id in src_id_to_gold_tgt_ids[src_test_id]:
71
+ target_ids.add(tgt_id)
72
+ if set(test_target).intersection(target_ids):
73
+ fpstrict += 1
74
+ tplax += 1
75
+ else:
76
+ fpstrict += 1
77
+ fplax += 1
78
+
79
+ return np.array([tpstrict, fpstrict, tplax, fplax], dtype=np.int32)
80
+
81
+
82
+ def score_multiple(gold_list, test_list, value_for_div_by_0=0.0):
83
+ # accumulate counts for all gold/test files
84
+ pcounts = np.array([0, 0, 0, 0], dtype=np.int32)
85
+ rcounts = np.array([0, 0, 0, 0], dtype=np.int32)
86
+ for goldalign, testalign in zip(gold_list, test_list):
87
+ pcounts += _precision(goldalign=goldalign, testalign=testalign)
88
+ # recall is precision with no insertion/deletion and swap args
89
+ test_no_del = [(x, y) for x, y in testalign if len(x) and len(y)]
90
+ gold_no_del = [(x, y) for x, y in goldalign if len(x) and len(y)]
91
+ rcounts += _precision(goldalign=test_no_del, testalign=gold_no_del)
92
+
93
+ # Compute results
94
+ # pcounts: tpstrict,fnstrict,tplax,fnlax
95
+ # rcounts: tpstrict,fpstrict,tplax,fplax
96
+
97
+ if pcounts[0] + pcounts[1] == 0:
98
+ pstrict = value_for_div_by_0
99
+ else:
100
+ pstrict = pcounts[0] / float(pcounts[0] + pcounts[1])
101
+
102
+ if pcounts[2] + pcounts[3] == 0:
103
+ plax = value_for_div_by_0
104
+ else:
105
+ plax = pcounts[2] / float(pcounts[2] + pcounts[3])
106
+
107
+ if rcounts[0] + rcounts[1] == 0:
108
+ rstrict = value_for_div_by_0
109
+ else:
110
+ rstrict = rcounts[0] / float(rcounts[0] + rcounts[1])
111
+
112
+ if rcounts[2] + rcounts[3] == 0:
113
+ rlax = value_for_div_by_0
114
+ else:
115
+ rlax = rcounts[2] / float(rcounts[2] + rcounts[3])
116
+
117
+ if (pstrict + rstrict) == 0:
118
+ fstrict = value_for_div_by_0
119
+ else:
120
+ fstrict = 2 * (pstrict * rstrict) / (pstrict + rstrict)
121
+
122
+ if (plax + rlax) == 0:
123
+ flax = value_for_div_by_0
124
+ else:
125
+ flax = 2 * (plax * rlax) / (plax + rlax)
126
+
127
+ result = dict(recall_strict=rstrict,
128
+ recall_lax=rlax,
129
+ precision_strict=pstrict,
130
+ precision_lax=plax,
131
+ f1_strict=fstrict,
132
+ f1_lax=flax)
133
+
134
+ return result
135
+
136
+
137
+ def log_final_scores(res):
138
+ print(' ---------------------------------', file=sys.stderr)
139
+ print('| | Strict | Lax |', file=sys.stderr)
140
+ print('| Precision | {precision_strict:.3f} | {precision_lax:.3f} |'.format(**res), file=sys.stderr)
141
+ print('| Recall | {recall_strict:.3f} | {recall_lax:.3f} |'.format(**res), file=sys.stderr)
142
+ print('| F1 | {f1_strict:.3f} | {f1_lax:.3f} |'.format(**res), file=sys.stderr)
143
+ print(' ---------------------------------', file=sys.stderr)
144
+
145
+
146
+ def main():
147
+ parser = argparse.ArgumentParser(
148
+ 'Compute strict/lax precision and recall for one or more pairs of gold/test alignments',
149
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
150
+
151
+ parser.add_argument('-t', '--test', type=str, nargs='+', required=True,
152
+ help='one or more test alignment files')
153
+
154
+ parser.add_argument('-g', '--gold', type=str, nargs='+', required=True,
155
+ help='one or more gold alignment files')
156
+
157
+ args = parser.parse_args()
158
+
159
+ if len(args.test) != len(args.gold):
160
+ raise Exception('number of gold/test files must be the same')
161
+
162
+ gold_list = [read_alignments(x) for x in args.gold]
163
+ test_list = [read_alignments(x) for x in args.test]
164
+
165
+ res = score_multiple(gold_list=gold_list, test_list=test_list)
166
+ log_final_scores(res)
167
+
168
+
169
+ if __name__ == '__main__':
170
+ main()
vecalign/standalone_document_embedding_demo.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #!/usr/bin/env python3
3
+
4
+ """
5
+ Copyright 2019 Brian Thompson
6
+
7
+ Licensed under the Apache License, Version 2.0 (the "License");
8
+ you may not use this file except in compliance with the License.
9
+ You may obtain a copy of the License at
10
+
11
+ https://www.apache.org/licenses/LICENSE-2.0
12
+
13
+ Unless required by applicable law or agreed to in writing, software
14
+ distributed under the License is distributed on an "AS IS" BASIS,
15
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ See the License for the specific language governing permissions and
17
+ limitations under the License.
18
+
19
+
20
+ This is a standalone example of creating a document vector from sentence vectors
21
+ following https://aclanthology.org/2020.emnlp-main.483
22
+
23
+ """
24
+
25
+
26
+ import numpy as np
27
+
28
+ from mcerp import PERT # pip install mcerp # see https://github.com/tisimst/mcerp/blob/master/mcerp/__init__.py
29
+
30
+
31
+ NUM_TIME_SLOTS = 16
32
+ PERT_G = 20
33
+
34
+
35
+ # PERT is very slow (50ms per distribution) so we cache a bank of PERT distributions
36
+ _num_banks = 100
37
+ _xx = np.linspace(start=0, stop=1, num=NUM_TIME_SLOTS)
38
+ PERT_BANKS = []
39
+ for _pp in np.linspace(0, 1, num=_num_banks):
40
+ if _pp == 0.5: # some special case that makes g do nothing
41
+ _pp += 0.001
42
+ pert = PERT(low=-0.001, peak=_pp, high=1.001, g=PERT_G, tag=None)
43
+ _yy = pert.rv.pdf(_xx)
44
+ _yy = _yy / sum(_yy) # normalize
45
+ PERT_BANKS.append(_yy)
46
+
47
+
48
+ np.set_printoptions(threshold=50, precision=5)
49
+
50
+
51
+ def build_doc_embedding(sent_vecs, sent_counts):
52
+ # ensure sentence counds are >= 1
53
+ sent_counts = np.clip(sent_counts, a_min=1, a_max=None)
54
+
55
+ # scale each sent vec by 1/count
56
+ sent_weights = 1.0/np.array(sent_counts)
57
+
58
+ scaled_sent_vecs = np.multiply(sent_vecs.T, sent_weights).T
59
+
60
+ # equally space sentences
61
+ sent_centers = np.linspace(0, 1, len(scaled_sent_vecs))
62
+
63
+ # find weighting for each sentence, for each time slot
64
+ sentence_loc_weights = np.zeros((len(sent_centers), NUM_TIME_SLOTS))
65
+
66
+ for sent_ii, p in enumerate(sent_centers):
67
+ bank_idx = int(p * (len(PERT_BANKS) - 1)) # find the nearest cached pert distribution
68
+ sentence_loc_weights[sent_ii, :] = PERT_BANKS[bank_idx]
69
+
70
+ # make each chunk vector
71
+ doc_chunk_vec = np.matmul(scaled_sent_vecs.T, sentence_loc_weights).T
72
+
73
+ # concatenate chunk vectors into a single vector for the full document
74
+ doc_vec = doc_chunk_vec.flatten()
75
+
76
+ # normalize document vector
77
+ doc_vec = doc_vec / (np.linalg.norm(doc_vec) + 1e-5)
78
+
79
+ return doc_vec
80
+
81
+
82
+ def demo():
83
+
84
+ # Replace sent_vecs with laser/LaBSE/etc embeddings of each sentence in your document,
85
+ # after projecting the sentence embeddings into a lower-dimensional space using something like PCA (see paper for details).
86
+ sent_emb_size = 32 # Document embedding size will be sent_emb_size * NUM_TIME_SLOTS
87
+ n_sents = 7
88
+ sent_vecs = np.random.rand(n_sents, sent_emb_size)-0.5
89
+
90
+ # Replace sent_counts with the number of times each sentence has been seen in your corpus.
91
+ sent_counts = np.random.randint(low=1, high=50, size=n_sents)
92
+
93
+ doc_emb = build_doc_embedding(sent_vecs, sent_counts)
94
+
95
+ print('Document Embedding:', doc_emb)
96
+ print('Document Embedding Size:', doc_emb.shape)
97
+
98
+
99
+ if __name__ == '__main__':
100
+ demo()
vecalign/valid_en_ja.csv ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ en_url,en_title,en,jp_url,jp_title,ja
2
+ https://developer.nvidia.com/blog/expanding-ai-agent-interface-options-with-2d-and-3d-digital-human-avatars/,Expanding AI Agent Interface Options with 2D and 3D Digital Human Avatars,"When interfacing with
3
+ generative AI
4
+ applications, users have multiple communication options—text, voice, or through digital avatars.
5
+ Traditional chatbot or copilot applications have text interfaces where users type in queries and receive text-based responses. For hands-free communication, speech AI technologies like
6
+ automatic speech recognition
7
+ (ASR) and
8
+ text-to-speech
9
+ (TTS) facilitate verbal interactions, ideal for scenarios like phone-based customer service. Moreover, combining digital avatars with speech capabilities provides a more dynamic interface for users to engage visually with the application. According to Gartner, by 2028, 45% of organizations with more than 500 employees will leverage employee AI avatars to expand the capacity of human capital.
10
+ 1
11
+ Digital avatars can vary widely in style—some use cases benefit from photorealistic 3D or 2D avatars, while other use cases work better with a stylized, or cartoonish avatar.
12
+ 3D Avatars
13
+ offer fully immersive experiences, showcasing lifelike movements and photorealism. Developing these avatars requires specialized software and technical expertise, as they involve intricate body animations and high-quality renderings.
14
+ 2D Avatars
15
+ are quicker to develop and ideal for web-embedded solutions. They offer a streamlined approach to creating interactive AI, often requiring artists for design and animation but less intensive in terms of technical resources.
16
+ To kickstart your creation of a photo-realistic digital human, the
17
+ NVIDIA AI Blueprint on digital humans for customer service
18
+ can be tailored for various use cases. This functionality is now included with support for the NVIDIA Maxine
19
+ Audio2Face-2D
20
+ NIM microservice. ‌Additionally, the blueprint now offers flexibility in rendering for 3D avatar developers to use
21
+ Unreal Engine
22
+ .
23
+ How to add a talking digital avatar to your agent application
24
+ In the AI Blueprint for digital humans, a user interacts with an
25
+ AI agent
26
+ that leverages
27
+ NVIDIA ACE
28
+ technology (Figure 1).
29
+ Figure 1. Architecture diagram for the NVIDIA AI Blueprint for digital humans
30
+ The audio input from the user is sent to the ACE agent which orchestrates the communication between various NIM microservices. The ACE agent uses the
31
+ Riva Parakeet NIM
32
+ to convert the audio to text, which is then processed by a RAG pipeline. The RAG pipeline uses the NVIDIA NeMo Retriever
33
+ embedding
34
+ and
35
+ reranking
36
+ NIM microservices, and an
37
+ LLM NIM
38
+ , to respond with relevant context from stored documents.
39
+ Finally, the response is converted back to speech via Riva TTS, animating the digital human using the Audio2Face-3D NIM or Audio2Face-2D NIM.
40
+ Considerations when designing your AI agent application
41
+ In global enterprises, communication barriers across languages can slow down operations. AI-powered avatars with multilingual capabilities communicate across languages effortlessly. The digital human AI Blueprint provides conversational AI capabilities that simulate human interactions that accommodate users’ speech styles and languages through Riva ASR, neural machine translation (NMT) along with intelligent interruption and barge-in support.
42
+ One of the key benefits of digital human AI agents is their ability to function as “always-on” resources for employees and customers alike. RAG-powered AI agents continuously learn from interactions and improve over time, providing more accurate responses and better user experiences.
43
+ For enterprises considering digital human interfaces, choosing the right avatar and rendering option depends on the use case and customization preferences.
44
+ Use Case
45
+ : 3D avatars are ideal for highly immersive use cases like in physical stores, kiosks or primarily one-to-one interactions, while 2D avatars are effective for web or mobile conversational AI use cases.
46
+ Development and Customization Preferences
47
+ : Teams with 3D and animation expertise can leverage their skillset to create an immersive and ultra-realistic avatar, while teams looking to iterate and customize quickly can benefit from the simplicity of 2D avatars.
48
+ Scaling Considerations:
49
+ Scaling is an important consideration when evaluating avatars and corresponding rendering options. Stream throughput, especially for 3D avatars, is highly dependent on the choice and quality of the character asset used, the desired output resolution and the rendering option of choice (Omniverse Renderer or Unreal Engine) can play a critical role in determining per stream compute footprint.
50
+ NVIDIA Audio2Face-2D allows creation of lifelike 2D avatars from just a portrait image and voice input. Easy and simple configurations allow developers to quickly iterate and produce target avatars and animations for their digital human use cases. With real-time output and cloud-native deployment, 2D digital humans are ideal for interactive use cases and streaming avatars for interactive web-embedded solutions.
51
+ For example, enterprises looking to deploy AI agents across multiple devices and inserting digital humans into web- or mobile-first customer journeys, can benefit from the reduced hardware demands of 2D avatars.
52
+ 3D photorealistic avatars provide an unmatched immersive experience for use cases demanding ‌highly empathetic user engagement. NVIDIA Audio2Face-3D and Animation NIM microservices animate a 3D character by generating blendshapes along with subtle head and body animation to create an immersive, photorealistic avatar. The digital human AI Blueprint now supports two rendering options for 3D avatars, including Omniverse Renderer and Unreal Engine Renderer, providing developers the flexibility to integrate the rendering option of their choice.
53
+ To explore how digital humans can enhance your enterprise, visit the
54
+ NVIDIA API catalog
55
+ to learn about the different avatar options.
56
+ Getting started with digital avatars
57
+ For hands-on development with Audio2Face-2D and Unreal Engine NIM microservices,
58
+ apply for ACE Early Access
59
+ or dive into the digital human AI Blueprint
60
+ technical blog
61
+ to learn how you can add digital human interfaces to personalize chatbot applications.
62
+ 1
63
+ Gartner®, Hype Cycle for the Future of Work, 2024 by Tori Paulman, Emily Rose McRae, etc., July 2024
64
+ GARTNER is a registered trademark and service mark of Gartner, Inc. and/or its affiliates in the U.S. and internationally and is used herein with permission. All rights reserved.",https://developer.nvidia.com/ja-jp/blog/expanding-ai-agent-interface-options-with-2d-and-3d-digital-human-avatars/,2D と 3D のデジタル ヒューマン アバターによる AI エージェント インターフェイス オプションの拡張,"Reading Time:
65
+ 2
66
+ minutes
67
+ ユーザーが
68
+ 生成 AI
69
+ アプリケーションを使ってやり取りする際には、テキスト、音声、デジタル アバターなど複数のコミュニケーション オプションを利用することができます。
70
+ 従来のチャットボットやコパイロット アプリケーションでは、ユーザーが問い合わせを入力し、テキストベースの応答を受信するテキスト インターフェイスを使用しています。ハンズフリーのコミュニケーションでは、
71
+ 自動音声認識
72
+ (ASR: Automatic Speech Recognition) や
73
+ 音声合成
74
+ (TTS: Text-To-Speech) などの音声 AI 技術により、電話を使用したカスタマー サービスなどのシナリオに最適な口頭によるやり取りが容易になります。さらに、デジタル アバターに音声機能を持たせることで、ユーザーがアプリケーションを視覚的に使用できるため、ダイナミックなインターフェイスを提供できます。Gartner によると、2028 年までに、従業員 500 名以上の組織の 45% が、人的資本の能力拡大のために、 AI アバターの従業員を活用するようになるそうです。
75
+ 1
76
+ デジタル アバターのスタイルは様々で、フォトリアリスティックな 3D または 2D のアバターが適しているケースもあれば、定型化されたアバターや漫画のようなアバターの方が適しているケースもあります。
77
+ 3D アバター
78
+ は、リアルな動きと写実性を再現し、完全な没入体験を提供します。このようなアバターの開発には、複雑なボディー アニメーションや高品質のレンダリングが必要となるため、専門的なソフトウェアや技術的な専門知識が必要になります。
79
+ 2D アバター
80
+ は開発が迅速で、Web に組み込みソリューションに最適です。インタラクティブな AI の作成に合理的なアプローチを提供し、デザインやアニメーションにはアーティストが必要になることが多いですが、技術的なリソースの面はそれほど負担になりません。
81
+ フォトリアリスティックなデジタル ヒューマンの作成を始めるにあたり、
82
+ カスタマー サービス向けデジタル ヒューマンの NVIDIA AI Blueprint
83
+ は、さまざまなユース ケースに合わせてカスタマイズすることができます。この機能は現在、NVIDIA Maxine
84
+ Audio2Face-2D
85
+ NIM マイクロサービスのサポートに含まれています。さらに、この Blueprint では、3D アバター開発者が
86
+ Unreal Engine
87
+ を使用できるよう、レンダリングに柔軟性を持たせています。
88
+ エージェント アプリケーションに会話するデジタル アバターを追加する方法
89
+ デジタル ヒューマン向け AI Blueprint では、ユーザーが
90
+ NVIDIA ACE
91
+ 技術を活用した
92
+ AI エージェント
93
+ と対話します (図 1)。
94
+ 図 1. デジタル ヒューマン向け NVIDIA AI Blueprint のアーキテクチャ
95
+ ユーザーによる音声入力は、さまざまな NIM マイクロサービス間の通信を調整する ACE エージェントに送��されます。ACE エージェントは、
96
+ Riva Parakeet NIM
97
+ を使用して音声をテキストに変換し、そのテキストは RAG パイプラインで処理されます。RAG パイプラインでは、NIM マイクロサービスの
98
+ 埋め込み
99
+
100
+ リランク
101
+ を行う NVIDIA NeMo Retriever と
102
+ LLM NIM
103
+ を使用して、保存されたドキュメントから関連するコンテキストを用いて応答します。
104
+ 最後に、Riva TTS を介してこの応答を音声に変換し、Audio2Face-3D NIM または Audio2Face-2D NIM を使用してデジタル ヒューマンをアニメーション化します。
105
+ AI エージェント アプリケーションを設計する際に考慮すべきポイント
106
+ グローバル企業では、言語の壁によるコミュニケーションの障害が業務の妨げとなることがあります。多言語機能を備えた AI 搭載アバターを使用すれば、言語の壁を超えた円滑なコミュニケーションを取ることができます。デジタル ヒューマン AI Blueprint は、Riva ASR やニューラル機械翻訳 (NMT: Neural Machine Translation) に加え、インテリジェントな割り込みやバージイン機能を備え、ユーザーの話し方や言語に柔軟に対応できる、人間らしい対話型 AI を実現します。
107
+ デジタル ヒューマン AI エージェントの主な利点の 1 つは、従業員と顧客の両者にとって「常時稼働する」リソースとして機能できることです。RAG を搭載した AI エージェントは、やりとりから継続的に学習し、時間の経過とともに改善していくため、より正確な対応とより優れたユーザー体験を提供することができます。
108
+ デジタル ヒューマン インターフェイスを検討している企業にとって、適切なアバターとレンダリング オプションの選択は、ユース ケースやカスタマイズ設定に依存します。
109
+ ユース ケース
110
+ : 3D アバターは、実店舗やキオスク (無人端末) など、主に 1対 1 のやりとりのような、非常に没入感の高いユース ケースに最適ですが、2D アバターは、Web やモバイルの対話型 AI ユース ケースに効果的です。
111
+ 開発とカスタマイズの設定
112
+ : 3D やアニメーションの専門知識を持つチームは、そのスキルを活用して没入感のある超リアルなアバターを作成できます。一方、反復作業やカスタマイズを迅速に行いたいチームには、シンプルな 2D アバターが有効です。
113
+ スケーリングの考慮すべきポイント
114
+ : アバターと対応するレンダリング オプションを評価する際に、スケーリングは考慮すべき重要なポイントです。ストリームのスループットは、特に 3D アバターの場合、使用するキャラクター アセットの選択と品質によって大きく異なります。希望する出力解像度や選択するレンダリング オプション (Omniverse Renderer または Unreal Engine) は、ストリームあたりの計算フットプリントを決定する上で重要な役割を果たします。
115
+ NVIDIA Audio2Face-2D では、顔写真と音声入力だけでリアルな 2D アバターを作成できます。簡単でシンプルな構成のため、開発者はデジタル ヒューマンのユース ケースに合わせたアバターやアニメーションを迅速に繰り返し作成できます。リアルタイム出力とクラウド ネイティブのデプロイにより、2D デジタル ヒューマンは、インタラクティブなユース ケースや、インタラクティブな Web 組み込みソリューション向けのストリーミング アバターに最適です。
116
+ たとえば、複数のデバイスに AI エージェントをデプロイし、Web またはモバイル ファーストのカスタマー ジャーニーにデジタル ヒューマンを導入しようとしている企業には、2D アバターはハードウェア要件が軽減するのでメリットがあります。
117
+ 3D のフォトリアリスティックなアバターは、高い共感が要求されるユーザー エンゲージメントを必要とするユース ケースに、比類のない没入体験を提供します。NVIDIA Audio2Face-3D とアニメーション NIM マイクロサービスは、繊細な頭部と身体のアニメーションとともにブレンドシェイプを生成し、没入感のあるフォトリアリスティックなアバターを作成することで、3D キャラクターをアニメーション化します。デジタル ヒューマン AI Blueprint は、3D アバターのレンダリング オプションをとして、Omniverse レンダラーと Unreal-Engine レンダラーをサポートしており、開発者が選択したレンダリング オプションを柔軟に統合できるようになりました。
118
+ デジタル ヒューマンが企業を強化する方法については、
119
+ NVIDIA API カタログ
120
+ にアクセスして、さまざまなアバターのオプションをご覧ください。
121
+ デジタル アバターを始める
122
+ Audio2Face-2D と Unreal Engine NIM マイクロサービスを使用した実践的な開発については、
123
+ ACE 早期アクセスに申し込む
124
+ か、デジタル ヒューマン AI Blueprint の
125
+ 技術ブログ
126
+ にアクセスして、チャットボット アプリケーションをパーソナライズするためにデジタル ヒューマン インターフェイスを追加する方法について学ぶことができます。
127
+ 1
128
+ Gartner®, Hype Cycle for the Future of Work, 2024 by Tori Paulman, Emily Rose McRae, etc., July 2024
129
+ GARTNER is a registered trademark and service mark of Gartner, Inc. and/or its affiliates in the U.S. and internationally and is used herein with permission. All rights reserved.
130
+ 関連情報
131
+ GTC セッション:
132
+ Enhancing the Digital Human Experience with Cloud Microservices Accelerated by Generative AI
133
+ GTC セッション:
134
+ Build a World of Interactive Avatars Based on NVIDIA Omniverse, AIGC, and LLM
135
+ NGC コンテナー:
136
+ ACE エージェント サンプル フロントエンド
137
+ SDK:
138
+ NVIDIA Tokkio
139
+ ウェビナー:
140
+ How Telcos Transform Customer Experiences with Conversational AI"
141
+ https://developer.nvidia.com/blog/5x-faster-time-to-first-token-with-nvidia-tensorrt-llm-kv-cache-early-reuse/,5x Faster Time to First Token with NVIDIA TensorRT-LLM KV Cache Early Reuse,"In our previous
142
+ blog post
143
+ , we demonstrated how reusing the key-value (KV) cache by offloading it to CPU memory can accelerate time to first token (TTFT) by up to 14x on x86-based NVIDIA H100 Tensor Core GPUs and 28x on the NVIDIA GH200 Superchip. In this post, we shed light on KV cache reuse techniques and best practices that can drive even further TTFT speedups.
144
+ Introduction to KV cache
145
+ LLM models are rapidly being adopted for many tasks, including question-answering, and code generation. To generate a response, these models begin by converting the user’s prompt into tokens, which are then transformed into dense vectors. Extensive dot-product operations follow to mathematically model the relationships between the tokens and build a contextual understanding of the user input. The computational cost of generating this contextual understanding increases quadratically with the length of the input sequence.
146
+ This resource-intensive process generates keys and values, which are cached to avoid recomputation when generating subsequent tokens. Reusing the KV cache reduces the computational load and time needed to generate additional tokens—leading to a faster and more efficient user experience.
147
+ When reusing the KV cache, careful attention must be given to how long it remains in memory, which components to evict first when memory is full, and when it can be reused for new incoming prompts. Optimizing these factors can lead to incremental performance improvements in KV cache reuse. NVIDIA TensorRT-LLM offers three key features that specifically address these areas.
148
+ Early KV cache reuse
149
+ Traditional reuse algorithms require the entire KV cache computation to be completed before any portions of it can be reused with new user prompts. In scenarios such as enterprise chatbots, where system prompts—predefined instructions added to user queries—are essential to direct the LLM’s responses in line with enterprise guidelines, this method can be inefficient.
150
+ When a surge of users interacts with the chatbot simultaneously, each user would require a separate computation of the system prompt KV cache. With TensorRT-LLM, we can instead reuse the system prompt as it is being generated in real time, enabling it to be shared across all users during the burst, rather than recalculating it for each user. This can significantly accelerate inference for use cases requiring system prompts by up to 5x.
151
+ Figure 1. TensorRT-LLM KV cache reuse can speed up TTFT by up to 5x
152
+ Flexible KV cache block sizing
153
+ In reuse implementations, only entire cache memory blocks can be allocated for reuse. For example, if the cache memory block size is 64 tokens and KV cache is 80 tokens, only 64 tokens will be stored for reuse, while the remaining 16 tokens will need to be recomputed. However, if the memory block size is reduced to 16 tokens, all 64 tokens can be stored across five memory blocks, eliminating the need for re-computation.
154
+ This effect is most pronounced when the input sequences are short. For long input sequences, larger blocks can be more beneficial.  As is clear, the more granular the control you have over the KV cache, the better you can optimize it for your specific use case.
155
+ TensorRT-LLM provides fine-grained control over KV cache memory blocks, giving developers the ability to chop them into smaller blocks between 64 to 2 tokens. This optimizes the usage of allocated memory, increases reuse rates, and improves TTFT. When running LLAMA70B on NVIDIA H100 Tensor Core GPUs, we can speed up TTFT up to 7% in multi-user environments by reducing KV cache block size from 64 tokens to 8 tokens.
156
+ Figure 2. Impact of changing KV cache block size on inference speedup
157
+ Efficient KV cache eviction protocols
158
+ Partitioning the KV cache into smaller blocks and evicting unused ones can be effective for memory optimization, but it introduces dependency complexities. When a specific block is used to generate a response, and the result is stored as a new block, it can form a tree-like structure of dependencies.
159
+ Over time, the counters tracking the usage of the source blocks (the branches) may become stale as the dependent nodes (the leaves) are reused. Evicting the source block then requires the eviction of all dependent blocks, which would require recalculation of the KV cache for new user prompts, increasing TTFT.
160
+ To address this challenge, TensorRT-LLM includes intelligent eviction algorithms that can trace the dependent nodes from their source nodes and evict dependent nodes first, even if they have more recent reuse counters. This ensures more efficient memory management while preventing unnecessary evictions of dependent blocks.
161
+ Figure 3. A logical representation of KV cache eviction algorithm show how it can reduce the number of evicted blocks, increasing the likelihood of reuse
162
+ Getting started with TensorRT-LLM KV cache reuse
163
+ Generating KV cache during inference requires a lot of compute and memory resources. Using it efficiently is critical to improving model response, accelerating inference, and increasing system throughput. TensorRT-LLM provides advanced reuse features for developers looking to further optimize TTFT response times for peak performance.
164
+ To start using TensorRT-LLM KV cache reuse check out our
165
+ GitHub documentation
166
+ .",https://developer.nvidia.com/ja-jp/blog/5x-faster-time-to-first-token-with-nvidia-tensorrt-llm-kv-cache-early-reuse/,NVIDIA TensorRT-LLM の KV Cache Early Reuseで、Time to First Token を 5 倍高速化,"Reading Time:
167
+ 2
168
+ minutes
169
+ 以前の
170
+ ブログ記事
171
+ では、key-value (KV) キャッシュを CPU メモリにオフロードして再利用することで、最初のトークンが出力されるまでの時間 (TTFT: Time To First Token) を x86 ベースの NVIDIA H100 Tensor コア GPU で最大 14 倍、NVIDIA GH200 Superchip で最大 28 倍に高速化できる方法をご紹介しました。本記事では、KV キャッシュの再利用技術と、TTFT のさらなる高速化を実現するベストプラクティスについて解説します。
172
+ KV キャッシュの概要
173
+ LLM モデルは、質問回答やコード生成など、多くのタスクで急速に採用されています。応答を生成するにあたり、これらのモデルはまず、ユーザーのプロンプトをトークンへ変換し、その後これらのトークンを密ベクトルへと変換します。膨大なドット積演算がその後に続き、その後トークン間の関係性を数学的にモデル化し、ユーザー入力に対する文脈理解を構築します。この文脈理解を生成するためにかかる計算コストは、入力シーケンスの長さの二乗に比例して増加します。
174
+ このリソースを大量に消費するプロセスから key とvalue が生成され、後続のトークンを生成するときに再度計算されないようにキャッシュされます。KV キャッシュを再利用することで、追加のトークンを生成する際に必要となる計算負荷と時間が軽減され、より高速で効率的なユーザー体験を実現します。
175
+ KV キャッシュを再利用するときには、キャッシュがメモリに残る期間、メモリが一杯になったときに最初に削除するコンポーネント、および新しい入力プロンプトに再利用できるタイミングなどの点に細心の注意を払う必要があります。これらの要因を最適化することで、KV キャッシュの再利用におけるパフォーマンスの段階的な増加へとつなげることができます。NVIDIA TensorRT-LLM は、これらの分野に特化した 3 つの主要な機能を提供します。
176
+ Early KV cache reuse
177
+ 従来の再利用アルゴリズムでは、KV キャッシュをその一部であっても新しいユーザー プロンプトで再利用するためには、事前にすべての KV キャッシュの計算を完了させておく必要がありました。この方法は、LLM のレスポンスを企業のガイドラインに沿ったものにするために、システム プロンプト (ユーザーの問い合わせに追加される事前定義の指示) が不可欠となる企業向けチャットボットなどのシナリオでは、非効率的である可能性があります。
178
+ チャットボットと同時にやり取りするユーザーが急増した場合、各ユーザーに対してシステム プロンプト KV キャッシュを個別に計算する必要があります。TensorRT-LLM では、リアルタイムで生成���れるシステム プロンプトを再利用することができるため、急増時にはすべてのユーザーと共有することができ、ユーザーごとに再計算する必要がありません。これにより、システム プロンプトを必要とするユース ケースの推論を最大 5 倍にまで高速化することができます。
179
+ 図 1. TensorRT-LLM KV cache reuse により、TTFT を最大 5 倍高速化
180
+ 柔軟な KV キャッシュ ブロック サイズ
181
+ 再利用を実装する際には、キャッシュ メモリ ブロック全体のみを再利用に割り当てることができます。例えば、キャッシュ メモリ ブロック サイズが 64 トークンで、KV キャッシュが 80 トークンである場合、再利用のために保存できるのは 64 トークンのみであり、残りの 16 トークンは再計算する必要があります。しかしながら、メモリ ブロック サイズを 16 トークンに減らすと、64 トークンすべてを 5 つのメモリ ブロックに格納することができ、再計算の必要性がなくなります。
182
+ この効果は、入力シーケンスが短いときに最も顕著に現れます。長い入力シーケンスの場合は、より大きなブロックの方がより有益です。明らかに、KV キャッシュをより細かく制御できればできるほど、特定のユース ケースに合わせた最適化も向上します。
183
+ TensorRT-LLM では、KV キャッシュ メモリ ブロックをきめ細かく制御できるため、開発者は KV キャッシュ メモリ ブロックを 64 から 2 トークンまで、より小さなブロックに分割することができます。これにより、割り当てられたメモリの使用が最適化され、再利用率が上昇し、TTFT が改善されます。NVIDIA H100 Tensor コア GPU で LLAMA70B を実行する場合、KV キャッシュ ブロックサイズを 64 トークンから 8 トークンへと減らすことで、マルチユーザー環境で TTFT を最大 7% 高速化できます。
184
+ 図 2. KV キャッシュ ブロック サイズの変更による推論の高速化
185
+ 効率的な KV キャッシュの除外 (Eviction) プロトコル
186
+ KV キャッシュをより小さなブロックに分割し、未使用のブロックを除外することは、メモリの最適化に効果的ですが、依存関係に複雑さが生まれます。特定のブロックがレスポンスの生成に使用され、その結果が新しいブロックとして保存されると、依存関係のツリー構造が形成される可能性があります。
187
+ 時間の経過とともに、ソース ブロック (ブランチ) の使用を追跡するカウンターは、従属ノード (リーフ) が再利用されるにつれて古くなる可能性があります。ソース ブロックを除外するには、従属するすべてのブロックを除外する必要があり、新しいユーザ プロンプトの KV キャッシュを再計算する必要が生じて TTFT が増加します。
188
+ この課題に対処するために、TensorRT-LLM には、従属ノードをソース ノードから追跡し、従属ノードがより最近の再利用カウンターを持っている場合でも、最初に従属ノードを除外することができるインテリジェントな除外アルゴリズムが含まれています。これにより、より効率的にメモリを管理できるようになると共に、従属ブロックの不要な除外を回避できます。
189
+ 図 3. KV キャッシュの除外アルゴリズムの論理を表現した図。除外されるブロックの数を減らし、再利用の可能性を高められる様子を示しています。
190
+ TensorRT-LLM KV cache reuse を使い始める
191
+ 推論中に KV キャッシュを生成するには、多くの計算とメモリ ソースが必要になります。効率的に使用することが、モデル応答の改善、推論の高速化、システム スループットの向上には不可欠です。TensorRT-LLM は、ピーク性能のために TTFT 応答時間をさらに最適化しようとする開発者に高度な再利用機能を提供します。
192
+ TensorRT-LLM KV cache reuse を使い始めるには、
193
+ GitHub のドキュメント
194
+ を参照してください。
195
+ 関連情報
196
+ GTC セッション:
197
+ Speeding up LLM Inference With TensorRT-LLM (TensorRT-LLM による LLM 推論の高速化)
198
+ GTC セッション:
199
+ Optimizing and Scaling LLMs With TensorRT-LLM for Text Generation (テキスト生成のための TensorRT-LLM を使用した LLM の最適化とスケーリング)
200
+ SDK:
201
+ Torch-TensorRT
202
+ SDK:
203
+ TensorRT
204
+ SDK:
205
+ TensorFlow-TensorRT"
206
+ https://developer.nvidia.com/blog/state-of-the-art-multimodal-generative-ai-model-development-with-nvidia-nemo/,State-of-the-Art Multimodal Generative AI Model Development with NVIDIA NeMo,"Generative AI
207
+ has rapidly evolved from text-based models to multimodal capabilities. These models perform tasks like image captioning and visual question answering, reflecting a shift toward more human-like AI. The community is now expanding from text and images to video, opening new possibilities across industries.
208
+ Video AI models are poised to revolutionize industries such as robotics, automotive, and retail. In
209
+ robotics
210
+ , they enhance autonomous navigation in complex, ever-changing environments, which is vital for sectors like manufacturing and warehouse management. In the automotive industry, video AI is propelling autonomous driving, boosting vehicle perception, safety, and predictive maintenance to improve efficiency.
211
+ To build image and video foundation models, developers must curate and preprocess a large amount of training data, tokenize the resulting high-quality data at high fidelity, train or customize pretrained models efficiently and at scale, and then generate high-quality images and videos during inference.
212
+ Announcing NVIDIA NeMo for multimodal generative AI
213
+ NVIDIA NeMo
214
+ is an end-to-end platform for developing, customizing, and deploying generative AI models.
215
+ NVIDIA just announced the expansion of NeMo to support the end-to-end pipeline for developing multimodal models. NeMo enables you to easily curate high-quality visual data, accelerate
216
+ training
217
+ and
218
+ customization
219
+ with highly efficient tokenizers and parallelism techniques, and reconstruct high-quality visuals during inference.
220
+ Accelerated video and image data curation
221
+ High-quality training data ensures high-accuracy results from an AI model. However, developers face various challenges in building data processing pipelines, ranging from scaling to data orchestration.
222
+ NeMo Curator
223
+ streamlines the data curation process, making it easier and faster for you to build multimodal generative AI models. Its out-of-the-box experience minimizes the total cost of ownership (TCO) and accelerates time-to-market.
224
+ While working with visuals, organizations can easily reach petabyte-scale data processing. NeMo Curator provides an orchestration pipeline that can load balance on multiple GPUs at each stage of the data curation. As a result, you can reduce video processing time by 7x compared to a naive GPU-based implementation. The scalable pipelines can efficiently process over 100 PB of data, ensuring the seamless handling of large datasets.
225
+ Figure 1. NVIDIA NeMo Curator video processing speed
226
+ NeMo Curator provides reference video curation models optimized for high-throughput filtering, captioning, and embedding stages to enhance dataset quality, empowering you to create more accurate AI models.
227
+ For instance, NeMo Curator uses an optimized captioning model that delivers an order of magnitude throughput improvement compared to unoptimized inference model implementations.
228
+ NVIDIA Cosmos tokenizers
229
+ Tokenizers map redundant and implicit visual data into compact and semantic tokens, enabling efficient training of large-scale generative models and democratizing their inference on limited computational resources.
230
+ Today’s open video and image tokenizers often generate poor data representations, leading to lossy reconstructions, distorted images, and temporally unstable videos and placing a cap on the capability of generative models built on top of the tokenizers. Inefficient tokenization processes also result in slow encoding and decoding and longer training and inference times, negatively impacting both developer productivity and the user experience.
231
+ NVIDIA Cosmos tokenizers are open models that offer superior visual tokenization with exceptionally large compression rates and cutting-edge reconstruction quality across diverse image and video categories.
232
+ Video 1. Efficient Generative AI Tokenizers for Image and Video
233
+ These tokenizers provide ease of use through a suite of tokenizer standardized models that support vision-language models (VLMs) with discrete latent codes, diffusion models with continuous latent embeddings, and various aspect ratios and resolutions, enabling the efficient management of large-resolution images and videos. This provides you with tools for tokenizing a wide variety of visual input data to build image and video AI models.
234
+ Cosmos tokenizer architecture
235
+ A Cosmos tokenizer uses a sophisticated encoder-decoder structure designed for high efficiency and effective learning. At its core, it employs 3D
236
+ causal convolution blocks
237
+ , which are specialized layers that jointly process spatiotemporal information, and uses causal temporal attention that captures long-range dependencies in data.
238
+ The causal structure ensures that the model uses only past and present frames when performing tokenization, avoiding future frames. This is crucial for aligning with the causal nature of many real-world systems, such as those in physical AI or multimodal LLMs.
239
+ Figure 2. NVIDIA Cosmos tokenizer architecture
240
+ The input is downsampled using 3D wavelets, a signal processing technique that represents pixel information more efficiently. After the data is processed, an inverse wavelet transform reconstructs the original input.
241
+ This approach improves learning efficiency, enabling the tokenizer encoder-decoder learnable modules to focus on meaningful features rather than redundant pixel details. The combination of such techniques and its unique training recipe makes the Cosmos tokenizers a cutting-edge architecture for efficient and powerful tokenization.
242
+ During inference, the Cosmos tokenizers significantly reduce the cost of running the model by delivering up to 12x faster reconstruction compared to leading open-weight tokenizers (Figure 3).
243
+ Figure 3. Quantitative comparison of reconstruction quality (left) and runtime performance (right) for video tokenizers
244
+ The Cosmos tokenizers also produce high-fidelity images and videos while compressing more than other tokenizers, demonstrating an unprecedented quality-compression trade-off.
245
+ Figure 4. Continuous tokenizer compression rate compared to reconstruction quality
246
+ Figure 5. Discrete tokenizer compression rate compared to reconstruction quality
247
+ Although the Cosmos tokenizer regenerates from highly compressed tokens, it is capable of creating high-quality images and videos due to an innovative neural network training technique and architecture.
248
+ Figure 6. Reconstructed video frame for continuous video tokenizers
249
+ Build Your Own Multimodal Models with NeMo
250
+ The expansion of the NVIDIA NeMo platform with at-scale data processing using
251
+ NeMo Curator
252
+ and high-quality tokenization and visual reconstruction using the Cosmos tokenizer empowers you to build state-of-the-art multimodal, generative AI models.
253
+ Join the waitlist
254
+ and be notified when NeMo Curator is available. The tokenizer is available now on the
255
+ /NVIDIA/cosmos-tokenizer
256
+ GitHub repo and
257
+ Hugging Face
258
+ .",https://developer.nvidia.com/ja-jp/blog/state-of-the-art-multimodal-generative-ai-model-development-with-nvidia-nemo/,NVIDIA NeMo による最先端のマルチモーダル生成 AI モデル開発,"Reading Time:
259
+ 2
260
+ minutes
261
+ 生成 AI
262
+ は、テキストベースのモデルからマルチモーダル機能へと急速に進化しています。これらのモデルは、画像のキャプション作成や視覚的な質問回答などのタスクを実行し、より人間に近い AI へとシフトしていることを反映しています。このコミュニティは現在、テキストや画像から動画へと拡大しており、さまざまな業界で新たな可能性を切り開かれています。
263
+ 動画 AI モデルは、ロボティクス、自動車、小売などの業界に革命を起こそうとしています。
264
+ ロボティクス
265
+ では、製造業や倉庫管理などの分野に不可欠な、複雑で変化し続ける環境における自律的なナビゲーションを強化しています。自動車業界では、動画 AI が自動運転を推進し、車両の認識、安全性、予知保全を強化し、効率性を高めています。
266
+ 画像や動画の基盤モデルを構築するには、開発者は大量の学習データのキュレーションと事前処理を行い、結果として得られた高品質データを高い忠実度でトークン化し、学習済みモデルを効率的に大規模に学習またはカスタマイズして、推論中に高品質な画像や動画を生成する必要があります。
267
+ マルチモーダル生成 AI 向けの NVIDIA NeMo を発表
268
+ NVIDIA NeMo
269
+ は、生成 AI モデルを開発、カスタマイズ、デプロイするエンドツーエンドのプラットフォームです。
270
+ NVIDIA は、マルチモーダル モデル開発向けのエンドツーエンドのパイプラインをサポートする NeMo の拡張を発表しました。NeMo により、高品質な視覚データを簡単にキュレーションし、高効率なトークナイザーと並列処理技術で
271
+ 学習
272
+
273
+ カスタマイズ
274
+ を加速し、推論中に高品質なビジュアルを再構築することができます。
275
+ 動画と画像データのキュレーションを加速
276
+ 高品質な学習データでは、AI モデルから高精度な結果が得られます。しかし、開発者は、データ処理パイプラインの構築において、スケーリングからデータのオーケストレーションまで、さまざまな課題に直面しています。
277
+ NeMo Curator
278
+ は、データ キュレーション プロセスを合理化することで、マルチモーダル生成 AI モデルをより簡単かつ迅速に構築することができます。すぐに試すことができるため、総保有コスト (TCO) を最小限に抑え、市場投入までの時間を短縮します。
279
+ ビジュアルを扱う際には、組織はペタバイト規模のデータ処理を容易に実行できます。NeMo Curator は、データ キュレーションの各段階で複数の GPU に負荷分散できるオーケストレーション パイプラインを提供します。その結果、単純な GPU ベースの実装と比較して、動画処理時間を 7 分の 1 に短縮できます。スケール可能なパイプラインは、100 PB を超えるデータを効率的に処理でき、大規模なデータセットをシームレスに取り扱うことができます。
280
+ 図 1. NVIDIA NeMo Curator の動画処理速度
281
+ NeMo Curator は、高いスループットのフィルタリング、キャプション作成、埋め込みの各段階に最適化されたリファレンス ビデオ キュレーション モデルを提供し、データセットの品質を向上させ、より正確な AI モデルの作成をサポートします。
282
+ たとえば、NeMo Curator は、最適化されたキャプション モデルを使用し、最適化されていない推論モデルの実装と比較して、桁違いのスループットの向上を実現します。
283
+ NVIDIA Cosmos トークナイザー
284
+ トークナイザーは、冗長的で暗黙的な視覚データをコンパクトで意味のあるトークンにマッピングし、大規模な生成モデルの効率的な学習を実現し、誰もが限られた計算リソースで推論できるようにします。
285
+ 今日のオープンな動画や画像のトークナイザーは、データ表現が不十分なことが多いため、劣化の多い再構築、歪んだ画像、不連続な動画につながり、トークナイザー上に構築された生成モデルの能力に限界をもたらします。トークン化プロセスが非効率なため、エンコードやデコードに時間がかかり、学習や推論の時間が長くなり、開発者の生産性とユーザー体験の両方に悪影響を及ぼします。
286
+ NVIDIA Cosmos トークナイザーは、優れた視覚トークン化を提供するオープンなモデルで、さまざまな画像や動画のカテゴリーで、高い圧縮率と最先端の再構築品質を実現します。
287
+ 離散的な潜在コードを備えた視覚言語モデル (VLM: Vision-language Model)、連続した潜在的埋め込みによる拡散モデル、さまざまなアスペクト比や解像度をサポートする一連のトークナイザー標準化モデルを使用して、これらのトークナイザーを簡単に使用でき、高解像度の画像や動画を効率的に管理することができます。これにより、画像や動画 AI モデルを構築するために、幅広い視覚入力データをトークン化するツールが提供されます。
288
+ Cosmos トークナイザーのアーキテクチャ
289
+ Cosmos トークナイザーは、高効率かつ効果的な学習向けに設計されており、高度なエンコーダー / デコーダー構造を使用しています。その中核には 3D
290
+ Causal Convolution Block
291
+ (因果畳み込みブロック) を採用しています。これは時空間情報を共同処理する特殊なレイヤーで、データの長期的な依存関係を捉える Causal Temporal Attention (因果的時間注意機構) を使用しています。
292
+ この因果構造により、トークン化の実行時にモデルが過去と現在のフレームのみを使用し、未来のフレームは使用しません。これは、物理的なAIやマルチモーダルLLMなどの多くの現実世界のシステムの因果性に合わせるために重要です。
293
+ 図 2. NVIDIA Cosmos トークナイザーのアーキテクチャ
294
+ 入力は、ピクセル情報をより効率的に表す信号処理技術である 3D ウェーブレットを使用してダウンサンプリングされます。データ処理後、逆ウェーベレット変換によって元の入力が再構築されます。
295
+ このアプローチにより、学習効率が向上し、トークナイザーのエンコーダー / デコーダーの学習可能なモジュールは、冗長なピクセルの詳細ではなく、意味のある特徴に焦点を当てることができます。このような技術と独自の学習レシピの組み合わせにより、Cosmos トークナイザーは、効率的かつ強力なトークン化を実現する最先端のアーキテクチャとなっています。
296
+ 推論の際、Cosmos トークナイザーは、主要なオープンウェイトのトークナイザーと比較して最大 12 倍高速な再構築を実現し、モデルの実行コストを大幅に削減しました (図 3)。
297
+ 図 3. Cosmos トークナイザーと主要なオープンウェイトのトークナイザーとの比較
298
+ Cosmos トークナイザーは、他のトークナイザーよりも高い圧縮率を実現しながら、高い忠実度の画像や動画を生成し、前例のない品質と圧縮のトレードオフを実現しています。
299
+ 図 4. 連続トークナイザーの圧縮率と再構築品質の比較
300
+ 図 5. 離散トークナイザーの圧縮率と再構築品質の比較
301
+ Cosmos トークナイザーは、高度に圧縮されたトークンから再生成されますが、革新的なニューラル ネットワークの学習技術とアーキテクチャにより、高品質な画像や動画を作成することができます。
302
+ 図 6. 連続動画トークナイザーで再構築された動画フレーム
303
+ NeMo で独自のマルチモーダル モデルを構築
304
+ NeMo Curator
305
+ を使用した大規模なデータ処理と、Cosmos トークナイザーを使用した高品質なトークン化やビジュアル再構築を備えた、NVIDIA NeMo プラットフォームの拡張により、最先端のマルチモーダル生成 AI モデルを構築することができます。
306
+ 登録
307
+ していただくと、NeMo Curator が利用可能になった際に通知を受け取ることができます。トークナイザーは、現在
308
+ /NVIDIA/cosmos-tokenizer
309
+ GitHub リポジトリおよび
310
+ Hugging Face
311
+ で利用することができます。
312
+ 関連情報
313
+ GTC セッション:
314
+ Large Language Model Fine-Tuning using Parameter Efficient Fine-Tuning (PEFT を使用した大規模言語モデルのファインチューニング)
315
+ GTC セッション:
316
+ Large Language Model Fine-Tuning using NVIDIA NeMo (NVIDIA NeMo を使用した大規模言語モデルのファインチューニング – Domino Data Lab 提供)
317
+ SDK:
318
+ NVIDIA NeMo カスタマイザー
319
+ SDK:
320
+ NeMo LLM サービス
321
+ SDK:
322
+ NeMo Megatron"
vecalign/valid_zh_en.csv ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id,seg,en,zh,zh_title,zh_url,llama3,gpt4o
2
+ 1,0,"The financial services industry is reaching an important milestone with AI, as organizations move beyond testing and experimentation to successful AI implementation, driving business results.
3
+ NVIDIA’s fifth annual State of AI in Financial Services report shows how financial institutions have consolidated their AI efforts to focus on core applications, signaling a significant increase in AI capability and proficiency.
4
+ AI Helps Drive Revenue and Save Costs Companies investing in AI are seeing tangible benefits, including increased revenue and cost savings.
5
+ Nearly 70% of respondents report that AI has driven a revenue increase of 5% or more, with a dramatic rise in those seeing a 10-20% revenue boost. In addition, more than 60% of respondents say AI has helped reduce annual costs by 5% or more. Nearly a quarter of respondents are planning to use AI to create new business opportunities and revenue streams.
6
+ The top generative AI use cases in terms of return on investment (ROI) are trading and portfolio optimization, which account for 25% of responses, followed by customer experience and engagement at 21%. These figures highlight the practical, measurable benefits of AI as it transforms key business areas and drives financial gains.
7
+ Overcoming Barriers to AI Success Half of management respondents said they’ve deployed their first generative AI service or application, with an additional 28% planning to do so within the next six months. A 50% decline in the number of respondents reporting a lack of AI budget suggests increasing dedication to AI development and resource allocation.
8
+ The challenges associated with early AI exploration are also diminishing. The survey revealed fewer companies reporting data issues and privacy concerns, as well as reduced concern over insufficient data for model training. These improvements reflect growing expertise and better data management practices within the industry.
9
+ As financial services firms allocate budget and grow more savvy at data management, they can better position themselves to harness AI for enhanced operational efficiency, security and innovation across business functions.","金融服務業在使用人工智慧(AI)方面正邁入一個重要的里程碑,各大組織開始邁出測試與實驗的範疇,成功使用 AI 推動業務成果。
10
+ NVIDIA 的第五份《金融服務業 AI 現況(State of AI in Financial Services)》年度調查報告顯示,金融機構已經整合自身在 AI 方面的各項作為,以專注在核心應用項目上,這標誌著 AI 能力與熟練程度大幅提升。
11
+ AI 有助於增加營收與節省成本
12
+ 投資於 AI 的公司正在看到實質效益,包括增加營收和節省成本等。
13
+ 近七成的受訪者表示,AI 已經帶來 5% 或以上的營收成長,其中營收成長幅度達 10% 至 20% 的受訪者比例更是大幅增加。此外,超過六成的受訪者表示 AI 已協助減少 5% 或以上的年度成本。近四分之一的受訪者正計劃使用 AI 創造新的商機和收入來源。
14
+ 交易與投資組合最佳化是投資報酬率(ROI)最高的
15
+ 生成式 AI 使用案例,佔回應數量的 25%,其次是客戶體驗與參與度,佔 21%。這些數字突顯 AI在改變關鍵業務領域和推動財務收益時,所帶來可衡量的實際效益。
16
+ 克服 AI 成功的關卡
17
+ 半數管理層的受訪者表示,他們已經部署了第一個生成式 AI 服務或應用,另有 28% 的受訪者計劃在未來六個月內部署。回覆缺乏 AI 預算的受訪者人數減少了五成,這顯示對於 AI 開發與資源分配的投入程度日益增加。
18
+ 與早期探索 AI 相關的挑戰同樣在減少。調查顯示,回答有資料問題和隱私疑慮的公司數量減少,對於模型訓練資料不足的疑慮也降低。這些改善反映出業界的專業知識與資料管理實務正在不斷增加。
19
+ 隨著金融服務公司分配預算並更加擅長管理資料,他們可以更好地利用 AI 來提高跨業務單位的營運效率、安全性和進行創新。",AI 帶來亮眼報酬:調查結果揭示金融業最新技術趨勢,https://blogs.nvidia.com.tw/blog/ai-in-financial-services-survey-2025/,"The Financial Services industry is marking an important milestone in its adoption of Artificial Intelligence (AI), with major organizations taking their first steps in testing and experimenting with AI-driven business results.
20
+
21
+ NVIDIA's fifth annual survey of the State of AI in Financial Services found that financial institutions have integrated AI across all functions, focusing on core applications, a sign that AI capabilities and expertise have increased dramatically.
22
+
23
+ AI boosts revenue and saves costs
24
+
25
+ Companies investing in AI are seeing tangible benefits, including increased revenue and cost savings.
26
+
27
+ About 70% of respondents say AI has driven 5% or more revenue growth, while those seeing 10% to 20% revenue growth make up a significantly larger proportion. Similarly, more than 60% of respondents say AI has helped reduce annual costs by 5% or more.
28
+
29
+ A quarter of respondents are planning to use AI to create new business opportunities and revenue streams.
30
+
31
+ Optimizing trading and investment portfolios delivers the highest ROI
32
+
33
+ 24% of respondents cite generative AI use cases as the most prevalent, followed by customer experience and engagement at 21%. These numbers highlight the measurable, real-world benefits AI is bringing to the industry as it transforms critical business areas and drives financial returns.
34
+
35
+ Overcoming the hurdles to AI success
36
+
37
+ About half of management respondents say they have deployed their first generative AI service or application, while 28% plan to deploy within the next six months. The number of respondents answering that they lack an AI budget has decreased by 50%, a sign that investment in AI development and resource allocation is increasing.
38
+
39
+ The challenges associated with early exploration of AI are decreasing as well. The survey shows a decrease in the number of respondents citing data issues and privacy concerns, as well as concerns about insufficient training data. These improvements reflect the growing expertise and data management practices in the industry.
40
+
41
+ As financial services companies allocate budgets and become more adept at managing data, they will be better equipped to leverage AI to improve operational efficiency, security, and innovation across business units.","The financial services industry is reaching a significant milestone in the use of artificial intelligence (AI), as major organizations move beyond testing and experimentation to successfully use AI to drive business outcomes.
42
+
43
+ NVIDIA's fifth annual report on the ""State of AI in Financial Services"" shows that financial institutions have integrated various AI initiatives to focus on core applications, marking a substantial improvement in AI capabilities and proficiency.
44
+
45
+ AI helps increase revenue and reduce costs. Companies investing in AI are seeing tangible benefits, including increased revenue and cost savings. Nearly 70% of respondents reported that AI has led to revenue growth of 5% or more, with a significantly higher percentage of respondents seeing revenue growth of 10% to 20%. Additionally, over 60% of respondents indicated that AI has helped reduce annual costs by 5% or more. Nearly a quarter of respondents are planning to use AI to create new business opportunities and revenue streams.
46
+
47
+ Trading and portfolio optimization is the highest ROI-generating AI use case, accounting for 25% of responses, followed by customer experience and engagement at 21%. These figures highlight AI's measurable real-world benefits in transforming key business areas and driving financial gains.
48
+
49
+ Overcoming barriers to AI success. Half of the management-level respondents stated that they have already deployed at least one generative AI service or application, with another 28% planning to deploy in the next six months. The number of respondents citing lack of AI budget has halved, indicating increased investment in AI development and resource allocation.
50
+
51
+ Challenges associated with early AI exploration are similarly decreasing. The survey shows a reduction in the number of companies reporting data issues and privacy concerns, and worries about insufficient training data for models have also diminished. These improvements reflect the growing expertise and data management practices in the industry.
52
+
53
+ As financial services companies allocate budgets and become more adept at managing data, they can better leverage AI to enhance operational efficiency, security, and innovation across business units."
54
+ 1,1,"Generative AI Powers More Use Cases
55
+ After data analytics, generative AI has emerged as the second-most-used AI workload in the financial services industry. The applications of the technology have expanded significantly, from enhancing customer experience to optimizing trading and portfolio management.
56
+ Notably, the use of generative AI for customer experience, particularly via chatbots and virtual assistants, has more than doubled, rising from 25% to 60%. This surge is driven by the increasing availability, cost efficiency and scalability of generative AI technologies for powering more sophisticated and accurate digital assistants that can enhance customer interactions.
57
+ More than half of the financial professionals surveyed are now using generative AI to enhance the speed and accuracy of critical tasks like document processing and report generation.
58
+ Financial institutions are also poised to benefit from agentic AI systems that harness vast amounts of data from various sources and use sophisticated reasoning to autonomously solve complex, multistep problems. Banks and asset managers can use agentic AI systems to enhance risk management, automate compliance processes, optimize investment strategies and personalize customer services.
59
+ Advanced AI Drives Innovation
60
+ Recognizing the transformative potential of AI, companies are taking proactive steps to build AI factories — specially built accelerated computing platforms equipped with full-stack AI software — through cloud providers or on premises. This strategic focus on implementing high-value AI use cases is crucial to enhancing customer service, boosting revenue and reducing costs.
61
+ By tapping into advanced infrastructure and software, companies can streamline the development and deployment of AI models and position themselves to harness the power of agentic AI.
62
+ With industry leaders predicting at least 2x ROI on AI investments, financial institutions remain highly motivated to implement their highest-value AI use cases to drive efficiency and innovation.
63
+ Download the full report to learn more about how financial services companies are using accelerated computing and AI to transform services and business operations.","生成式 AI 驅動更多使用案例
64
+ 繼資料分析之後,生成式 AI 已經成為金融服務業裡第二大宗的 AI 工作負載。這項技術的應用範圍已大幅擴展,從提升客戶體驗到最佳化交易和投資組合管理。
65
+ 值得注意的是,生成式 AI 在客戶體驗方面的應用,特別是透過聊天機器人和虛擬助理,數量增加了一倍以上,從 25% 上升到 60%。這樣大幅成長的趨勢是基於生成式 AI 技術的可用性、成本效率和可擴展性不斷提高,能夠驅動更複雜、更精準的數位助理,從而提升客戶互動情況。
66
+ 半數以上受訪的金融專業人員現正使用生成式 AI 技術,以提高處理文件和產生報告等重要工作的速度和準確性。
67
+ 金融機構也準備好從代理型 AI 中受惠,代理型 AI 系統是指利用各種來源的大量資料,並使用複雜的推理流程自主解決複雜的多步驟問題。銀行和資產管理公司可以使用代理型 AI 系統來加強管理風險、自動化合規流程、最佳化投資策略,還有提供個人化的客戶服務。
68
+ 先進的 AI 推動創新
69
+ 在意識到 AI 的轉型潛力後,企業正積極採取措施,透過與雲端服務供應商合作或是在地端建立 AI 工廠,這些 AI 工廠是專門打造的加速運算平台,配備全端的 AI 軟體。企業在策略上特別鎖定實施高價值的 AI 使用案例,這對於提升客戶服務、增加收入與降低成本來說至關重要。
70
+ 企業利用先進的基礎設施和軟體,可以簡化 AI 模型的開發和部署,並在善加發揮代理型 AI 力量方面站穩腳步。
71
+ 由於業界領導業者預測 AI 投資的投資報酬率至少為兩倍,因此金融機構仍有很大動力去實現其最高價值的 AI 使用案例,以推動效率和創新。
72
+ 下載完整報告,進一步瞭解金融服務公司如何利用加速運算和 AI 來改變服務和業務運作。",AI 帶來亮眼報酬:調查結果揭示金融業最新技術趨勢,https://blogs.nvidia.com.tw/blog/ai-in-financial-services-survey-2025/,"Generative AI Drives More Use Cases
73
+ Following data analytics, generative AI has become the second largest AI workload in the financial services industry. The scope of this technology's application has expanded significantly, from enhancing customer experience to optimizing transactions and investment portfolio management.
74
+
75
+ What's worth noting is that generative AI's application in customer experience, particularly through chatbots and virtual assistants, has increased by more than double, from 25% to 60%. This significant growth trend is based on the increasing availability, cost-effectiveness, and scalability of generative AI technology, which can drive more complex and precise digital assistants, thereby enhancing customer interactions.
76
+
77
+ More than half of the surveyed financial professionals are currently using generative AI technology to improve the speed and accuracy of important work such as processing documents and generating reports.
78
+
79
+ Financial institutions are also ready to benefit from agent-based AI, which refers to the use of large amounts of data from various sources and complex reasoning processes to autonomously solve complex multi-step problems. Banks and asset management companies can use agent-based AI systems to strengthen risk management, automate compliance processes, optimize investment strategies, and provide personalized customer services.
80
+
81
+ Advanced AI drives innovation
82
+ Following the realization of AI's transformative potential, companies are taking proactive measures by partnering with cloud service providers or establishing AI factories on-site. These AI factories are specialized platforms for accelerated computing, equipped with end-to-end AI software. Companies strategically focus on implementing high-value AI use cases, which are critical for enhancing customer service, increasing revenue, and reducing costs.
83
+
84
+ With advanced infrastructure and software, companies can simplify the development and deployment of AI models and establish a strong footing in leveraging the power of agent-based AI.
85
+
86
+ Due to industry leaders predicting that AI investments will yield at least a 2x return on investment, financial institutions still have significant motivation to achieve their highest-value AI use cases to drive efficiency and innovation.
87
+
88
+ Download the full report to learn more about how financial services companies use accelerated computing and AI to change their services and business operations.","Generative AI Drives More Use Cases
89
+
90
+ Following data analytics, generative AI has become the second largest AI workload in the financial services industry. The scope of this technology has significantly expanded, from enhancing customer experience to optimizing trading and portfolio management.
91
+
92
+ Notably, the application of generative AI in customer experience, particularly through chatbots and virtual assistants, has more than doubled, rising from 25% to 60%. This significant growth trend is based on the increasing availability, cost efficiency, and scalability of generative AI technology, enabling the development of more complex and accurate digital assistants, thereby enhancing customer interactions.
93
+
94
+ More than half of the financial professionals surveyed are currently using generative AI technology to improve the speed and accuracy of critical tasks such as document processing and report generation.
95
+
96
+ Financial institutions are also poised to benefit from agent-based AI. These systems utilize vast amounts of data from various sources and employ complex reasoning processes to autonomously solve multi-step complex problems. Banks and asset management companies can use agent-based AI systems to strengthen risk management, automate compliance processes, optimize investment strategies, and provide personalized customer services.
97
+
98
+ Advanced AI Drives Innovation
99
+
100
+ Recognizing the transformative potential of AI, businesses are actively taking measures by partnering with cloud service providers or establishing local AI factories, which are specialized accelerated computing platforms equipped with end-to-end AI software. Strategically, companies focus particularly on implementing high-value AI use cases that are crucial for improving customer service, increasing revenue, and reducing costs.
101
+
102
+ By leveraging advanced infrastructure and software, businesses can streamline the development and deployment of AI models and firmly establish themselves in utilizing the power of agent-based AI.
103
+
104
+ As industry leaders anticipate a return on investment in AI of at least two times, financial institutions have strong motivation to realize their highest-value AI use cases to drive efficiency and innovation.
105
+
106
+ Download the full report to learn more about how financial services companies are using accelerated computing and AI to transform services and business operations."
107
+ 4,0,"Generative AI and foundation models let autonomous machines generalize beyond the operational design domains on which they’ve been trained. Using new AI techniques such as tokenization and large language and diffusion models, developers and researchers can now address longstanding hurdles to autonomy.
108
+ These larger models require massive amounts of diverse data for training, fine-tuning and validation. But collecting such data — including from rare edge cases and potentially hazardous scenarios, like a pedestrian crossing in front of an autonomous vehicle (AV) at night or a human entering a welding robot work cell — can be incredibly difficult and resource-intensive.
109
+ To help developers fill this gap, NVIDIA Omniverse Cloud Sensor RTX APIs enable physically accurate sensor simulation for generating datasets at scale. The application programming interfaces (APIs) are designed to support sensors commonly used for autonomy — including cameras, radar and lidar — and can integrate seamlessly into existing workflows to accelerate the development of autonomous vehicles and robots of every kind.
110
+ Omniverse Sensor RTX APIs are now available to select developers in early access. Organizations such as Accenture, Foretellix, MITRE and Mcity are integrating these APIs via domain-specific blueprints to provide end customers with the tools they need to deploy the next generation of industrial manufacturing robots and self-driving cars.
111
+ Powering Industrial AI With Omniverse Blueprints
112
+ In complex environments like factories and warehouses, robots must be orchestrated to safely and efficiently work alongside machinery and human workers. All those moving parts present a massive challenge when designing, testing or validating operations while avoiding disruptions.
113
+ Mega is an Omniverse Blueprint that offers enterprises a reference architecture of NVIDIA accelerated computing, AI, NVIDIA Isaac and NVIDIA Omniverse technologies.
114
+ Enterprises can use it to develop digital twins and test AI-powered robot brains that drive robots, cameras, equipment and more to handle enormous complexity and scale.
115
+ Integrating Omniverse Sensor RTX, the blueprint lets robotics developers simultaneously render sensor data from any type of intelligent machine in a factory for high-fidelity, large-scale sensor simulation.
116
+ With the ability to test operations and workflows in simulation, manufacturers can save considerable time and investment, and improve efficiency in entirely new ways.","生成式人工智慧(AI)和基礎模型讓自主機器能夠超越它們所接受訓練的操作設計領域。開發人員和研究人員使用標記化(tokenization)及大型語言和擴散模型等嶄新 AI 技術,現在可以解決一直以來在自主領域方面的各項障礙。
117
+ 需要使用大量相異的資料來訓練、微調與驗證這些大型模型。不過收集這些資料(包括從罕見的邊緣情況和潛在危險情境中收集資料,例如行人在夜間橫越自動駕駛車前方,或是人類進入焊接機器人工作單元)可能非常困難,又得耗費不少資源。
118
+ 為了協助開發人員填補這個缺口,NVIDIA Omniverse Cloud Sensor RTX API 提供了物理精確的感測器模擬,用於大規模生成資料集。這些應用程式介面(API)用於支援常用於自主機器上的感測器,包括攝影機、雷達與光達,且能完美與現有的工作流程進行整合,以加快開發各種自動駕駛車輛與機器人。
119
+ 現已開放部分開發人員搶先體驗
120
+ Omniverse Sensor RTX API。埃森哲(Accenture)、Foretellix、MITRE 和 Mcity等企業正透過特定領域藍圖整合這些 API,為終端客戶提供部署下一代工業製造機器人和自動駕駛車所需的工具。
121
+ 使用 Omniverse Blueprints 為工業 AI 提供動力
122
+ 在工廠和倉庫等複雜環境中,機器人必須被精心協調,才能安全高效率地與機器和人類工作者並肩作業。在設計、測試或驗證操作,又要避免中斷作業時,所有這些移動部件都會帶來巨大的挑戰。
123
+ Mega 是一個 Omniverse Blueprint ,可為企業提供 NVIDIA 加速運算、AI、NVIDIA Isaac 及 NVIDIA Omniverse 技術的參考架構。企業可以用它開發數位孿生模型,測試由 AI 驅動的機器人大腦,而這些大腦驅動著機器人、攝影機、設備等項目,以處理極為複雜又大量的作業。
124
+ 這個整合了 Omniverse Sensor RTX 的藍圖可以讓機器人開發人員同時渲染工廠內任何類型智慧機器的感測器資料,實現高保真、大規模的感測器模擬。
125
+ 隨著能夠在模擬環境裡測試操作和工作流程,製造商可以省下大量時間和投資,以全新方式提高作業效率。",建造更聰明的自主機器:NVIDIA 宣布 Omniverse Sensor RTX 推出搶先體驗活動,https://blogs.nvidia.com.tw/blog/omniverse-sensor-rtx-autonomous-machines/,"Generative artificial intelligence (AI) and foundation models enable autonomous machines to surpass the operational design domains in which they were trained. Developers and researchers are using tokenization and large language and diffusion models and other AI techniques to overcome long-standing challenges in the autonomous domain.
126
+
127
+ Training, fine-tuning, and validating these large models requires an abundance of diverse data. However, collecting this data, including from rare edge cases and potentially hazardous scenarios, such as pedestrians crossing in front of autonomous vehicles at night or humans entering a welding robot workcell, can be extremely difficult and costly.
128
+
129
+ To help fill this gap, NVIDIA Omniverse Cloud Sensor RTX API offers physically accurate sensor simulation for large-scale data generation. These APIs are designed to support sensors commonly found on autonomous machines, including cameras, radar, and lidar, and can be seamlessly integrated into existing workflows to accelerate the development of various autonomous vehicles and robots.
130
+
131
+ Omniverse Sensor RTX API is now available for early adopters. Companies like Accenture, Foretellix, MITRE, and Mcity are integrating these APIs into specific domain blueprints to provide end customers with the tools needed to deploy next-generation industrial manufacturing robots and autonomous vehicles.
132
+
133
+ Harnessing Omniverse Blueprints for Industrial AI
134
+
135
+ In complex environments such as factories and warehouses, robots must be carefully coordinated to operate safely and efficiently alongside machines and human workers. Designing, testing, or verifying operations while avoiding downtime brings a multitude of challenges to these moving parts.
136
+
137
+ Mega is an Omniverse Blueprint that provides a reference architecture with NVIDIA acceleration, AI, NVIDIA Isaac, and NVIDIA Omniverse technologies for enterprises. With it, companies can develop digital twins, test AI-driven robot brains, and deploy these brains to robots, cameras, equipment, and other assets to handle complex and massive workloads.
138
+
139
+ This integrated Blueprint with Omniverse Sensor RTX enables robotics developers to simultaneously render sensor data for any type of intelligent machine within a factory, achieving high-fidelity, large-scale sensor simulation.
140
+
141
+ By testing operations and workflows in a simulated environment, manufacturers can save significant time and investment and improve operational efficiency in new ways.","Generative Artificial Intelligence (AI) and foundational models enable autonomous machines to operate beyond their training design domains. Developers and researchers use innovative AI technologies like tokenization, large language, and diffusion models to overcome long-standing barriers in the field of autonomy.
142
+
143
+ Training, fine-tuning, and validating these large models require a vast amount of diverse data. However, collecting such data, including from rare edge cases and potentially dangerous situations—like pedestrians crossing in front of autonomous vehicles at night or humans entering a welding robot's workspace—can be challenging and resource-intensive.
144
+
145
+ To help developers bridge this gap, the NVIDIA Omniverse Cloud Sensor RTX API offers physically accurate sensor simulation for large-scale dataset generation. These APIs are designed to support sensors commonly used on autonomous machines, including cameras, radars, and LiDARs, and seamlessly integrate with existing workflows to accelerate the development of various autonomous vehicles and robots.
146
+
147
+ The Omniverse Sensor RTX API is now available for early access by selected developers. Companies like Accenture, Foretellix, MITRE, and Mcity are integrating these APIs through domain-specific blueprints to equip end customers with the tools needed to deploy next-generation industrial manufacturing robots and autonomous vehicles.
148
+
149
+ Powering industrial AI with Omniverse Blueprints
150
+ In complex environments like factories and warehouses, robots must be meticulously coordinated to work safely and efficiently alongside machines and human workers. Designing, testing, or validating operations without interrupting ongoing processes presents substantial challenges with these moving parts.
151
+
152
+ Mega is an Omniverse Blueprint that provides enterprises with a reference architecture for NVIDIA accelerated computing, AI, NVIDIA Isaac, and NVIDIA Omniverse technologies. Enterprises can use it to develop digital twin models and test AI-driven robotic brains, which power robots, cameras, devices, and more to handle highly complex and large-scale operations.
153
+
154
+ This blueprint, integrated with Omniverse Sensor RTX, allows robot developers to simultaneously render sensor data from any type of intelligent machine within a factory, achieving high-fidelity, large-scale sensor simulation.
155
+
156
+ By testing operations and workflows in simulated environments, manufacturers can save significant time and investment, enhancing operational efficiency in entirely new ways."
157
+ 4,1,"International supply chain solutions company KION Group and Accenture are using the Mega blueprint to build Omniverse digital twins that serve as virtual training and testing environments for industrial AI’s robot brains, tapping into data from smart cameras, forklifts, robotic equipment and digital humans.
158
+ The robot brains perceive the simulated environment with physically accurate sensor data rendered by the Omniverse Sensor RTX APIs. They use this data to plan and act, with each action precisely tracked with Mega, alongside the state and position of all the assets in the digital twin.
159
+ With these capabilities, developers can continuously build and test new layouts before they’re implemented in the physical world.
160
+ Driving AV Development and Validation
161
+ Autonomous vehicles have been under development for over a decade, but barriers in acquiring the right training and validation data and slow iteration cycles have hindered large-scale deployment.
162
+ To address this need for sensor data, companies are harnessing the NVIDIA Omniverse Blueprint for AV simulation, a reference workflow that enables physically accurate sensor simulation. The workflow uses Omniverse Sensor RTX APIs to render the camera, radar and lidar data necessary for AV development and validation.
163
+ AV toolchain provider Foretellix has integrated the blueprint into its
164
+ Foretify AV development toolchain
165
+ to transform object-level simulation into physically accurate sensor simulation.
166
+ The Foretify toolchain can generate any number of testing scenarios simultaneously. By adding sensor simulation capabilities to these scenarios, Foretify can now enable developers to evaluate the completeness of their AV development, as well as train and test at the levels of fidelity and scale needed to achieve large-scale and safe deployment. In addition, Foretellix will use the newly announced
167
+ NVIDIA Cosmos platform to generate an even greater diversity of scenarios for verification and validation.
168
+ Nuro, an autonomous driving technology provider with one of the largest level 4 deployments in the U.S., is using the Foretify toolchain to train, test and validate its self-driving vehicles before deployment.
169
+ In addition, research organization MITRE is collaborating with the University of Michigan’s Mcity testing facility to build a digital AV validation framework for regulatory use, including a digital twin of Mcity’s 32-acre proving ground for autonomous vehicles. The project uses the AV simulation blueprint to render physically accurate sensor data at scale in the virtual environment, boosting training effectiveness.
170
+ The future of robotics and autonomy is coming into sharp focus, thanks to the power of high-fidelity sensor simulation. Learn more about these solutions at CES by visiting Accenture at Ballroom F at the Venetian and Foretellix booth 4016 in the West Hall of Las Vegas Convention Center.
171
+ Learn more about the latest in automotive and generative AI technologies by joining
172
+ NVIDIA at CES.","國際供應鏈解決方案公司凱傲集團(KION Group)與埃森哲利用來自智慧攝影機、堆高機、機器人設備和數位人類的資料,使用 Mega 藍圖建立 Omniverse 數位孿生,作為工業AI機器人大腦的虛擬訓練和測試環境。
173
+ 機器人大腦透過 Omniverse Sensor RTX API 渲染的物理精確感測器資料來感知模擬環境。機器人使用這些資料來計劃和採取行動,並透過 Mega 精準追蹤每一個動作,以及數位孿生中所有資產的狀態和位置。借助這些功能,開發人員可以在真正部署至實體環境裡之前,不斷建立和測試新配置。
174
+ 推動開發與驗證自動駕駛車
175
+ 自動駕駛車輛已開發超過十多年,但在取得正確的訓練與驗證資料方面所遇到的阻礙,還有緩慢的迭代週期,都阻礙了大規模部署。
176
+ 為了滿足對感測器資料的這種需求,各家公司利用 NVIDIA Omniverse Blueprint for AV simulation,這是一個實現物理精確感測器模擬的參考工作流程。這個工作流程使用 Omniverse Sensor RTX API 來渲染出開發與驗證自動駕駛汽車所需的攝影機、雷達與光達資料。
177
+ 自動駕駛汽車工具鏈供應商 Foretellix 已經把這個藍圖納入該公司的 Foretify 自動駕駛車開發工作鏈,將物件級模擬轉換為物理精準感測器模擬。
178
+ Foretify 工具鏈可以同時產生任意數量的測試情境。Foretify 在這些情境中加入感測器模擬功能,開發人員便能評估自己在開發自動駕駛車方面的完整性,並以實現大規模安全部署所需的保真度和規模水平進行訓練和測試。。Foretellix 還將使用最新發表的
179
+ NVIDIA Cosmos 平台,產生更多樣化的情境進行確認與驗證。
180
+ 自動駕駛技術提供商 Nuro 是美國規模最大的 level 4 部署業者之一,使用 Foretify 工具鏈在部署前對其自動駕駛車輛進行訓練、測試和驗證。
181
+ 再者,研究機構 MITRE 與密西根大學的 Mcity 測試設施合作,建立供主管機關使用的數位自動駕駛車驗證框架,包括 Mcity 32 英畝自動駕駛車試驗場的數位孿生模型。這項合作案使用 自動駕駛車 模擬藍圖,在虛擬環境中大規模渲染出物理精確的感測器資料,以提升訓練成效。
182
+ 得益於高保真感測器模擬技術,機器人與自動化的未來正逐漸成為人們關注的焦點。如需更深入瞭解 CES 大會上這些解決方案的資訊,請造訪埃森哲位於拉斯維加斯威尼斯人F展廳的攤位,以及 Foretellix 位於拉斯維加斯展覽中心西館 4016 號的展位。
183
+ 欲了解最新的汽車與生成式 AI 技術,參加 NVIDIA 在 CES 大會的各項活動。",建造更聰明的自主機器:NVIDIA 宣布 Omniverse Sensor RTX 推出搶先體驗活動,https://blogs.nvidia.com.tw/blog/omniverse-sensor-rtx-autonomous-machines/,"KION Group, a leading global intralogistics company, and Accenture have leveraged data from smart cameras, forklifts, robotics equipment, and digital humans to create an Omniverse digital twin, powered by Mega Graph, as a virtual training and testing environment for industrial AI robots. The robot brain perceives the simulated environment through physically accurate sensor data rendered by the Omniverse Sensor RTX API. The robot uses this data to plan and act, and through Mega, it precisely tracks every action and the state and location of all assets within the digital twin. By leveraging these capabilities, developers can continuously build and test new configurations before deploying them to physical environments.
184
+
185
+ Driving Autonomous Vehicle Development
186
+ Autonomous vehicles have been in development for over a decade, but have been hindered by the difficulty in obtaining accurate training and validation data, as well as the slow iteration cycle. To meet this demand for sensor data, companies have turned to NVIDIA Omniverse Blueprint for AV simulation, a reference workflow that realizes physically accurate sensor simulation. This workflow uses the Omniverse Sensor RTX API to render camera, radar, and lidar data needed for developing and validating autonomous vehicles.
187
+
188
+ Autonomous Vehicle Tools and Services Provider Foretellix has integrated this blueprint into its Foretify autonomous vehicle development workflow, converting object-level simulation to physically accurate sensor simulation. The Foretify toolchain can generate an arbitrary number of test scenarios, with sensor simulation added in these scenarios, allowing developers to evaluate their completeness in developing autonomous vehicles, and train and test at the fidelity and scale required for large-scale deployments. Foretellix is also using the newly released NVIDIA Cosmos platform to generate more diverse scenarios for validation and verification.
189
+
190
+ Nuro, a leading provider of autonomous technology and one of the largest Level 4 deployment operators in the US, is using the Foretify toolchain to train, test, and validate its autonomous vehicles before deployment.
191
+
192
+ In addition, research institutions MITRE and the University of Michigan's Mcity test facility are collaborating to establish a digital autonomous vehicle validation framework for government agencies, including a digital twin model of the Mcity 32-acre autonomous vehicle test field. This collaboration is using the autonomous vehicle simulation blueprint to render physically accurate sensor data at scale in a virtual environment, improving training effectiveness.
193
+
194
+ Thanks to high-fidelity sensor simulation technology, the future of robots and automation is becoming a growing focus area. For more information on these solutions showcased at CES, visit Accenture's booth at the Venetian in Las Vegas, as well as Foretellix's booth #4016 at the West Hall of the Las Vegas Convention Center. For the latest on automotive and generative AI technology, attend NVIDIA's various activities at CES.","The international supply chain solutions company KION Group, in collaboration with Accenture, is using data from smart cameras, forklifts, robotics equipment, and digital humans to build an Omniverse digital twin with Mega blueprints. This serves as a virtual training and testing environment for the industrial AI robot brain. The robot brain perceives the simulated environment through physically accurate sensor data rendered by the Omniverse Sensor RTX API. Robots use this data to plan and take actions, with Mega accurately tracking every movement and the status and location of all assets in the digital twin. These features allow developers to continuously build and test new configurations before deploying them in a physical environment.
195
+
196
+ Advancing the Development and Validation of Autonomous Vehicles
197
+ Autonomous vehicles have been in development for over a decade, but obstacles in obtaining the correct training and validation data, along with slow iteration cycles, have hindered large-scale deployment. To meet the demand for sensor data, companies are utilizing the NVIDIA Omniverse Blueprint for AV simulation, a reference workflow that realizes physically accurate sensor simulations. This workflow uses the Omniverse Sensor RTX API to render the camera, radar, and lidar data needed for developing and validating autonomous vehicles.
198
+
199
+ Foretellix, an autonomous vehicle toolchain supplier, has incorporated this blueprint into the company’s Foretify autonomous vehicle development toolchain, transforming object-level simulation into physically precise sensor simulation. The Foretify toolchain can generate any number of test scenarios simultaneously. By incorporating sensor simulation into these scenarios, developers can assess the integrity of their autonomous vehicle development with the fidelity and scale required for large-scale safe deployment. Foretellix will also use the newly released NVIDIA Cosmos platform to generate more diverse scenarios for validation and verification.
200
+
201
+ Nuro, one of the largest level 4 deployment operators in the U.S., uses the Foretify toolchain to train, test, and validate their autonomous vehicles before deployment.
202
+
203
+ Additionally, the research institution MITRE is collaborating with the University of Michigan's Mcity testing facility to create a digital verification framework for autonomous vehicles for regulatory authorities, including a digital twin model of Mcity's 32-acre autonomous vehicle testing grounds. This collaboration uses the autonomous vehicle simulation blueprint to render physically accurate sensor data on a large scale in virtual environments to enhance training effectiveness.
204
+
205
+ Thanks to high-fidelity sensor simulation technology, the future of robotics and automation is becoming a focal point of interest. For more information about these solutions presented at the CES conference, visit Accenture's booth at the Venetian F Hall in Las Vegas and Foretellix's booth at the Las Vegas Convention Center West Hall, booth 4016. To stay updated on the latest automotive and generative AI technologies, participate in NVIDIA's activities at the CES conference."
206
+ 6,0,"Greetings, Traveler. Prepare for adventure.
207
+ Genshin Impact, the popular open-world action role-playing game, is leaving limited beta and launching for all GeForce NOW members next week.
208
+ Gamers can get their game on today with the six total games joining the GeForce NOW library.
209
+ As announced last week, Warhammer 40,000: Darktide is coming to the cloud at launch — with GeForce technology. This September, members will be able to leap thousands of years into the future to the time of the Space Marines, streaming on GeForce NOW with NVIDIA DLSS and more.
210
+ Plus, the 2.0.41 GeForce NOW app update brings a highly requested feature: in-stream copy-and-paste support from the clipboard while streaming from the PC and Mac apps — so there’s no need to enter a long, complex password for the digital store. Get to your games even faster with this new capability.
211
+ GeForce NOW is also giving mobile gamers more options by bringing the perks of RTX 3080 memberships and PC gaming at 120 frames per second to all devices with support for 120Hz phones. The capability is rolling out in the coming weeks.
212
+ Take a Trip to Teyvat
213
+ After the success of a limited beta and receiving great feedback from members, Genshin Impact is coming next week to everyone streaming on GeForce NOW.
214
+ Embark on a journey as a traveler from another world, stranded in the fantastic land of Teyvat. Search for your missing sibling in a vast continent made up of seven nations. Master the art of elemental combat and build a dream team of over 40 uniquely skilled playable characters – like the newest additions of Yelan and Kuki Shinobu – each with their own rich stories, personalities and combat styles.
215
+ Experience the immersive campaign, dive deep into rich quests alongside iconic characters and complete daily challenges. Charge head-on into battles solo or invite friends to join the adventures. The world is constantly expanding, so bring it wherever you go across devices, streaming soon to underpowered PCs, Macs and Chromebooks on GeForce NOW.
216
+ RTX 3080 members can level up their gaming for the best experience by streaming in 4K resolution and 60 frames per second on the PC and Mac apps.
217
+ Let the Gaming Commence
218
+ All of the action this GFN Thursday kicks off with six new games arriving on the cloud. Members can also gear up for Rainbow Six Siege Year 7 Season 2.
219
+ Get ready for a new Operator, Team Deathmatch map and more in “Rainbow Six Siege” Year 7 Season 2.
220
+ Members can look for the following streaming this week:Chivalry 2(New release on Steam) Starship Troopers – Terran Command(New release on Steam and Epic Games Store) Builder Simulator(Steam) Supraland(Free on Epic Games Store) The Legend of Heroes: Trails of Cold Steel II(Steam) POSTAL: Brain Damaged(Steam)
221
+ Finally, members still have a chance to stream the PC Building Simulator 2 open beta before it ends on Monday, June 20. Experience deeper simulation, an upgraded career mode and powerful new customization features to bring your ultimate PC to life.","旅人你好,準備踏上冒險之旅吧。熱門開放世界動作角色扮演遊戲《原神》即將結束限量公測版,並將於下週推出,供所有 GeForce NOW 會員遊玩。
222
+ 還有六款遊戲現已加入 GeForce NOW 遊戲庫,供玩家即刻暢玩。
223
+ 正如上週公告,《戰鎚 40K:黑潮 (Warhammer 40,000: Darktide)》即將於雲端推出,由 GeForce 技術支援。今年九月,會員將能橫跨數千年後的未來,進入太空海軍陸戰隊時代,遊戲將可於 GeForce NOW 上串流。
224
+ 前往提瓦特《原神》限時公測版大獲成功,得到會員的極佳回饋,並將於下週開始在 GeForce NOW 上開放串流,供所有玩家遊玩。
225
+ 化身來自另一世界的旅人踏上冒險之途,流連於提瓦特的奇幻土地。在由七個國家組成的寬廣大陸尋找失蹤手足。掌握元素戰鬥的藝術,打造一支夢幻團隊,40 多位角色均具備獨一無二的技能,例如最新加入的夜蘭 (Yelan) 和久岐忍 (Kuki Shinobu),他們各自都有豐富的故事、個性和戰鬥風格。
226
+ 在《Chasm》的2.7版「荒夢藏虞淵(Hidden Dreams in the Depths)」更新中,探索故事深處的奧秘。
227
+ 體驗身歷其境的戰役、與經典角色一同深入探索豐富任務並完成每日挑戰。衝鋒陷陣單打獨鬥,或邀請好友加入冒險。世界正在持續擴張,所以無論身處何處都能跨裝置使用,快速在低效能的 PC、Mac和 Chromebook 上透過 GeForce NOW串流遊玩。
228
+ 遊戲開始
229
+ 本週 GFN 以六款於雲端推出的新遊戲揭開序幕。會員也可以準備迎接《虹彩六號:圍攻行動(Rainbow Six Siege)》第 7 年第 2 季。
230
+ 準備好迎接《虹彩六號:圍攻行動 (Rainbow Six Siege) 》第 7 年第 2 季新加入的戰鬥員、團隊殊死戰 (Team Deathmatch) 地圖等更多內容。
231
+ 會員可於本週稍後期待以下遊戲開放串流:《騎士精神 2 (Chivalry 2)》(於 Steam 全新發佈)《星艦戰將:人類總動員(Starship Troopers – Terran Command)》(於 Steam 與 Epic Games Store 全新發佈)《Builder Simulator》(Steam)《Supraland》(Epic Games Store 開放免費遊玩)《英雄傳說閃之軌跡 II (The Legend of Heroes: Trails of Cold Steel II)》(Steam)《喋血街頭:腦損(POSTAL: Brain Damaged) 》(Steam)
232
+ 最後,會員仍有機會在 6 月 20 日星期一結束前,串流遊玩《PC Builder Simulator 2》公測版。體驗更深入的模擬效果、經過升級的生涯模式和強大的全新自訂功能,讓你的終極 PC 栩栩如生。",願望成真:《原神 (Genshin Impact) 》即將於 GeForce NOW 聯盟 Taiwan Mobile 雲端遊戲服務推出,https://blogs.nvidia.com.tw/blog/geforce-now-thursday-june-16/,"A traveler, are you ready to embark on an adventure? The highly-anticipated open-world action role-playing game Genshin Impact is nearing the end of its limited beta test and will be launched next week for all GeForce NOW members to play.
233
+
234
+ Six new games have been added to the GeForce NOW game library, available to play now.
235
+
236
+ As previously announced, Warhammer 40,000: Darktide will be launched on the cloud with GeForce technology support. In September, members will be able to travel to a future where space marines rule, with the game streaming on GeForce NOW.
237
+
238
+ Genshin Impact's limited beta test was a huge success, with excellent feedback from members, and will begin streaming on GeForce NOW next week for all players to play.
239
+
240
+ As a traveler from another world, embark on an adventure to explore the fantastical lands of Teyvat, searching for a missing companion. Master the art of elemental combat, build a dream team, and choose from over 40 characters, each with unique skills, such as the new Yelan and Kuki Shinobu, with their own stories, personalities, and combat styles.
241
+
242
+ In Chasm's 2.7 update ""Hidden Dreams in the Depths,"" explore the secrets of the story.
243
+
244
+ Experience immersive campaigns, explore rich missions with classic characters, and complete daily challenges. Engage in single-player or invite friends to join the adventure. The world is constantly expanding, so you can use it on any device, quickly streaming on low-powered PCs, Macs, and Chromebooks through GeForce NOW.
245
+
246
+ New Games
247
+
248
+ This week, GFN kicks off with six new games being launched on the cloud. Members can also prepare for Rainbow Six Siege's Season 7, Year 2.
249
+
250
+ Get ready for Rainbow Six Siege's Season 7, Year 2, with new operators, Team Deathmatch maps, and more.
251
+
252
+ Members can expect the following games to be streaming on GeForce NOW this week:
253
+
254
+ Chivalry 2 (newly released on Steam)
255
+ Starship Troopers – Terran Command (newly released on Steam and Epic Games Store)
256
+ Builder Simulator (Steam)
257
+ Supraland (Epic Games Store free to play)
258
+ The Legend of Heroes: Trails of Cold Steel II (Steam)
259
+ POSTAL: Brain Damaged (Steam)
260
+
261
+ Lastly, members still have a chance to stream PC Builder Simulator 2's public test before it ends on June 20th. Experience more immersive simulation, upgraded career mode, and powerful new customization features, bringing your ultimate PC to life.","Hello, traveler, get ready to embark on an adventure. The popular open-world action RPG ""Genshin Impact"" is about to conclude its limited beta version and will be launched next week for all GeForce NOW members to play.
262
+
263
+ Additionally, six new games have now joined the GeForce NOW library for players to enjoy immediately. As announced last week, ""Warhammer 40,000: Darktide"" will soon be launched on the cloud, supported by GeForce technology. In September this year, members will be able to enter the era of Space Marines in the far future, with the game available for streaming on GeForce NOW.
264
+
265
+ The limited-time beta test for ""Genshin Impact"" in Teyvat was a great success, receiving excellent feedback from members. Starting next week, the game will be available for streaming on GeForce NOW for all players.
266
+
267
+ Transform into a traveler from another world and embark on an adventure in the fantasy land of Teyvat. Explore the vast continent made up of seven nations in search of your missing sibling. Master the art of elemental combat and build a dream team with over 40 characters, each with unique skills, such as the newly added Yelan and Kuki Shinobu, who each have rich stories, personalities, and combat styles.
268
+
269
+ In the Version 2.7 update ""Hidden Dreams in the Depths"" of ""The Chasm,"" uncover the mysteries deep within the story. Experience immersive campaigns, explore richly immersive quests with classic characters, and complete daily challenges. Charge into battle alone or invite friends to join the adventure. The world is continuously expanding, so you can access it across devices, streaming quickly on low-performance PCs, Macs, and Chromebooks via GeForce NOW.
270
+
271
+ Let the games begin. This week, GFN kicks off with six new cloud-distributed games. Members can also look forward to the 2nd season of the 7th year of ""Rainbow Six Siege.""
272
+
273
+ Get ready to welcome new operatives in ""Rainbow Six Siege"" Year 7 Season 2, new Team Deathmatch maps, and more content. Members can look forward to streaming the following games later this week: ""Chivalry 2"" (newly released on Steam), ""Starship Troopers – Terran Command"" (newly released on Steam and Epic Games Store), ""Builder Simulator"" (Steam), ""Supraland"" (free to play on Epic Games Store), ""The Legend of Heroes: Trails of Cold Steel II"" (Steam), and ""POSTAL: Brain Damaged"" (Steam).
274
+
275
+ Lastly, members still have the chance to stream the beta version of ""PC Builder Simulator 2"" before it ends on Monday, June 20. Experience deeper simulation effects, an upgraded career mode, and powerful new customization features that bring your ultimate PC to life."
276
+ 8,0,"Autonomous vehicle (AV) development is made possible by three distinct computers:
277
+ NVIDIA DGX systems for training the AI-based stack in the data center, NVIDIA Omniverse running on NVIDIA OVX systems for simulation and synthetic data generation, and the NVIDIA AGX in-vehicle computer to process real-time sensor data for safety.
278
+ Together, these purpose-built, full-stack systems enable continuous development cycles, speeding improvements in performance and safety.
279
+ At the CES trade show, NVIDIA today announced a new part of the equation:
280
+ NVIDIA Cosmos, a platform comprising state-of-the-art generative world foundation models (WFMs), advanced tokenizers, guardrails and an accelerated video processing pipeline built to advance the development of physical AI systems such as AVs and robots.
281
+ With Cosmos added to the three-computer solution, developers gain a data flywheel that can turn thousands of human-driven miles into billions of virtually driven miles — amplifying training data quality.
282
+ “The AV data factory flywheel consists of fleet data collection, accurate 4D reconstruction and AI to generate scenes and traffic variations for training and closed-loop evaluation,” said Sanja Fidler, vice president of AI research at NVIDIA. “Using the NVIDIA Omniverse platform, as well as Cosmos and supporting AI models, developers can generate synthetic driving scenarios to amplify training data by orders of magnitude.”
283
+ “Developing physical AI models has traditionally been resource-intensive and costly for developers, requiring acquisition of real-world datasets and filtering, curating and preparing data for training,” said Norm Marks, vice president of automotive at NVIDIA. “Cosmos accelerates this process with generative AI, enabling smarter, faster and more precise AI model development for autonomous vehicles and robotics.”
284
+ Transportation leaders are using Cosmos to build physical AI for AVs, including:
285
+ Waabi
286
+ , a company pioneering generative AI for the physical world, will use Cosmos for the search and curation of video data for AV software development and simulation.
287
+ Wayve
288
+ , which is developing AI foundation models for autonomous driving, is evaluating Cosmos as a tool to search for edge and corner case driving scenarios used for safety and validation.
289
+ AV toolchain provider Foretellix will use Cosmos, alongside NVIDIA Omniverse Sensor RTX APIs, to evaluate and generate high-fidelity testing scenarios and training data at scale.
290
+ In addition, ridesharing giant Uber is partnering with NVIDIA to accelerate autonomous mobility. Rich driving datasets from Uber, combined with the features of the Cosmos platform and
291
+ NVIDIA DGX Cloud, will help AV partners build stronger AI models even more efficiently.
292
+ Availability
293
+ Cosmos WFMs are now available under an open model license on Hugging Face and the NVIDIA NGC catalog.
294
+ Cosmos models will soon be available as fully optimized NVIDIA NIM microservices.
295
+ Get started with Cosmos and join NVIDIA at CES.","自動駕駛的發展以三台不同的電腦實現:
296
+ NVIDIA DGX 系統用於在資料中心訓練以人工智慧(AI)為基礎的堆疊,在 NVIDIA OVX 系統上運行的 NVIDIA Omniverse 用於模擬與產生合成資料,而 NVIDIA AGX 車載電腦則用於即時處理感測器產生出的資料以確保安全。
297
+ 這些專門建置的全堆疊系統共同推動持續性的開發進程,加快提高效能與安全性。
298
+ NVIDIA 今日在 CES 大會宣布此方程式又加入一個新成員:NVIDIA Cosmos。 這個平台包含最先進的生成世界基礎模型(WFM)、先進的標記器、護欄和加速影片處理管道,專為推動開發自駕車輛與機器人等實體 AI 系統而打造。
299
+ 將 Cosmos 加入三台電腦的解決方案,開發人員獲得一個資料飛輪,可以將人類駕駛所累積出的數千哩的里程轉換為數十億哩的虛擬駕駛里程,提高訓練資料的品質。
300
+ NVIDIA AI 研究部門副總裁 Sanja Fidler 表示:「自動駕駛資料工廠的飛輪包括收集車隊資料、精準的 4D 重構與 AI,以產生場景與各種交通路況,用於訓練與閉環評估。開發人員使用 NVIDIA Omniverse 平台以及 Cosmos 和支援的 AI 模型,可以產生合成的行車場景,將訓練資料放大數倍。」
301
+ NVIDIA車用產品副總裁 Norm Marks 表示:「開發人員在開發實體 AI 模型的過程向來是資源密集且成本高昂的工作,需要取得真實世界的資料集,並且篩選、整理和準備訓練資料。Cosmos利用生成式 AI 加快這個過程,更聰明、快速且精確開發用於自動駕駛和機器人的 AI 模型。」
302
+ 交通運輸領域領導業者使用 Cosmos 為自動駕駛建立實體 AI,包括:
303
+ Waabi
304
+ 為實體世界開創生成式 AI,使用 Cosmos 搜尋和整理影片資料,用於開發和模擬自動駕駛軟體。
305
+ Wayve
306
+ 開發適用於自動駕駛的 AI 基礎模型,正在評估 Cosmos,將其作為搜尋用於安全和驗證之邊緣和極端駕駛情況的工具。
307
+ 自駕車工具鏈供應商 Foretellix 使用 Cosmos 與 NVIDIA Omniverse Sensor RTX API,以評估和產生大量高擬真度的測試場景及訓練資料。
308
+ 此外,乘車服務巨擘 Uber 也將與 NVIDIA 合作,加速推動開發自動���駛移動技術。Uber 提供豐富的駕駛資料集,加上 Cosmos 平台與 NVIDIA DGX Cloud,將協助自駕車合作夥伴更有效率地建立更強大的 AI 模型。
309
+ 上市時間
310
+ Cosmos WFM 現已在 Hugging Face 及 NVIDIA NGC 目錄上以開放模型授權的方式提供。Cosmos模型即將以完全最佳化 NVIDIA NIM 微服務的形式提供。
311
+ 開始使用 Cosmos、觀看示範,並且參加 NVIDIA 在 CES 大會的活動。",NVIDIA以 Cosmos 世界基礎模型增強適用於自動駕駛的三台電腦解決方案,https://blogs.nvidia.com.tw/blog/three-computer-cosmos-ces/,"The development of autonomous driving technology is achieved through three distinct computers: The NVIDIA DGX system, which is used for training AI-based stacks in the data center and runs NVIDIA Omniverse on the NVIDIA OVX system, which is used for simulating and generating synthetic data. The NVIDIA AGX car computer is used to process sensor data in real-time to ensure safety. These purpose-built end-to-end systems drive continuous development and improvement in performance and safety.
312
+
313
+ NVIDIA today announced at the CES event that it has added a new member to the equation: NVIDIA Cosmos. This platform includes the most advanced World Baseline Model (WFM) generation, advanced markers, guards, and acceleration of video processing pipelines, which are specifically designed to drive the development of autonomous vehicles and other physical AI systems.
314
+
315
+ Adding Cosmos to the three computer solution, developers get a data wheel that can turn the miles accumulated by human drivers into tens of billions of virtual driving miles, improving the quality of training data.
316
+
317
+ NVIDIA AI Research Department Vice President Sanja Fidler said: ""The flywheel of the autonomous driving data factory includes collecting fleet data, accurate 4D reconstruction and AI to generate scenarios and various traffic conditions for training and closed-loop evaluation. Using the NVIDIA Omniverse platform and Cosmos and supported AI models, developers can generate synthetic driving scenarios, expanding training data by orders of magnitude.""
318
+
319
+ NVIDIA Vice President of Automotive Products Norm Marks said: ""Developers in the development of physical AI models have traditionally been resource-intensive and costly work that requires real-world data sets and filtering, processing and preparing training data. Cosmos uses generative AI to accelerate this process, developing smarter, faster and more accurate AI models for autonomous driving and robots.""","The development of autonomous driving is achieved with three different computers: The NVIDIA DGX system is used in data centers to train AI-based stacks, NVIDIA Omniverse running on NVIDIA OVX systems is used for simulation and synthetic data generation, and the NVIDIA AGX in-car computer processes sensor data in real time to ensure safety. These specialized full-stack systems collectively drive continuous development progress, accelerating efficiency and safety improvements. NVIDIA announced at CES today the addition of a new member to this formula: NVIDIA Cosmos. This platform includes cutting-edge World Foundational Models (WFM), advanced labelers, guardrails, and accelerated video processing pipelines, designed to advance the development of autonomous vehicles and robotic physical AI systems.
320
+
321
+ By integrating Cosmos into the three-computer solution, developers gain a data flywheel that converts thousands of miles of human driving into billions of miles of virtual driving, improving training data quality. Sanja Fidler, Vice President of NVIDIA AI Research, stated: ""The data flywheel of the autonomous driving data factory includes fleet data collection, precise 4D reconstruction, and AI, to generate scenes and various traffic scenarios for training and closed-loop evaluation. Developers using the NVIDIA Omniverse platform, Cosmos, and supported AI models can generate synthetic driving scenes, amplifying training data manyfold.""
322
+
323
+ Norm Marks, Vice President of NVIDIA Automotive Products, stated: ""Developing physical AI models has always been a resource-intensive and costly task, requiring real-world datasets, sorting, organizing, and preparing training data. Cosmos accelerates this process utilizing generative AI, making the development of AI models for autonomous driving and robotics smarter, faster, and more accurate.""
324
+
325
+ Leaders in the transportation field using Cosmos to build physical AI for autonomous driving include:
326
+ - Waabi: Pioneering generative AI for the physical world, using Cosmos to search and organize video data for developing and simulating autonomous driving software.
327
+ - Wayve: Developing AI foundational models for autonomous driving, evaluating Cosmos as a tool for discovering edge and extreme driving scenarios for safety and verification.
328
+ - Autonomous driving toolchain supplier Foretellix uses Cosmos and NVIDIA Omniverse Sensor RTX API to evaluate and produce large volumes of high-fidelity test scenarios and training data.
329
+
330
+ Additionally, ride-hailing giant Uber will partner with NVIDIA to accelerate the development of autonomous mobility technology. Uber provides rich driving datasets, and combined with the Cosmos platform and NVIDIA DGX Cloud, will help autonomous vehicle partners build stronger AI models more efficiently.
331
+
332
+ Availability:
333
+ Cosmos WFM is now available under an open model license on Hugging Face and NVIDIA NGC catalog. Cosmos models will soon be offered as fully optimized NVIDIA NIM microservices.
334
+
335
+ Start using Cosmos, watch demonstrations, and participate in NVIDIA's activities at CES."
336
+ 10,0,"A new NVIDIA DRIVE AI Systems Inspection Lab will help automotive ecosystem partners navigate evolving industry standards for autonomous vehicle safety.
337
+ The lab, launched today, will focus on inspecting and verifying that automotive partner software and systems on the
338
+ NVIDIA DRIVE AGX
339
+ platform meet the automotive industry’s stringent safety and cybersecurity standards, including AI functional safety.
340
+ The lab has been accredited by the ANSI National Accreditation Board (
341
+ ANAB
342
+ ) according to the ISO/IEC 17020 assessment for standards, including:
343
+ Functional safety (ISO 26262)
344
+ SOTIF (ISO 21448)
345
+ Cybersecurity (ISO 21434)
346
+ UN-R regulations, including UN-R 79, UN-R 13-H, UN-R 152, UN-R 155, UN-R 157 and UN-R 171
347
+ AI functional safety (ISO PAS 8800 and ISO/IEC TR 5469)
348
+ “The launch of this new lab will help partners in the global automotive ecosystem create safe, reliable autonomous driving technology,” said Ali Kani, vice president of automotive at NVIDIA. “With accreditation by ANAB, the lab will carry out an inspection plan that combines functional safety, cybersecurity and AI — bolstering adherence to the industry’s safety standards.”
349
+ “ANAB is proud to be the accreditation body for the NVIDIA DRIVE AI Systems Inspection Lab,” said R. Douglas Leonard Jr., executive director of ANAB. “NVIDIA’s comprehensive evaluation verifies the demonstration of competence and compliance with internationally recognized standards, helping ensure that DRIVE ecosystem partners meet the highest benchmarks for functional safety, cybersecurity and AI integration.”
350
+ The new lab builds on NVIDIA’s ongoing safety compliance work with Mercedes-Benz and JLR. Inaugural participants in the lab include Continental and Sony SSS-America.
351
+ “We are pleased to participate in the newly launched NVIDIA Drive AI Systems Inspection Lab and to further intensify the fruitful, ongoing collaboration between our two companies,” said Nobert Hammerschmidt, head of components business at Continental.
352
+ “Self-driving vehicles have the capability to significantly enhance safety on roads,” said Marius Evensen, head of automotive image sensors at Sony SSS-America. “We look forward to working with NVIDIA’s DRIVE AI Systems Inspection Lab to help us deliver the highest levels of safety to our customers.”","全新啟用的 NVIDIA DRIVE AI 系統檢測實驗室(Systems Inspection Lab)將協助汽車生態系合作夥伴掌握不斷發展的自駕車安全產業標準。
353
+ 於今日啟用的這處實驗室,將側重於檢測與驗證汽車合作夥伴在 NVIDIA DRIVE AGX 平台上的軟體與系統,是否符合汽車產業嚴格的安全與資安,包括人工智慧(AI)功能安全。
354
+ 該實驗室已獲得美國國家標準協會認可委員會(ANAB)根據 ISO/IEC 17020 評估標準的認證,包括:
355
+ 功能安全 (ISO 26262)
356
+ SOTIF (ISO 21448)
357
+ 資安 (ISO 21434)
358
+ UN-R 法規,包括 UN-R 79、UN-R 13-H、UN-R 152、UN-R 155、UN-R 157 和 UN-R 171
359
+ AI 功能安全 (ISO PAS 8800 和 ISO/IEC TR 5469)
360
+ NVIDIA 車用產品部門副總裁 Ali Kani 表示:「NVIDIA 成立這個新的實驗室,將協助全球汽車產業生態系的合作夥伴發展出安全可靠的自動駕駛技術。在獲得 ANAB 認證後,實驗室將執行結合功能安全、資安與 AI 的檢測計畫,強化遵守業界安全標準的程度。」
361
+ ANAB 執行董事 R. Douglas Leonard Jr 表示:「ANAB 很榮幸成為 NVIDIA DRIVE AI 系統檢測實驗室的認證機構。NVIDIA 的綜合評估驗證其所展示出的能力與遵守國際公認標準,有助於確保 DRIVE 生態系統合作夥伴遵守功能安全、資安和 AI 整合的最高標準。」
362
+ 此全新實驗室建立在 NVIDIA 與 Mercedes-Benz 和 JLR 持續進行的安全合規的基礎上。首批加入該實驗室的業者包括大陸集團(Continental)和 Sony SSS-America。
363
+ 大陸集團零組件業務部門負責人 Nobert Hammerschmidt 表示:「我們很高興能加入新成立的 NVIDIA Drive AI 系統檢測實驗室,進一步強化我們雙方一路以來卓越的合作成果。」
364
+ Sony SSS-America 車用影像感測器部門負責人 Marius Evensen 表示:「自駕車能夠大幅提高用路安全。我們期待與 NVIDIA 的 DRIVE AI 系統檢測實驗室合作,協助我們為客戶提供最高等級的安全性。」",NVIDIA 啟用 DRIVE AI 系統檢測實驗室,創下業界全新安全里程碑,https://blogs.nvidia.com.tw/blog/drive-ai-lab-ces/,"NVIDIA today launched its Systems Inspection Lab (SIL) as part of its DRIVE Autonomous System Validation efforts. The lab, which is available today, will help its automotive ecosystem partners navigate the evolving standards for autonomous vehicle safety.
365
+
366
+ The lab focuses on inspecting and verifying the software and systems of automotive partners on the NVIDIA DRIVE AGX platform to ensure compliance with the rigorous safety and security standards of the automotive industry, including artificial intelligence (AI) functionality safety.
367
+
368
+ The lab has been accredited by the American National Standards Institute (ANSI)-accredited ANAB (ANSI National Accreditation Board) based on the ISO/IEC 17020 evaluation standard, covering:
369
+
370
+ Function Safety (ISO 26262)
371
+ SOTIF (ISO 21448)
372
+ Cybersecurity (ISO 21434)
373
+ UN-R regulations, including UN-R 79, UN-R 13-H, UN-R 152, UN-R 155, UN-R 157, and UN-R 171
374
+ AI Function Safety (ISO PAS 8800 and ISO/IEC TR 5469)
375
+
376
+ ""NVIDIA is establishing this new lab to help our global automotive ecosystem partners develop safe and reliable autonomous driving technologies,"" said Ali Kani, vice president and general manager of Automotive at NVIDIA. ""With the ANAB accreditation, the lab will now execute a combined functional safety, cybersecurity, and AI testing plan to further enhance the industry's adherence to safety standards.""
377
+
378
+ ""The ANAB is honored to be the accrediting body for NVIDIA's Systems Inspection Lab,"" said R. Douglas Leonard Jr., ANAB Executive Director. ""NVIDIA's comprehensive evaluation and verification demonstrate its capability and compliance with international recognized standards, helping ensure that DRIVE ecosystem partners adhere to the highest standards of functional safety, cybersecurity and AI integration.""
379
+
380
+ The new lab is based on the ongoing safety-relevant work of NVIDIA and Mercedes-Benz and JLR. The first customers to join the lab are Continental and Sony SSS-America.
381
+
382
+ ""We are excited to join the newly established NVIDIA DRIVE AI Systems Inspection Lab and further strengthen our longstanding collaborative success,"" said Nobert Hammerschmidt, head of Continental's Components division.
383
+
384
+ ""We believe autonomous driving can greatly enhance road safety,"" said Marius Evensen, director of Automotive Imaging Sensors at Sony SSS-America. ""We look forward to collaborating with NVIDIA's DRIVE AI Systems Inspection Lab to help deliver the highest level of safety for our customers.""","The newly launched NVIDIA DRIVE AI Systems Inspection Lab will assist automotive ecosystem partners in mastering the evolving industry standards for autonomous vehicle safety. This lab, which inaugurated today, will focus on testing and verifying whether the software and systems of automotive partners on the NVIDIA DRIVE AGX platform meet the stringent safety and cybersecurity standards of the automotive industry, including AI functional safety.
385
+
386
+ The lab is certified by the National Accreditation Board of the United States (ANAB) according to ISO/IEC 17020 evaluation standards, including:
387
+ - Functional Safety (ISO 26262)
388
+ - SOTIF (ISO 21448)
389
+ - Cybersecurity (ISO 21434)
390
+ - UN-R Regulations, including UN-R 79, UN-R 13-H, UN-R 152, UN-R 155, UN-R 157, and UN-R 171
391
+ - AI Functional Safety (ISO PAS 8800 and ISO/IEC TR 5469)
392
+
393
+ Ali Kani, Vice President of NVIDIA’s Automotive Products Division, stated: ""NVIDIA established this new lab to assist partners within the global automotive industry ecosystem in developing safe and reliable autonomous driving technology. With ANAB certification, the lab will execute testing programs that integrate functional safety, cybersecurity, and AI, enhancing the level of compliance with industry safety standards.""
394
+
395
+ R. Douglas Leonard Jr, Executive Director of ANAB, stated: ""ANAB is honored to be the certifying body for the NVIDIA DRIVE AI Systems Inspection Lab. NVIDIA’s comprehensive evaluation verifies its demonstrated capabilities and compliance with internationally recognized standards, which helps ensure that the DRIVE ecosystem partners adhere to the highest standards of functional safety, cybersecurity, and AI integration.""
396
+
397
+ This new lab is built upon NVIDIA's ongoing safety compliance collaboration with Mercedes-Benz and JLR. The initial participants joining the lab include Continental and Sony SSS-America.
398
+
399
+ Norbert Hammerschmidt, Head of the Component Business Unit at Continental, stated: ""We are delighted to join the newly established NVIDIA DRIVE AI Systems Inspection Lab, further strengthening our already excellent cooperative achievements.""
400
+
401
+ Marius Evensen, Head of Automotive Image Sensor Division at Sony SSS-America, stated: ""Autonomous vehicles can significantly improve road safety. We look forward to collaborating with NVIDIA's DRIVE AI Systems Inspection Lab to help us provide the highest level of safety for our customers."""
402
+ 10,1,"“Compliance with functional safety, SOTIF and cybersecurity is particularly challenging for complex systems such as AI-based autonomous vehicles,” said Riccardo Mariani, head of industry safety at NVIDIA. “Through the DRIVE AI Systems Inspection Lab, the correctness of the integration of our partners’ products with DRIVE safety and cybersecurity requirements can be inspected and verified.”
403
+ Now open to all NVIDIA DRIVE AGX platform partners, the lab is expected to expand to include additional automotive and robotics products and add a testing component.
404
+ Complementing International Automotive Safety Standards
405
+ The NVIDIA DRIVE AI Systems Inspection Lab complements the missions of independent third-party certification bodies, including technical service organizations such as TÜV SÜD, TÜV Rheinland and exida, as well as vehicle certification agencies such as VCA and KBA.
406
+ Today’s announcement dovetails with recent significant safety certifications and assessments of NVIDIA automotive products:
407
+ TÜV SÜD
408
+ granted the ISO 21434 Cybersecurity Process certification to NVIDIA for its automotive system-on-a-chip, platform and software engineering processes. Upon certification release, the
409
+ NVIDIA DriveOS
410
+ 6.0 operating system conforms with ISO 26262 Automotive Safety Integrity Level (ASIL) D standards.
411
+ “Meeting cybersecurity process requirements is of fundamental importance in the autonomous vehicle era,” said Martin Webhofer, CEO of TÜV SÜD Rail GmbH. “NVIDIA has successfully established processes, activities and procedures that fulfill the stringent requirements of ISO 21434. Additionally, NVIDIA DriveOS 6.0 conforms to ISO 26262 ASIL D standards, pending final certification activities.”
412
+ TÜV Rheinland
413
+ performed an independent United Nations Economic Commission for Europe safety assessment of NVIDIA DRIVE AV related to safety requirements for complex electronic systems.
414
+ “NVIDIA has demonstrated thorough, high-quality, safety-oriented processes and technologies in the context of the assessment of the generic, non-OEM-specific parts of the SAE level 2 NVIDIA DRIVE system,” said Dominik Strixner, global lead functional safety automotive mobility at TÜV Rheinland.
415
+ To learn more about NVIDIA’s work in advancing autonomous driving safety, read the NVIDIA Self-Driving Safety Report.","NVIDIA 產業安全部門負責人Riccardo Mariani 表示:「對於基於 AI 的自動駕駛車這一類複雜系統來說,遵守功能安全、SOTIF 和資安是一件特別挑戰的事。透過 DRIVE AI 系統檢測實驗室,我們可以檢測和驗證合作夥伴的產品是否有正確與 DRIVE 安全和資安要求進行整合。」
416
+ 該實驗室目前開放全體 NVIDIA DRIVE AGX 平台合作夥伴使用,預計將加入更多汽車和機器人產品,並且將增加測試環節。
417
+ 與國際汽車安全標準相輔相成
418
+ NVIDIA DRIVE AI 系統檢測實驗室與獨立第三方認證機構的任務相輔相成,包括 TÜV SÜD、TÜV Rheinland 和 exida 等技術服務機構,以及 VCA 和 KBA 等車輛認證機構。
419
+ 今天宣布的消息與 NVIDIA 車用產品最近獲得的重要安全認證及評估結果不謀而合:
420
+ TÜV SÜD
421
+ 授予 NVIDIA 汽車系統單晶片、平台與軟體工程流程 ISO 21434 網路安全流程認證。認證發布後,
422
+ NVIDIA DriveOS
423
+ 6.0 作業系統符合 ISO 26262 車輛安全完整性等級(ASIL)D 標準。
424
+ TÜV SÜD Rail GmbH 執行長 Martin Webhofer 表示:「在自動駕駛時代,必須要符合網路安全流程的要求。NVIDIA 已經成功建立了符合 ISO 21434 嚴格要求的流程、活動和程序。通過最後的認證活動後,NVIDIA DriveOS 6.0 還符合 ISO 26262 ASIL D 標準。」
425
+ TÜV Rheinland
426
+ 對 NVIDIA DRIVE AV 進行了與複雜電子系統安全要求相關的聯合國歐洲經濟委員會獨立安全評估作業。
427
+ TÜV Rheinland 全球功能安全汽車行動部門主管 Dominik Strixner 表示:「NVIDIA 在 SAE level 2 NVIDIA DRIVE 系統通用、非 OEM 專用零件的評估中,展現出全面、高品質、以安全為導向的流程與技術。」
428
+ 若要進一步瞭解 NVIDIA 在推動自動駕駛安全方面的各項工作,請閱讀 NVIDIA 自動駕駛安全報告。",NVIDIA 啟用 DRIVE AI 系統檢測實驗室,創下業界全新安全里程碑,https://blogs.nvidia.com.tw/blog/drive-ai-lab-ces/,"“Given the complexity of this type of system, it's especially challenging to meet the requirements for functional safety, SOTIF and cybersecurity,"" said Riccardo Mariani, Head of Industry Security at NVIDIA. ""Through the DRIVE AI Systems Test Lab, we can test and validate our partners' products to ensure they are properly integrated with DRIVE security and cybersecurity requirements.""
429
+
430
+ The lab is currently available to all NVIDIA DRIVE AGX platform partners and will be expanded to include additional automotive and robotics products with more testing segments.
431
+
432
+ The NVIDIA DRIVE AI Systems Test Lab complements the work of independent third-party certification bodies, including TÜV SÜD, TÜV Rheinland and exida, as well as VCA and KBA, automotive certification bodies.
433
+
434
+ The announcement aligns with recent important security certification and evaluation results for NVIDIA's automotive products:
435
+
436
+ TÜV SÜD
437
+
438
+ * Granted NVIDIA automotive system-on-chip, platform and software engineering process ISO 21434 network security process certification.
439
+ * Following the publication of the certification, NVIDIA DriveOS 6.0 meets ISO 26262 vehicle safety integrity level (ASIL) D standard.
440
+ NVIDIA DriveOS 6.0 meets ISO 26262 vehicle safety integrity level (ASIL) D standard.
441
+
442
+ TÜV SÜD Rail GmbH CEO Martin Webhofer said: “In the era of autonomous driving, it is necessary to meet the requirements for network security processes. NVIDIA has successfully established a process, activities and procedures that meet the strict requirements of ISO 21434. Following the final certification activities, NVIDIA DriveOS 6.0 meets ISO 26262 ASIL D standard.”
443
+
444
+ TÜV Rheinland
445
+
446
+ * Conducted an independent safety assessment, related to the requirements for security of complex electronic systems, in accordance with United Nations Economic Commission for Europe.
447
+ Dominik Strixner, Global Functional Safety Automotive Business Unit Manager at TÜV Rheinland, said: “NVIDIA showed a comprehensive, high-quality and safety-oriented process and technology in the evaluation of its SAE level 2, general-use, non-OEM-specific components of the NVIDIA DRIVE system.""
448
+
449
+ To learn more about NVIDIA’s work in advancing the safety of autonomous driving, please read the NVIDIA Autonomous Driving Safety Report.","Riccardo Mariani, head of NVIDIA's industrial safety department, stated: ""For complex systems such as AI-based autonomous vehicles, complying with functional safety, SOTIF, and cybersecurity is a particularly challenging task. Through the DRIVE AI Systems Testing Lab, we can test and verify whether our partners' products are properly integrated with DRIVE safety and cybersecurity requirements.""
450
+
451
+ The lab is currently open to all NVIDIA DRIVE AGX platform partners and plans to include more automotive and robotic products, as well as increase the testing scenarios.
452
+
453
+ Complementing International Automotive Safety Standards
454
+ The NVIDIA DRIVE AI Systems Testing Lab complements the work of independent third-party certification bodies, including technical service organizations like TÜV SÜD, TÜV Rheinland, and exida, as well as vehicle certification bodies like VCA and KBA.
455
+
456
+ Today's announcement coincides with recent significant safety certifications and evaluation results for NVIDIA automotive products:
457
+
458
+ TÜV SÜD
459
+ Granted the certification for ISO 21434 cybersecurity process for NVIDIA's automotive system-on-chips, platforms, and software engineering processes. After the certification was issued, NVIDIA DriveOS 6.0 complies with the ISO 26262 Automotive Safety Integrity Level (ASIL) D standard.
460
+ Martin Webhofer, CEO of TÜV SÜD Rail GmbH, stated: ""In the era of autonomous driving, meeting cybersecurity process requirements is essential. NVIDIA has successfully established processes, activities, and procedures that meet the stringent requirements of ISO 21434. After the final certification activities, NVIDIA DriveOS 6.0 also complies with the ISO 26262 ASIL D standard.""
461
+
462
+ TÜV Rheinland
463
+ Conducted an independent safety assessment operation related to the safety requirements of complex electronic systems for NVIDIA DRIVE AV as part of the United Nations Economic Commission for Europe.
464
+ Dominik Strixner, Global Head of Functional Safety Automotive at TÜV Rheinland, said: ""NVIDIA demonstrated comprehensive, high-quality, safety-oriented processes and technologies in the evaluation of SAE Level 2 NVIDIA DRIVE systems, which are generic, non-OEM specific components.""
465
+
466
+ To learn more about NVIDIA's efforts in promoting autonomous driving safety, please read the NVIDIA Autonomous Driving Safety Report."
vecalign/vecalign.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ Copyright 2019 Brian Thompson
5
+
6
+ Licensed under the Apache License, Version 2.0 (the "License");
7
+ you may not use this file except in compliance with the License.
8
+ You may obtain a copy of the License at
9
+
10
+ https://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+ """
18
+
19
+ import argparse
20
+ import logging
21
+ import pickle
22
+ from math import ceil
23
+ from random import seed as seed
24
+
25
+ import numpy as np
26
+
27
+ logger = logging.getLogger('vecalign')
28
+ logger.setLevel(logging.WARNING)
29
+ logFormatter = logging.Formatter("%(asctime)s %(levelname)-5.5s %(message)s")
30
+ consoleHandler = logging.StreamHandler()
31
+ consoleHandler.setFormatter(logFormatter)
32
+ logger.addHandler(consoleHandler)
33
+
34
+ from dp_utils import make_alignment_types, make_one_to_many_alignment_types, print_alignments, read_alignments, \
35
+ read_in_embeddings, make_doc_embedding, vecalign
36
+
37
+ from score import score_multiple, log_final_scores
38
+
39
+
40
+ def _main():
41
+ # make runs consistent
42
+ seed(42)
43
+ np.random.seed(42)
44
+
45
+ parser = argparse.ArgumentParser('Sentence alignment using sentence embeddings and FastDTW',
46
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
47
+
48
+ parser.add_argument('-s', '--src', type=str, nargs='+', required=True,
49
+ help='preprocessed source file to align')
50
+
51
+ parser.add_argument('-t', '--tgt', type=str, nargs='+', required=True,
52
+ help='preprocessed target file to align')
53
+
54
+ parser.add_argument('-g', '--gold_alignment', type=str, nargs='+', required=False,
55
+ help='preprocessed target file to align')
56
+
57
+ parser.add_argument('--src_embed', type=str, nargs=2, required=True,
58
+ help='Source embeddings. Requires two arguments: first is a text file, sencond is a binary embeddings file. ')
59
+
60
+ parser.add_argument('--tgt_embed', type=str, nargs=2, required=True,
61
+ help='Target embeddings. Requires two arguments: first is a text file, sencond is a binary embeddings file. ')
62
+
63
+ parser.add_argument('-a', '--alignment_max_size', type=int, default=4,
64
+ help='Searches for alignments up to size N-M, where N+M <= this value. Note that the the embeddings must support the requested number of overlaps')
65
+
66
+ # without flag: one_to_many==default, with flag but no argument: one_to_many==const, with flag and argument: one_to_many==argument
67
+ parser.add_argument('--one_to_many', type=int, nargs='?', default=None, const=50,
68
+ help='Perform one to many (e.g. 1:1, 1:2, ... 1:M) alignment.'
69
+ ' Argument specifies M but will default to 50 if flag is set but no argument is provided. Overrides --alignment_max_size (-a).')
70
+
71
+ parser.add_argument('-d', '--del_percentile_frac', type=float, default=0.2,
72
+ help='Deletion penalty is set to this percentile (as a fraction) of the cost matrix distribution. Should be between 0 and 1.')
73
+
74
+ parser.add_argument('-v', '--verbose', help='sets consle to logging.DEBUG instead of logging.WARN',
75
+ action='store_true')
76
+
77
+ parser.add_argument('--max_size_full_dp', type=int, default=300,
78
+ help='Maximum size N for which is is acceptable to run full N^2 dynamic programming.')
79
+
80
+ parser.add_argument('--costs_sample_size', type=int, default=20000,
81
+ help='Sample size to estimate costs distribution, used to set deletion penalty in conjunction with deletion_percentile.')
82
+
83
+ parser.add_argument('--num_samps_for_norm', type=int, default=100,
84
+ help='Number of samples used for normalizing embeddings')
85
+
86
+ parser.add_argument('--search_buffer_size', type=int, default=5,
87
+ help='Width (one side) of search buffer. Larger values makes search more likely to recover from errors but increases runtime.')
88
+
89
+ parser.add_argument('--debug_save_stack', type=str,
90
+ help='Write stack to pickle file for debug purposes')
91
+
92
+ parser.add_argument('--print_aligned_text', action='store_true',
93
+ help='Print aligned text in addition to alignments, for debugging/tuning.')
94
+
95
+ args = parser.parse_args()
96
+
97
+ if len(args.src) != len(args.tgt):
98
+ raise Exception('number of source files must match number of target files')
99
+
100
+ if args.gold_alignment is not None:
101
+ if len(args.gold_alignment) != len(args.src):
102
+ raise Exception('number of gold alignment files, if provided, must match number of source and target files')
103
+
104
+ if args.verbose:
105
+ import logging
106
+ logger.setLevel(logging.INFO)
107
+
108
+ if args.alignment_max_size < 2:
109
+ logger.warning('Alignment_max_size < 2. Increasing to 2 so that 1-1 alignments will be considered')
110
+ args.alignment_max_size = 2
111
+
112
+ src_sent2line, src_line_embeddings = read_in_embeddings(args.src_embed[0], args.src_embed[1])
113
+ tgt_sent2line, tgt_line_embeddings = read_in_embeddings(args.tgt_embed[0], args.tgt_embed[1])
114
+
115
+ src_max_alignment_size = 1 if args.one_to_many is not None else args.alignment_max_size
116
+ tgt_max_alignment_size = args.one_to_many if args.one_to_many is not None else args.alignment_max_size
117
+
118
+ width_over2 = ceil(max(src_max_alignment_size, tgt_max_alignment_size) / 2.0) + args.search_buffer_size
119
+
120
+ test_alignments = []
121
+ stack_list = []
122
+ for src_file, tgt_file in zip(args.src, args.tgt):
123
+ logger.info('Aligning src="%s" to tgt="%s"', src_file, tgt_file)
124
+
125
+ src_lines = open(src_file, 'rt', encoding="utf-8").readlines()
126
+ vecs0 = make_doc_embedding(src_sent2line, src_line_embeddings, src_lines, src_max_alignment_size)
127
+
128
+ tgt_lines = open(tgt_file, 'rt', encoding="utf-8").readlines()
129
+ vecs1 = make_doc_embedding(tgt_sent2line, tgt_line_embeddings, tgt_lines, tgt_max_alignment_size)
130
+
131
+ if args.one_to_many is not None:
132
+ final_alignment_types = make_one_to_many_alignment_types(args.one_to_many)
133
+ else:
134
+ final_alignment_types = make_alignment_types(args.alignment_max_size)
135
+ logger.debug('Considering alignment types %s', final_alignment_types)
136
+
137
+ stack = vecalign(vecs0=vecs0,
138
+ vecs1=vecs1,
139
+ final_alignment_types=final_alignment_types,
140
+ del_percentile_frac=args.del_percentile_frac,
141
+ width_over2=width_over2,
142
+ max_size_full_dp=args.max_size_full_dp,
143
+ costs_sample_size=args.costs_sample_size,
144
+ num_samps_for_norm=args.num_samps_for_norm)
145
+
146
+ # write final alignments to stdout
147
+ print_alignments(stack[0]['final_alignments'], scores=stack[0]['alignment_scores'],
148
+ src_lines=src_lines if args.print_aligned_text else None,
149
+ tgt_lines=tgt_lines if args.print_aligned_text else None)
150
+
151
+ test_alignments.append(stack[0]['final_alignments'])
152
+ stack_list.append(stack)
153
+
154
+ if args.gold_alignment is not None:
155
+ gold_list = [read_alignments(x) for x in args.gold_alignment]
156
+ res = score_multiple(gold_list=gold_list, test_list=test_alignments)
157
+ log_final_scores(res)
158
+
159
+ if args.debug_save_stack:
160
+ pickle.dump(stack_list, open(args.debug_save_stack, 'wb'))
161
+
162
+
163
+ if __name__ == '__main__':
164
+ _main()