Spaces:
Running
on
L40S
Running
on
L40S
add chunk size function
Browse files
app.py
CHANGED
@@ -289,62 +289,60 @@ def process_text(text, src_language, target_language, max_iterations_value, thre
|
|
289 |
source_segments = segment_sentences_by_punctuation(source_sentence, src_nlp)
|
290 |
|
291 |
if chunk_size == -1:
|
292 |
-
chunks = [' '.join(source_segments)]
|
293 |
-
else:
|
294 |
-
chunks = [' '.join(source_segments[i:i+chunk_size]) for i in range(0, len(source_segments), chunk_size)]
|
295 |
-
|
296 |
-
org_translated_chunks = []
|
297 |
-
p2a_translated_chunks = []
|
298 |
-
bfn_translated_chunks = []
|
299 |
-
mpc_translated_chunks = []
|
300 |
-
|
301 |
-
for chunk in chunks:
|
302 |
if "Original" in translation_methods:
|
303 |
-
|
304 |
-
|
305 |
if "Plan2Align" in translation_methods:
|
306 |
-
|
307 |
-
|
308 |
max_iterations_value, threshold_value, good_ref_contexts_num_value, "metricx"
|
309 |
)
|
310 |
-
|
311 |
if "Best-of-N" in translation_methods:
|
312 |
-
|
313 |
-
|
314 |
if "MPC" in translation_methods:
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
|
349 |
return orig_output, plan2align_output, best_of_n_output, mpc_output
|
350 |
|
|
|
289 |
source_segments = segment_sentences_by_punctuation(source_sentence, src_nlp)
|
290 |
|
291 |
if chunk_size == -1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
if "Original" in translation_methods:
|
293 |
+
orig, best_score = original_translation(text, src_language, target_language, session_id)
|
294 |
+
orig_output = f"{orig}\n\nScore: {best_score:.2f}"
|
295 |
if "Plan2Align" in translation_methods:
|
296 |
+
plan2align_trans, best_score = plan2align_translate_text(
|
297 |
+
text, session_id, model, tokenizer, device, src_language, target_language,
|
298 |
max_iterations_value, threshold_value, good_ref_contexts_num_value, "metricx"
|
299 |
)
|
300 |
+
plan2align_output = f"{plan2align_trans}\n\nScore: {best_score:.2f}"
|
301 |
if "Best-of-N" in translation_methods:
|
302 |
+
best_candidate, best_score = best_of_n_translation(text, src_language, target_language, max_iterations_value, session_id)
|
303 |
+
best_of_n_output = f"{best_candidate}\n\nScore: {best_score:.2f}"
|
304 |
if "MPC" in translation_methods:
|
305 |
+
mpc_candidate, mpc_score = mpc_translation(text, src_language, target_language,
|
306 |
+
max_iterations_value, session_id)
|
307 |
+
mpc_output = f"{mpc_candidate}\n\nScore: {mpc_score:.2f}"
|
308 |
+
else:
|
309 |
+
chunks = [' '.join(source_segments[i:i+chunk_size]) for i in range(0, len(source_segments), chunk_size)]
|
310 |
+
|
311 |
+
org_translated_chunks = []
|
312 |
+
p2a_translated_chunks = []
|
313 |
+
bfn_translated_chunks = []
|
314 |
+
mpc_translated_chunks = []
|
315 |
+
|
316 |
+
for chunk in chunks:
|
317 |
+
if "Original" in translation_methods:
|
318 |
+
translation, _ = original_translation(chunk, src_language, target_language, session_id)
|
319 |
+
org_translated_chunks.append(translation)
|
320 |
+
if "Plan2Align" in translation_methods:
|
321 |
+
translation, _ = plan2align_translate_text(
|
322 |
+
chunk, session_id, model, tokenizer, device, src_language, target_language,
|
323 |
+
max_iterations_value, threshold_value, good_ref_contexts_num_value, "metricx"
|
324 |
+
)
|
325 |
+
p2a_translated_chunks.append(translation)
|
326 |
+
if "Best-of-N" in translation_methods:
|
327 |
+
translation, _ = best_of_n_translation(chunk, src_language, target_language, max_iterations_value, session_id)
|
328 |
+
bfn_translated_chunks.append(translation)
|
329 |
+
if "MPC" in translation_methods:
|
330 |
+
translation, _ = mpc_translation(chunk, src_language, target_language, max_iterations_value, session_id)
|
331 |
+
mpc_translated_chunks.append(translation)
|
332 |
+
|
333 |
+
org_combined_translation = ' '.join(org_translated_chunks)
|
334 |
+
p2a_combined_translation = ' '.join(p2a_translated_chunks)
|
335 |
+
bfn_combined_translation = ' '.join(bfn_translated_chunks)
|
336 |
+
mpc_combined_translation = ' '.join(mpc_translated_chunks)
|
337 |
+
|
338 |
+
orig, best_score = summary_translate(text, org_combined_translation, target_language, session_id)
|
339 |
+
orig_output = f"{orig}\n\nScore: {best_score:.2f}"
|
340 |
+
plan2align_trans, best_score = summary_translate(text, p2a_combined_translation, target_language, session_id)
|
341 |
+
plan2align_output = f"{plan2align_trans}\n\nScore: {best_score:.2f}"
|
342 |
+
best_candidate, best_score = summary_translate(text, bfn_combined_translation, target_language, session_id)
|
343 |
+
best_of_n_output = f"{best_candidate}\n\nScore: {best_score:.2f}"
|
344 |
+
mpc_candidate, best_score = summary_translate(text, mpc_combined_translation, target_language, session_id)
|
345 |
+
mpc_output = f"{mpc_candidate}\n\nScore: {best_score:.2f}"
|
346 |
|
347 |
return orig_output, plan2align_output, best_of_n_output, mpc_output
|
348 |
|