|
import os |
|
import pandas as pd |
|
import logging |
|
from PyPDF2 import PdfReader |
|
from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, vis_helper |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
def pdf_to_text(pdf_path): |
|
text = "" |
|
with open(pdf_path, "rb") as file: |
|
reader = PdfReader(file) |
|
for page_num in range(len(reader.pages)): |
|
page = reader.pages[page_num] |
|
text += page.extract_text() + "\n" |
|
return text |
|
|
|
def align_text(txt1: str, txt2: str, lang1: str, lang2: str) -> pd.DataFrame: |
|
db_path = "bilingualdata.db" |
|
models = ["sentence_transformer_multilingual", "sentence_transformer_multilingual_labse"] |
|
model_name = models[0] |
|
|
|
txt1 = txt1.split("\n") |
|
txt2 = txt2.split("\n") |
|
|
|
text1_prepared = preprocessor.mark_paragraphs(txt1) |
|
text2_prepared = preprocessor.mark_paragraphs(txt2) |
|
splitted_from = splitter.split_by_sentences_wrapper(text1_prepared, lang1) |
|
splitted_to = splitter.split_by_sentences_wrapper(text2_prepared, lang2) |
|
|
|
if os.path.isfile(db_path): |
|
os.unlink(db_path) |
|
|
|
aligner.fill_db(db_path, lang1, lang2, splitted_from, splitted_to) |
|
batch_ids = [0, 1] |
|
|
|
aligner.align_db(db_path, |
|
model_name, |
|
batch_size=100, |
|
window=40, |
|
batch_ids=batch_ids, |
|
save_pic=False, |
|
embed_batch_size=10, |
|
normalize_embeddings=True, |
|
show_progress_bar=True) |
|
|
|
conflicts_to_solve, rest = resolver.get_all_conflicts(db_path, min_chain_length=2, max_conflicts_len=6, batch_id=-1) |
|
resolver.get_statistics(conflicts_to_solve) |
|
resolver.get_statistics(rest) |
|
|
|
steps = 3 |
|
batch_id = -1 |
|
|
|
for i in range(steps): |
|
conflicts, rest = resolver.get_all_conflicts(db_path, min_chain_length=2 + i, max_conflicts_len=6 * (i + 1), batch_id=batch_id) |
|
resolver.resolve_all_conflicts(db_path, conflicts, model_name, show_logs=False) |
|
|
|
|
|
if len(rest) == 0: |
|
break |
|
|
|
paragraphs_dict, par_ids, meta_info, sent_counter_dict = reader.get_paragraphs(db_path) |
|
|
|
|
|
logger.info(f"paragraphs_dict keys: {paragraphs_dict.keys()}") |
|
|
|
paragraphs_from = paragraphs_dict["from"] |
|
paragraphs_to = paragraphs_dict["to"] |
|
|
|
data = [] |
|
for from_paragraph, to_paragraph in zip(paragraphs_from, paragraphs_to): |
|
if isinstance(from_paragraph, int) or isinstance(to_paragraph, int): |
|
logger.warning(f"from_paragraph: {from_paragraph}, to_paragraph: {to_paragraph}") |
|
continue |
|
for from_line, to_line in zip(from_paragraph, to_paragraph): |
|
data.append({"From": from_line, "To": to_line}) |
|
|
|
df = pd.DataFrame(data) |
|
return df |
|
|
|
def save_to_excel(df, file_name: str): |
|
df.to_excel(file_name, index=False) |