import os import pandas as pd import logging from PyPDF2 import PdfReader from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, vis_helper # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def pdf_to_text(pdf_path): text = "" with open(pdf_path, "rb") as file: reader = PdfReader(file) for page_num in range(len(reader.pages)): page = reader.pages[page_num] text += page.extract_text() + "\n" return text def align_text(txt1: str, txt2: str, lang1: str, lang2: str) -> pd.DataFrame: db_path = "bilingualdata.db" models = ["sentence_transformer_multilingual", "sentence_transformer_multilingual_labse"] model_name = models[0] txt1 = txt1.split("\n") txt2 = txt2.split("\n") text1_prepared = preprocessor.mark_paragraphs(txt1) text2_prepared = preprocessor.mark_paragraphs(txt2) splitted_from = splitter.split_by_sentences_wrapper(text1_prepared, lang1) splitted_to = splitter.split_by_sentences_wrapper(text2_prepared, lang2) if os.path.isfile(db_path): os.unlink(db_path) aligner.fill_db(db_path, lang1, lang2, splitted_from, splitted_to) batch_ids = [0, 1] aligner.align_db(db_path, model_name, batch_size=100, window=40, batch_ids=batch_ids, save_pic=False, embed_batch_size=10, normalize_embeddings=True, show_progress_bar=True) conflicts_to_solve, rest = resolver.get_all_conflicts(db_path, min_chain_length=2, max_conflicts_len=6, batch_id=-1) resolver.get_statistics(conflicts_to_solve) resolver.get_statistics(rest) steps = 3 batch_id = -1 for i in range(steps): conflicts, rest = resolver.get_all_conflicts(db_path, min_chain_length=2 + i, max_conflicts_len=6 * (i + 1), batch_id=batch_id) resolver.resolve_all_conflicts(db_path, conflicts, model_name, show_logs=False) #vis_helper.visualize_alignment_by_db(db_path, output_path="img_test1.png", lang_name_from=lang1, lang_name_to=lang2, batch_size=400, size=(600, 600), plt_show=True) if len(rest) == 0: break paragraphs_dict, par_ids, meta_info, sent_counter_dict = reader.get_paragraphs(db_path) # Log the keys of paragraphs_dict logger.info(f"paragraphs_dict keys: {paragraphs_dict.keys()}") paragraphs_from = paragraphs_dict["from"] paragraphs_to = paragraphs_dict["to"] data = [] for from_paragraph, to_paragraph in zip(paragraphs_from, paragraphs_to): if isinstance(from_paragraph, int) or isinstance(to_paragraph, int): logger.warning(f"from_paragraph: {from_paragraph}, to_paragraph: {to_paragraph}") continue for from_line, to_line in zip(from_paragraph, to_paragraph): data.append({"From": from_line, "To": to_line}) df = pd.DataFrame(data) return df def save_to_excel(df, file_name: str): df.to_excel(file_name, index=False)