pdf2tmx / app.py
nelsonjq's picture
languages
5323684 verified
import gradio as gr
import pandas as pd
import os
from utils import pdf_to_text, align_text
def process_files(source_file, target_file, lang1, lang2):
if source_file is None or target_file is None:
return "Please upload both PDF files.", None
if lang1 == lang2:
return "Please select different languages.", None
# Convert PDFs to text
text_content1 = pdf_to_text(source_file.name)
text_content2 = pdf_to_text(target_file.name)
# Align the texts
aligned_df = align_text(text_content1, text_content2, lang1, lang2)
# Convert DataFrame to HTML
aligned_html = aligned_df.to_html(index=False)
# Save DataFrame as Excel file
excel_path = "aligned_data.xlsx"
aligned_df.to_excel(excel_path, index=False)
return aligned_html, excel_path
# Define the Gradio interface
with gr.Blocks() as interface:
gr.Markdown("# PDF Text Aligner\nUpload two PDF files and select languages to align the text.")
source_file = gr.File(label="Upload Source PDF")
target_file = gr.File(label="Upload Target PDF")
lang1 = gr.Dropdown(choices=["en", "es", "fr", "zh", "ar", "ru", "pt"], label="Select Language 1")
lang2 = gr.Dropdown(choices=["en", "es", "fr", "zh", "ar", "ru", "pt"], label="Select Language 2", value="es")
start_button = gr.Button(value="Start")
aligned_html = gr.HTML(label="Aligned DataFrame")
download_button = gr.File(label="Download Aligned Data as Excel")
start_button.click(
fn=process_files,
inputs=[source_file, target_file, lang1, lang2],
outputs=[aligned_html, download_button]
)
if __name__ == "__main__":
interface.launch()