Translator / app.py
garyd1's picture
Create app.py
3cf0975 verified
import os
import uuid
import tempfile
import re
import requests
import pandas as pd
from tika import parser
from docx import Document
from sentence_transformers import SentenceTransformer, util
import torch
import streamlit as st
from io import BytesIO
# Load the pre-trained embedding model for semantic matching.
model = SentenceTransformer('all-MiniLM-L6-v2')
# -----------------------------
# Glossary Loader and Enforcement
# -----------------------------
def load_glossary(glossary_file) -> dict:
"""
Load the company glossary from an Excel file.
Expects columns: 'English' and 'CanadianFrench'
"""
try:
# Use pandas to read directly from the uploaded file (BytesIO)
df = pd.read_excel(glossary_file)
glossary = {
row['English'].strip().lower(): row['CanadianFrench'].strip()
for _, row in df.iterrows()
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench'])
}
return glossary
except Exception as e:
raise Exception(f"Error loading glossary: {str(e)}")
def apply_glossary(text: str, glossary: dict) -> str:
"""
Replace occurrences of glossary terms (exact word match) with preferred Canadian French terms.
"""
for eng_term, fr_term in glossary.items():
pattern = r'\b' + re.escape(eng_term) + r'\b'
text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
return text
# -----------------------------
# Semantic Glossary Enforcement
# -----------------------------
def compute_glossary_embeddings(glossary: dict):
"""
Precompute embeddings for the glossary keys.
"""
glossary_terms = list(glossary.keys())
embeddings = model.encode(glossary_terms, convert_to_tensor=True)
return glossary_terms, embeddings
def apply_semantic_glossary(text: str, glossary: dict, threshold: float = 0.8) -> str:
"""
Enhance glossary enforcement using semantic similarity.
Splits text into sentences, computes embeddings, and if a sentence is
semantically similar to a glossary term (above threshold), performs replacement.
"""
glossary_terms, glossary_embeddings = compute_glossary_embeddings(glossary)
sentences = text.split('.')
updated_sentences = []
for sentence in sentences:
if not sentence.strip():
continue
sentence_embedding = model.encode(sentence, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
max_score, max_idx = torch.max(cos_scores, dim=1)
if max_score.item() >= threshold:
term = glossary_terms[max_idx]
replacement = glossary[term]
pattern = r'\b' + re.escape(term) + r'\b'
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
updated_sentences.append(sentence.strip())
final_text = '. '.join(updated_sentences)
return final_text
# -----------------------------
# Translation using Azure Translator API
# -----------------------------
def translate_text_azure(text: str) -> str:
"""
Translate text to Canadian French using the Azure Translator API.
"""
subscription_key = os.getenv("AZURE_TRANSLATOR_KEY")
region = os.getenv("AZURE_TRANSLATOR_REGION")
if not subscription_key or not region:
raise Exception("Azure Translator credentials not set.")
endpoint = "https://api.cognitive.microsofttranslator.com/translate"
params = {"api-version": "3.0", "to": "fr-CA"}
headers = {
"Ocp-Apim-Subscription-Key": subscription_key,
"Ocp-Apim-Subscription-Region": region,
"Content-type": "application/json",
"X-ClientTraceId": str(uuid.uuid4())
}
body = [{"text": text}]
response = requests.post(endpoint, params=params, headers=headers, json=body)
if response.status_code != 200:
raise Exception(f"Translation API error: {response.text}")
result = response.json()
translated_text = result[0]['translations'][0]['text']
return translated_text
# -----------------------------
# Document Parsing & Reconstruction
# -----------------------------
def parse_document(file_path: str) -> str:
"""
Extract text content from a document using Apache Tika.
"""
parsed = parser.from_file(file_path)
text = parsed.get("content", "")
if not text:
raise Exception("No text content found in the document.")
return text
def rebuild_document(text: str) -> bytes:
"""
Rebuild a DOCX document from the provided text.
Returns the document as bytes.
"""
document = Document()
for line in text.split("\n"):
if line.strip():
document.add_paragraph(line)
bio = BytesIO()
document.save(bio)
bio.seek(0)
return bio.getvalue()
# -----------------------------
# Processing Pipeline
# -----------------------------
def process_translation(doc_file, glossary_file) -> bytes:
try:
# Write uploaded document to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_doc:
tmp_doc.write(doc_file.read())
doc_path = tmp_doc.name
# Load glossary from the uploaded Excel file
glossary = load_glossary(glossary_file)
# Parse document text
raw_text = parse_document(doc_path)
# Translate text via Azure Translator
translated_text = translate_text_azure(raw_text)
# Apply exact glossary enforcement
final_text = apply_glossary(translated_text, glossary)
# Apply semantic glossary enforcement
final_text = apply_semantic_glossary(final_text, glossary, threshold=0.8)
# Rebuild document to DOCX and get bytes
output_bytes = rebuild_document(final_text)
# Clean up temporary file
os.unlink(doc_path)
return output_bytes
except Exception as e:
st.error(f"Error: {str(e)}")
return None
# -----------------------------
# Streamlit App UI
# -----------------------------
def main():
st.title("English to Canadian Quebec French Translator")
st.write("Upload an English document (Word or PDF) and your company glossary (Excel) to translate.")
doc_file = st.file_uploader("Upload English Document", type=["doc", "docx", "pdf"])
glossary_file = st.file_uploader("Upload Company Glossary (Excel)", type=["xlsx"])
if st.button("Translate Document"):
if doc_file is None or glossary_file is None:
st.error("Please upload both the document and glossary files.")
else:
with st.spinner("Translating..."):
result = process_translation(doc_file, glossary_file)
if result is not None:
st.download_button(
label="Download Translated DOCX",
data=result,
file_name="translated.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
if __name__ == "__main__":
main()