Spaces:

garyd1
/

Translator

Sleeping

App Files Files Community

Translator / app.py

garyd1

Create app.py

3cf0975 verified 2 months ago

raw

history blame contribute delete

7.12 kB

	import os
	import uuid
	import tempfile
	import re
	import requests
	import pandas as pd
	from tika import parser
	from docx import Document
	from sentence_transformers import SentenceTransformer, util
	import torch
	import streamlit as st
	from io import BytesIO

	# Load the pre-trained embedding model for semantic matching.
	model = SentenceTransformer('all-MiniLM-L6-v2')

	# -----------------------------
	# Glossary Loader and Enforcement
	# -----------------------------
	def load_glossary(glossary_file) -> dict:
	"""
	Load the company glossary from an Excel file.
	Expects columns: 'English' and 'CanadianFrench'
	"""
	try:
	# Use pandas to read directly from the uploaded file (BytesIO)
	df = pd.read_excel(glossary_file)
	glossary = {
	row['English'].strip().lower(): row['CanadianFrench'].strip()
	for _, row in df.iterrows()
	if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench'])
	}
	return glossary
	except Exception as e:
	raise Exception(f"Error loading glossary: {str(e)}")

	def apply_glossary(text: str, glossary: dict) -> str:
	"""
	Replace occurrences of glossary terms (exact word match) with preferred Canadian French terms.
	"""
	for eng_term, fr_term in glossary.items():
	pattern = r'\b' + re.escape(eng_term) + r'\b'
	text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
	return text

	# -----------------------------
	# Semantic Glossary Enforcement
	# -----------------------------
	def compute_glossary_embeddings(glossary: dict):
	"""
	Precompute embeddings for the glossary keys.
	"""
	glossary_terms = list(glossary.keys())
	embeddings = model.encode(glossary_terms, convert_to_tensor=True)
	return glossary_terms, embeddings

	def apply_semantic_glossary(text: str, glossary: dict, threshold: float = 0.8) -> str:
	"""
	Enhance glossary enforcement using semantic similarity.
	Splits text into sentences, computes embeddings, and if a sentence is
	semantically similar to a glossary term (above threshold), performs replacement.
	"""
	glossary_terms, glossary_embeddings = compute_glossary_embeddings(glossary)
	sentences = text.split('.')
	updated_sentences = []
	for sentence in sentences:
	if not sentence.strip():
	continue
	sentence_embedding = model.encode(sentence, convert_to_tensor=True)
	cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
	max_score, max_idx = torch.max(cos_scores, dim=1)
	if max_score.item() >= threshold:
	term = glossary_terms[max_idx]
	replacement = glossary[term]
	pattern = r'\b' + re.escape(term) + r'\b'
	sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
	updated_sentences.append(sentence.strip())
	final_text = '. '.join(updated_sentences)
	return final_text

	# -----------------------------
	# Translation using Azure Translator API
	# -----------------------------
	def translate_text_azure(text: str) -> str:
	"""
	Translate text to Canadian French using the Azure Translator API.
	"""
	subscription_key = os.getenv("AZURE_TRANSLATOR_KEY")
	region = os.getenv("AZURE_TRANSLATOR_REGION")
	if not subscription_key or not region:
	raise Exception("Azure Translator credentials not set.")

	endpoint = "https://api.cognitive.microsofttranslator.com/translate"
	params = {"api-version": "3.0", "to": "fr-CA"}
	headers = {
	"Ocp-Apim-Subscription-Key": subscription_key,
	"Ocp-Apim-Subscription-Region": region,
	"Content-type": "application/json",
	"X-ClientTraceId": str(uuid.uuid4())
	}
	body = [{"text": text}]
	response = requests.post(endpoint, params=params, headers=headers, json=body)
	if response.status_code != 200:
	raise Exception(f"Translation API error: {response.text}")
	result = response.json()
	translated_text = result[0]['translations'][0]['text']
	return translated_text

	# -----------------------------
	# Document Parsing & Reconstruction
	# -----------------------------
	def parse_document(file_path: str) -> str:
	"""
	Extract text content from a document using Apache Tika.
	"""
	parsed = parser.from_file(file_path)
	text = parsed.get("content", "")
	if not text:
	raise Exception("No text content found in the document.")
	return text

	def rebuild_document(text: str) -> bytes:
	"""
	Rebuild a DOCX document from the provided text.
	Returns the document as bytes.
	"""
	document = Document()
	for line in text.split("\n"):
	if line.strip():
	document.add_paragraph(line)
	bio = BytesIO()
	document.save(bio)
	bio.seek(0)
	return bio.getvalue()

	# -----------------------------
	# Processing Pipeline
	# -----------------------------
	def process_translation(doc_file, glossary_file) -> bytes:
	try:
	# Write uploaded document to a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_doc:
	tmp_doc.write(doc_file.read())
	doc_path = tmp_doc.name

	# Load glossary from the uploaded Excel file
	glossary = load_glossary(glossary_file)

	# Parse document text
	raw_text = parse_document(doc_path)

	# Translate text via Azure Translator
	translated_text = translate_text_azure(raw_text)

	# Apply exact glossary enforcement
	final_text = apply_glossary(translated_text, glossary)

	# Apply semantic glossary enforcement
	final_text = apply_semantic_glossary(final_text, glossary, threshold=0.8)

	# Rebuild document to DOCX and get bytes
	output_bytes = rebuild_document(final_text)

	# Clean up temporary file
	os.unlink(doc_path)
	return output_bytes
	except Exception as e:
	st.error(f"Error: {str(e)}")
	return None

	# -----------------------------
	# Streamlit App UI
	# -----------------------------
	def main():
	st.title("English to Canadian Quebec French Translator")
	st.write("Upload an English document (Word or PDF) and your company glossary (Excel) to translate.")

	doc_file = st.file_uploader("Upload English Document", type=["doc", "docx", "pdf"])
	glossary_file = st.file_uploader("Upload Company Glossary (Excel)", type=["xlsx"])

	if st.button("Translate Document"):
	if doc_file is None or glossary_file is None:
	st.error("Please upload both the document and glossary files.")
	else:
	with st.spinner("Translating..."):
	result = process_translation(doc_file, glossary_file)
	if result is not None:
	st.download_button(
	label="Download Translated DOCX",
	data=result,
	file_name="translated.docx",
	mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	)

	if __name__ == "__main__":
	main()