Spaces:

garyd1
/

text_translator

Sleeping

App Files Files Community

garyd1 commited on Feb 25

Commit

b7c9c63

verified ·

1 Parent(s): 69fbe17

Create app.py

Browse files

Files changed (1) hide show

app.py +117 -0

app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import re
+import openai
+import streamlit as st
+import pandas as pd
+import torch
+import nltk
+from langchain.chat_models import ChatOpenAI
+from langchain.schema import SystemMessage, HumanMessage
+from sentence_transformers import SentenceTransformer, util
+# Try to load spaCy for advanced NLP processing
+try:
+    import spacy
+    nlp = spacy.load("en_core_web_sm")
+    use_spacy = True
+except Exception:
+    st.warning("SpaCy model not found, falling back to NLTK for tokenization.")
+    nltk.download("punkt")
+    use_spacy = False
+# Load AI models
+translator = ChatOpenAI(model="gpt-3.5-turbo")
+model = SentenceTransformer('all-MiniLM-L6-v2')
+@st.cache_data
+def load_glossary_from_excel(glossary_file_bytes) -> dict:
+    """Load glossary from an Excel file, applying lemmatization and sorting by length."""
+    df = pd.read_excel(glossary_file_bytes)
+    glossary = {}
+    for _, row in df.iterrows():
+        if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
+            english_term = row['English'].strip().lower()
+            french_term = row['CanadianFrench'].strip()
+            doc = nlp(english_term) if use_spacy else english_term.split()
+            lemmatized_term = " ".join([token.lemma_ for token in doc]) if use_spacy else english_term
+            glossary[lemmatized_term] = french_term
+    return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))
+@st.cache_data
+def compute_glossary_embeddings_cached(glossary_items: tuple):
+    """Compute cached embeddings for glossary terms."""
+    glossary = dict(glossary_items)
+    glossary_terms = list(glossary.keys())
+    embeddings = model.encode(glossary_terms, convert_to_tensor=True)
+    return glossary_terms, embeddings
+def translate_text(text: str) -> str:
+    """Uses OpenAI's GPT to translate text to Canadian French."""
+    messages = [
+        SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."),
+        HumanMessage(content=text)
+    ]
+    response = translator(messages)
+    return response.content.strip()
+def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
+    """Applies glossary replacements based on semantic similarity."""
+    glossary_items = tuple(sorted(glossary.items()))
+    glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
+    sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents]
+    updated_sentences = []
+    for sentence in sentences:
+        if not sentence.strip():
+            continue
+        sentence_embedding = model.encode(sentence, convert_to_tensor=True)
+        cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
+        max_score, max_idx = torch.max(cos_scores, dim=1)
+        if max_score.item() >= threshold:
+            term = glossary_terms[max_idx]
+            replacement = glossary[term]
+            pattern = r'\b' + re.escape(term) + r'\b'
+            sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
+        updated_sentences.append(sentence.strip())
+    return " ".join(updated_sentences)
+def validate_translation(original_text, final_text):
+    """Uses GPT to check if the final translation retains the original meaning."""
+    messages = [
+        SystemMessage(content="You are an AI proofreader. Compare the original and final translation. Does the final translation retain the original meaning?"),
+        HumanMessage(content=f"Original Text: {original_text}\nFinal Translation: {final_text}\n")
+    ]
+    response = translator(messages)
+    return response.content.strip()
+# Streamlit UI
+st.title("AI-Powered English to Canadian French Translator")
+st.write("This app uses AI agents for translation, glossary enforcement, and meaning validation.")
+input_text = st.text_area("Enter text to translate:")
+glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
+threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.8)
+if st.button("Translate"):
+    if not input_text.strip():
+        st.error("Please enter text to translate.")
+    elif glossary_file is None:
+        st.error("Glossary file is required.")
+    else:
+        glossary = load_glossary_from_excel(glossary_file)
+        translated_text = translate_text(input_text)
+        glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold)
+        validation_result = validate_translation(input_text, glossary_enforced_text)
+        st.subheader("Final Translated Text:")
+        st.write(glossary_enforced_text)
+        st.subheader("Validation Check:")
+        st.write(validation_result)