""" Text Translation Module using NLLB-3.3B model Handles text segmentation and batch translation """ import logging from transformers import AutoTokenizer, AutoModelForSeq2SeqLM logger = logging.getLogger(__name__) def translate_text(text): """ Translate English text to Simplified Chinese Args: text: Input English text Returns: Translated Chinese text """ logger.info(f"Starting translation for text length: {len(text)}") try: # Model initialization with explicit language codes logger.info("Loading NLLB model") tokenizer = AutoTokenizer.from_pretrained( "facebook/nllb-200-3.3B", src_lang="eng_Latn" # Specify source language ) model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B") logger.info("Translation model loaded") # Text processing max_chunk_length = 1000 text_chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)] logger.info(f"Split text into {len(text_chunks)} chunks") translated_chunks = [] for i, chunk in enumerate(text_chunks): logger.info(f"Processing chunk {i+1}/{len(text_chunks)}") # Tokenize with source language specification inputs = tokenizer( chunk, return_tensors="pt", max_length=1024, truncation=True ) # Generate translation with target language specification outputs = model.generate( **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("zho_Hans"), max_new_tokens=1024 ) translated = tokenizer.decode(outputs[0], skip_special_tokens=True) translated_chunks.append(translated) logger.info(f"Chunk {i+1} translated successfully") result = "".join(translated_chunks) logger.info(f"Translation completed. Total length: {len(result)}") return result except Exception as e: logger.error(f"Translation failed: {str(e)}", exc_info=True) raise