Spaces:

DroolingPanda
/

teachingAssistant

Running

teachingAssistant / utils /translation.py

Michael Hu

fix bug

77b7581 3 months ago

2.22 kB

	"""
	Text Translation Module using NLLB-3.3B model
	Handles text segmentation and batch translation
	"""

	import logging
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

	logger = logging.getLogger(__name__)

	def translate_text(text):
	"""
	Translate English text to Simplified Chinese
	Args:
	text: Input English text
	Returns:
	Translated Chinese text
	"""
	logger.info(f"Starting translation for text length: {len(text)}")

	try:
	# Model initialization with explicit language codes
	logger.info("Loading NLLB model")
	tokenizer = AutoTokenizer.from_pretrained(
	"facebook/nllb-200-3.3B",
	src_lang="eng_Latn" # Specify source language
	)
	model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
	logger.info("Translation model loaded")

	# Text processing
	max_chunk_length = 1000
	text_chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
	logger.info(f"Split text into {len(text_chunks)} chunks")

	translated_chunks = []
	for i, chunk in enumerate(text_chunks):
	logger.info(f"Processing chunk {i+1}/{len(text_chunks)}")

	# Tokenize with source language specification
	inputs = tokenizer(
	chunk,
	return_tensors="pt",
	max_length=1024,
	truncation=True
	)

	# Generate translation with target language specification
	outputs = model.generate(
	**inputs,
	forced_bos_token_id=tokenizer.convert_tokens_to_ids("zho_Hans"),
	max_new_tokens=1024
	)

	translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
	translated_chunks.append(translated)
	logger.info(f"Chunk {i+1} translated successfully")

	result = "".join(translated_chunks)
	logger.info(f"Translation completed. Total length: {len(result)}")
	return result

	except Exception as e:
	logger.error(f"Translation failed: {str(e)}", exc_info=True)
	raise