Spaces:

Jerich
/

TalklasApp2

Sleeping

Jerich commited on 25 days ago

Commit

0cce76f

verified ·

1 Parent(s): eb35d5b

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -75,8 +75,11 @@ NLLB_LANGUAGE_CODES = {
 # Define a list of inappropriate words for content filtering
 INAPPROPRIATE_WORDS = [
-    "profanity", "obscenity", "obscene", "offensive", "vulgar", "explicit",
-    # Add more words as needed or load from a separate file
 ]
 # Function to check if text contains inappropriate content
@@ -604,7 +607,15 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
             inputs = processor(waveform.numpy()[0], sampling_rate=16000, return_tensors="pt").to(device)
             with torch.no_grad():
                 language = "en" if source_code == "eng" else "tl" if source_code == "tgl" else None
-                generated_ids = model.generate(**inputs, language=language, task="transcribe")
                 transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         else:
             processor = stt_mms_processor

 # Define a list of inappropriate words for content filtering
 INAPPROPRIATE_WORDS = [
+    "fuck", "shit", "bitch", "asshole", "damn", "cunt", "whore", "bastard",
+    "son of a bitch", "dick", "pussy", "motherfucker", "agka baboy",
+    "puta", "putang ina", "gago", "tanga", "hayop", "ulol", "lintik", "animal ka",
+    "paki", "pakyu", "yawa", "bungol", "gingan", "yawa ka", "peste", "irig",
+    "pakit", "ayat", "pua", "kayat mo ti agsardeng", "hinampak", "iring ka"
 ]
 # Function to check if text contains inappropriate content
             inputs = processor(waveform.numpy()[0], sampling_rate=16000, return_tensors="pt").to(device)
             with torch.no_grad():
                 language = "en" if source_code == "eng" else "tl" if source_code == "tgl" else None
+                # Explicitly avoid forced_decoder_ids conflict
+                generation_config = model.generation_config
+                generation_config.task = "transcribe"
+                generation_config.language = f"<|{language}|>" if language else None
+                generated_ids = model.generate(
+                    **inputs,
+                    generation_config=generation_config,
+                    max_length=448
+                )
                 transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         else:
             processor = stt_mms_processor