Spaces:
Sleeping
Sleeping
File size: 3,203 Bytes
f55c9dd 530ecef f55c9dd 7082b8a 8f90ee5 338a103 f55c9dd 627e7f5 f55c9dd 627e7f5 f55c9dd d391be7 f55c9dd d391be7 f55c9dd 627e7f5 f55c9dd f6ee992 f55c9dd 8f90ee5 625be5d 8f90ee5 625be5d f55c9dd 8f90ee5 625be5d f55c9dd 8f90ee5 2d97810 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import torch
import torchaudio
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
from googletrans import Translator
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from pathlib import Path
import numpy as np
app = FastAPI()
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=256,
chunk_length_s=30,
batch_size=16,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device,
)
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
@app.post("/voice_recognition")
async def process_audio(file: UploadFile = File(...)):
try:
# File
save_directory = Path("/home/user")
save_directory.mkdir(parents=True, exist_ok=True)
file_location = save_directory / file.filename
with open(file_location, "wb") as saved_file:
saved_file.write(file.file.read())
# Read audio file and convert to NumPy ndarray
audio_array, _ = torchaudio.load(file_location, normalize=True)
audio_array = np.array(audio_array[0].numpy())
# JP
original = pipe(audio_array)
original_version = original["text"]
# EN
result = pipe(audio_array, generate_kwargs={"task": "translate"})
hasil = result["text"]
# ID
detect = detect_google(hasil)
id_ver = translate_google(hasil, f"{detect}", "ID")
# Additional modifications
id_ver = modify_text(id_ver)
return JSONResponse(content={"response": {"jp_text": original_version, "en_text": hasil, "id_text": id_ver}}, status_code=200)
except Exception as e:
return HTTPException(status_code=500, detail=f"Error: {e}")
def detect_google(text):
try:
translator = Translator()
detected_lang = translator.detect(text)
return detected_lang.lang.upper()
except Exception as e:
print(f"Error detect: {e}")
return None
def translate_google(text, source, target):
try:
translator = Translator()
translated_text = translator.translate(text, src=source, dest=target)
return translated_text.text
except Exception as e:
print(f"Error translate: {e}")
return None
def modify_text(text):
# Additional modifications, case-sensitive
replacements = {
"Tuan": "Master",
"tuan": "Master",
"Guru": "Master",
"guru": "Master",
"Monica": "Monika",
"monica": "Monika",
}
for original, replacement in replacements.items():
text = text.replace(original, replacement)
return text
|