marigen_api / parser /audio_parser.py
jameszokah's picture
Synced repo using 'sync_with_huggingface' Github Action
1897f56 verified
raw
history blame
3.04 kB
from typing import Iterator
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
import speech_recognition as sr
from pydub import AudioSegment
import io
class AudioParser(BaseBlobParser):
"""Parse audio files from a blob and convert them to text."""
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Parse an audio file into the Document iterator.
Args:
blob: The blob to parse.
Returns: An iterator of Documents.
"""
supported_mime_types = [
"audio/wav", # .wav
"audio/mpeg", # .mp3
"audio/ogg", # .ogg
"audio/flac", # .flac
"audio/x-aiff" # .aiff
]
# Debugging: Print MIME type
print(f"Blob MIME type: {blob.mimetype}")
if blob.mimetype not in supported_mime_types:
raise ValueError(
f"This blob type is not supported for this parser. Supported types are: {supported_mime_types}"
)
recognizer = sr.Recognizer()
try:
# Convert to PCM WAV if necessary
with blob.as_bytes_io() as audio_file:
audio_bytes = audio_file.read()
# e.g., "mpeg" from "audio/mpeg"
audio_format = blob.mimetype.split('/')[1]
print(f"Attempting to process audio format: {audio_format}")
if audio_format in ["wav", "flac", "aiff"]:
# Directly use AudioFile for these formats
audio_file.seek(0)
audio_stream = audio_file
else:
# Convert to PCM WAV using pydub
audio_segment = AudioSegment.from_file(
io.BytesIO(audio_bytes), format=audio_format)
audio_stream = io.BytesIO()
audio_segment.export(audio_stream, format="wav")
audio_stream.seek(0)
with sr.AudioFile(audio_stream) as source:
audio_data = recognizer.record(source)
try:
text = recognizer.recognize_google(audio_data)
metadata = {"source": blob.source}
yield Document(page_content=text, metadata=metadata)
except sr.UnknownValueError:
print(
"Google Speech Recognition could not understand the audio.")
raise
except sr.RequestError as e:
print(
f"Could not request results from Google Speech Recognition service; {e}")
raise
except Exception as e:
print(f"Error processing audio file: {e}")
raise