Spaces:

jameszokah
/

marigen_api

Sleeping

File size: 3,038 Bytes

1897f56

from typing import Iterator
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
import speech_recognition as sr
from pydub import AudioSegment
import io


class AudioParser(BaseBlobParser):
    """Parse audio files from a blob and convert them to text."""

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Parse an audio file into the Document iterator.



        Args:

            blob: The blob to parse.



        Returns: An iterator of Documents.

        """
        supported_mime_types = [
            "audio/wav",      # .wav
            "audio/mpeg",     # .mp3
            "audio/ogg",      # .ogg
            "audio/flac",     # .flac
            "audio/x-aiff"    # .aiff
        ]

        # Debugging: Print MIME type
        print(f"Blob MIME type: {blob.mimetype}")

        if blob.mimetype not in supported_mime_types:
            raise ValueError(
                f"This blob type is not supported for this parser. Supported types are: {supported_mime_types}"
            )

        recognizer = sr.Recognizer()

        try:
            # Convert to PCM WAV if necessary
            with blob.as_bytes_io() as audio_file:
                audio_bytes = audio_file.read()
                # e.g., "mpeg" from "audio/mpeg"
                audio_format = blob.mimetype.split('/')[1]
                print(f"Attempting to process audio format: {audio_format}")

                if audio_format in ["wav", "flac", "aiff"]:
                    # Directly use AudioFile for these formats
                    audio_file.seek(0)
                    audio_stream = audio_file
                else:
                    # Convert to PCM WAV using pydub
                    audio_segment = AudioSegment.from_file(
                        io.BytesIO(audio_bytes), format=audio_format)
                    audio_stream = io.BytesIO()
                    audio_segment.export(audio_stream, format="wav")
                    audio_stream.seek(0)

                with sr.AudioFile(audio_stream) as source:
                    audio_data = recognizer.record(source)
                    try:
                        text = recognizer.recognize_google(audio_data)
                        metadata = {"source": blob.source}
                        yield Document(page_content=text, metadata=metadata)
                    except sr.UnknownValueError:
                        print(
                            "Google Speech Recognition could not understand the audio.")
                        raise
                    except sr.RequestError as e:
                        print(
                            f"Could not request results from Google Speech Recognition service; {e}")
                        raise
        except Exception as e:
            print(f"Error processing audio file: {e}")
            raise