# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= import base64 import logging import os from typing import List, Optional from urllib.parse import urlparse import openai import requests from camel.toolkits.base import BaseToolkit from camel.toolkits.function_tool import FunctionTool # logger = logging.getLogger(__name__) from loguru import logger class AudioAnalysisToolkit(BaseToolkit): r"""A class representing a toolkit for audio operations. This class provides methods for processing and understanding audio data. """ def __init__(self, cache_dir: Optional[str] = None, reasoning: bool = False): self.cache_dir = 'tmp/' if cache_dir: self.cache_dir = cache_dir self.client = openai.OpenAI() self.reasoning = reasoning def ask_question_about_audio(self, audio_path: str, question: str) -> str: r"""Ask any question about the audio and get the answer using multimodal model. Args: audio_path (str): The path to the audio file. question (str): The question to ask about the audio. Returns: str: The answer to the question. """ logger.debug( f"Calling ask_question_about_audio method for audio file \ `{audio_path}` and question `{question}`." ) parsed_url = urlparse(audio_path) is_url = all([parsed_url.scheme, parsed_url.netloc]) encoded_string = None if is_url: res = requests.get(audio_path) res.raise_for_status() audio_data = res.content encoded_string = base64.b64encode(audio_data).decode('utf-8') else: with open(audio_path, "rb") as audio_file: audio_data = audio_file.read() audio_file.close() encoded_string = base64.b64encode(audio_data).decode('utf-8') file_suffix = os.path.splitext(audio_path)[1] file_format = file_suffix[1:] if self.reasoning: text_prompt = f"Transcribe all the content in the speech into text." transcription = self.client.audio.transcriptions.create( model="whisper-1", file=open(audio_path, "rb") ) transcript = transcription.text reasoning_prompt = f""" {transcript} Please answer the following question based on the speech transcription result above: {question} """ reasoning_completion = self.client.chat.completions.create( # model="gpt-4o-audio-preview", model = "o3-mini", messages=[ { "role": "user", "content": reasoning_prompt, }] ) reasoning_result = reasoning_completion.choices[0].message.content return str(reasoning_result) else: text_prompt = f"""Answer the following question based on the given \ audio information:\n\n{question}""" completion = self.client.chat.completions.create( # model="gpt-4o-audio-preview", model = "gpt-4o-mini-audio-preview", messages=[ { "role": "system", "content": "You are a helpful assistant specializing in \ audio analysis.", }, { # type: ignore[list-item, misc] "role": "user", "content": [ {"type": "text", "text": text_prompt}, { "type": "input_audio", "input_audio": { "data": encoded_string, "format": file_format, }, }, ], }, ], ) # type: ignore[misc] response: str = str(completion.choices[0].message.content) logger.debug(f"Response: {response}") return str(response) def get_tools(self) -> List[FunctionTool]: r"""Returns a list of FunctionTool objects representing the functions in the toolkit. Returns: List[FunctionTool]: A list of FunctionTool objects representing the functions in the toolkit. """ return [FunctionTool(self.ask_question_about_audio)]