# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import base64
import logging
import os
from typing import List, Optional
from urllib.parse import urlparse
import openai
import requests
from camel.toolkits.base import BaseToolkit
from camel.toolkits.function_tool import FunctionTool
# logger = logging.getLogger(__name__)
from loguru import logger
class AudioAnalysisToolkit(BaseToolkit):
r"""A class representing a toolkit for audio operations.
This class provides methods for processing and understanding audio data.
"""
def __init__(self, cache_dir: Optional[str] = None, reasoning: bool = False):
self.cache_dir = 'tmp/'
if cache_dir:
self.cache_dir = cache_dir
self.client = openai.OpenAI()
self.reasoning = reasoning
def ask_question_about_audio(self, audio_path: str, question: str) -> str:
r"""Ask any question about the audio and get the answer using
multimodal model.
Args:
audio_path (str): The path to the audio file.
question (str): The question to ask about the audio.
Returns:
str: The answer to the question.
"""
logger.debug(
f"Calling ask_question_about_audio method for audio file \
`{audio_path}` and question `{question}`."
)
parsed_url = urlparse(audio_path)
is_url = all([parsed_url.scheme, parsed_url.netloc])
encoded_string = None
if is_url:
res = requests.get(audio_path)
res.raise_for_status()
audio_data = res.content
encoded_string = base64.b64encode(audio_data).decode('utf-8')
else:
with open(audio_path, "rb") as audio_file:
audio_data = audio_file.read()
audio_file.close()
encoded_string = base64.b64encode(audio_data).decode('utf-8')
file_suffix = os.path.splitext(audio_path)[1]
file_format = file_suffix[1:]
if self.reasoning:
text_prompt = f"Transcribe all the content in the speech into text."
transcription = self.client.audio.transcriptions.create(
model="whisper-1",
file=open(audio_path, "rb")
)
transcript = transcription.text
reasoning_prompt = f"""
{transcript}
Please answer the following question based on the speech transcription result above:
{question}
"""
reasoning_completion = self.client.chat.completions.create(
# model="gpt-4o-audio-preview",
model = "o3-mini",
messages=[
{
"role": "user",
"content": reasoning_prompt,
}]
)
reasoning_result = reasoning_completion.choices[0].message.content
return str(reasoning_result)
else:
text_prompt = f"""Answer the following question based on the given \
audio information:\n\n{question}"""
completion = self.client.chat.completions.create(
# model="gpt-4o-audio-preview",
model = "gpt-4o-mini-audio-preview",
messages=[
{
"role": "system",
"content": "You are a helpful assistant specializing in \
audio analysis.",
},
{ # type: ignore[list-item, misc]
"role": "user",
"content": [
{"type": "text", "text": text_prompt},
{
"type": "input_audio",
"input_audio": {
"data": encoded_string,
"format": file_format,
},
},
],
},
],
) # type: ignore[misc]
response: str = str(completion.choices[0].message.content)
logger.debug(f"Response: {response}")
return str(response)
def get_tools(self) -> List[FunctionTool]:
r"""Returns a list of FunctionTool objects representing the functions
in the toolkit.
Returns:
List[FunctionTool]: A list of FunctionTool objects representing the
functions in the toolkit.
"""
return [FunctionTool(self.ask_question_about_audio)]