|
import asyncio |
|
|
|
from src.file_handler.handlers import ( |
|
convert_docx_to_markdown, |
|
convert_excel_bytes_to_llm_format, |
|
convert_file_to_string, |
|
convert_image_to_pillow, |
|
convert_pdf_to_markdown, |
|
) |
|
|
|
|
|
async def aparse_file(task_id: str, file_name: str, api_base_url: str) -> str: |
|
""" |
|
Parses a file and returns its content in a format suitable for LLMs. |
|
|
|
Args: |
|
task_id (str): The ID of the task. |
|
file_name (str): The name of the file. |
|
api_base_url (str): The base URL of the API. |
|
|
|
Returns: |
|
str: The content of the file in a format suitable for LLMs. |
|
""" |
|
file_extension = file_name.split(".")[-1] |
|
|
|
if file_extension == "xlsx": |
|
return await convert_excel_bytes_to_llm_format(task_id, api_base_url) |
|
elif file_extension == "docx": |
|
return await convert_docx_to_markdown(task_id, api_base_url) |
|
elif file_extension in ["jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp"]: |
|
return await convert_image_to_pillow(task_id, api_base_url) |
|
elif file_extension == "pdf": |
|
return await convert_pdf_to_markdown(task_id, api_base_url) |
|
elif file_extension == "mp3": |
|
return None |
|
else: |
|
return await convert_file_to_string(task_id, api_base_url) |
|
|
|
|
|
def parse_file(task_id: str, file_name: str, api_base_url: str) -> str: |
|
""" |
|
Parses a file and returns its content in a format suitable for LLMs. |
|
|
|
Args: |
|
task_id (str): The ID of the task. |
|
file_name (str): The name of the file. |
|
api_base_url (str): The base URL of the API. |
|
|
|
Returns: |
|
str: The content of the file in a format suitable for LLMs. |
|
""" |
|
return asyncio.run(aparse_file(task_id, file_name, api_base_url)) |
|
|