File size: 1,732 Bytes
912f746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import asyncio

from src.file_handler.handlers import (
    convert_docx_to_markdown,
    convert_excel_bytes_to_llm_format,
    convert_file_to_string,
    convert_image_to_pillow,
    convert_pdf_to_markdown,
)


async def aparse_file(task_id: str, file_name: str, api_base_url: str) -> str:
    """
    Parses a file and returns its content in a format suitable for LLMs.

    Args:
        task_id (str): The ID of the task.
        file_name (str): The name of the file.
        api_base_url (str): The base URL of the API.

    Returns:
        str: The content of the file in a format suitable for LLMs.
    """
    file_extension = file_name.split(".")[-1]

    if file_extension == "xlsx":
        return await convert_excel_bytes_to_llm_format(task_id, api_base_url)
    elif file_extension == "docx":
        return await convert_docx_to_markdown(task_id, api_base_url)
    elif file_extension in ["jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp"]:
        return await convert_image_to_pillow(task_id, api_base_url)
    elif file_extension == "pdf":
        return await convert_pdf_to_markdown(task_id, api_base_url)
    elif file_extension == "mp3":
        return None
    else:
        return await convert_file_to_string(task_id, api_base_url)


def parse_file(task_id: str, file_name: str, api_base_url: str) -> str:
    """
    Parses a file and returns its content in a format suitable for LLMs.

    Args:
        task_id (str): The ID of the task.
        file_name (str): The name of the file.
        api_base_url (str): The base URL of the API.

    Returns:
        str: The content of the file in a format suitable for LLMs.
    """
    return asyncio.run(aparse_file(task_id, file_name, api_base_url))