|
import io |
|
import json |
|
from typing import Optional |
|
|
|
import pandas as pd |
|
import pdfminer.high_level |
|
import PIL.Image |
|
from docx import Document |
|
from docx.opc.exceptions import PackageNotFoundError |
|
from pdfminer.pdfparser import PDFSyntaxError |
|
|
|
from src.file_handler.get_file import download_file_for_task |
|
|
|
|
|
async def convert_excel_bytes_to_llm_format(task_id: str, url: str) -> str: |
|
""" |
|
Downloads an Excel file using download_file_for_task, removes empty rows |
|
from each sheet, and converts its content to an LLM-friendly dictionary format. |
|
|
|
Args: |
|
task_id (str): The identifier for the task, used by download_file_for_task. |
|
url (str): The URL of the Excel file to download and process. |
|
|
|
Returns: |
|
str: A dictionary where keys are sheet names and values are lists of |
|
dictionaries (each dictionary representing a row, with column |
|
headers as keys). |
|
Returns None if a critical error occurs (e.g., download failure, |
|
file unparseable). |
|
Returns an empty dictionary if the Excel file is valid but contains |
|
no sheets or no data after cleaning. |
|
""" |
|
try: |
|
file_bytes = await download_file_for_task(task_id, url) |
|
|
|
if not file_bytes: |
|
print(f"Info [{task_id}]: No content downloaded from URL '{url}'.") |
|
|
|
|
|
return None |
|
|
|
|
|
excel_buffer = io.BytesIO(file_bytes) |
|
|
|
|
|
|
|
|
|
xls = pd.ExcelFile(excel_buffer) |
|
|
|
if not xls.sheet_names: |
|
print( |
|
f"Info [{task_id}]: Excel file from URL '{url}' has no sheets." |
|
) |
|
return {} |
|
|
|
all_sheets_data = {} |
|
for sheet_name in xls.sheet_names: |
|
|
|
df = xls.parse(sheet_name) |
|
|
|
|
|
df.dropna(how="all", inplace=True) |
|
|
|
|
|
|
|
|
|
all_sheets_data[sheet_name] = df.to_dict(orient="records") |
|
|
|
return json.dumps(all_sheets_data, ensure_ascii=False) |
|
|
|
except pd.errors.ParserError as e: |
|
|
|
print( |
|
f"Error [{task_id}]: Pandas parsing error for Excel file from '{url}': {e}" |
|
) |
|
return None |
|
except ValueError as e: |
|
|
|
|
|
print( |
|
f"Error [{task_id}]: Value error processing Excel file from '{url}': {e}" |
|
) |
|
return None |
|
except Exception as e: |
|
|
|
|
|
|
|
|
|
|
|
print( |
|
f"Error [{task_id}]: Unexpected error processing Excel file from '{url}': {e}" |
|
) |
|
return None |
|
|
|
|
|
|
|
|
|
|
|
async def convert_image_to_pillow( |
|
task_id: str, url: str |
|
) -> Optional[PIL.Image.Image]: |
|
""" |
|
Downloads an image file and returns a PIL Image object. |
|
Returns None on failure. |
|
|
|
Args: |
|
task_id (str): The ID of the task. |
|
url (str): The URL of the image file. |
|
|
|
Returns: |
|
Optional[PIL.Image.Image]: The PIL Image object or None on failure. |
|
""" |
|
try: |
|
raw = await download_file_for_task(task_id, url) |
|
if not raw: |
|
print(f"Info [{task_id}]: No bytes downloaded from '{url}'.") |
|
return None |
|
|
|
return PIL.Image.open(io.BytesIO(raw)) |
|
|
|
except Exception as e: |
|
print(f"Error [{task_id}]: converting image from '{url}' → base64: {e}") |
|
return None |
|
|
|
|
|
|
|
|
|
|
|
async def convert_file_to_string(task_id: str, url: str) -> Optional[str]: |
|
""" |
|
Downloads a file and returns its text (UTF‑8, errors replaced). |
|
""" |
|
try: |
|
raw = await download_file_for_task(task_id, url) |
|
if not raw: |
|
print(f"Info [{task_id}]: No bytes downloaded from '{url}'.") |
|
return None |
|
|
|
return raw.decode("utf-8", errors="replace") |
|
|
|
except Exception as e: |
|
print(f"Error [{task_id}]: decoding file from '{url}': {e}") |
|
return None |
|
|
|
|
|
|
|
|
|
|
|
def _runs_to_md(runs): |
|
"""Helper – convert a list of runs to markdown inline‑text.""" |
|
out = [] |
|
for run in runs: |
|
text = run.text.replace("\n", " ") |
|
if not text: |
|
continue |
|
if run.bold: |
|
text = f"**{text}**" |
|
if run.italic: |
|
text = f"*{text}*" |
|
out.append(text) |
|
return "".join(out) |
|
|
|
|
|
async def convert_docx_to_markdown(task_id: str, url: str) -> Optional[str]: |
|
""" |
|
Converts a Word document to *simple* Markdown. |
|
""" |
|
try: |
|
raw = await download_file_for_task(task_id, url) |
|
if not raw: |
|
print(f"Info [{task_id}]: No bytes downloaded from '{url}'.") |
|
return None |
|
|
|
doc = Document(io.BytesIO(raw)) |
|
|
|
md_lines = [] |
|
for p in doc.paragraphs: |
|
style = (p.style.name or "").lower() |
|
text = _runs_to_md(p.runs).strip() |
|
if not text: |
|
continue |
|
|
|
if "heading" in style: |
|
|
|
level = int("".join(filter(str.isdigit, style)) or 1) |
|
md_lines.append(f"{'#' * level} {text}") |
|
else: |
|
md_lines.append(text) |
|
|
|
return "\n\n".join(md_lines) |
|
|
|
except PackageNotFoundError: |
|
print(f"Error [{task_id}]: file from '{url}' is not a valid DOCX.") |
|
return None |
|
except Exception as e: |
|
print(f"Error [{task_id}]: DOCX→MD conversion failed for '{url}': {e}") |
|
return None |
|
|
|
|
|
|
|
|
|
|
|
async def convert_pdf_to_markdown(task_id: str, url: str) -> Optional[str]: |
|
""" |
|
Extracts text from a PDF and returns it as Markdown (plain paragraphs). |
|
""" |
|
try: |
|
raw = await download_file_for_task(task_id, url) |
|
if not raw: |
|
print(f"Info [{task_id}]: No bytes downloaded from '{url}'.") |
|
return None |
|
|
|
text = pdfminer.high_level.extract_text(io.BytesIO(raw)) |
|
if not text.strip(): |
|
print(f"Info [{task_id}]: PDF at '{url}' produced no text.") |
|
return "" |
|
|
|
|
|
paragraphs = [p.strip() for p in text.splitlines() if p.strip()] |
|
return "\n\n".join(paragraphs) |
|
|
|
except (PDFSyntaxError, ValueError) as e: |
|
print(f"Error [{task_id}]: PDF syntax error for '{url}': {e}") |
|
return None |
|
except Exception as e: |
|
print(f"Error [{task_id}]: PDF→MD conversion failed for '{url}': {e}") |
|
return None |
|
|