multidoc_chat / utils.py
akashshahade's picture
Upload 7 files
6363d82 verified
raw
history blame contribute delete
972 Bytes
import pandas as pd
import io
from pypdf import PdfReader
def process_uploaded_file(uploaded_file):
"""Extracts text from uploaded PDF or Excel files"""
if uploaded_file.type == "application/pdf":
return extract_text_from_pdf(uploaded_file)
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
return extract_text_from_excel(uploaded_file)
else:
return "Unsupported file format."
def extract_text_from_pdf(pdf_file):
"""Extract text from a PDF"""
reader = PdfReader(pdf_file)
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
return text
def extract_text_from_excel(excel_file):
"""Extract text from an Excel file"""
df = pd.read_excel(excel_file, sheet_name=None)
text = ""
for sheet, data in df.items():
text += f"\nSheet: {sheet}\n" + data.to_string(index=False)
return text