Spaces:
Sleeping
Sleeping
import pandas as pd | |
import io | |
from pypdf import PdfReader | |
def process_uploaded_file(uploaded_file): | |
"""Extracts text from uploaded PDF or Excel files""" | |
if uploaded_file.type == "application/pdf": | |
return extract_text_from_pdf(uploaded_file) | |
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": | |
return extract_text_from_excel(uploaded_file) | |
else: | |
return "Unsupported file format." | |
def extract_text_from_pdf(pdf_file): | |
"""Extract text from a PDF""" | |
reader = PdfReader(pdf_file) | |
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()]) | |
return text | |
def extract_text_from_excel(excel_file): | |
"""Extract text from an Excel file""" | |
df = pd.read_excel(excel_file, sheet_name=None) | |
text = "" | |
for sheet, data in df.items(): | |
text += f"\nSheet: {sheet}\n" + data.to_string(index=False) | |
return text | |