Spaces:
Running
Running
import io | |
import pdfplumber | |
import pandas as pd | |
import json | |
from docx import Document | |
from openpyxl import load_workbook | |
import re | |
import uuid | |
class FileReader: | |
def __init__(self): | |
self.allowed_files = ["txt", "pdf", "docx", "md", "json", "csv", "xlsx", "xls"] | |
self.max_chars_per_file = 5000000 | |
def calc_chars(self, files, allowed_chars): | |
total_chars = 0 | |
clean_contents = [] | |
for file in files: | |
file_extension = file.filename.split('.')[-1].lower() | |
if file_extension not in self.allowed_files: | |
return {"error": "unsupported file type uploaded"}, 400 | |
try: | |
if file_extension == 'txt' or file_extension=="md": | |
text = self._read_txt(file) | |
elif file_extension == 'pdf': | |
text = self._read_pdf(file) | |
elif file_extension == 'docx': | |
text = self._read_docx(file) | |
elif file_extension == 'json': | |
text = self._read_json(file) | |
elif file_extension == 'csv': | |
text = self._read_csv(file) | |
elif file_extension in ['xlsx', 'xls']: | |
text = self._read_excel(file) | |
if(len(text)>self.max_chars_per_file): | |
return {"error": "max 5 million characters per file allowed."} , 400 | |
clean_contents.append({ | |
"type": file_extension, | |
"content": text, | |
"name": file.filename, | |
"id": str(uuid.uuid4()), | |
"total_chars": len(text) | |
}) | |
total_chars += len(text) | |
if(total_chars>int(allowed_chars)): | |
return {"error": "Total allowed characters limit reached"}, 400 | |
except Exception as e: | |
return {"error": f"Error reading file {file.filename}: {e}"}, 500 | |
return {"total_chars": total_chars, "clean_contents": clean_contents}, 200 | |
def _read_txt(self, file): | |
file_content = file.read().decode("utf-8") | |
return self._clean_text(file_content) | |
def _read_pdf(self, file): | |
with pdfplumber.open(file) as pdf: | |
text = '' | |
for page in pdf.pages: | |
text += page.extract_text() or '' | |
return self._clean_text(text) | |
def _read_docx(self, file): | |
doc = Document(file) | |
text = '' | |
for para in doc.paragraphs: | |
text += para.text + "\n" | |
return self._clean_text(text) | |
def _read_json(self, file): | |
content = json.load(file) | |
text = json.dumps(content, ensure_ascii=False) | |
return self._clean_text(text) | |
def _read_csv(self, file): | |
df = pd.read_csv(file) | |
text = df.to_string(index=False) | |
return self._clean_text(text) | |
def _read_excel(self, file): | |
wb = load_workbook(file) | |
text = '' | |
for sheet in wb.sheetnames: | |
ws = wb[sheet] | |
for row in ws.iter_rows(values_only=True): | |
text += ' | '.join(str(cell) if cell is not None else '' for cell in row) + "\n" | |
return self._clean_text(text) | |
def _clean_text(self, text): | |
text = re.sub(r'\s+', ' ', text) | |
text = re.sub(r'[^\x00-\x7F]+', '', text) | |
text = text.strip() | |
return text | |
file_reader = FileReader() |