Spaces:

yasirme
/

RAG-retrieval

Running

App Files Files Community

yasirme commited on 24 days ago

Commit

54155f3

1 Parent(s): 24055cd

init

Browse files

Files changed (4) hide show

app.py +10 -18
src/index.html +1 -1
utils/file_reader.py +100 -0
utils/handle_file.py +23 -157

app.py CHANGED Viewed

@@ -11,30 +11,22 @@ app.config['MAX_CONTENT_LENGTH'] = 11 * 1024 * 1024
 def index():
     return send_file('src/index.html')
-@app.route('/upload', methods=['POST'])
 def upload():
     try:
-        allowed_chars = request.args.get('allowed_size')
-        print(allowed_chars)
-        if 'file' not in request.files and 'files' not in request.files:
             return jsonify({"error": "No files uploaded"}), 400
-        if 'files' in request.files:
-            files = request.files.getlist('files')
-        else:
-            files = request.files.getlist('file')
-        if not files or not files[0].filename:
-            return jsonify({"error": "No files selected"}), 400
-        if len(files) == 1:
-            return file_handler.process_file(files[0], allowed_chars)
-        else:
-            return file_handler.process_files(files, allowed_chars)
     except Exception as e:
-        return jsonify({"error": f"An error occurred: {e}"}), 500
-@app.route('/embedding')
 def embedding():
     return rag.generate_embedding(
         text=request.json.get("text"),

 def index():
     return send_file('src/index.html')
+@app.route('/upload', methods=["POST"])
 def upload():
     try:
+        allowed_chars = request.args.get("allowed_chars")
+        if 'files' not in request.files:
             return jsonify({"error": "No files uploaded"}), 400
+        files = request.files.getlist('files')
+        return file_handler.handle_files(files=files,allowed_chars=allowed_chars)
     except Exception as e:
+        return jsonify({"error": f"An error occurred: {e} "}), 500
+@app.route('/embedding', methods=['POST'])
 def embedding():
     return rag.generate_embedding(
         text=request.json.get("text"),

src/index.html CHANGED Viewed

@@ -7,7 +7,7 @@
   </head>
   <body>
-    <h1>API key is not set</h1>
     <h1>Clone this space and use your own gemini api key</h1>
   </body>
 </html>

   </head>
   <body>
+    <h1>API key is not configured</h1>
     <h1>Clone this space and use your own gemini api key</h1>
   </body>
 </html>

utils/file_reader.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import io
+import pdfplumber
+import pandas as pd
+import json
+from docx import Document
+from openpyxl import load_workbook
+import re
+import uuid
+class FileReader:
+    def __init__(self):
+        self.allowed_files = ["txt", "pdf", "docx", "md", "json", "csv", "xlsx", "xls"]
+        self.max_chars_per_file = 5000000
+    def calc_chars(self, files, allowed_chars):
+        total_chars = 0
+        clean_contents = []
+        for file in files:
+            file_extension = file.filename.split('.')[-1].lower()
+            if file_extension not in self.allowed_files:
+                return {"error": "unsupported file type uploaded"}, 400
+            try:
+                if file_extension == 'txt' or file_extension=="md":
+                    text = self._read_txt(file)
+                elif file_extension == 'pdf':
+                    text = self._read_pdf(file)
+                elif file_extension == 'docx':
+                    text = self._read_docx(file)
+                elif file_extension == 'json':
+                    text = self._read_json(file)
+                elif file_extension == 'csv':
+                    text = self._read_csv(file)
+                elif file_extension in ['xlsx', 'xls']:
+                    text = self._read_excel(file)
+                if(len(text)>self.max_chars_per_file):
+                    return {"error": "max 5 million characters per file allowed."} , 400
+                clean_contents.append({
+                    "type": file_extension,
+                    "content": text,
+                    "name": file.filename,
+                    "id": str(uuid.uuid4()),
+                    "total_chars": len(text)
+                })
+                total_chars += len(text)
+                if(total_chars>int(allowed_chars)):
+                    return {"error": "Total allowed characters limit reached"}, 400
+            except Exception as e:
+                return {"error": f"Error reading file {file.filename}: {e}"}, 500
+        return {"total_chars": total_chars, "clean_contents": clean_contents}, 200
+    def _read_txt(self, file):
+        file_content = file.read().decode("utf-8")
+        return self._clean_text(file_content)
+    def _read_pdf(self, file):
+        with pdfplumber.open(file) as pdf:
+            text = ''
+            for page in pdf.pages:
+                text += page.extract_text() or ''
+        return self._clean_text(text)
+    def _read_docx(self, file):
+        doc = Document(file)
+        text = ''
+        for para in doc.paragraphs:
+            text += para.text + "\n"
+        return self._clean_text(text)
+    def _read_json(self, file):
+        content = json.load(file)
+        text = json.dumps(content, ensure_ascii=False)
+        return self._clean_text(text)
+    def _read_csv(self, file):
+        df = pd.read_csv(file)
+        text = df.to_string(index=False)
+        return self._clean_text(text)
+    def _read_excel(self, file):
+        wb = load_workbook(file)
+        text = ''
+        for sheet in wb.sheetnames:
+            ws = wb[sheet]
+            for row in ws.iter_rows(values_only=True):
+                text += ' | '.join(str(cell) if cell is not None else '' for cell in row) + "\n"
+        return self._clean_text(text)
+    def _clean_text(self, text):
+        text = re.sub(r'\s+', ' ', text)
+        text = re.sub(r'[^\x00-\x7F]+', '', text)
+        text = text.strip()
+        return text
+file_reader = FileReader()

utils/handle_file.py CHANGED Viewed

@@ -1,166 +1,32 @@
-import io
-import pdfplumber
-import pandas as pd
-import json
-from docx import Document
 from rag.RAG import rag
-from openpyxl import load_workbook
-ALLOWED_EXTENSIONS = {'pdf', 'txt', 'docx', 'csv', 'xlsx', 'xls', 'json'}
-MAX_CHARS_PER_FILE = 5000000  # 5 million characters per file limit
-class FileHandler:
     def __init__(self):
-        self.file_handlers = {
-            'pdf': self._read_pdf,
-            'txt': self._read_txt,
-            'docx': self._read_docx,
-            'csv': self._read_csv,
-            'xlsx': self._read_excel,
-            'xls': self._read_excel,
-            'json': self._read_json
-        }
-    def _validate_params(self, allowed_chars):
-        if not allowed_chars:
-            return None
-        try:
-            return int(allowed_chars)
-        except ValueError:
-            raise ValueError("allowed_size parameter must be an integer")
-    def _validate_file(self, file):
-        if not file or file.filename == '':
-            raise ValueError("No file selected")
-        extension = file.filename.rsplit('.', 1)[1].lower() if '.' in file.filename else ''
-        if extension not in ALLOWED_EXTENSIONS:
-            raise ValueError(f"File type not allowed. Supported types: {', '.join(ALLOWED_EXTENSIONS)}")
-        return extension
-    def _check_char_limit(self, text):
-        if len(text.strip()) > MAX_CHARS_PER_FILE:
-            raise ValueError(f"File exceeds the maximum character limit of {MAX_CHARS_PER_FILE} characters")
-        return text.strip()
-    def _read_pdf(self, file):
-        try:
-            text = ""
-            with pdfplumber.open(file) as pdf:
-                for page in pdf.pages:
-                    page_text = page.extract_text(layout=True)
-                    if page_text:
-                        text += page_text.strip()
-            return self._check_char_limit(text)
-        except Exception as e:
-            raise ValueError(f"Error reading PDF: {e}")
-    def _read_txt(self, file):
-        try:
-            text = file.read().decode("utf-8")
-            return self._check_char_limit(text)
-        except Exception as e:
-            raise ValueError(f"Error reading TXT: {e}")
-    def _read_docx(self, file):
-        try:
-            doc = Document(file)
-            text = "\n".join(paragraph.text.strip() for paragraph in doc.paragraphs)
-            return self._check_char_limit(text)
-        except Exception as e:
-            raise ValueError(f"Error reading DOCX: {e}")
-    def _read_csv(self, file):
         try:
-            df = pd.read_csv(file)
-            text = df.to_string(index=False)
-            return self._check_char_limit(text)
-        except Exception as e:
-            raise ValueError(f"Error reading CSV: {e}")
-    def _read_excel(self, file):
-        try:
-            all_text = []
-            workbook = load_workbook(filename=file)
-            for sheet_name in workbook.sheetnames:
-                sheet = workbook[sheet_name]
-                sheet_text = f"Sheet: {sheet_name}\n"
-                for row in sheet.iter_rows(values_only=True):
-                    row_text = " | ".join([str(cell) if cell is not None else "" for cell in row])
-                    sheet_text += row_text + "\n"
-                all_text.append(sheet_text)
-            text = "\n\n".join(all_text)
-            return self._check_char_limit(text)
-        except Exception as e:
-            raise ValueError(f"Error reading Excel: {e}")
-    def _read_json(self, file):
-        try:
-            data = json.load(file)
-            text = json.dumps(data, indent=2)
-            return self._check_char_limit(text)
-        except Exception as e:
-            raise ValueError(f"Error reading JSON: {e}")
-    def read_file(self, file):
-        extension = self._validate_file(file)
-        return self.file_handlers[extension](file)
-    def process_file(self, file, allowed_chars):
-        try:
-            allowed_limit = self._validate_params(allowed_chars)
-            content = self.read_file(file)
-            if len(content) > allowed_limit:
-                return {"error": f"Character count ({len(content)}) exceeds the allowed limit ({allowed_limit})"}, 400
-            return rag.generate_embedding(content)
-        except ValueError as e:
-            return {"error": str(e)}, 400
-        except Exception as e:
-            return {"error": f"Unexpected error: {e}"}, 500
-    def process_files(self, files, allowed_chars):
-        try:
-            allowed_limit = self._validate_params(allowed_chars)
-            file_contents = []
-            total_chars = 0
-            for file in files:
-                try:
-                    content = self.read_file(file)
-                    file_contents.append((file.filename, content))
-                    total_chars += len(content)
-                except ValueError as e:
-                    return {"error": f"Error with file '{file.filename}': {str(e)}"}, 400
-            if total_chars > allowed_limit:
-                return {"error": f"Total character count ({total_chars}) exceeds the allowed limit ({allowed_limit})"}, 400
-            results = []
-            for filename, content in file_contents:
-                embedding_result, status_code = rag.generate_embedding(content)
-                if status_code != 200:
-                    return embedding_result, status_code
-                results.append({
-                    "filename": filename,
-                    "char_count": len(content),
-                    "embeddings": embedding_result
-                })
-            return {
-                "total_char_count": total_chars,
-                "file_count": len(files),
-                "results": results
-            }, 200
-        except ValueError as e:
-            return {"error": str(e)}, 400
         except Exception as e:
-            return {"error": f"Unexpected error: {e}"}, 500
-file_handler = FileHandler()

+from utils.file_reader import file_reader
 from rag.RAG import rag
+class HandleFiles:
     def __init__(self):
+        pass
+    def handle_files(self,files,allowed_chars):
         try:
+            result = {}
+            content,status_code = file_reader.calc_chars(files,allowed_chars)
+            if(status_code!=200):
+                return content,status_code
+            for text_dict in content['clean_contents']:
+                embedding, status_code = rag.generate_embedding(text_dict['content'])
+                if(status_code!=200):
+                    return embedding,status_code
+                result[str(text_dict['id'])] = {
+                    "name": text_dict['name'],
+                    "type": text_dict['type'],
+                    "total_chars": text_dict['total_chars'],
+                    "embedding": embedding['embeddings'],
+                    "chunks": embedding['chunks']
+                }
+            return result
         except Exception as e:
+            return {"error": f"an error occured: {e}"}, 500
+file_handler = HandleFiles()