Spaces:

yasirme
/

RAG-retrieval

Running

App Files Files Community

yasirme commited on 26 days ago

Commit

44870e3

1 Parent(s): dbc8b25

push

Browse files

Files changed (6) hide show

.gitignore +14 -0
app.py +28 -0
rag/RAG.py +49 -0
requirements.txt +61 -0
src/index.html +12 -0
utils/handle_file.py +120 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+# Ignore Python bytecode
+__pycache__/
+# Ignore environment files
+.env
+# Ignore index files
+.idx
+# Ignore Visual Studio Code configuration
+.vscode/
+# Ignore virtual environment
+.venv/

app.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+from flask import Flask, send_file, request, jsonify
+from utils.handle_file import file_handler
+from werkzeug.utils import secure_filename
+app = Flask(__name__)
+app.config['MAX_CONTENT_LENGTH'] = 11 * 1024 * 1024
+@app.route("/")
+def index():
+    return send_file('src/index.html')
+@app.route('/upload', methods=['POST'])
+def upload():
+    try:
+        if 'file' not in request.files:
+            return jsonify({"error": "No filet"}), 400
+        file = request.files['file']
+        if file.filename == '':
+            return jsonify({"error": "No selected file"}), 400
+        return file_handler.process_file(file)
+    except Exception as e:
+        return {"error": f"an error occured: {e}"}, 500
+def main():
+    app.run(host='0.0.0.0', port=7860, debug=True)
+if __name__ == "__main__":
+    main()

rag/RAG.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from google import genai
+from google.genai import types
+import numpy as np
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import os
+from dotenv import load_dotenv
+load_dotenv()
+client = genai.Client(api_key=os.getenv("api_key"))
+class RAG:
+    def __init__(self):
+        self.CHUNK_SIZE = 800;
+        self.CHUNK_OVERLAP = 75;
+        self.MAX_BATCH_SIZE = 100;
+        self.MODEL = "text-embedding-004";
+        self.TASK_TYPE = "SEMANTIC_SIMILARITY";
+    def split_text(self,text):
+        try:
+            return RecursiveCharacterTextSplitter(
+                chunk_size=512,
+                chunk_overlap=75,
+                separators=["\n\n", "\n", ".", "!", "?", "。", " ", ""]
+            ).split_text(text)
+        except Exception as e:
+            raise ValueError(f"an error occured: {e}")
+    def generate_embedding(self,text,task_type=None):
+        try:
+            if(not task_type):
+                task_type = self.TASK_TYPE
+            embeddings = []
+            chunks = self.split_text(text)
+            for i in range(0,len(chunks),self.MAX_BATCH_SIZE):
+                response = client.models.embed_content(
+                    model=self.MODEL,
+                    contents=chunks[i:i + self.MAX_BATCH_SIZE],
+                    config=types.EmbedContentConfig(task_type=task_type)
+                )
+                for chunk_embedding in response.embeddings:
+                    embeddings.append(chunk_embedding.values)
+            return {"embeddings": embeddings, "chunks": chunks}, 200
+        except Exception as e:
+            return {"an error occured": f"{e}"}, 500
+rag = RAG()

requirements.txt ADDED Viewed

	@@ -0,0 +1,61 @@

+annotated-types==0.7.0
+anyio==4.9.0
+blinker==1.9.0
+cachetools==5.5.2
+certifi==2025.1.31
+cffi==1.17.1
+charset-normalizer==3.4.1
+click==8.1.8
+cryptography==44.0.2
+et_xmlfile==2.0.0
+Flask==3.1.0
+google-auth==2.38.0
+google-genai==1.10.0
+greenlet==3.1.1
+h11==0.14.0
+httpcore==1.0.8
+httpx==0.28.1
+idna==3.10
+itsdangerous==2.2.0
+Jinja2==3.1.6
+jsonpatch==1.33
+jsonpointer==3.0.0
+langchain==0.3.23
+langchain-core==0.3.51
+langchain-text-splitters==0.3.8
+langsmith==0.3.30
+lxml==5.3.2
+MarkupSafe==3.0.2
+numpy==2.2.4
+openpyxl==3.1.5
+orjson==3.10.16
+packaging==24.2
+pandas==2.2.3
+pdfminer.six==20250327
+pdfplumber==0.11.6
+pillow==11.1.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pycparser==2.22
+pydantic==2.11.3
+pydantic_core==2.33.1
+pypdfium2==4.30.1
+python-dateutil==2.9.0.post0
+python-docx==1.1.2
+python-dotenv==1.1.0
+pytz==2025.2
+PyYAML==6.0.2
+requests==2.32.3
+requests-toolbelt==1.0.0
+rsa==4.9
+six==1.17.0
+sniffio==1.3.1
+SQLAlchemy==2.0.40
+tenacity==9.1.2
+typing-inspection==0.4.0
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+websockets==15.0.1
+Werkzeug==3.1.3
+zstandard==0.23.0

src/index.html ADDED Viewed

	@@ -0,0 +1,12 @@

+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Hello World</title>
+  </head>
+  <body>
+    <h1>Hello World</h1>
+  </body>
+</html>

utils/handle_file.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import io
+import pdfplumber
+import pandas as pd
+import json
+from docx import Document
+from rag.RAG import rag
+from openpyxl import load_workbook
+ALLOWED_EXTENSIONS = {'pdf', 'txt', 'docx', 'csv', 'xlsx', 'xls', 'json'}
+MAX_CHARS = 5000000
+class FileHandler:
+    def __init__(self):
+        pass
+    def allowed_file(self, filename):
+        return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+    def check_char_limit(self, text):
+        """Check if text exceeds the character limit"""
+        if len(text.strip()) > MAX_CHARS:
+            raise ValueError(f"File exceeds the maximum character limit of {MAX_CHARS} characters")
+        return text
+    def read_pdf(self, file):
+        text = ""
+        try:
+            with pdfplumber.open(file) as pdf:
+                for page in pdf.pages:
+                    page_text = page.extract_text(layout=True)
+                    if page_text:
+                        text += page_text.strip()
+            text = self.check_char_limit(text)
+            return rag.generate_embedding(text.strip())
+        except Exception as e:
+            raise ValueError(f"An error occurred while reading the PDF: {e}")
+    def read_txt(self, file):
+        try:
+            text = file.read().decode("utf-8")
+            text = self.check_char_limit(text)
+            return rag.generate_embedding(text.strip())
+        except Exception as e:
+            raise ValueError(f"An error occurred while reading the TXT file: {e}")
+    def read_docx(self, file):
+        try:
+            doc = Document(file)
+            text = "\n".join(paragraph.text.strip() for paragraph in doc.paragraphs)
+            text = self.check_char_limit(text)
+            return rag.generate_embedding(text.strip())
+        except Exception as e:
+            raise ValueError(f"An error occurred while reading the DOCX file: {e}")
+    def read_csv(self, file):
+        try:
+            df = pd.read_csv(file)
+            text = df.to_string(index=False)
+            text = self.check_char_limit(text)
+            return rag.generate_embedding(text.strip())
+        except Exception as e:
+            raise ValueError(f"An error occurred while reading the CSV file: {e}")
+    def read_excel(self, file):
+        try:
+            all_text = []
+            workbook = load_workbook(filename=file)
+            for sheet_name in workbook.sheetnames:
+                sheet = workbook[sheet_name]
+                sheet_text = f"Sheet: {sheet_name}\n"
+                for row in sheet.iter_rows(values_only=True):
+                    row_text = " | ".join([str(cell) if cell is not None else "" for cell in row])
+                    sheet_text += row_text + "\n"
+                all_text.append(sheet_text)
+            text = "\n\n".join(all_text)
+            text = self.check_char_limit(text)
+            return rag.generate_embedding(text.strip())
+        except Exception as e:
+            raise ValueError(f"An error occurred while reading the Excel file: {e}")
+    def read_json(self, file):
+        try:
+            data = json.load(file)
+            text = json.dumps(data, indent=2)
+            text = self.check_char_limit(text)
+            return rag.generate_embedding(text.strip())
+        except Exception as e:
+            raise ValueError(f"An error occurred while reading the JSON file: {e}")
+    def handle_file(self, file):
+        filename = file.filename.lower()
+        if filename.endswith('.pdf'):
+            return self.read_pdf(file)
+        elif filename.endswith('.txt'):
+            return self.read_txt(file)
+        elif filename.endswith('.docx'):
+            return self.read_docx(file)
+        elif filename.endswith('.csv'):
+            return self.read_csv(file)
+        elif filename.endswith(('.xlsx', '.xls')):
+            return self.read_excel(file)
+        elif filename.endswith('.json'):
+            return self.read_json(file)
+        else:
+            raise ValueError(f"Unsupported file type: {filename}")
+    def process_file(self, file):
+        try:
+            if not self.allowed_file(file.filename):
+                return {"error": f"File type not allowed. Supported types: {', '.join(ALLOWED_EXTENSIONS)}"}, 400
+            return self.handle_file(file)
+        except Exception as e:
+            return {"error": f"Error processing file: {e}"}, 400
+file_handler = FileHandler()