yasirme commited on
Commit
44870e3
·
1 Parent(s): dbc8b25
Files changed (6) hide show
  1. .gitignore +14 -0
  2. app.py +28 -0
  3. rag/RAG.py +49 -0
  4. requirements.txt +61 -0
  5. src/index.html +12 -0
  6. utils/handle_file.py +120 -0
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore Python bytecode
2
+ __pycache__/
3
+
4
+ # Ignore environment files
5
+ .env
6
+
7
+ # Ignore index files
8
+ .idx
9
+
10
+ # Ignore Visual Studio Code configuration
11
+ .vscode/
12
+
13
+ # Ignore virtual environment
14
+ .venv/
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from flask import Flask, send_file, request, jsonify
3
+ from utils.handle_file import file_handler
4
+ from werkzeug.utils import secure_filename
5
+ app = Flask(__name__)
6
+ app.config['MAX_CONTENT_LENGTH'] = 11 * 1024 * 1024
7
+
8
+ @app.route("/")
9
+ def index():
10
+ return send_file('src/index.html')
11
+
12
+ @app.route('/upload', methods=['POST'])
13
+ def upload():
14
+ try:
15
+ if 'file' not in request.files:
16
+ return jsonify({"error": "No filet"}), 400
17
+ file = request.files['file']
18
+ if file.filename == '':
19
+ return jsonify({"error": "No selected file"}), 400
20
+ return file_handler.process_file(file)
21
+ except Exception as e:
22
+ return {"error": f"an error occured: {e}"}, 500
23
+
24
+ def main():
25
+ app.run(host='0.0.0.0', port=7860, debug=True)
26
+
27
+ if __name__ == "__main__":
28
+ main()
rag/RAG.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from google import genai
2
+ from google.genai import types
3
+ import numpy as np
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ import os
6
+ from dotenv import load_dotenv
7
+ load_dotenv()
8
+
9
+ client = genai.Client(api_key=os.getenv("api_key"))
10
+
11
+ class RAG:
12
+ def __init__(self):
13
+ self.CHUNK_SIZE = 800;
14
+ self.CHUNK_OVERLAP = 75;
15
+ self.MAX_BATCH_SIZE = 100;
16
+ self.MODEL = "text-embedding-004";
17
+ self.TASK_TYPE = "SEMANTIC_SIMILARITY";
18
+
19
+ def split_text(self,text):
20
+ try:
21
+ return RecursiveCharacterTextSplitter(
22
+ chunk_size=512,
23
+ chunk_overlap=75,
24
+ separators=["\n\n", "\n", ".", "!", "?", "。", " ", ""]
25
+ ).split_text(text)
26
+ except Exception as e:
27
+ raise ValueError(f"an error occured: {e}")
28
+
29
+ def generate_embedding(self,text,task_type=None):
30
+ try:
31
+ if(not task_type):
32
+ task_type = self.TASK_TYPE
33
+ embeddings = []
34
+ chunks = self.split_text(text)
35
+ for i in range(0,len(chunks),self.MAX_BATCH_SIZE):
36
+ response = client.models.embed_content(
37
+ model=self.MODEL,
38
+ contents=chunks[i:i + self.MAX_BATCH_SIZE],
39
+ config=types.EmbedContentConfig(task_type=task_type)
40
+ )
41
+ for chunk_embedding in response.embeddings:
42
+ embeddings.append(chunk_embedding.values)
43
+ return {"embeddings": embeddings, "chunks": chunks}, 200
44
+ except Exception as e:
45
+ return {"an error occured": f"{e}"}, 500
46
+
47
+
48
+
49
+ rag = RAG()
requirements.txt ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.7.0
2
+ anyio==4.9.0
3
+ blinker==1.9.0
4
+ cachetools==5.5.2
5
+ certifi==2025.1.31
6
+ cffi==1.17.1
7
+ charset-normalizer==3.4.1
8
+ click==8.1.8
9
+ cryptography==44.0.2
10
+ et_xmlfile==2.0.0
11
+ Flask==3.1.0
12
+ google-auth==2.38.0
13
+ google-genai==1.10.0
14
+ greenlet==3.1.1
15
+ h11==0.14.0
16
+ httpcore==1.0.8
17
+ httpx==0.28.1
18
+ idna==3.10
19
+ itsdangerous==2.2.0
20
+ Jinja2==3.1.6
21
+ jsonpatch==1.33
22
+ jsonpointer==3.0.0
23
+ langchain==0.3.23
24
+ langchain-core==0.3.51
25
+ langchain-text-splitters==0.3.8
26
+ langsmith==0.3.30
27
+ lxml==5.3.2
28
+ MarkupSafe==3.0.2
29
+ numpy==2.2.4
30
+ openpyxl==3.1.5
31
+ orjson==3.10.16
32
+ packaging==24.2
33
+ pandas==2.2.3
34
+ pdfminer.six==20250327
35
+ pdfplumber==0.11.6
36
+ pillow==11.1.0
37
+ pyasn1==0.6.1
38
+ pyasn1_modules==0.4.2
39
+ pycparser==2.22
40
+ pydantic==2.11.3
41
+ pydantic_core==2.33.1
42
+ pypdfium2==4.30.1
43
+ python-dateutil==2.9.0.post0
44
+ python-docx==1.1.2
45
+ python-dotenv==1.1.0
46
+ pytz==2025.2
47
+ PyYAML==6.0.2
48
+ requests==2.32.3
49
+ requests-toolbelt==1.0.0
50
+ rsa==4.9
51
+ six==1.17.0
52
+ sniffio==1.3.1
53
+ SQLAlchemy==2.0.40
54
+ tenacity==9.1.2
55
+ typing-inspection==0.4.0
56
+ typing_extensions==4.13.2
57
+ tzdata==2025.2
58
+ urllib3==2.4.0
59
+ websockets==15.0.1
60
+ Werkzeug==3.1.3
61
+ zstandard==0.23.0
src/index.html ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>Hello World</title>
7
+ </head>
8
+
9
+ <body>
10
+ <h1>Hello World</h1>
11
+ </body>
12
+ </html>
utils/handle_file.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import pdfplumber
3
+ import pandas as pd
4
+ import json
5
+ from docx import Document
6
+ from rag.RAG import rag
7
+ from openpyxl import load_workbook
8
+
9
+ ALLOWED_EXTENSIONS = {'pdf', 'txt', 'docx', 'csv', 'xlsx', 'xls', 'json'}
10
+ MAX_CHARS = 5000000
11
+
12
+ class FileHandler:
13
+ def __init__(self):
14
+ pass
15
+
16
+ def allowed_file(self, filename):
17
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
18
+
19
+ def check_char_limit(self, text):
20
+ """Check if text exceeds the character limit"""
21
+ if len(text.strip()) > MAX_CHARS:
22
+ raise ValueError(f"File exceeds the maximum character limit of {MAX_CHARS} characters")
23
+ return text
24
+
25
+ def read_pdf(self, file):
26
+ text = ""
27
+ try:
28
+ with pdfplumber.open(file) as pdf:
29
+ for page in pdf.pages:
30
+ page_text = page.extract_text(layout=True)
31
+ if page_text:
32
+ text += page_text.strip()
33
+ text = self.check_char_limit(text)
34
+ return rag.generate_embedding(text.strip())
35
+ except Exception as e:
36
+ raise ValueError(f"An error occurred while reading the PDF: {e}")
37
+
38
+ def read_txt(self, file):
39
+ try:
40
+ text = file.read().decode("utf-8")
41
+ text = self.check_char_limit(text)
42
+ return rag.generate_embedding(text.strip())
43
+ except Exception as e:
44
+ raise ValueError(f"An error occurred while reading the TXT file: {e}")
45
+
46
+ def read_docx(self, file):
47
+ try:
48
+ doc = Document(file)
49
+ text = "\n".join(paragraph.text.strip() for paragraph in doc.paragraphs)
50
+ text = self.check_char_limit(text)
51
+ return rag.generate_embedding(text.strip())
52
+ except Exception as e:
53
+ raise ValueError(f"An error occurred while reading the DOCX file: {e}")
54
+
55
+ def read_csv(self, file):
56
+ try:
57
+ df = pd.read_csv(file)
58
+ text = df.to_string(index=False)
59
+ text = self.check_char_limit(text)
60
+ return rag.generate_embedding(text.strip())
61
+ except Exception as e:
62
+ raise ValueError(f"An error occurred while reading the CSV file: {e}")
63
+
64
+ def read_excel(self, file):
65
+ try:
66
+ all_text = []
67
+ workbook = load_workbook(filename=file)
68
+
69
+ for sheet_name in workbook.sheetnames:
70
+ sheet = workbook[sheet_name]
71
+ sheet_text = f"Sheet: {sheet_name}\n"
72
+
73
+ for row in sheet.iter_rows(values_only=True):
74
+ row_text = " | ".join([str(cell) if cell is not None else "" for cell in row])
75
+ sheet_text += row_text + "\n"
76
+
77
+ all_text.append(sheet_text)
78
+
79
+ text = "\n\n".join(all_text)
80
+ text = self.check_char_limit(text)
81
+ return rag.generate_embedding(text.strip())
82
+ except Exception as e:
83
+ raise ValueError(f"An error occurred while reading the Excel file: {e}")
84
+
85
+ def read_json(self, file):
86
+ try:
87
+ data = json.load(file)
88
+ text = json.dumps(data, indent=2)
89
+ text = self.check_char_limit(text)
90
+ return rag.generate_embedding(text.strip())
91
+ except Exception as e:
92
+ raise ValueError(f"An error occurred while reading the JSON file: {e}")
93
+
94
+ def handle_file(self, file):
95
+ filename = file.filename.lower()
96
+
97
+ if filename.endswith('.pdf'):
98
+ return self.read_pdf(file)
99
+ elif filename.endswith('.txt'):
100
+ return self.read_txt(file)
101
+ elif filename.endswith('.docx'):
102
+ return self.read_docx(file)
103
+ elif filename.endswith('.csv'):
104
+ return self.read_csv(file)
105
+ elif filename.endswith(('.xlsx', '.xls')):
106
+ return self.read_excel(file)
107
+ elif filename.endswith('.json'):
108
+ return self.read_json(file)
109
+ else:
110
+ raise ValueError(f"Unsupported file type: {filename}")
111
+
112
+ def process_file(self, file):
113
+ try:
114
+ if not self.allowed_file(file.filename):
115
+ return {"error": f"File type not allowed. Supported types: {', '.join(ALLOWED_EXTENSIONS)}"}, 400
116
+ return self.handle_file(file)
117
+ except Exception as e:
118
+ return {"error": f"Error processing file: {e}"}, 400
119
+
120
+ file_handler = FileHandler()