yasirme commited on
Commit
54155f3
·
1 Parent(s): 24055cd
Files changed (4) hide show
  1. app.py +10 -18
  2. src/index.html +1 -1
  3. utils/file_reader.py +100 -0
  4. utils/handle_file.py +23 -157
app.py CHANGED
@@ -11,30 +11,22 @@ app.config['MAX_CONTENT_LENGTH'] = 11 * 1024 * 1024
11
  def index():
12
  return send_file('src/index.html')
13
 
14
-
15
- @app.route('/upload', methods=['POST'])
16
  def upload():
17
  try:
18
- allowed_chars = request.args.get('allowed_size')
19
- print(allowed_chars)
20
- if 'file' not in request.files and 'files' not in request.files:
21
  return jsonify({"error": "No files uploaded"}), 400
22
- if 'files' in request.files:
23
- files = request.files.getlist('files')
24
- else:
25
- files = request.files.getlist('file')
26
- if not files or not files[0].filename:
27
- return jsonify({"error": "No files selected"}), 400
28
- if len(files) == 1:
29
- return file_handler.process_file(files[0], allowed_chars)
30
- else:
31
- return file_handler.process_files(files, allowed_chars)
32
-
33
  except Exception as e:
34
- return jsonify({"error": f"An error occurred: {e}"}), 500
 
35
 
36
 
37
- @app.route('/embedding')
38
  def embedding():
39
  return rag.generate_embedding(
40
  text=request.json.get("text"),
 
11
  def index():
12
  return send_file('src/index.html')
13
 
14
+ @app.route('/upload', methods=["POST"])
 
15
  def upload():
16
  try:
17
+ allowed_chars = request.args.get("allowed_chars")
18
+ if 'files' not in request.files:
 
19
  return jsonify({"error": "No files uploaded"}), 400
20
+ files = request.files.getlist('files')
21
+
22
+ return file_handler.handle_files(files=files,allowed_chars=allowed_chars)
23
+
 
 
 
 
 
 
 
24
  except Exception as e:
25
+ return jsonify({"error": f"An error occurred: {e} "}), 500
26
+
27
 
28
 
29
+ @app.route('/embedding', methods=['POST'])
30
  def embedding():
31
  return rag.generate_embedding(
32
  text=request.json.get("text"),
src/index.html CHANGED
@@ -7,7 +7,7 @@
7
  </head>
8
 
9
  <body>
10
- <h1>API key is not set</h1>
11
  <h1>Clone this space and use your own gemini api key</h1>
12
  </body>
13
  </html>
 
7
  </head>
8
 
9
  <body>
10
+ <h1>API key is not configured</h1>
11
  <h1>Clone this space and use your own gemini api key</h1>
12
  </body>
13
  </html>
utils/file_reader.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import pdfplumber
3
+ import pandas as pd
4
+ import json
5
+ from docx import Document
6
+ from openpyxl import load_workbook
7
+ import re
8
+ import uuid
9
+
10
+
11
+ class FileReader:
12
+ def __init__(self):
13
+ self.allowed_files = ["txt", "pdf", "docx", "md", "json", "csv", "xlsx", "xls"]
14
+ self.max_chars_per_file = 5000000
15
+
16
+ def calc_chars(self, files, allowed_chars):
17
+ total_chars = 0
18
+ clean_contents = []
19
+ for file in files:
20
+ file_extension = file.filename.split('.')[-1].lower()
21
+ if file_extension not in self.allowed_files:
22
+ return {"error": "unsupported file type uploaded"}, 400
23
+ try:
24
+ if file_extension == 'txt' or file_extension=="md":
25
+ text = self._read_txt(file)
26
+ elif file_extension == 'pdf':
27
+ text = self._read_pdf(file)
28
+ elif file_extension == 'docx':
29
+ text = self._read_docx(file)
30
+ elif file_extension == 'json':
31
+ text = self._read_json(file)
32
+ elif file_extension == 'csv':
33
+ text = self._read_csv(file)
34
+ elif file_extension in ['xlsx', 'xls']:
35
+ text = self._read_excel(file)
36
+
37
+ if(len(text)>self.max_chars_per_file):
38
+ return {"error": "max 5 million characters per file allowed."} , 400
39
+ clean_contents.append({
40
+ "type": file_extension,
41
+ "content": text,
42
+ "name": file.filename,
43
+ "id": str(uuid.uuid4()),
44
+ "total_chars": len(text)
45
+ })
46
+ total_chars += len(text)
47
+ if(total_chars>int(allowed_chars)):
48
+ return {"error": "Total allowed characters limit reached"}, 400
49
+
50
+ except Exception as e:
51
+ return {"error": f"Error reading file {file.filename}: {e}"}, 500
52
+
53
+ return {"total_chars": total_chars, "clean_contents": clean_contents}, 200
54
+
55
+ def _read_txt(self, file):
56
+ file_content = file.read().decode("utf-8")
57
+ return self._clean_text(file_content)
58
+
59
+ def _read_pdf(self, file):
60
+ with pdfplumber.open(file) as pdf:
61
+ text = ''
62
+ for page in pdf.pages:
63
+ text += page.extract_text() or ''
64
+
65
+ return self._clean_text(text)
66
+
67
+ def _read_docx(self, file):
68
+ doc = Document(file)
69
+ text = ''
70
+ for para in doc.paragraphs:
71
+ text += para.text + "\n"
72
+ return self._clean_text(text)
73
+
74
+ def _read_json(self, file):
75
+ content = json.load(file)
76
+ text = json.dumps(content, ensure_ascii=False)
77
+ return self._clean_text(text)
78
+
79
+ def _read_csv(self, file):
80
+ df = pd.read_csv(file)
81
+ text = df.to_string(index=False)
82
+ return self._clean_text(text)
83
+
84
+ def _read_excel(self, file):
85
+ wb = load_workbook(file)
86
+ text = ''
87
+ for sheet in wb.sheetnames:
88
+ ws = wb[sheet]
89
+ for row in ws.iter_rows(values_only=True):
90
+ text += ' | '.join(str(cell) if cell is not None else '' for cell in row) + "\n"
91
+ return self._clean_text(text)
92
+
93
+ def _clean_text(self, text):
94
+ text = re.sub(r'\s+', ' ', text)
95
+ text = re.sub(r'[^\x00-\x7F]+', '', text)
96
+ text = text.strip()
97
+ return text
98
+
99
+ file_reader = FileReader()
100
+
utils/handle_file.py CHANGED
@@ -1,166 +1,32 @@
1
- import io
2
- import pdfplumber
3
- import pandas as pd
4
- import json
5
- from docx import Document
6
  from rag.RAG import rag
7
- from openpyxl import load_workbook
8
 
9
- ALLOWED_EXTENSIONS = {'pdf', 'txt', 'docx', 'csv', 'xlsx', 'xls', 'json'}
10
- MAX_CHARS_PER_FILE = 5000000 # 5 million characters per file limit
11
-
12
- class FileHandler:
13
  def __init__(self):
14
- self.file_handlers = {
15
- 'pdf': self._read_pdf,
16
- 'txt': self._read_txt,
17
- 'docx': self._read_docx,
18
- 'csv': self._read_csv,
19
- 'xlsx': self._read_excel,
20
- 'xls': self._read_excel,
21
- 'json': self._read_json
22
- }
23
-
24
- def _validate_params(self, allowed_chars):
25
- if not allowed_chars:
26
- return None
27
-
28
- try:
29
- return int(allowed_chars)
30
- except ValueError:
31
- raise ValueError("allowed_size parameter must be an integer")
32
-
33
- def _validate_file(self, file):
34
- if not file or file.filename == '':
35
- raise ValueError("No file selected")
36
-
37
- extension = file.filename.rsplit('.', 1)[1].lower() if '.' in file.filename else ''
38
- if extension not in ALLOWED_EXTENSIONS:
39
- raise ValueError(f"File type not allowed. Supported types: {', '.join(ALLOWED_EXTENSIONS)}")
40
- return extension
41
-
42
- def _check_char_limit(self, text):
43
- if len(text.strip()) > MAX_CHARS_PER_FILE:
44
- raise ValueError(f"File exceeds the maximum character limit of {MAX_CHARS_PER_FILE} characters")
45
- return text.strip()
46
-
47
- def _read_pdf(self, file):
48
- try:
49
- text = ""
50
- with pdfplumber.open(file) as pdf:
51
- for page in pdf.pages:
52
- page_text = page.extract_text(layout=True)
53
- if page_text:
54
- text += page_text.strip()
55
- return self._check_char_limit(text)
56
- except Exception as e:
57
- raise ValueError(f"Error reading PDF: {e}")
58
-
59
- def _read_txt(self, file):
60
- try:
61
- text = file.read().decode("utf-8")
62
- return self._check_char_limit(text)
63
- except Exception as e:
64
- raise ValueError(f"Error reading TXT: {e}")
65
-
66
- def _read_docx(self, file):
67
- try:
68
- doc = Document(file)
69
- text = "\n".join(paragraph.text.strip() for paragraph in doc.paragraphs)
70
- return self._check_char_limit(text)
71
- except Exception as e:
72
- raise ValueError(f"Error reading DOCX: {e}")
73
 
74
- def _read_csv(self, file):
75
  try:
76
- df = pd.read_csv(file)
77
- text = df.to_string(index=False)
78
- return self._check_char_limit(text)
79
- except Exception as e:
80
- raise ValueError(f"Error reading CSV: {e}")
81
-
82
- def _read_excel(self, file):
83
- try:
84
- all_text = []
85
- workbook = load_workbook(filename=file)
86
-
87
- for sheet_name in workbook.sheetnames:
88
- sheet = workbook[sheet_name]
89
- sheet_text = f"Sheet: {sheet_name}\n"
90
-
91
- for row in sheet.iter_rows(values_only=True):
92
- row_text = " | ".join([str(cell) if cell is not None else "" for cell in row])
93
- sheet_text += row_text + "\n"
94
-
95
- all_text.append(sheet_text)
96
-
97
- text = "\n\n".join(all_text)
98
- return self._check_char_limit(text)
99
- except Exception as e:
100
- raise ValueError(f"Error reading Excel: {e}")
101
-
102
- def _read_json(self, file):
103
- try:
104
- data = json.load(file)
105
- text = json.dumps(data, indent=2)
106
- return self._check_char_limit(text)
107
- except Exception as e:
108
- raise ValueError(f"Error reading JSON: {e}")
109
-
110
- def read_file(self, file):
111
- extension = self._validate_file(file)
112
- return self.file_handlers[extension](file)
113
-
114
- def process_file(self, file, allowed_chars):
115
- try:
116
- allowed_limit = self._validate_params(allowed_chars)
117
- content = self.read_file(file)
118
 
119
- if len(content) > allowed_limit:
120
- return {"error": f"Character count ({len(content)}) exceeds the allowed limit ({allowed_limit})"}, 400
121
- return rag.generate_embedding(content)
122
-
123
- except ValueError as e:
124
- return {"error": str(e)}, 400
125
- except Exception as e:
126
- return {"error": f"Unexpected error: {e}"}, 500
127
-
128
- def process_files(self, files, allowed_chars):
129
- try:
130
- allowed_limit = self._validate_params(allowed_chars)
131
- file_contents = []
132
- total_chars = 0
133
- for file in files:
134
- try:
135
- content = self.read_file(file)
136
- file_contents.append((file.filename, content))
137
- total_chars += len(content)
138
- except ValueError as e:
139
- return {"error": f"Error with file '{file.filename}': {str(e)}"}, 400
140
- if total_chars > allowed_limit:
141
- return {"error": f"Total character count ({total_chars}) exceeds the allowed limit ({allowed_limit})"}, 400
142
-
143
- results = []
144
- for filename, content in file_contents:
145
- embedding_result, status_code = rag.generate_embedding(content)
146
- if status_code != 200:
147
- return embedding_result, status_code
148
 
149
- results.append({
150
- "filename": filename,
151
- "char_count": len(content),
152
- "embeddings": embedding_result
153
- })
154
-
155
- return {
156
- "total_char_count": total_chars,
157
- "file_count": len(files),
158
- "results": results
159
- }, 200
160
-
161
- except ValueError as e:
162
- return {"error": str(e)}, 400
163
  except Exception as e:
164
- return {"error": f"Unexpected error: {e}"}, 500
 
165
 
166
- file_handler = FileHandler()
 
1
+ from utils.file_reader import file_reader
 
 
 
 
2
  from rag.RAG import rag
 
3
 
4
+ class HandleFiles:
 
 
 
5
  def __init__(self):
6
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ def handle_files(self,files,allowed_chars):
9
  try:
10
+ result = {}
11
+ content,status_code = file_reader.calc_chars(files,allowed_chars)
12
+ if(status_code!=200):
13
+ return content,status_code
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ for text_dict in content['clean_contents']:
16
+ embedding, status_code = rag.generate_embedding(text_dict['content'])
17
+ if(status_code!=200):
18
+ return embedding,status_code
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ result[str(text_dict['id'])] = {
21
+ "name": text_dict['name'],
22
+ "type": text_dict['type'],
23
+ "total_chars": text_dict['total_chars'],
24
+ "embedding": embedding['embeddings'],
25
+ "chunks": embedding['chunks']
26
+ }
27
+ return result
 
 
 
 
 
 
28
  except Exception as e:
29
+ return {"error": f"an error occured: {e}"}, 500
30
+
31
 
32
+ file_handler = HandleFiles()