vishalsh13 commited on
Commit
4ad299d
·
1 Parent(s): c7e2b56
Dockerfile CHANGED
@@ -15,7 +15,7 @@ RUN apt-get install -y \
15
  python3.11 python3.11-distutils python3.11-dev wget git unzip && \
16
  rm -rf /var/lib/apt/lists/*
17
 
18
- # Install pip using get-pip.py
19
  RUN wget https://bootstrap.pypa.io/get-pip.py && \
20
  python3.11 get-pip.py && \
21
  rm get-pip.py
@@ -25,12 +25,11 @@ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
25
  update-alternatives --config python3
26
 
27
  # Set Hugging Face cache directory
28
- ENV TRANSFORMERS_CACHE=/app/cache
29
  RUN mkdir -p /app/cache && chmod -R 777 /app/cache
30
 
31
  # Create directories with proper read/write permissions
32
- RUN mkdir -p /app/uploads/vectors && \
33
- chmod -R 777 /app/uploads
34
 
35
  # Set the working directory
36
  WORKDIR /app
@@ -40,6 +39,7 @@ COPY . .
40
 
41
  # Install Python dependencies
42
  RUN pip install --no-cache-dir -r requirements.txt
 
43
 
44
  # Expose the Flask app's port
45
  EXPOSE 7860
 
15
  python3.11 python3.11-distutils python3.11-dev wget git unzip && \
16
  rm -rf /var/lib/apt/lists/*
17
 
18
+ # Install pip using get-pip.py (bypasses any outdated or broken system pip)
19
  RUN wget https://bootstrap.pypa.io/get-pip.py && \
20
  python3.11 get-pip.py && \
21
  rm get-pip.py
 
25
  update-alternatives --config python3
26
 
27
  # Set Hugging Face cache directory
28
+ ENV HF_HOME=/app/cache
29
  RUN mkdir -p /app/cache && chmod -R 777 /app/cache
30
 
31
  # Create directories with proper read/write permissions
32
+ RUN mkdir -p /app/uploads/vectors && chmod -R 777 /app/uploads
 
33
 
34
  # Set the working directory
35
  WORKDIR /app
 
39
 
40
  # Install Python dependencies
41
  RUN pip install --no-cache-dir -r requirements.txt
42
+ RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder='/app/cache')"
43
 
44
  # Expose the Flask app's port
45
  EXPOSE 7860
app/routes.py CHANGED
@@ -1,7 +1,8 @@
1
  import os
2
  from flask import Blueprint, render_template, request, send_file, jsonify
3
- from .utils.zip_handler import handle_zip_upload
4
  from .utils.vector_db import process_files_to_vectors
 
 
5
 
6
  v_bp = Blueprint('routes', __name__)
7
 
@@ -9,21 +10,29 @@ v_bp = Blueprint('routes', __name__)
9
  def home():
10
  if request.method == 'POST':
11
  v_uploaded_file = request.files.get('file')
12
- if v_uploaded_file and v_uploaded_file.filename.endswith('.zip'):
13
- v_upload_path = os.path.join('app/uploads', v_uploaded_file.filename)
14
- v_uploaded_file.save(v_upload_path)
15
 
16
- # Process the zip file
17
- v_output_path = handle_zip_upload(v_upload_path)
 
18
 
19
- # Create or update vector database
20
- v_result_path = process_files_to_vectors(v_output_path)
21
 
22
- # Compress and send the result
23
- result_zip = os.path.join('app/uploads/vectors/vector_db.zip')
24
- os.system(f'zip -r {result_zip} {v_result_path}')
25
- return send_file(result_zip, as_attachment=True)
26
 
27
- return jsonify({'error': 'Please upload a valid zip file.'})
 
 
 
 
 
 
 
 
 
 
28
 
29
  return render_template('index.html')
 
1
  import os
2
  from flask import Blueprint, render_template, request, send_file, jsonify
 
3
  from .utils.vector_db import process_files_to_vectors
4
+ from .utils.zip_handler import handle_zip_upload # We'll create this utility
5
+ import zipfile
6
 
7
  v_bp = Blueprint('routes', __name__)
8
 
 
10
  def home():
11
  if request.method == 'POST':
12
  v_uploaded_file = request.files.get('file')
13
+ if not v_uploaded_file or not v_uploaded_file.filename.endswith('.zip'):
14
+ return jsonify({'error': 'Please upload a valid zip file.'}), 400
 
15
 
16
+ # Save uploaded ZIP
17
+ v_upload_path = os.path.join('app/uploads', v_uploaded_file.filename)
18
+ v_uploaded_file.save(v_upload_path)
19
 
20
+ # Extract the ZIP
21
+ v_extracted_folder = handle_zip_upload(v_upload_path)
22
 
23
+ # Process to create or update vector DB
24
+ v_result_folder = process_files_to_vectors(v_extracted_folder)
 
 
25
 
26
+ # Zip the resulting vectors folder for download
27
+ v_result_zip_path = os.path.join('app/uploads', 'vector_db_result.zip')
28
+ obj_zip = zipfile.ZipFile(v_result_zip_path, 'w', zipfile.ZIP_DEFLATED)
29
+ for v_root, _, v_files in os.walk(v_result_folder):
30
+ for v_file in v_files:
31
+ v_full_path = os.path.join(v_root, v_file)
32
+ v_arcname = os.path.relpath(v_full_path, start=v_result_folder)
33
+ obj_zip.write(v_full_path, arcname=v_arcname)
34
+ obj_zip.close()
35
+
36
+ return send_file(v_result_zip_path, as_attachment=True)
37
 
38
  return render_template('index.html')
app/templates/index.html CHANGED
@@ -1,23 +1,52 @@
1
  <!DOCTYPE html>
2
  <html lang="en">
3
  <head>
4
- <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
  <title>Vector DB Creator</title>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  </head>
8
  <body>
9
  <header>
10
- <img src="{{ url_for('static', filename='logo.png') }}" alt="Logo" style="float:left; width:50px;">
11
  <h1>Change your PDF, PPT, and CSV data to Vector DB</h1>
12
  </header>
13
- <main>
14
- <p>Upload your data files or an existing vector database to create or update a vector DB.</p>
 
 
15
  <form action="/" method="POST" enctype="multipart/form-data">
16
  <label for="file">Upload ZIP File:</label>
17
- <input type="file" name="file" id="file" accept=".zip" required>
18
- <button type="submit">Upload</button>
19
  </form>
20
- <p>Your vector DB will be available for download after processing.</p>
21
- </main>
 
22
  </body>
23
  </html>
 
1
  <!DOCTYPE html>
2
  <html lang="en">
3
  <head>
4
+ <meta charset="UTF-8" />
 
5
  <title>Vector DB Creator</title>
6
+ <style>
7
+ body {
8
+ font-family: Arial, sans-serif;
9
+ margin: 40px;
10
+ }
11
+ header {
12
+ display: flex;
13
+ align-items: center;
14
+ }
15
+ header img {
16
+ margin-right: 20px;
17
+ width: 50px;
18
+ }
19
+ h1 {
20
+ margin: 0;
21
+ }
22
+ .content {
23
+ margin-top: 20px;
24
+ }
25
+ form {
26
+ margin-top: 20px;
27
+ }
28
+ label {
29
+ display: inline-block;
30
+ width: 120px;
31
+ }
32
+ </style>
33
  </head>
34
  <body>
35
  <header>
36
+ <img src="{{ url_for('static', filename='logo.png') }}" alt="Logo" />
37
  <h1>Change your PDF, PPT, and CSV data to Vector DB</h1>
38
  </header>
39
+ <div class="content">
40
+ <p>This application allows you to upload a .zip containing your data files (PDF, PPTX, or CSV)
41
+ and convert them into a vector database. Then you can download the processed vector DB as a .zip.</p>
42
+
43
  <form action="/" method="POST" enctype="multipart/form-data">
44
  <label for="file">Upload ZIP File:</label>
45
+ <input type="file" name="file" id="file" accept=".zip" required />
46
+ <button type="submit">Upload & Convert</button>
47
  </form>
48
+
49
+ <p>After processing, you will be prompted to download the resulting vector database.</p>
50
+ </div>
51
  </body>
52
  </html>
app/utils/file_handler.py CHANGED
@@ -1,28 +1,31 @@
1
  import os
2
- import fitz # PyMuPDF for PDF handling
3
  import pandas as pd
4
  from pptx import Presentation
5
 
6
  def extract_text_from_file(v_file_path):
7
  """
8
- Extracts text content from a given file (PDF, PPT, CSV).
9
  """
10
  v_text = ""
11
 
12
- if v_file_path.endswith('.pdf'):
 
13
  obj_pdf = fitz.open(v_file_path)
14
  for obj_page in obj_pdf:
15
  v_text += obj_page.get_text()
16
  obj_pdf.close()
17
 
18
- elif v_file_path.endswith('.pptx'):
 
19
  obj_ppt = Presentation(v_file_path)
20
  for obj_slide in obj_ppt.slides:
21
  for obj_shape in obj_slide.shapes:
22
  if obj_shape.has_text_frame:
23
  v_text += obj_shape.text_frame.text + "\n"
24
 
25
- elif v_file_path.endswith('.csv'):
 
26
  v_data = pd.read_csv(v_file_path)
27
  v_text += v_data.to_string()
28
 
 
1
  import os
2
+ import fitz # PyMuPDF
3
  import pandas as pd
4
  from pptx import Presentation
5
 
6
  def extract_text_from_file(v_file_path):
7
  """
8
+ Extracts text from PDF, PPTX, or CSV files.
9
  """
10
  v_text = ""
11
 
12
+ # PDF
13
+ if v_file_path.lower().endswith('.pdf'):
14
  obj_pdf = fitz.open(v_file_path)
15
  for obj_page in obj_pdf:
16
  v_text += obj_page.get_text()
17
  obj_pdf.close()
18
 
19
+ # PPTX
20
+ elif v_file_path.lower().endswith('.pptx'):
21
  obj_ppt = Presentation(v_file_path)
22
  for obj_slide in obj_ppt.slides:
23
  for obj_shape in obj_slide.shapes:
24
  if obj_shape.has_text_frame:
25
  v_text += obj_shape.text_frame.text + "\n"
26
 
27
+ # CSV
28
+ elif v_file_path.lower().endswith('.csv'):
29
  v_data = pd.read_csv(v_file_path)
30
  v_text += v_data.to_string()
31
 
app/utils/vector_db.py CHANGED
@@ -3,54 +3,54 @@ import faiss
3
  import numpy as np
4
  import torch
5
  from sentence_transformers import SentenceTransformer
6
- from app.utils.file_handler import extract_text_from_file
7
 
8
- device = "cuda" if torch.cuda.is_available() else "cpu"
9
- #obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
10
- # Load a pre-trained embedding model with GPU support
11
- obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device="cuda")
12
 
13
- def process_files_to_vectors(v_folder_path, v_update=False, v_existing_index_path=None):
14
  """
15
- Processes files to create or update a FAISS vector database.
16
-
17
- Args:
18
- v_folder_path (str): Path to the folder containing input files.
19
- v_update (bool): Whether to update an existing vector database.
20
- v_existing_index_path (str): Path to the existing FAISS index file (if updating).
21
-
22
- Returns:
23
- str: Path to the folder containing the updated vector database.
24
  """
25
  v_vector_folder = os.path.join(v_folder_path, 'vectors')
26
  os.makedirs(v_vector_folder, exist_ok=True)
27
 
28
- # Initialize FAISS index
29
- if v_update and v_existing_index_path:
30
- v_index = faiss.read_index(v_existing_index_path)
31
- with open(os.path.join(v_vector_folder, 'metadata.json'), 'r') as obj_meta:
32
- import json
33
- v_metadata = json.load(obj_meta)
34
- else:
35
- v_index = faiss.IndexFlatL2(384) # Embedding dimensions = 384
36
- v_metadata = {}
37
-
38
- # Process files and update the vector database
39
  for v_root, _, v_files in os.walk(v_folder_path):
40
  for v_file in v_files:
41
  v_file_path = os.path.join(v_root, v_file)
42
 
43
- if v_file.endswith(('.pdf', '.pptx', '.csv')):
44
  v_text = extract_text_from_file(v_file_path)
 
 
 
 
 
45
  v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
 
 
46
  v_index.add(v_embeddings)
47
- v_metadata[len(v_index) - 1] = v_file_path
48
 
49
- # Save the updated index and metadata
 
 
 
 
50
  v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
51
  faiss.write_index(v_index, v_index_path)
52
- with open(os.path.join(v_vector_folder, 'metadata.json'), 'w') as obj_meta:
53
- import json
 
 
54
  json.dump(v_metadata, obj_meta, indent=4)
55
 
56
  return v_vector_folder
 
3
  import numpy as np
4
  import torch
5
  from sentence_transformers import SentenceTransformer
6
+ from .file_handler import extract_text_from_file
7
 
8
+ # Determine if GPU is available
9
+ v_device = "cuda" if torch.cuda.is_available() else "cpu"
10
+ obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=v_device)
 
11
 
12
+ def process_files_to_vectors(v_folder_path):
13
  """
14
+ Processes files (PDF, PPTX, CSV) to create/update a FAISS vector database.
15
+ Returns the path to the folder containing the FAISS index and metadata.
 
 
 
 
 
 
 
16
  """
17
  v_vector_folder = os.path.join(v_folder_path, 'vectors')
18
  os.makedirs(v_vector_folder, exist_ok=True)
19
 
20
+ # Create a brand-new FAISS index
21
+ # For sentence-transformers/all-MiniLM-L6-v2, embedding dimension is 384
22
+ v_index = faiss.IndexFlatL2(384)
23
+ v_metadata = {}
24
+
25
+ # Iterate over extracted files
26
+ v_doc_counter = 0
 
 
 
 
27
  for v_root, _, v_files in os.walk(v_folder_path):
28
  for v_file in v_files:
29
  v_file_path = os.path.join(v_root, v_file)
30
 
31
+ if v_file_path.lower().endswith(('.pdf', '.pptx', '.csv')):
32
  v_text = extract_text_from_file(v_file_path)
33
+ if not v_text.strip():
34
+ continue # Skip empty content
35
+
36
+ # Encode text into embeddings
37
+ # convert_to_tensor=True yields a PyTorch tensor, so convert to numpy
38
  v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
39
+
40
+ # Add to FAISS index
41
  v_index.add(v_embeddings)
 
42
 
43
+ # Map index ID to filename
44
+ v_metadata[v_doc_counter] = v_file_path
45
+ v_doc_counter += 1
46
+
47
+ # Save FAISS index
48
  v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
49
  faiss.write_index(v_index, v_index_path)
50
+
51
+ # Save metadata
52
+ import json
53
+ with open(os.path.join(v_vector_folder, 'metadata.json'), 'w', encoding='utf-8') as obj_meta:
54
  json.dump(v_metadata, obj_meta, indent=4)
55
 
56
  return v_vector_folder
app/utils/zip_handler.py CHANGED
@@ -2,7 +2,10 @@ import os
2
  import zipfile
3
 
4
  def handle_zip_upload(v_zip_path):
5
- v_extract_path = os.path.splitext(v_zip_path)[0]
 
 
 
6
  with zipfile.ZipFile(v_zip_path, 'r') as obj_zip:
7
- obj_zip.extractall(v_extract_path)
8
- return v_extract_path
 
2
  import zipfile
3
 
4
  def handle_zip_upload(v_zip_path):
5
+ """
6
+ Extracts ZIP file contents into a subfolder of app/uploads.
7
+ """
8
+ v_extracted_path = os.path.splitext(v_zip_path)[0]
9
  with zipfile.ZipFile(v_zip_path, 'r') as obj_zip:
10
+ obj_zip.extractall(v_extracted_path)
11
+ return v_extracted_path
requirements.txt CHANGED
@@ -1,11 +1,11 @@
1
- Flask
2
- transformers
3
- sentence-transformers
4
- torch
5
- torchvision
6
- faiss-cpu
7
- PyMuPDF
8
- python-pptx
9
- pandas
10
- numpy
11
- #huggingface-hub==0.15.1 # Ensure compatibility
 
1
+ Flask==2.3.2
2
+ transformers==4.31.0
3
+ sentence-transformers==2.2.2
4
+ torch==2.0.1
5
+ torchvision==0.15.2
6
+ faiss-cpu==1.7.2
7
+ PyMuPDF==1.22.5
8
+ python-pptx==0.6.21
9
+ pandas==1.5.3
10
+ numpy==1.24.2
11
+ huggingface-hub==0.15.1
run.py CHANGED
@@ -1,8 +1,6 @@
1
  from app import create_app
2
- import os
3
- os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
4
-
5
 
6
  if __name__ == "__main__":
7
  obj_app = create_app()
8
- obj_app.run(host='0.0.0.0', port=7860, debug=True)
 
 
1
  from app import create_app
 
 
 
2
 
3
  if __name__ == "__main__":
4
  obj_app = create_app()
5
+ # Run the Flask app on 0.0.0.0:7860 for Hugging Face Spaces or local Docker
6
+ obj_app.run(host='0.0.0.0', port=7860)