Spaces:

vishalsh13
/

VectorDBConversionfromfiles

Runtime error

App Files Files Community

vishalsh13 commited on Jan 11

Commit

4ad299d

1 Parent(s): c7e2b56

commit

Browse files

Files changed (8) hide show

Dockerfile +4 -4
app/routes.py +22 -13
app/templates/index.html +38 -9
app/utils/file_handler.py +8 -5
app/utils/vector_db.py +31 -31
app/utils/zip_handler.py +6 -3
requirements.txt +11 -11
run.py +2 -4

Dockerfile CHANGED Viewed

@@ -15,7 +15,7 @@ RUN apt-get install -y \
     python3.11 python3.11-distutils python3.11-dev wget git unzip && \
     rm -rf /var/lib/apt/lists/*
-# Install pip using get-pip.py
 RUN wget https://bootstrap.pypa.io/get-pip.py && \
     python3.11 get-pip.py && \
     rm get-pip.py
@@ -25,12 +25,11 @@ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
     update-alternatives --config python3
 # Set Hugging Face cache directory
-ENV TRANSFORMERS_CACHE=/app/cache
 RUN mkdir -p /app/cache && chmod -R 777 /app/cache
 # Create directories with proper read/write permissions
-RUN mkdir -p /app/uploads/vectors && \
-    chmod -R 777 /app/uploads
 # Set the working directory
 WORKDIR /app
@@ -40,6 +39,7 @@ COPY . .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Expose the Flask app's port
 EXPOSE 7860

     python3.11 python3.11-distutils python3.11-dev wget git unzip && \
     rm -rf /var/lib/apt/lists/*
+# Install pip using get-pip.py (bypasses any outdated or broken system pip)
 RUN wget https://bootstrap.pypa.io/get-pip.py && \
     python3.11 get-pip.py && \
     rm get-pip.py
     update-alternatives --config python3
 # Set Hugging Face cache directory
+ENV HF_HOME=/app/cache
 RUN mkdir -p /app/cache && chmod -R 777 /app/cache
 # Create directories with proper read/write permissions
+RUN mkdir -p /app/uploads/vectors && chmod -R 777 /app/uploads
 # Set the working directory
 WORKDIR /app
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
+RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder='/app/cache')"
 # Expose the Flask app's port
 EXPOSE 7860

app/routes.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import os
 from flask import Blueprint, render_template, request, send_file, jsonify
-from .utils.zip_handler import handle_zip_upload
 from .utils.vector_db import process_files_to_vectors
 v_bp = Blueprint('routes', __name__)
@@ -9,21 +10,29 @@ v_bp = Blueprint('routes', __name__)
 def home():
     if request.method == 'POST':
         v_uploaded_file = request.files.get('file')
-        if v_uploaded_file and v_uploaded_file.filename.endswith('.zip'):
-            v_upload_path = os.path.join('app/uploads', v_uploaded_file.filename)
-            v_uploaded_file.save(v_upload_path)
-            # Process the zip file
-            v_output_path = handle_zip_upload(v_upload_path)
-            # Create or update vector database
-            v_result_path = process_files_to_vectors(v_output_path)
-            # Compress and send the result
-            result_zip = os.path.join('app/uploads/vectors/vector_db.zip')
-            os.system(f'zip -r {result_zip} {v_result_path}')
-            return send_file(result_zip, as_attachment=True)
-        return jsonify({'error': 'Please upload a valid zip file.'})
     return render_template('index.html')

 import os
 from flask import Blueprint, render_template, request, send_file, jsonify
 from .utils.vector_db import process_files_to_vectors
+from .utils.zip_handler import handle_zip_upload  # We'll create this utility
+import zipfile
 v_bp = Blueprint('routes', __name__)
 def home():
     if request.method == 'POST':
         v_uploaded_file = request.files.get('file')
+        if not v_uploaded_file or not v_uploaded_file.filename.endswith('.zip'):
+            return jsonify({'error': 'Please upload a valid zip file.'}), 400
+        # Save uploaded ZIP
+        v_upload_path = os.path.join('app/uploads', v_uploaded_file.filename)
+        v_uploaded_file.save(v_upload_path)
+        # Extract the ZIP
+        v_extracted_folder = handle_zip_upload(v_upload_path)
+        # Process to create or update vector DB
+        v_result_folder = process_files_to_vectors(v_extracted_folder)
+        # Zip the resulting vectors folder for download
+        v_result_zip_path = os.path.join('app/uploads', 'vector_db_result.zip')
+        obj_zip = zipfile.ZipFile(v_result_zip_path, 'w', zipfile.ZIP_DEFLATED)
+        for v_root, _, v_files in os.walk(v_result_folder):
+            for v_file in v_files:
+                v_full_path = os.path.join(v_root, v_file)
+                v_arcname = os.path.relpath(v_full_path, start=v_result_folder)
+                obj_zip.write(v_full_path, arcname=v_arcname)
+        obj_zip.close()
+        return send_file(v_result_zip_path, as_attachment=True)
     return render_template('index.html')

app/templates/index.html CHANGED Viewed

@@ -1,23 +1,52 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>Vector DB Creator</title>
 </head>
 <body>
     <header>
-        <img src="{{ url_for('static', filename='logo.png') }}" alt="Logo" style="float:left; width:50px;">
         <h1>Change your PDF, PPT, and CSV data to Vector DB</h1>
     </header>
-    <main>
-        <p>Upload your data files or an existing vector database to create or update a vector DB.</p>
         <form action="/" method="POST" enctype="multipart/form-data">
             <label for="file">Upload ZIP File:</label>
-            <input type="file" name="file" id="file" accept=".zip" required>
-            <button type="submit">Upload</button>
         </form>
-        <p>Your vector DB will be available for download after processing.</p>
-    </main>
 </body>
 </html>

 <!DOCTYPE html>
 <html lang="en">
 <head>
+    <meta charset="UTF-8" />
     <title>Vector DB Creator</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 40px;
+        }
+        header {
+            display: flex;
+            align-items: center;
+        }
+        header img {
+            margin-right: 20px;
+            width: 50px;
+        }
+        h1 {
+            margin: 0;
+        }
+        .content {
+            margin-top: 20px;
+        }
+        form {
+            margin-top: 20px;
+        }
+        label {
+            display: inline-block;
+            width: 120px;
+        }
+    </style>
 </head>
 <body>
     <header>
+        <img src="{{ url_for('static', filename='logo.png') }}" alt="Logo" />
         <h1>Change your PDF, PPT, and CSV data to Vector DB</h1>
     </header>
+    <div class="content">
+        <p>This application allows you to upload a .zip containing your data files (PDF, PPTX, or CSV)
+        and convert them into a vector database. Then you can download the processed vector DB as a .zip.</p>
         <form action="/" method="POST" enctype="multipart/form-data">
             <label for="file">Upload ZIP File:</label>
+            <input type="file" name="file" id="file" accept=".zip" required />
+            <button type="submit">Upload & Convert</button>
         </form>
+        <p>After processing, you will be prompted to download the resulting vector database.</p>
+    </div>
 </body>
 </html>

app/utils/file_handler.py CHANGED Viewed

@@ -1,28 +1,31 @@
 import os
-import fitz  # PyMuPDF for PDF handling
 import pandas as pd
 from pptx import Presentation
 def extract_text_from_file(v_file_path):
     """
-    Extracts text content from a given file (PDF, PPT, CSV).
     """
     v_text = ""
-    if v_file_path.endswith('.pdf'):
         obj_pdf = fitz.open(v_file_path)
         for obj_page in obj_pdf:
             v_text += obj_page.get_text()
         obj_pdf.close()
-    elif v_file_path.endswith('.pptx'):
         obj_ppt = Presentation(v_file_path)
         for obj_slide in obj_ppt.slides:
             for obj_shape in obj_slide.shapes:
                 if obj_shape.has_text_frame:
                     v_text += obj_shape.text_frame.text + "\n"
-    elif v_file_path.endswith('.csv'):
         v_data = pd.read_csv(v_file_path)
         v_text += v_data.to_string()

 import os
+import fitz  # PyMuPDF
 import pandas as pd
 from pptx import Presentation
 def extract_text_from_file(v_file_path):
     """
+    Extracts text from PDF, PPTX, or CSV files.
     """
     v_text = ""
+    # PDF
+    if v_file_path.lower().endswith('.pdf'):
         obj_pdf = fitz.open(v_file_path)
         for obj_page in obj_pdf:
             v_text += obj_page.get_text()
         obj_pdf.close()
+    # PPTX
+    elif v_file_path.lower().endswith('.pptx'):
         obj_ppt = Presentation(v_file_path)
         for obj_slide in obj_ppt.slides:
             for obj_shape in obj_slide.shapes:
                 if obj_shape.has_text_frame:
                     v_text += obj_shape.text_frame.text + "\n"
+    # CSV
+    elif v_file_path.lower().endswith('.csv'):
         v_data = pd.read_csv(v_file_path)
         v_text += v_data.to_string()

app/utils/vector_db.py CHANGED Viewed

@@ -3,54 +3,54 @@ import faiss
 import numpy as np
 import torch
 from sentence_transformers import SentenceTransformer
-from app.utils.file_handler import extract_text_from_file
-device = "cuda" if torch.cuda.is_available() else "cpu"
-#obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
-# Load a pre-trained embedding model with GPU support
-obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device="cuda")
-def process_files_to_vectors(v_folder_path, v_update=False, v_existing_index_path=None):
     """
-    Processes files to create or update a FAISS vector database.
-    Args:
-        v_folder_path (str): Path to the folder containing input files.
-        v_update (bool): Whether to update an existing vector database.
-        v_existing_index_path (str): Path to the existing FAISS index file (if updating).
-    Returns:
-        str: Path to the folder containing the updated vector database.
     """
     v_vector_folder = os.path.join(v_folder_path, 'vectors')
     os.makedirs(v_vector_folder, exist_ok=True)
-    # Initialize FAISS index
-    if v_update and v_existing_index_path:
-        v_index = faiss.read_index(v_existing_index_path)
-        with open(os.path.join(v_vector_folder, 'metadata.json'), 'r') as obj_meta:
-            import json
-            v_metadata = json.load(obj_meta)
-    else:
-        v_index = faiss.IndexFlatL2(384)  # Embedding dimensions = 384
-        v_metadata = {}
-    # Process files and update the vector database
     for v_root, _, v_files in os.walk(v_folder_path):
         for v_file in v_files:
             v_file_path = os.path.join(v_root, v_file)
-            if v_file.endswith(('.pdf', '.pptx', '.csv')):
                 v_text = extract_text_from_file(v_file_path)
                 v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
                 v_index.add(v_embeddings)
-                v_metadata[len(v_index) - 1] = v_file_path
-    # Save the updated index and metadata
     v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
     faiss.write_index(v_index, v_index_path)
-    with open(os.path.join(v_vector_folder, 'metadata.json'), 'w') as obj_meta:
-        import json
         json.dump(v_metadata, obj_meta, indent=4)
     return v_vector_folder

 import numpy as np
 import torch
 from sentence_transformers import SentenceTransformer
+from .file_handler import extract_text_from_file
+# Determine if GPU is available
+v_device = "cuda" if torch.cuda.is_available() else "cpu"
+obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=v_device)
+def process_files_to_vectors(v_folder_path):
     """
+    Processes files (PDF, PPTX, CSV) to create/update a FAISS vector database.
+    Returns the path to the folder containing the FAISS index and metadata.
     """
     v_vector_folder = os.path.join(v_folder_path, 'vectors')
     os.makedirs(v_vector_folder, exist_ok=True)
+    # Create a brand-new FAISS index
+    # For sentence-transformers/all-MiniLM-L6-v2, embedding dimension is 384
+    v_index = faiss.IndexFlatL2(384)
+    v_metadata = {}
+    # Iterate over extracted files
+    v_doc_counter = 0
     for v_root, _, v_files in os.walk(v_folder_path):
         for v_file in v_files:
             v_file_path = os.path.join(v_root, v_file)
+            if v_file_path.lower().endswith(('.pdf', '.pptx', '.csv')):
                 v_text = extract_text_from_file(v_file_path)
+                if not v_text.strip():
+                    continue  # Skip empty content
+                # Encode text into embeddings
+                # convert_to_tensor=True yields a PyTorch tensor, so convert to numpy
                 v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
+                # Add to FAISS index
                 v_index.add(v_embeddings)
+                # Map index ID to filename
+                v_metadata[v_doc_counter] = v_file_path
+                v_doc_counter += 1
+    # Save FAISS index
     v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
     faiss.write_index(v_index, v_index_path)
+    # Save metadata
+    import json
+    with open(os.path.join(v_vector_folder, 'metadata.json'), 'w', encoding='utf-8') as obj_meta:
         json.dump(v_metadata, obj_meta, indent=4)
     return v_vector_folder

app/utils/zip_handler.py CHANGED Viewed

@@ -2,7 +2,10 @@ import os
 import zipfile
 def handle_zip_upload(v_zip_path):
-    v_extract_path = os.path.splitext(v_zip_path)[0]
     with zipfile.ZipFile(v_zip_path, 'r') as obj_zip:
-        obj_zip.extractall(v_extract_path)
-    return v_extract_path

 import zipfile
 def handle_zip_upload(v_zip_path):
+    """
+    Extracts ZIP file contents into a subfolder of app/uploads.
+    """
+    v_extracted_path = os.path.splitext(v_zip_path)[0]
     with zipfile.ZipFile(v_zip_path, 'r') as obj_zip:
+        obj_zip.extractall(v_extracted_path)
+    return v_extracted_path

requirements.txt CHANGED Viewed

@@ -1,11 +1,11 @@
-Flask
-transformers
-sentence-transformers
-torch
-torchvision
-faiss-cpu
-PyMuPDF
-python-pptx
-pandas
-numpy
-#huggingface-hub==0.15.1  # Ensure compatibility

+Flask==2.3.2
+transformers==4.31.0
+sentence-transformers==2.2.2
+torch==2.0.1
+torchvision==0.15.2
+faiss-cpu==1.7.2
+PyMuPDF==1.22.5
+python-pptx==0.6.21
+pandas==1.5.3
+numpy==1.24.2
+huggingface-hub==0.15.1

run.py CHANGED Viewed

@@ -1,8 +1,6 @@
 from app import create_app
-import os
-os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
 if __name__ == "__main__":
     obj_app = create_app()
-    obj_app.run(host='0.0.0.0', port=7860, debug=True)

 from app import create_app
 if __name__ == "__main__":
     obj_app = create_app()
+    # Run the Flask app on 0.0.0.0:7860 for Hugging Face Spaces or local Docker
+    obj_app.run(host='0.0.0.0', port=7860)