Spaces:

vishalsh13
/

VectorDBConversionfromfiles

Runtime error

App Files Files Community

vishalsh13 commited on Jan 11

Commit

9cb30e2

1 Parent(s): 975d1b2

update missing files

Browse files

Files changed (4) hide show

Dockerfile +8 -3
app/utils/file_handler.py +29 -0
app/utils/vector_db.py +43 -8
requirements.txt +4 -3

Dockerfile CHANGED Viewed

@@ -1,10 +1,11 @@
-# Base image with GPU support for Hugging Face Spaces
 FROM nvidia/cuda:11.8.0-base-ubuntu20.04
 # Set environment variables
 ENV DEBIAN_FRONTEND=noninteractive
-# Install Python and dependencies
 RUN apt-get update && apt-get install -y \
     python3 python3-pip git wget unzip && \
     rm -rf /var/lib/apt/lists/*
@@ -15,10 +16,14 @@ WORKDIR /app
 # Copy the application files
 COPY . .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
-# Expose the default port used by Spaces
 EXPOSE 7860
 # Start the Flask app

+# CUDA-enabled base image
 FROM nvidia/cuda:11.8.0-base-ubuntu20.04
 # Set environment variables
 ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+# Install required system packages
 RUN apt-get update && apt-get install -y \
     python3 python3-pip git wget unzip && \
     rm -rf /var/lib/apt/lists/*
 # Copy the application files
 COPY . .
+# Create directories with proper read/write permissions
+RUN mkdir -p /app/uploads/vectors && \
+    chmod -R 777 /app/uploads
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
+# Expose the Flask app's port
 EXPOSE 7860
 # Start the Flask app

app/utils/file_handler.py CHANGED Viewed

	@@ -0,0 +1,29 @@

+import os
+import fitz  # PyMuPDF for PDF handling
+import pandas as pd
+from pptx import Presentation
+def extract_text_from_file(v_file_path):
+    """
+    Extracts text content from a given file (PDF, PPT, CSV).
+    """
+    v_text = ""
+    if v_file_path.endswith('.pdf'):
+        obj_pdf = fitz.open(v_file_path)
+        for obj_page in obj_pdf:
+            v_text += obj_page.get_text()
+        obj_pdf.close()
+    elif v_file_path.endswith('.pptx'):
+        obj_ppt = Presentation(v_file_path)
+        for obj_slide in obj_ppt.slides:
+            for obj_shape in obj_slide.shapes:
+                if obj_shape.has_text_frame:
+                    v_text += obj_shape.text_frame.text + "\n"
+    elif v_file_path.endswith('.csv'):
+        v_data = pd.read_csv(v_file_path)
+        v_text += v_data.to_string()
+    return v_text

app/utils/vector_db.py CHANGED Viewed

@@ -1,18 +1,53 @@
 import os
-def process_files_to_vectors(v_folder_path):
-    # Logic to create or update the vector DB
     v_vector_folder = os.path.join(v_folder_path, 'vectors')
     os.makedirs(v_vector_folder, exist_ok=True)
-    # Placeholder: Iterate over files and create vector representations
     for v_root, _, v_files in os.walk(v_folder_path):
         for v_file in v_files:
             v_file_path = os.path.join(v_root, v_file)
-            if v_file.endswith(('.pdf', '.ppt', '.csv')):
-                # Process the files here (convert to vectors)
-                v_vector_file = os.path.join(v_vector_folder, v_file + '.vec')
-                with open(v_vector_file, 'w') as obj_out:
-                    obj_out.write(f'Vector representation of {v_file}')
     return v_vector_folder

 import os
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from app.utils.file_handler import extract_text_from_file
+# Load a pre-trained embedding model with GPU support
+obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device="cuda")
+def process_files_to_vectors(v_folder_path, v_update=False, v_existing_index_path=None):
+    """
+    Processes files to create or update a FAISS vector database.
+    Args:
+        v_folder_path (str): Path to the folder containing input files.
+        v_update (bool): Whether to update an existing vector database.
+        v_existing_index_path (str): Path to the existing FAISS index file (if updating).
+    Returns:
+        str: Path to the folder containing the updated vector database.
+    """
     v_vector_folder = os.path.join(v_folder_path, 'vectors')
     os.makedirs(v_vector_folder, exist_ok=True)
+    # Initialize FAISS index
+    if v_update and v_existing_index_path:
+        v_index = faiss.read_index(v_existing_index_path)
+        with open(os.path.join(v_vector_folder, 'metadata.json'), 'r') as obj_meta:
+            import json
+            v_metadata = json.load(obj_meta)
+    else:
+        v_index = faiss.IndexFlatL2(384)  # Embedding dimensions = 384
+        v_metadata = {}
+    # Process files and update the vector database
     for v_root, _, v_files in os.walk(v_folder_path):
         for v_file in v_files:
             v_file_path = os.path.join(v_root, v_file)
+            if v_file.endswith(('.pdf', '.pptx', '.csv')):
+                v_text = extract_text_from_file(v_file_path)
+                v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
+                v_index.add(v_embeddings)
+                v_metadata[len(v_index) - 1] = v_file_path
+    # Save the updated index and metadata
+    v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
+    faiss.write_index(v_index, v_index_path)
+    with open(os.path.join(v_vector_folder, 'metadata.json'), 'w') as obj_meta:
+        import json
+        json.dump(v_metadata, obj_meta, indent=4)
     return v_vector_folder

requirements.txt CHANGED Viewed

@@ -1,9 +1,10 @@
 Flask==2.3.2
 transformers==4.27.4
 sentence-transformers==2.2.2
-torch==2.0.1
-torchvision==0.15.2
-numpy==1.24.2
 PyMuPDF==1.22.5
 python-pptx==0.6.21
 pandas==1.5.3

 Flask==2.3.2
 transformers==4.27.4
 sentence-transformers==2.2.2
+torch==2.0.1+cu118         # GPU-enabled PyTorch (CUDA 11.8 compatible)
+torchvision==0.15.2+cu118  # GPU-enabled TorchVision
+faiss-gpu==1.7.4           # GPU-enabled FAISS
 PyMuPDF==1.22.5
 python-pptx==0.6.21
 pandas==1.5.3
+numpy==1.24.2