vishalsh13 commited on
Commit
9cb30e2
·
1 Parent(s): 975d1b2

update missing files

Browse files
Dockerfile CHANGED
@@ -1,10 +1,11 @@
1
- # Base image with GPU support for Hugging Face Spaces
2
  FROM nvidia/cuda:11.8.0-base-ubuntu20.04
3
 
4
  # Set environment variables
5
  ENV DEBIAN_FRONTEND=noninteractive
 
6
 
7
- # Install Python and dependencies
8
  RUN apt-get update && apt-get install -y \
9
  python3 python3-pip git wget unzip && \
10
  rm -rf /var/lib/apt/lists/*
@@ -15,10 +16,14 @@ WORKDIR /app
15
  # Copy the application files
16
  COPY . .
17
 
 
 
 
 
18
  # Install Python dependencies
19
  RUN pip install --no-cache-dir -r requirements.txt
20
 
21
- # Expose the default port used by Spaces
22
  EXPOSE 7860
23
 
24
  # Start the Flask app
 
1
+ # CUDA-enabled base image
2
  FROM nvidia/cuda:11.8.0-base-ubuntu20.04
3
 
4
  # Set environment variables
5
  ENV DEBIAN_FRONTEND=noninteractive
6
+ ENV PYTHONUNBUFFERED=1
7
 
8
+ # Install required system packages
9
  RUN apt-get update && apt-get install -y \
10
  python3 python3-pip git wget unzip && \
11
  rm -rf /var/lib/apt/lists/*
 
16
  # Copy the application files
17
  COPY . .
18
 
19
+ # Create directories with proper read/write permissions
20
+ RUN mkdir -p /app/uploads/vectors && \
21
+ chmod -R 777 /app/uploads
22
+
23
  # Install Python dependencies
24
  RUN pip install --no-cache-dir -r requirements.txt
25
 
26
+ # Expose the Flask app's port
27
  EXPOSE 7860
28
 
29
  # Start the Flask app
app/utils/file_handler.py CHANGED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz # PyMuPDF for PDF handling
3
+ import pandas as pd
4
+ from pptx import Presentation
5
+
6
+ def extract_text_from_file(v_file_path):
7
+ """
8
+ Extracts text content from a given file (PDF, PPT, CSV).
9
+ """
10
+ v_text = ""
11
+
12
+ if v_file_path.endswith('.pdf'):
13
+ obj_pdf = fitz.open(v_file_path)
14
+ for obj_page in obj_pdf:
15
+ v_text += obj_page.get_text()
16
+ obj_pdf.close()
17
+
18
+ elif v_file_path.endswith('.pptx'):
19
+ obj_ppt = Presentation(v_file_path)
20
+ for obj_slide in obj_ppt.slides:
21
+ for obj_shape in obj_slide.shapes:
22
+ if obj_shape.has_text_frame:
23
+ v_text += obj_shape.text_frame.text + "\n"
24
+
25
+ elif v_file_path.endswith('.csv'):
26
+ v_data = pd.read_csv(v_file_path)
27
+ v_text += v_data.to_string()
28
+
29
+ return v_text
app/utils/vector_db.py CHANGED
@@ -1,18 +1,53 @@
1
  import os
 
 
 
 
2
 
3
- def process_files_to_vectors(v_folder_path):
4
- # Logic to create or update the vector DB
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  v_vector_folder = os.path.join(v_folder_path, 'vectors')
6
  os.makedirs(v_vector_folder, exist_ok=True)
7
 
8
- # Placeholder: Iterate over files and create vector representations
 
 
 
 
 
 
 
 
 
 
9
  for v_root, _, v_files in os.walk(v_folder_path):
10
  for v_file in v_files:
11
  v_file_path = os.path.join(v_root, v_file)
12
- if v_file.endswith(('.pdf', '.ppt', '.csv')):
13
- # Process the files here (convert to vectors)
14
- v_vector_file = os.path.join(v_vector_folder, v_file + '.vec')
15
- with open(v_vector_file, 'w') as obj_out:
16
- obj_out.write(f'Vector representation of {v_file}')
 
 
 
 
 
 
 
 
17
 
18
  return v_vector_folder
 
1
  import os
2
+ import faiss
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ from app.utils.file_handler import extract_text_from_file
6
 
7
+ # Load a pre-trained embedding model with GPU support
8
+ obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device="cuda")
9
+
10
+ def process_files_to_vectors(v_folder_path, v_update=False, v_existing_index_path=None):
11
+ """
12
+ Processes files to create or update a FAISS vector database.
13
+
14
+ Args:
15
+ v_folder_path (str): Path to the folder containing input files.
16
+ v_update (bool): Whether to update an existing vector database.
17
+ v_existing_index_path (str): Path to the existing FAISS index file (if updating).
18
+
19
+ Returns:
20
+ str: Path to the folder containing the updated vector database.
21
+ """
22
  v_vector_folder = os.path.join(v_folder_path, 'vectors')
23
  os.makedirs(v_vector_folder, exist_ok=True)
24
 
25
+ # Initialize FAISS index
26
+ if v_update and v_existing_index_path:
27
+ v_index = faiss.read_index(v_existing_index_path)
28
+ with open(os.path.join(v_vector_folder, 'metadata.json'), 'r') as obj_meta:
29
+ import json
30
+ v_metadata = json.load(obj_meta)
31
+ else:
32
+ v_index = faiss.IndexFlatL2(384) # Embedding dimensions = 384
33
+ v_metadata = {}
34
+
35
+ # Process files and update the vector database
36
  for v_root, _, v_files in os.walk(v_folder_path):
37
  for v_file in v_files:
38
  v_file_path = os.path.join(v_root, v_file)
39
+
40
+ if v_file.endswith(('.pdf', '.pptx', '.csv')):
41
+ v_text = extract_text_from_file(v_file_path)
42
+ v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
43
+ v_index.add(v_embeddings)
44
+ v_metadata[len(v_index) - 1] = v_file_path
45
+
46
+ # Save the updated index and metadata
47
+ v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
48
+ faiss.write_index(v_index, v_index_path)
49
+ with open(os.path.join(v_vector_folder, 'metadata.json'), 'w') as obj_meta:
50
+ import json
51
+ json.dump(v_metadata, obj_meta, indent=4)
52
 
53
  return v_vector_folder
requirements.txt CHANGED
@@ -1,9 +1,10 @@
1
  Flask==2.3.2
2
  transformers==4.27.4
3
  sentence-transformers==2.2.2
4
- torch==2.0.1
5
- torchvision==0.15.2
6
- numpy==1.24.2
7
  PyMuPDF==1.22.5
8
  python-pptx==0.6.21
9
  pandas==1.5.3
 
 
1
  Flask==2.3.2
2
  transformers==4.27.4
3
  sentence-transformers==2.2.2
4
+ torch==2.0.1+cu118 # GPU-enabled PyTorch (CUDA 11.8 compatible)
5
+ torchvision==0.15.2+cu118 # GPU-enabled TorchVision
6
+ faiss-gpu==1.7.4 # GPU-enabled FAISS
7
  PyMuPDF==1.22.5
8
  python-pptx==0.6.21
9
  pandas==1.5.3
10
+ numpy==1.24.2