Spaces:
Runtime error
Runtime error
Commit
·
4ad299d
1
Parent(s):
c7e2b56
commit
Browse files- Dockerfile +4 -4
- app/routes.py +22 -13
- app/templates/index.html +38 -9
- app/utils/file_handler.py +8 -5
- app/utils/vector_db.py +31 -31
- app/utils/zip_handler.py +6 -3
- requirements.txt +11 -11
- run.py +2 -4
Dockerfile
CHANGED
@@ -15,7 +15,7 @@ RUN apt-get install -y \
|
|
15 |
python3.11 python3.11-distutils python3.11-dev wget git unzip && \
|
16 |
rm -rf /var/lib/apt/lists/*
|
17 |
|
18 |
-
# Install pip using get-pip.py
|
19 |
RUN wget https://bootstrap.pypa.io/get-pip.py && \
|
20 |
python3.11 get-pip.py && \
|
21 |
rm get-pip.py
|
@@ -25,12 +25,11 @@ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
|
|
25 |
update-alternatives --config python3
|
26 |
|
27 |
# Set Hugging Face cache directory
|
28 |
-
ENV
|
29 |
RUN mkdir -p /app/cache && chmod -R 777 /app/cache
|
30 |
|
31 |
# Create directories with proper read/write permissions
|
32 |
-
RUN mkdir -p /app/uploads/vectors &&
|
33 |
-
chmod -R 777 /app/uploads
|
34 |
|
35 |
# Set the working directory
|
36 |
WORKDIR /app
|
@@ -40,6 +39,7 @@ COPY . .
|
|
40 |
|
41 |
# Install Python dependencies
|
42 |
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
43 |
|
44 |
# Expose the Flask app's port
|
45 |
EXPOSE 7860
|
|
|
15 |
python3.11 python3.11-distutils python3.11-dev wget git unzip && \
|
16 |
rm -rf /var/lib/apt/lists/*
|
17 |
|
18 |
+
# Install pip using get-pip.py (bypasses any outdated or broken system pip)
|
19 |
RUN wget https://bootstrap.pypa.io/get-pip.py && \
|
20 |
python3.11 get-pip.py && \
|
21 |
rm get-pip.py
|
|
|
25 |
update-alternatives --config python3
|
26 |
|
27 |
# Set Hugging Face cache directory
|
28 |
+
ENV HF_HOME=/app/cache
|
29 |
RUN mkdir -p /app/cache && chmod -R 777 /app/cache
|
30 |
|
31 |
# Create directories with proper read/write permissions
|
32 |
+
RUN mkdir -p /app/uploads/vectors && chmod -R 777 /app/uploads
|
|
|
33 |
|
34 |
# Set the working directory
|
35 |
WORKDIR /app
|
|
|
39 |
|
40 |
# Install Python dependencies
|
41 |
RUN pip install --no-cache-dir -r requirements.txt
|
42 |
+
RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder='/app/cache')"
|
43 |
|
44 |
# Expose the Flask app's port
|
45 |
EXPOSE 7860
|
app/routes.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import os
|
2 |
from flask import Blueprint, render_template, request, send_file, jsonify
|
3 |
-
from .utils.zip_handler import handle_zip_upload
|
4 |
from .utils.vector_db import process_files_to_vectors
|
|
|
|
|
5 |
|
6 |
v_bp = Blueprint('routes', __name__)
|
7 |
|
@@ -9,21 +10,29 @@ v_bp = Blueprint('routes', __name__)
|
|
9 |
def home():
|
10 |
if request.method == 'POST':
|
11 |
v_uploaded_file = request.files.get('file')
|
12 |
-
if v_uploaded_file
|
13 |
-
|
14 |
-
v_uploaded_file.save(v_upload_path)
|
15 |
|
16 |
-
|
17 |
-
|
|
|
18 |
|
19 |
-
|
20 |
-
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
os.system(f'zip -r {result_zip} {v_result_path}')
|
25 |
-
return send_file(result_zip, as_attachment=True)
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
return render_template('index.html')
|
|
|
1 |
import os
|
2 |
from flask import Blueprint, render_template, request, send_file, jsonify
|
|
|
3 |
from .utils.vector_db import process_files_to_vectors
|
4 |
+
from .utils.zip_handler import handle_zip_upload # We'll create this utility
|
5 |
+
import zipfile
|
6 |
|
7 |
v_bp = Blueprint('routes', __name__)
|
8 |
|
|
|
10 |
def home():
|
11 |
if request.method == 'POST':
|
12 |
v_uploaded_file = request.files.get('file')
|
13 |
+
if not v_uploaded_file or not v_uploaded_file.filename.endswith('.zip'):
|
14 |
+
return jsonify({'error': 'Please upload a valid zip file.'}), 400
|
|
|
15 |
|
16 |
+
# Save uploaded ZIP
|
17 |
+
v_upload_path = os.path.join('app/uploads', v_uploaded_file.filename)
|
18 |
+
v_uploaded_file.save(v_upload_path)
|
19 |
|
20 |
+
# Extract the ZIP
|
21 |
+
v_extracted_folder = handle_zip_upload(v_upload_path)
|
22 |
|
23 |
+
# Process to create or update vector DB
|
24 |
+
v_result_folder = process_files_to_vectors(v_extracted_folder)
|
|
|
|
|
25 |
|
26 |
+
# Zip the resulting vectors folder for download
|
27 |
+
v_result_zip_path = os.path.join('app/uploads', 'vector_db_result.zip')
|
28 |
+
obj_zip = zipfile.ZipFile(v_result_zip_path, 'w', zipfile.ZIP_DEFLATED)
|
29 |
+
for v_root, _, v_files in os.walk(v_result_folder):
|
30 |
+
for v_file in v_files:
|
31 |
+
v_full_path = os.path.join(v_root, v_file)
|
32 |
+
v_arcname = os.path.relpath(v_full_path, start=v_result_folder)
|
33 |
+
obj_zip.write(v_full_path, arcname=v_arcname)
|
34 |
+
obj_zip.close()
|
35 |
+
|
36 |
+
return send_file(v_result_zip_path, as_attachment=True)
|
37 |
|
38 |
return render_template('index.html')
|
app/templates/index.html
CHANGED
@@ -1,23 +1,52 @@
|
|
1 |
<!DOCTYPE html>
|
2 |
<html lang="en">
|
3 |
<head>
|
4 |
-
<meta charset="UTF-8"
|
5 |
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
<title>Vector DB Creator</title>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
</head>
|
8 |
<body>
|
9 |
<header>
|
10 |
-
<img src="{{ url_for('static', filename='logo.png') }}" alt="Logo"
|
11 |
<h1>Change your PDF, PPT, and CSV data to Vector DB</h1>
|
12 |
</header>
|
13 |
-
<
|
14 |
-
<p>
|
|
|
|
|
15 |
<form action="/" method="POST" enctype="multipart/form-data">
|
16 |
<label for="file">Upload ZIP File:</label>
|
17 |
-
<input type="file" name="file" id="file" accept=".zip" required
|
18 |
-
<button type="submit">Upload</button>
|
19 |
</form>
|
20 |
-
|
21 |
-
|
|
|
22 |
</body>
|
23 |
</html>
|
|
|
1 |
<!DOCTYPE html>
|
2 |
<html lang="en">
|
3 |
<head>
|
4 |
+
<meta charset="UTF-8" />
|
|
|
5 |
<title>Vector DB Creator</title>
|
6 |
+
<style>
|
7 |
+
body {
|
8 |
+
font-family: Arial, sans-serif;
|
9 |
+
margin: 40px;
|
10 |
+
}
|
11 |
+
header {
|
12 |
+
display: flex;
|
13 |
+
align-items: center;
|
14 |
+
}
|
15 |
+
header img {
|
16 |
+
margin-right: 20px;
|
17 |
+
width: 50px;
|
18 |
+
}
|
19 |
+
h1 {
|
20 |
+
margin: 0;
|
21 |
+
}
|
22 |
+
.content {
|
23 |
+
margin-top: 20px;
|
24 |
+
}
|
25 |
+
form {
|
26 |
+
margin-top: 20px;
|
27 |
+
}
|
28 |
+
label {
|
29 |
+
display: inline-block;
|
30 |
+
width: 120px;
|
31 |
+
}
|
32 |
+
</style>
|
33 |
</head>
|
34 |
<body>
|
35 |
<header>
|
36 |
+
<img src="{{ url_for('static', filename='logo.png') }}" alt="Logo" />
|
37 |
<h1>Change your PDF, PPT, and CSV data to Vector DB</h1>
|
38 |
</header>
|
39 |
+
<div class="content">
|
40 |
+
<p>This application allows you to upload a .zip containing your data files (PDF, PPTX, or CSV)
|
41 |
+
and convert them into a vector database. Then you can download the processed vector DB as a .zip.</p>
|
42 |
+
|
43 |
<form action="/" method="POST" enctype="multipart/form-data">
|
44 |
<label for="file">Upload ZIP File:</label>
|
45 |
+
<input type="file" name="file" id="file" accept=".zip" required />
|
46 |
+
<button type="submit">Upload & Convert</button>
|
47 |
</form>
|
48 |
+
|
49 |
+
<p>After processing, you will be prompted to download the resulting vector database.</p>
|
50 |
+
</div>
|
51 |
</body>
|
52 |
</html>
|
app/utils/file_handler.py
CHANGED
@@ -1,28 +1,31 @@
|
|
1 |
import os
|
2 |
-
import fitz # PyMuPDF
|
3 |
import pandas as pd
|
4 |
from pptx import Presentation
|
5 |
|
6 |
def extract_text_from_file(v_file_path):
|
7 |
"""
|
8 |
-
Extracts text
|
9 |
"""
|
10 |
v_text = ""
|
11 |
|
12 |
-
|
|
|
13 |
obj_pdf = fitz.open(v_file_path)
|
14 |
for obj_page in obj_pdf:
|
15 |
v_text += obj_page.get_text()
|
16 |
obj_pdf.close()
|
17 |
|
18 |
-
|
|
|
19 |
obj_ppt = Presentation(v_file_path)
|
20 |
for obj_slide in obj_ppt.slides:
|
21 |
for obj_shape in obj_slide.shapes:
|
22 |
if obj_shape.has_text_frame:
|
23 |
v_text += obj_shape.text_frame.text + "\n"
|
24 |
|
25 |
-
|
|
|
26 |
v_data = pd.read_csv(v_file_path)
|
27 |
v_text += v_data.to_string()
|
28 |
|
|
|
1 |
import os
|
2 |
+
import fitz # PyMuPDF
|
3 |
import pandas as pd
|
4 |
from pptx import Presentation
|
5 |
|
6 |
def extract_text_from_file(v_file_path):
|
7 |
"""
|
8 |
+
Extracts text from PDF, PPTX, or CSV files.
|
9 |
"""
|
10 |
v_text = ""
|
11 |
|
12 |
+
# PDF
|
13 |
+
if v_file_path.lower().endswith('.pdf'):
|
14 |
obj_pdf = fitz.open(v_file_path)
|
15 |
for obj_page in obj_pdf:
|
16 |
v_text += obj_page.get_text()
|
17 |
obj_pdf.close()
|
18 |
|
19 |
+
# PPTX
|
20 |
+
elif v_file_path.lower().endswith('.pptx'):
|
21 |
obj_ppt = Presentation(v_file_path)
|
22 |
for obj_slide in obj_ppt.slides:
|
23 |
for obj_shape in obj_slide.shapes:
|
24 |
if obj_shape.has_text_frame:
|
25 |
v_text += obj_shape.text_frame.text + "\n"
|
26 |
|
27 |
+
# CSV
|
28 |
+
elif v_file_path.lower().endswith('.csv'):
|
29 |
v_data = pd.read_csv(v_file_path)
|
30 |
v_text += v_data.to_string()
|
31 |
|
app/utils/vector_db.py
CHANGED
@@ -3,54 +3,54 @@ import faiss
|
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
-
from
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device="cuda")
|
12 |
|
13 |
-
def process_files_to_vectors(v_folder_path
|
14 |
"""
|
15 |
-
Processes files to create
|
16 |
-
|
17 |
-
Args:
|
18 |
-
v_folder_path (str): Path to the folder containing input files.
|
19 |
-
v_update (bool): Whether to update an existing vector database.
|
20 |
-
v_existing_index_path (str): Path to the existing FAISS index file (if updating).
|
21 |
-
|
22 |
-
Returns:
|
23 |
-
str: Path to the folder containing the updated vector database.
|
24 |
"""
|
25 |
v_vector_folder = os.path.join(v_folder_path, 'vectors')
|
26 |
os.makedirs(v_vector_folder, exist_ok=True)
|
27 |
|
28 |
-
#
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
v_index = faiss.IndexFlatL2(384) # Embedding dimensions = 384
|
36 |
-
v_metadata = {}
|
37 |
-
|
38 |
-
# Process files and update the vector database
|
39 |
for v_root, _, v_files in os.walk(v_folder_path):
|
40 |
for v_file in v_files:
|
41 |
v_file_path = os.path.join(v_root, v_file)
|
42 |
|
43 |
-
if
|
44 |
v_text = extract_text_from_file(v_file_path)
|
|
|
|
|
|
|
|
|
|
|
45 |
v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
|
|
|
|
|
46 |
v_index.add(v_embeddings)
|
47 |
-
v_metadata[len(v_index) - 1] = v_file_path
|
48 |
|
49 |
-
|
|
|
|
|
|
|
|
|
50 |
v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
|
51 |
faiss.write_index(v_index, v_index_path)
|
52 |
-
|
53 |
-
|
|
|
|
|
54 |
json.dump(v_metadata, obj_meta, indent=4)
|
55 |
|
56 |
return v_vector_folder
|
|
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
+
from .file_handler import extract_text_from_file
|
7 |
|
8 |
+
# Determine if GPU is available
|
9 |
+
v_device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
+
obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=v_device)
|
|
|
11 |
|
12 |
+
def process_files_to_vectors(v_folder_path):
|
13 |
"""
|
14 |
+
Processes files (PDF, PPTX, CSV) to create/update a FAISS vector database.
|
15 |
+
Returns the path to the folder containing the FAISS index and metadata.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
"""
|
17 |
v_vector_folder = os.path.join(v_folder_path, 'vectors')
|
18 |
os.makedirs(v_vector_folder, exist_ok=True)
|
19 |
|
20 |
+
# Create a brand-new FAISS index
|
21 |
+
# For sentence-transformers/all-MiniLM-L6-v2, embedding dimension is 384
|
22 |
+
v_index = faiss.IndexFlatL2(384)
|
23 |
+
v_metadata = {}
|
24 |
+
|
25 |
+
# Iterate over extracted files
|
26 |
+
v_doc_counter = 0
|
|
|
|
|
|
|
|
|
27 |
for v_root, _, v_files in os.walk(v_folder_path):
|
28 |
for v_file in v_files:
|
29 |
v_file_path = os.path.join(v_root, v_file)
|
30 |
|
31 |
+
if v_file_path.lower().endswith(('.pdf', '.pptx', '.csv')):
|
32 |
v_text = extract_text_from_file(v_file_path)
|
33 |
+
if not v_text.strip():
|
34 |
+
continue # Skip empty content
|
35 |
+
|
36 |
+
# Encode text into embeddings
|
37 |
+
# convert_to_tensor=True yields a PyTorch tensor, so convert to numpy
|
38 |
v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
|
39 |
+
|
40 |
+
# Add to FAISS index
|
41 |
v_index.add(v_embeddings)
|
|
|
42 |
|
43 |
+
# Map index ID to filename
|
44 |
+
v_metadata[v_doc_counter] = v_file_path
|
45 |
+
v_doc_counter += 1
|
46 |
+
|
47 |
+
# Save FAISS index
|
48 |
v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
|
49 |
faiss.write_index(v_index, v_index_path)
|
50 |
+
|
51 |
+
# Save metadata
|
52 |
+
import json
|
53 |
+
with open(os.path.join(v_vector_folder, 'metadata.json'), 'w', encoding='utf-8') as obj_meta:
|
54 |
json.dump(v_metadata, obj_meta, indent=4)
|
55 |
|
56 |
return v_vector_folder
|
app/utils/zip_handler.py
CHANGED
@@ -2,7 +2,10 @@ import os
|
|
2 |
import zipfile
|
3 |
|
4 |
def handle_zip_upload(v_zip_path):
|
5 |
-
|
|
|
|
|
|
|
6 |
with zipfile.ZipFile(v_zip_path, 'r') as obj_zip:
|
7 |
-
obj_zip.extractall(
|
8 |
-
return
|
|
|
2 |
import zipfile
|
3 |
|
4 |
def handle_zip_upload(v_zip_path):
|
5 |
+
"""
|
6 |
+
Extracts ZIP file contents into a subfolder of app/uploads.
|
7 |
+
"""
|
8 |
+
v_extracted_path = os.path.splitext(v_zip_path)[0]
|
9 |
with zipfile.ZipFile(v_zip_path, 'r') as obj_zip:
|
10 |
+
obj_zip.extractall(v_extracted_path)
|
11 |
+
return v_extracted_path
|
requirements.txt
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
-
Flask
|
2 |
-
transformers
|
3 |
-
sentence-transformers
|
4 |
-
torch
|
5 |
-
torchvision
|
6 |
-
faiss-cpu
|
7 |
-
PyMuPDF
|
8 |
-
python-pptx
|
9 |
-
pandas
|
10 |
-
numpy
|
11 |
-
|
|
|
1 |
+
Flask==2.3.2
|
2 |
+
transformers==4.31.0
|
3 |
+
sentence-transformers==2.2.2
|
4 |
+
torch==2.0.1
|
5 |
+
torchvision==0.15.2
|
6 |
+
faiss-cpu==1.7.2
|
7 |
+
PyMuPDF==1.22.5
|
8 |
+
python-pptx==0.6.21
|
9 |
+
pandas==1.5.3
|
10 |
+
numpy==1.24.2
|
11 |
+
huggingface-hub==0.15.1
|
run.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
from app import create_app
|
2 |
-
import os
|
3 |
-
os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
|
4 |
-
|
5 |
|
6 |
if __name__ == "__main__":
|
7 |
obj_app = create_app()
|
8 |
-
|
|
|
|
1 |
from app import create_app
|
|
|
|
|
|
|
2 |
|
3 |
if __name__ == "__main__":
|
4 |
obj_app = create_app()
|
5 |
+
# Run the Flask app on 0.0.0.0:7860 for Hugging Face Spaces or local Docker
|
6 |
+
obj_app.run(host='0.0.0.0', port=7860)
|