Spaces:
Runtime error
Runtime error
File size: 2,111 Bytes
975d1b2 9cb30e2 9dc951f 9cb30e2 4ad299d 975d1b2 4ad299d 9cb30e2 4ad299d 9cb30e2 e4dba65 9cb30e2 975d1b2 e4dba65 4ad299d e4dba65 975d1b2 e4dba65 4ad299d 9cb30e2 4ad299d e4dba65 4ad299d e4dba65 9cb30e2 e4dba65 4ad299d e4dba65 9cb30e2 4ad299d e4dba65 4ad299d 9cb30e2 975d1b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import os
import faiss
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from .file_handler import extract_text_from_file
v_device = "cuda" if torch.cuda.is_available() else "cpu"
obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=v_device)
def process_files_to_vectors(v_folder_path):
"""
Processes files (PDF, PPTX, CSV) found in v_folder_path to build a FAISS vector DB.
Stores only the reference path in metadata.json (no file name or full path).
"""
v_vector_folder = os.path.join(v_folder_path, 'vectors')
os.makedirs(v_vector_folder, exist_ok=True)
# Create a FAISS index (384 dimensions for all-MiniLM-L6-v2)
v_index = faiss.IndexFlatL2(384)
v_metadata = {}
v_doc_counter = 0
for v_root, _, v_files in os.walk(v_folder_path):
for v_file in v_files:
v_file_path = os.path.join(v_root, v_file)
# Filter files by extension
if v_file_path.lower().endswith(('.pdf', '.pptx', '.csv')):
v_text = extract_text_from_file(v_file_path)
if not v_text.strip():
continue # skip empty files
# Convert text to embeddings
v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
v_index.add(v_embeddings)
# Generate a relative path (reference path only)
v_reference_path = os.path.relpath(v_file_path, start=v_folder_path)
# Store only the reference path in metadata
v_metadata[v_doc_counter] = v_reference_path
v_doc_counter += 1
# Save the FAISS index
v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
faiss.write_index(v_index, v_index_path)
# Save metadata (containing only reference paths)
import json
with open(os.path.join(v_vector_folder, 'metadata.json'), 'w', encoding='utf-8') as obj_meta:
json.dump(v_metadata, obj_meta, indent=4)
return v_vector_folder
|