File size: 2,111 Bytes
975d1b2
9cb30e2
 
9dc951f
9cb30e2
4ad299d
975d1b2
4ad299d
 
9cb30e2
4ad299d
9cb30e2
e4dba65
 
9cb30e2
975d1b2
 
 
e4dba65
4ad299d
 
 
e4dba65
975d1b2
 
 
e4dba65
 
4ad299d
9cb30e2
4ad299d
e4dba65
4ad299d
e4dba65
9cb30e2
 
 
e4dba65
 
 
 
 
4ad299d
 
e4dba65
9cb30e2
 
4ad299d
e4dba65
4ad299d
 
9cb30e2
975d1b2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import faiss
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from .file_handler import extract_text_from_file

v_device = "cuda" if torch.cuda.is_available() else "cpu"
obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=v_device)

def process_files_to_vectors(v_folder_path):
    """
    Processes files (PDF, PPTX, CSV) found in v_folder_path to build a FAISS vector DB.
    Stores only the reference path in metadata.json (no file name or full path).
    """
    v_vector_folder = os.path.join(v_folder_path, 'vectors')
    os.makedirs(v_vector_folder, exist_ok=True)

    # Create a FAISS index (384 dimensions for all-MiniLM-L6-v2)
    v_index = faiss.IndexFlatL2(384)
    v_metadata = {}
    v_doc_counter = 0

    for v_root, _, v_files in os.walk(v_folder_path):
        for v_file in v_files:
            v_file_path = os.path.join(v_root, v_file)
            
            # Filter files by extension
            if v_file_path.lower().endswith(('.pdf', '.pptx', '.csv')):
                v_text = extract_text_from_file(v_file_path)
                if not v_text.strip():
                    continue  # skip empty files

                # Convert text to embeddings
                v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
                v_index.add(v_embeddings)

                # Generate a relative path (reference path only)
                v_reference_path = os.path.relpath(v_file_path, start=v_folder_path)

                # Store only the reference path in metadata
                v_metadata[v_doc_counter] = v_reference_path
                v_doc_counter += 1

    # Save the FAISS index
    v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
    faiss.write_index(v_index, v_index_path)

    # Save metadata (containing only reference paths)
    import json
    with open(os.path.join(v_vector_folder, 'metadata.json'), 'w', encoding='utf-8') as obj_meta:
        json.dump(v_metadata, obj_meta, indent=4)

    return v_vector_folder