Spaces:
Sleeping
Sleeping
Update file_loader.py
Browse files- file_loader.py +50 -28
file_loader.py
CHANGED
@@ -2,6 +2,7 @@ import os
|
|
2 |
from tqdm import tqdm
|
3 |
from langchain_community.vectorstores import FAISS
|
4 |
from langchain_huggingface import HuggingFaceEmbeddings
|
|
|
5 |
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
6 |
|
7 |
# Import từ helpers
|
@@ -10,49 +11,70 @@ from helpers import (
|
|
10 |
get_splits, # Xử lý file docx thành splits
|
11 |
get_json_splits_only, # Xử lý file JSON (FAQ)
|
12 |
get_web_documents, # Xử lý dữ liệu từ web
|
|
|
|
|
13 |
)
|
14 |
|
15 |
import json
|
16 |
|
17 |
def get_vectorstore():
|
18 |
-
### Xử lý tất cả các tài liệu và nhét vào database
|
19 |
-
folder_path = "syllabus_nct_word_format/"
|
20 |
-
docx_files = list_docx_files(folder_path)
|
21 |
|
22 |
-
all_splits = [] # Khởi tạo danh sách lưu kết quả
|
23 |
-
# print("Feeding relevent websites' contents")
|
24 |
-
# #
|
25 |
-
# with open('syllabus_nct_word_format/urls.txt', 'r') as f:
|
26 |
-
# base_urls = [line.strip() for line in f]
|
27 |
-
# # urls_list
|
28 |
-
# # base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
|
29 |
-
# # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
|
30 |
|
31 |
|
32 |
-
# website_contents = get_web_documents(base_urls=base_urls)
|
33 |
-
# all_splits += website_contents
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
|
37 |
output_json_path = f"output_{i}.json"
|
|
|
38 |
splits = get_splits(file_path, output_json_path)
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
# Xử lý FAQ
|
43 |
-
FAQ_path = "syllabus_nct_word_format/FAQ.json"
|
44 |
-
FAQ_splits = get_json_splits_only(FAQ_path)
|
45 |
-
all_splits += FAQ_splits
|
46 |
-
|
47 |
-
FAQ_path = "syllabus_nct_word_format/FAQ2.json"
|
48 |
-
FAQ_splits = get_json_splits_only(FAQ_path)
|
49 |
-
all_splits += FAQ_splits
|
50 |
|
51 |
-
# Lưu vào vectorstore với nhúng từ Google GenAI
|
52 |
-
# embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
|
53 |
print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
|
54 |
-
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
55 |
print('Set vectorstore FAISS')
|
56 |
vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
|
|
|
57 |
print('Vectorstore ready!')
|
58 |
return vectorstore
|
|
|
2 |
from tqdm import tqdm
|
3 |
from langchain_community.vectorstores import FAISS
|
4 |
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
+
from tqdm import tqdm
|
6 |
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
7 |
|
8 |
# Import từ helpers
|
|
|
11 |
get_splits, # Xử lý file docx thành splits
|
12 |
get_json_splits_only, # Xử lý file JSON (FAQ)
|
13 |
get_web_documents, # Xử lý dữ liệu từ web
|
14 |
+
define_metadata,
|
15 |
+
update_documents_metadata
|
16 |
)
|
17 |
|
18 |
import json
|
19 |
|
20 |
def get_vectorstore():
|
21 |
+
# ### Xử lý tất cả các tài liệu và nhét vào database
|
22 |
+
# folder_path = "syllabus_nct_word_format/"
|
23 |
+
# docx_files = list_docx_files(folder_path)
|
24 |
|
25 |
+
# all_splits = [] # Khởi tạo danh sách lưu kết quả
|
26 |
+
# # print("Feeding relevent websites' contents")
|
27 |
+
# # #
|
28 |
+
# # with open('syllabus_nct_word_format/urls.txt', 'r') as f:
|
29 |
+
# # base_urls = [line.strip() for line in f]
|
30 |
+
# # # urls_list
|
31 |
+
# # # base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
|
32 |
+
# # # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
|
33 |
|
34 |
|
35 |
+
# # website_contents = get_web_documents(base_urls=base_urls)
|
36 |
+
# # all_splits += website_contents
|
37 |
+
|
38 |
+
# print('Feeding .docx files')
|
39 |
+
# for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
|
40 |
+
# output_json_path = f"output_{i}.json"
|
41 |
+
# splits = get_splits(file_path, output_json_path)
|
42 |
+
# all_splits += splits
|
43 |
+
|
44 |
+
# print('Feeding .json files')
|
45 |
+
# # Xử lý FAQ
|
46 |
+
# FAQ_path = "syllabus_nct_word_format/FAQ.json"
|
47 |
+
# FAQ_splits = get_json_splits_only(FAQ_path)
|
48 |
+
# all_splits += FAQ_splits
|
49 |
+
|
50 |
+
# FAQ_path = "syllabus_nct_word_format/FAQ2.json"
|
51 |
+
# FAQ_splits = get_json_splits_only(FAQ_path)
|
52 |
+
# all_splits += FAQ_splits
|
53 |
|
54 |
+
# # Lưu vào vectorstore với nhúng từ Google GenAI
|
55 |
+
# # embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
|
56 |
+
# print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
|
57 |
+
# embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
58 |
+
# print('Set vectorstore FAISS')
|
59 |
+
# vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
|
60 |
+
# print('Vectorstore ready!')
|
61 |
+
# return vectorstore
|
62 |
+
|
63 |
+
folder_path = '/content/chatbot4nct_test2/syllabus_nct_word_format'
|
64 |
+
docx_files = list_docx_files(folder_path)
|
65 |
+
all_splits = [] # Khởi tạo danh sách lưu kết quả
|
66 |
for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
|
67 |
output_json_path = f"output_{i}.json"
|
68 |
+
metadata = define_metadata(file_path)
|
69 |
splits = get_splits(file_path, output_json_path)
|
70 |
+
splits_with_metadata = update_documents_metadata(splits, metadata)
|
71 |
+
all_splits += splits_with_metadata
|
72 |
+
if i == 1: break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
|
|
|
|
74 |
print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
|
75 |
+
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") #"VoVanPhuc/sup-SimCSE-VietNamese-phobert-base")
|
76 |
print('Set vectorstore FAISS')
|
77 |
vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
|
78 |
+
|
79 |
print('Vectorstore ready!')
|
80 |
return vectorstore
|