quoc-khanh commited on
Commit
46ee881
·
verified ·
1 Parent(s): 611e958

Update file_loader.py

Browse files
Files changed (1) hide show
  1. file_loader.py +50 -28
file_loader.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  from tqdm import tqdm
3
  from langchain_community.vectorstores import FAISS
4
  from langchain_huggingface import HuggingFaceEmbeddings
 
5
  # from langchain_google_genai import GoogleGenerativeAIEmbeddings
6
 
7
  # Import từ helpers
@@ -10,49 +11,70 @@ from helpers import (
10
  get_splits, # Xử lý file docx thành splits
11
  get_json_splits_only, # Xử lý file JSON (FAQ)
12
  get_web_documents, # Xử lý dữ liệu từ web
 
 
13
  )
14
 
15
  import json
16
 
17
  def get_vectorstore():
18
- ### Xử lý tất cả các tài liệu và nhét vào database
19
- folder_path = "syllabus_nct_word_format/"
20
- docx_files = list_docx_files(folder_path)
21
 
22
- all_splits = [] # Khởi tạo danh sách lưu kết quả
23
- # print("Feeding relevent websites' contents")
24
- # #
25
- # with open('syllabus_nct_word_format/urls.txt', 'r') as f:
26
- # base_urls = [line.strip() for line in f]
27
- # # urls_list
28
- # # base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
29
- # # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
30
 
31
 
32
- # website_contents = get_web_documents(base_urls=base_urls)
33
- # all_splits += website_contents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- print('Feeding .docx files')
 
 
 
 
 
 
 
 
 
 
 
36
  for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
37
  output_json_path = f"output_{i}.json"
 
38
  splits = get_splits(file_path, output_json_path)
39
- all_splits += splits
40
-
41
- print('Feeding .json files')
42
- # Xử lý FAQ
43
- FAQ_path = "syllabus_nct_word_format/FAQ.json"
44
- FAQ_splits = get_json_splits_only(FAQ_path)
45
- all_splits += FAQ_splits
46
-
47
- FAQ_path = "syllabus_nct_word_format/FAQ2.json"
48
- FAQ_splits = get_json_splits_only(FAQ_path)
49
- all_splits += FAQ_splits
50
 
51
- # Lưu vào vectorstore với nhúng từ Google GenAI
52
- # embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
53
  print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
54
- embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
55
  print('Set vectorstore FAISS')
56
  vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
 
57
  print('Vectorstore ready!')
58
  return vectorstore
 
2
  from tqdm import tqdm
3
  from langchain_community.vectorstores import FAISS
4
  from langchain_huggingface import HuggingFaceEmbeddings
5
+ from tqdm import tqdm
6
  # from langchain_google_genai import GoogleGenerativeAIEmbeddings
7
 
8
  # Import từ helpers
 
11
  get_splits, # Xử lý file docx thành splits
12
  get_json_splits_only, # Xử lý file JSON (FAQ)
13
  get_web_documents, # Xử lý dữ liệu từ web
14
+ define_metadata,
15
+ update_documents_metadata
16
  )
17
 
18
  import json
19
 
20
  def get_vectorstore():
21
+ # ### Xử lý tất cả các tài liệu và nhét vào database
22
+ # folder_path = "syllabus_nct_word_format/"
23
+ # docx_files = list_docx_files(folder_path)
24
 
25
+ # all_splits = [] # Khởi tạo danh sách lưu kết quả
26
+ # # print("Feeding relevent websites' contents")
27
+ # # #
28
+ # # with open('syllabus_nct_word_format/urls.txt', 'r') as f:
29
+ # # base_urls = [line.strip() for line in f]
30
+ # # # urls_list
31
+ # # # base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
32
+ # # # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
33
 
34
 
35
+ # # website_contents = get_web_documents(base_urls=base_urls)
36
+ # # all_splits += website_contents
37
+
38
+ # print('Feeding .docx files')
39
+ # for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
40
+ # output_json_path = f"output_{i}.json"
41
+ # splits = get_splits(file_path, output_json_path)
42
+ # all_splits += splits
43
+
44
+ # print('Feeding .json files')
45
+ # # Xử lý FAQ
46
+ # FAQ_path = "syllabus_nct_word_format/FAQ.json"
47
+ # FAQ_splits = get_json_splits_only(FAQ_path)
48
+ # all_splits += FAQ_splits
49
+
50
+ # FAQ_path = "syllabus_nct_word_format/FAQ2.json"
51
+ # FAQ_splits = get_json_splits_only(FAQ_path)
52
+ # all_splits += FAQ_splits
53
 
54
+ # # Lưu vào vectorstore với nhúng từ Google GenAI
55
+ # # embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
56
+ # print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
57
+ # embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
58
+ # print('Set vectorstore FAISS')
59
+ # vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
60
+ # print('Vectorstore ready!')
61
+ # return vectorstore
62
+
63
+ folder_path = '/content/chatbot4nct_test2/syllabus_nct_word_format'
64
+ docx_files = list_docx_files(folder_path)
65
+ all_splits = [] # Khởi tạo danh sách lưu kết quả
66
  for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
67
  output_json_path = f"output_{i}.json"
68
+ metadata = define_metadata(file_path)
69
  splits = get_splits(file_path, output_json_path)
70
+ splits_with_metadata = update_documents_metadata(splits, metadata)
71
+ all_splits += splits_with_metadata
72
+ if i == 1: break
 
 
 
 
 
 
 
 
73
 
 
 
74
  print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
75
+ embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") #"VoVanPhuc/sup-SimCSE-VietNamese-phobert-base")
76
  print('Set vectorstore FAISS')
77
  vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
78
+
79
  print('Vectorstore ready!')
80
  return vectorstore