File size: 2,782 Bytes
b77c0a2
 
892f887
01ae535
28bdc3c
b77c0a2
 
 
887cb19
632ec54
01ae535
b77c0a2
 
 
 
 
887cb19
b77c0a2
 
 
 
 
 
 
 
 
 
28bdc3c
 
 
 
 
 
b77c0a2
 
 
df40595
 
887cb19
892f887
 
df40595
 
632ec54
 
 
 
 
df40595
 
 
 
 
 
632ec54
df40595
 
 
 
632ec54
b77c0a2
 
 
df40595
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pickle
from config import (
    MODEL_NAME, MODEL_TYPE, DEVICE_TYPE,
    SENTENCE_EMBEDDING_FILE,  
    STANDARD_NAME_MAP_DATA_FILE, SUBJECT_DATA_FILE, DATA_DIR, QINT8
)
from sentence_transformer_lib.sentence_transformer_helper import SentenceTransformerHelper
from data_lib.subject_data import SubjectData
from data_lib.standard_name_map_data import StandardNameMapData
import os


class SentenceTransformerService:
    def __init__(self):
        self.sentenceTransformerHelper = None
        self.dic_standard_subject = None
        self.anchor_name_sentence_embeddings = None
        self.sampleData = None

    def load_model_data(self):
        """Load model and data only once at startup"""
        if self.sentenceTransformerHelper is not None:
            print("Model already loaded. Skipping reload.")
            return  # Không load lại nếu đã có model

        print("Loading models and data...")
        # Load sentence transformer model
        print(f"Loading model {MODEL_NAME} with type {MODEL_TYPE} and qint8={QINT8}")
        self.sentenceTransformerHelper = SentenceTransformerHelper(
            model_name=MODEL_NAME, 
            model_type=MODEL_TYPE,
            qint8=QINT8
        )
        # Load standard subject dictionary
        self.dic_standard_subject = SubjectData.create_standard_subject_dic_from_file(SUBJECT_DATA_FILE)

        # Initialize StandardNameMapData without embeddings first
        self.standardNameMapData = StandardNameMapData(None)
        self.standardNameMapData.load_data_from_csv(STANDARD_NAME_MAP_DATA_FILE)
        self.standardNameMapData.process_data()

        # Load or create embeddings
        if os.path.exists(SENTENCE_EMBEDDING_FILE):
            with open(SENTENCE_EMBEDDING_FILE, "rb") as f:
                self.anchor_name_sentence_embeddings = pickle.load(f)
            print(f"Loaded anchor name sentence embeddings shape: {self.anchor_name_sentence_embeddings.shape}")
        else:
            list_anchor_name_sentence = self.standardNameMapData.processed_data["anchor_name_sentences"]
            self.anchor_name_sentence_embeddings = (
                self.sentenceTransformerHelper.create_embeddings(
                    list_anchor_name_sentence
                )
            )
            with open(SENTENCE_EMBEDDING_FILE, "wb") as f:
                pickle.dump(self.anchor_name_sentence_embeddings, f)
            print(f"Saved anchor name sentence embeddings to {SENTENCE_EMBEDDING_FILE}")

        # Update embeddings in StandardNameMapData
        self.standardNameMapData.update_embeddings(self.anchor_name_sentence_embeddings)

        print("Models and data loaded successfully")

# Global instance (singleton)
sentence_transformer_service = SentenceTransformerService()