vumichien commited on
Commit
887cb19
·
1 Parent(s): b6b3214

change service

Browse files
config.py CHANGED
@@ -12,6 +12,7 @@ UPLOAD_DIR = os.path.join(BASE_DIR, "uploads")
12
  OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
13
  SUBJECT_DATA_FILE = os.path.join(DATA_DIR, "subjectData.csv")
14
  SAMPLE_DATA_FILE = os.path.join(DATA_DIR, "sampleData.csv")
 
15
  # Model Names
16
  MODEL_NAME = "Detomo/cl-nagoya-sup-simcse-ja-for-standard-name-v0_9_10"
17
  SENTENCE_EMBEDDING_FILE = os.path.join(
 
12
  OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
13
  SUBJECT_DATA_FILE = os.path.join(DATA_DIR, "subjectData.csv")
14
  SAMPLE_DATA_FILE = os.path.join(DATA_DIR, "sampleData.csv")
15
+ STANDARD_NAME_MAP_DATA_FILE = os.path.join(DATA_DIR, "standardNameMapData.csv")
16
  # Model Names
17
  MODEL_NAME = "Detomo/cl-nagoya-sup-simcse-ja-for-standard-name-v0_9_10"
18
  SENTENCE_EMBEDDING_FILE = os.path.join(
data/anchor_name_sentence_sentence_embeddings(cl-nagoya-sup-simcse-ja-for-standard-name-v0_9_10).pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:062effa64e2fdd7b5b7253b596d777dea75a892b0b011592657ea51c3861ce62
3
- size 21565604
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e25d41cf2b9ab5b90f2c0e7e0f5d0ec31499f7dcb252de64d7af20ab63e91750
3
+ size 12073124
data/standardNameMapData.csv ADDED
The diff for this file is too large to render. See raw diff
 
routes/predict.py CHANGED
@@ -8,7 +8,7 @@ from auth import get_current_user
8
  from services.sentence_transformer_service import SentenceTransformerService, sentence_transformer_service
9
  from data_lib.input_name_data import InputNameData
10
  from data_lib.base_name_data import COL_NAME_SENTENCE
11
- from mapping_lib.name_mapping_helper import NameMappingHelper
12
  from config import UPLOAD_DIR, OUTPUT_DIR
13
 
14
  router = APIRouter()
@@ -39,36 +39,32 @@ async def predict(
39
  try:
40
  # Process input data
41
  start_time = time.time()
42
- inputData = InputNameData(sentence_service.dic_standard_subject)
43
- inputData.load_data_from_csv(input_file_path)
44
- inputData.process_data(sentence_service.sentenceTransformerHelper)
45
- input_name_sentences = inputData.dataframe[COL_NAME_SENTENCE]
46
- input_name_sentence_embeddings = sentence_service.sentenceTransformerHelper.create_embeddings(input_name_sentences)
47
-
48
- # Create similarity matrix
49
- similarity_matrix = sentence_service.sentenceTransformerHelper.create_similarity_matrix_from_embeddings(
50
- sentence_service.sample_name_sentence_embeddings,
51
- input_name_sentence_embeddings
52
- )
53
-
54
  # Map standard names
55
- nameMappingHelper = NameMappingHelper(
56
- sentence_service.sentenceTransformerHelper,
57
- inputData,
58
- sentence_service.sampleData,
59
- input_name_sentence_embeddings,
60
- sentence_service.sample_name_sentence_embeddings,
61
- similarity_matrix,
62
- )
63
- df_predicted = nameMappingHelper.map_standard_names()
 
64
 
65
  # Create output dataframe and save to CSV
66
  column_to_keep = ['ファイル名', 'シート名', '行', '科目', '中科目', '分類', '名称', '摘要', '備考']
67
  output_df = inputData.dataframe[column_to_keep].copy()
68
  output_df.reset_index(drop=False, inplace=True)
69
- output_df.loc[:, "出力_科目"] = df_predicted["出力_科目"]
70
- output_df.loc[:, "出力_項目名"] = df_predicted["出力_項目名"]
71
- output_df.loc[:, "出力_確率度"] = df_predicted["出力_確率度"]
72
 
73
  # Save with utf_8_sig encoding for Japanese Excel compatibility
74
  output_df.to_csv(output_file_path, index=False, encoding="utf_8_sig")
 
8
  from services.sentence_transformer_service import SentenceTransformerService, sentence_transformer_service
9
  from data_lib.input_name_data import InputNameData
10
  from data_lib.base_name_data import COL_NAME_SENTENCE
11
+ from mapping_lib.name_mapper import NameMapper
12
  from config import UPLOAD_DIR, OUTPUT_DIR
13
 
14
  router = APIRouter()
 
39
  try:
40
  # Process input data
41
  start_time = time.time()
42
+ try:
43
+ inputData = InputNameData(sentence_service.dic_standard_subject)
44
+ inputData.load_data_from_csv(input_file_path)
45
+ inputData.process_data(sentence_service.sentenceTransformerHelper)
46
+ except Exception as e:
47
+ print(f"Error processing input data: {e}")
48
+ raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
49
  # Map standard names
50
+ try:
51
+ nameMapper = NameMapper(
52
+ sentence_service.sentenceTransformerHelper,
53
+ sentence_service.standardNameMapData,
54
+ top_count=3
55
+ )
56
+ df_predicted = nameMapper.predict(inputData)
57
+ except Exception as e:
58
+ print(f"Error mapping standard names: {e}")
59
+ raise HTTPException(status_code=500, detail=str(e))
60
 
61
  # Create output dataframe and save to CSV
62
  column_to_keep = ['ファイル名', 'シート名', '行', '科目', '中科目', '分類', '名称', '摘要', '備考']
63
  output_df = inputData.dataframe[column_to_keep].copy()
64
  output_df.reset_index(drop=False, inplace=True)
65
+ output_df.loc[:, "出力_科目"] = df_predicted["標準科目"]
66
+ output_df.loc[:, "出力_項目名"] = df_predicted["標準項目名"]
67
+ output_df.loc[:, "出力_確率度"] = df_predicted["基準名称類似度"]
68
 
69
  # Save with utf_8_sig encoding for Japanese Excel compatibility
70
  output_df.to_csv(output_file_path, index=False, encoding="utf_8_sig")
services/sentence_transformer_service.py CHANGED
@@ -2,18 +2,18 @@ import pickle
2
  from config import (
3
  MODEL_NAME,
4
  SENTENCE_EMBEDDING_FILE,
5
- SAMPLE_DATA_FILE, SUBJECT_DATA_FILE
6
  )
7
  from sentence_transformer_lib.sentence_transformer_helper import SentenceTransformerHelper
8
  from data_lib.subject_data import SubjectData
9
- from data_lib.sample_name_data import SampleNameData
10
 
11
 
12
  class SentenceTransformerService:
13
  def __init__(self):
14
  self.sentenceTransformerHelper = None
15
  self.dic_standard_subject = None
16
- self.sample_name_sentence_embeddings = None
17
  self.sampleData = None
18
 
19
  def load_model_data(self):
@@ -34,14 +34,13 @@ class SentenceTransformerService:
34
 
35
  # Load pre-computed embeddings and similarities
36
  with open(SENTENCE_EMBEDDING_FILE, "rb") as f:
37
- self.sample_name_sentence_embeddings = pickle.load(f)
38
 
39
  # Load and process sample data
40
- self.sampleData = SampleNameData()
41
- self.sampleData.load_data_from_csv(SAMPLE_DATA_FILE)
42
- self.sampleData.process_data()
43
- self.sampleData.build_search_tree()
44
-
45
  print("Models and data loaded successfully")
46
 
47
  # Global instance (singleton)
 
2
  from config import (
3
  MODEL_NAME,
4
  SENTENCE_EMBEDDING_FILE,
5
+ STANDARD_NAME_MAP_DATA_FILE, SUBJECT_DATA_FILE
6
  )
7
  from sentence_transformer_lib.sentence_transformer_helper import SentenceTransformerHelper
8
  from data_lib.subject_data import SubjectData
9
+ from data_lib.standard_name_map_data import StandardNameMapData
10
 
11
 
12
  class SentenceTransformerService:
13
  def __init__(self):
14
  self.sentenceTransformerHelper = None
15
  self.dic_standard_subject = None
16
+ self.anchor_name_sentence_embeddings = None
17
  self.sampleData = None
18
 
19
  def load_model_data(self):
 
34
 
35
  # Load pre-computed embeddings and similarities
36
  with open(SENTENCE_EMBEDDING_FILE, "rb") as f:
37
+ self.anchor_name_sentence_embeddings = pickle.load(f)
38
 
39
  # Load and process sample data
40
+ self.standardNameMapData = StandardNameMapData()
41
+ self.standardNameMapData.load_data_from_csv(STANDARD_NAME_MAP_DATA_FILE)
42
+ self.standardNameMapData.process_data(self.anchor_name_sentence_embeddings)
43
+
 
44
  print("Models and data loaded successfully")
45
 
46
  # Global instance (singleton)