Spaces:
Running
Running
change service
Browse files
config.py
CHANGED
@@ -12,6 +12,7 @@ UPLOAD_DIR = os.path.join(BASE_DIR, "uploads")
|
|
12 |
OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
|
13 |
SUBJECT_DATA_FILE = os.path.join(DATA_DIR, "subjectData.csv")
|
14 |
SAMPLE_DATA_FILE = os.path.join(DATA_DIR, "sampleData.csv")
|
|
|
15 |
# Model Names
|
16 |
MODEL_NAME = "Detomo/cl-nagoya-sup-simcse-ja-for-standard-name-v0_9_10"
|
17 |
SENTENCE_EMBEDDING_FILE = os.path.join(
|
|
|
12 |
OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
|
13 |
SUBJECT_DATA_FILE = os.path.join(DATA_DIR, "subjectData.csv")
|
14 |
SAMPLE_DATA_FILE = os.path.join(DATA_DIR, "sampleData.csv")
|
15 |
+
STANDARD_NAME_MAP_DATA_FILE = os.path.join(DATA_DIR, "standardNameMapData.csv")
|
16 |
# Model Names
|
17 |
MODEL_NAME = "Detomo/cl-nagoya-sup-simcse-ja-for-standard-name-v0_9_10"
|
18 |
SENTENCE_EMBEDDING_FILE = os.path.join(
|
data/anchor_name_sentence_sentence_embeddings(cl-nagoya-sup-simcse-ja-for-standard-name-v0_9_10).pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e25d41cf2b9ab5b90f2c0e7e0f5d0ec31499f7dcb252de64d7af20ab63e91750
|
3 |
+
size 12073124
|
data/standardNameMapData.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
routes/predict.py
CHANGED
@@ -8,7 +8,7 @@ from auth import get_current_user
|
|
8 |
from services.sentence_transformer_service import SentenceTransformerService, sentence_transformer_service
|
9 |
from data_lib.input_name_data import InputNameData
|
10 |
from data_lib.base_name_data import COL_NAME_SENTENCE
|
11 |
-
from mapping_lib.
|
12 |
from config import UPLOAD_DIR, OUTPUT_DIR
|
13 |
|
14 |
router = APIRouter()
|
@@ -39,36 +39,32 @@ async def predict(
|
|
39 |
try:
|
40 |
# Process input data
|
41 |
start_time = time.time()
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
similarity_matrix = sentence_service.sentenceTransformerHelper.create_similarity_matrix_from_embeddings(
|
50 |
-
sentence_service.sample_name_sentence_embeddings,
|
51 |
-
input_name_sentence_embeddings
|
52 |
-
)
|
53 |
-
|
54 |
# Map standard names
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
64 |
|
65 |
# Create output dataframe and save to CSV
|
66 |
column_to_keep = ['ファイル名', 'シート名', '行', '科目', '中科目', '分類', '名称', '摘要', '備考']
|
67 |
output_df = inputData.dataframe[column_to_keep].copy()
|
68 |
output_df.reset_index(drop=False, inplace=True)
|
69 |
-
output_df.loc[:, "出力_科目"] = df_predicted["
|
70 |
-
output_df.loc[:, "出力_項目名"] = df_predicted["
|
71 |
-
output_df.loc[:, "出力_確率度"] = df_predicted["
|
72 |
|
73 |
# Save with utf_8_sig encoding for Japanese Excel compatibility
|
74 |
output_df.to_csv(output_file_path, index=False, encoding="utf_8_sig")
|
|
|
8 |
from services.sentence_transformer_service import SentenceTransformerService, sentence_transformer_service
|
9 |
from data_lib.input_name_data import InputNameData
|
10 |
from data_lib.base_name_data import COL_NAME_SENTENCE
|
11 |
+
from mapping_lib.name_mapper import NameMapper
|
12 |
from config import UPLOAD_DIR, OUTPUT_DIR
|
13 |
|
14 |
router = APIRouter()
|
|
|
39 |
try:
|
40 |
# Process input data
|
41 |
start_time = time.time()
|
42 |
+
try:
|
43 |
+
inputData = InputNameData(sentence_service.dic_standard_subject)
|
44 |
+
inputData.load_data_from_csv(input_file_path)
|
45 |
+
inputData.process_data(sentence_service.sentenceTransformerHelper)
|
46 |
+
except Exception as e:
|
47 |
+
print(f"Error processing input data: {e}")
|
48 |
+
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
|
|
|
49 |
# Map standard names
|
50 |
+
try:
|
51 |
+
nameMapper = NameMapper(
|
52 |
+
sentence_service.sentenceTransformerHelper,
|
53 |
+
sentence_service.standardNameMapData,
|
54 |
+
top_count=3
|
55 |
+
)
|
56 |
+
df_predicted = nameMapper.predict(inputData)
|
57 |
+
except Exception as e:
|
58 |
+
print(f"Error mapping standard names: {e}")
|
59 |
+
raise HTTPException(status_code=500, detail=str(e))
|
60 |
|
61 |
# Create output dataframe and save to CSV
|
62 |
column_to_keep = ['ファイル名', 'シート名', '行', '科目', '中科目', '分類', '名称', '摘要', '備考']
|
63 |
output_df = inputData.dataframe[column_to_keep].copy()
|
64 |
output_df.reset_index(drop=False, inplace=True)
|
65 |
+
output_df.loc[:, "出力_科目"] = df_predicted["標準科目"]
|
66 |
+
output_df.loc[:, "出力_項目名"] = df_predicted["標準項目名"]
|
67 |
+
output_df.loc[:, "出力_確率度"] = df_predicted["基準名称類似度"]
|
68 |
|
69 |
# Save with utf_8_sig encoding for Japanese Excel compatibility
|
70 |
output_df.to_csv(output_file_path, index=False, encoding="utf_8_sig")
|
services/sentence_transformer_service.py
CHANGED
@@ -2,18 +2,18 @@ import pickle
|
|
2 |
from config import (
|
3 |
MODEL_NAME,
|
4 |
SENTENCE_EMBEDDING_FILE,
|
5 |
-
|
6 |
)
|
7 |
from sentence_transformer_lib.sentence_transformer_helper import SentenceTransformerHelper
|
8 |
from data_lib.subject_data import SubjectData
|
9 |
-
from data_lib.
|
10 |
|
11 |
|
12 |
class SentenceTransformerService:
|
13 |
def __init__(self):
|
14 |
self.sentenceTransformerHelper = None
|
15 |
self.dic_standard_subject = None
|
16 |
-
self.
|
17 |
self.sampleData = None
|
18 |
|
19 |
def load_model_data(self):
|
@@ -34,14 +34,13 @@ class SentenceTransformerService:
|
|
34 |
|
35 |
# Load pre-computed embeddings and similarities
|
36 |
with open(SENTENCE_EMBEDDING_FILE, "rb") as f:
|
37 |
-
self.
|
38 |
|
39 |
# Load and process sample data
|
40 |
-
self.
|
41 |
-
self.
|
42 |
-
self.
|
43 |
-
|
44 |
-
|
45 |
print("Models and data loaded successfully")
|
46 |
|
47 |
# Global instance (singleton)
|
|
|
2 |
from config import (
|
3 |
MODEL_NAME,
|
4 |
SENTENCE_EMBEDDING_FILE,
|
5 |
+
STANDARD_NAME_MAP_DATA_FILE, SUBJECT_DATA_FILE
|
6 |
)
|
7 |
from sentence_transformer_lib.sentence_transformer_helper import SentenceTransformerHelper
|
8 |
from data_lib.subject_data import SubjectData
|
9 |
+
from data_lib.standard_name_map_data import StandardNameMapData
|
10 |
|
11 |
|
12 |
class SentenceTransformerService:
|
13 |
def __init__(self):
|
14 |
self.sentenceTransformerHelper = None
|
15 |
self.dic_standard_subject = None
|
16 |
+
self.anchor_name_sentence_embeddings = None
|
17 |
self.sampleData = None
|
18 |
|
19 |
def load_model_data(self):
|
|
|
34 |
|
35 |
# Load pre-computed embeddings and similarities
|
36 |
with open(SENTENCE_EMBEDDING_FILE, "rb") as f:
|
37 |
+
self.anchor_name_sentence_embeddings = pickle.load(f)
|
38 |
|
39 |
# Load and process sample data
|
40 |
+
self.standardNameMapData = StandardNameMapData()
|
41 |
+
self.standardNameMapData.load_data_from_csv(STANDARD_NAME_MAP_DATA_FILE)
|
42 |
+
self.standardNameMapData.process_data(self.anchor_name_sentence_embeddings)
|
43 |
+
|
|
|
44 |
print("Models and data loaded successfully")
|
45 |
|
46 |
# Global instance (singleton)
|