Upload 19 files
Browse files- README.md +2 -13
- checkpoint-92/config.json +195 -0
- checkpoint-92/model.safetensors +3 -0
- checkpoint-92/optimizer.pt +3 -0
- checkpoint-92/rng_state.pth +3 -0
- checkpoint-92/scheduler.pt +3 -0
- checkpoint-92/special_tokens_map.json +37 -0
- checkpoint-92/tokenizer.json +0 -0
- checkpoint-92/tokenizer_config.json +60 -0
- checkpoint-92/trainer_state.json +81 -0
- checkpoint-92/training_args.bin +3 -0
- checkpoint-92/vocab.txt +0 -0
- images/info_24dp_1F1F1F_FILL0_wght400_GRAD0_opsz24.png +0 -0
- images/medical_information_24dp_1F1F1F_FILL0_wght400_GRAD0_opsz24.png +0 -0
- pages/about.py +13 -0
- pages/home.py +53 -0
- pages/type_text.py +71 -0
- pages/upload_file.py +76 -0
- requirements.txt +4 -0
README.md
CHANGED
@@ -1,13 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
emoji: π
|
4 |
-
colorFrom: green
|
5 |
-
colorTo: gray
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.43.1
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
short_description: Performs named entity recognition for medical entities.
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# medNER_V2
|
2 |
+
This app performs Named Entity REcognition of medical entties.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint-92/config.json
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "dslim/distilbert-NER",
|
3 |
+
"activation": "gelu",
|
4 |
+
"architectures": [
|
5 |
+
"DistilBertForTokenClassification"
|
6 |
+
],
|
7 |
+
"attention_dropout": 0.1,
|
8 |
+
"dim": 768,
|
9 |
+
"dropout": 0.1,
|
10 |
+
"hidden_dim": 3072,
|
11 |
+
"id2label": {
|
12 |
+
"0": "O",
|
13 |
+
"1": "B-ACTIVITY",
|
14 |
+
"2": "I-ACTIVITY",
|
15 |
+
"3": "I-ADMINISTRATION",
|
16 |
+
"4": "B-ADMINISTRATION",
|
17 |
+
"5": "B-AGE",
|
18 |
+
"6": "I-AGE",
|
19 |
+
"7": "I-AREA",
|
20 |
+
"8": "B-AREA",
|
21 |
+
"9": "B-BIOLOGICAL_ATTRIBUTE",
|
22 |
+
"10": "I-BIOLOGICAL_ATTRIBUTE",
|
23 |
+
"11": "I-BIOLOGICAL_STRUCTURE",
|
24 |
+
"12": "B-BIOLOGICAL_STRUCTURE",
|
25 |
+
"13": "B-CLINICAL_EVENT",
|
26 |
+
"14": "I-CLINICAL_EVENT",
|
27 |
+
"15": "B-COLOR",
|
28 |
+
"16": "I-COLOR",
|
29 |
+
"17": "I-COREFERENCE",
|
30 |
+
"18": "B-COREFERENCE",
|
31 |
+
"19": "B-DATE",
|
32 |
+
"20": "I-DATE",
|
33 |
+
"21": "I-DETAILED_DESCRIPTION",
|
34 |
+
"22": "B-DETAILED_DESCRIPTION",
|
35 |
+
"23": "I-DIAGNOSTIC_PROCEDURE",
|
36 |
+
"24": "B-DIAGNOSTIC_PROCEDURE",
|
37 |
+
"25": "I-DISEASE_DISORDER",
|
38 |
+
"26": "B-DISEASE_DISORDER",
|
39 |
+
"27": "B-DISTANCE",
|
40 |
+
"28": "I-DISTANCE",
|
41 |
+
"29": "B-DOSAGE",
|
42 |
+
"30": "I-DOSAGE",
|
43 |
+
"31": "I-DURATION",
|
44 |
+
"32": "B-DURATION",
|
45 |
+
"33": "I-FAMILY_HISTORY",
|
46 |
+
"34": "B-FAMILY_HISTORY",
|
47 |
+
"35": "B-FREQUENCY",
|
48 |
+
"36": "I-FREQUENCY",
|
49 |
+
"37": "I-HEIGHT",
|
50 |
+
"38": "B-HEIGHT",
|
51 |
+
"39": "B-HISTORY",
|
52 |
+
"40": "I-HISTORY",
|
53 |
+
"41": "I-LAB_VALUE",
|
54 |
+
"42": "B-LAB_VALUE",
|
55 |
+
"43": "I-MASS",
|
56 |
+
"44": "B-MASS",
|
57 |
+
"45": "I-MEDICATION",
|
58 |
+
"46": "B-MEDICATION",
|
59 |
+
"47": "I-NONBIOLOGICAL_LOCATION",
|
60 |
+
"48": "B-NONBIOLOGICAL_LOCATION",
|
61 |
+
"49": "I-OCCUPATION",
|
62 |
+
"50": "B-OCCUPATION",
|
63 |
+
"51": "B-OTHER_ENTITY",
|
64 |
+
"52": "I-OTHER_ENTITY",
|
65 |
+
"53": "B-OTHER_EVENT",
|
66 |
+
"54": "I-OTHER_EVENT",
|
67 |
+
"55": "I-OUTCOME",
|
68 |
+
"56": "B-OUTCOME",
|
69 |
+
"57": "I-PERSONAL_BACKGROUND",
|
70 |
+
"58": "B-PERSONAL_BACKGROUND",
|
71 |
+
"59": "B-QUALITATIVE_CONCEPT",
|
72 |
+
"60": "I-QUALITATIVE_CONCEPT",
|
73 |
+
"61": "I-QUANTITATIVE_CONCEPT",
|
74 |
+
"62": "B-QUANTITATIVE_CONCEPT",
|
75 |
+
"63": "B-SEVERITY",
|
76 |
+
"64": "I-SEVERITY",
|
77 |
+
"65": "B-SEX",
|
78 |
+
"66": "I-SEX",
|
79 |
+
"67": "B-SHAPE",
|
80 |
+
"68": "I-SHAPE",
|
81 |
+
"69": "B-SIGN_SYMPTOM",
|
82 |
+
"70": "I-SIGN_SYMPTOM",
|
83 |
+
"71": "B-SUBJECT",
|
84 |
+
"72": "I-SUBJECT",
|
85 |
+
"73": "B-TEXTURE",
|
86 |
+
"74": "I-TEXTURE",
|
87 |
+
"75": "B-THERAPEUTIC_PROCEDURE",
|
88 |
+
"76": "I-THERAPEUTIC_PROCEDURE",
|
89 |
+
"77": "I-TIME",
|
90 |
+
"78": "B-TIME",
|
91 |
+
"79": "B-VOLUME",
|
92 |
+
"80": "I-VOLUME",
|
93 |
+
"81": "I-WEIGHT",
|
94 |
+
"82": "B-WEIGHT"
|
95 |
+
},
|
96 |
+
"initializer_range": 0.02,
|
97 |
+
"label2id": {
|
98 |
+
"B-ACTIVITY": 1,
|
99 |
+
"B-ADMINISTRATION": 4,
|
100 |
+
"B-AGE": 5,
|
101 |
+
"B-AREA": 8,
|
102 |
+
"B-BIOLOGICAL_ATTRIBUTE": 9,
|
103 |
+
"B-BIOLOGICAL_STRUCTURE": 12,
|
104 |
+
"B-CLINICAL_EVENT": 13,
|
105 |
+
"B-COLOR": 15,
|
106 |
+
"B-COREFERENCE": 18,
|
107 |
+
"B-DATE": 19,
|
108 |
+
"B-DETAILED_DESCRIPTION": 22,
|
109 |
+
"B-DIAGNOSTIC_PROCEDURE": 24,
|
110 |
+
"B-DISEASE_DISORDER": 26,
|
111 |
+
"B-DISTANCE": 27,
|
112 |
+
"B-DOSAGE": 29,
|
113 |
+
"B-DURATION": 32,
|
114 |
+
"B-FAMILY_HISTORY": 34,
|
115 |
+
"B-FREQUENCY": 35,
|
116 |
+
"B-HEIGHT": 38,
|
117 |
+
"B-HISTORY": 39,
|
118 |
+
"B-LAB_VALUE": 42,
|
119 |
+
"B-MASS": 44,
|
120 |
+
"B-MEDICATION": 46,
|
121 |
+
"B-NONBIOLOGICAL_LOCATION": 48,
|
122 |
+
"B-OCCUPATION": 50,
|
123 |
+
"B-OTHER_ENTITY": 51,
|
124 |
+
"B-OTHER_EVENT": 53,
|
125 |
+
"B-OUTCOME": 56,
|
126 |
+
"B-PERSONAL_BACKGROUND": 58,
|
127 |
+
"B-QUALITATIVE_CONCEPT": 59,
|
128 |
+
"B-QUANTITATIVE_CONCEPT": 62,
|
129 |
+
"B-SEVERITY": 63,
|
130 |
+
"B-SEX": 65,
|
131 |
+
"B-SHAPE": 67,
|
132 |
+
"B-SIGN_SYMPTOM": 69,
|
133 |
+
"B-SUBJECT": 71,
|
134 |
+
"B-TEXTURE": 73,
|
135 |
+
"B-THERAPEUTIC_PROCEDURE": 75,
|
136 |
+
"B-TIME": 78,
|
137 |
+
"B-VOLUME": 79,
|
138 |
+
"B-WEIGHT": 82,
|
139 |
+
"I-ACTIVITY": 2,
|
140 |
+
"I-ADMINISTRATION": 3,
|
141 |
+
"I-AGE": 6,
|
142 |
+
"I-AREA": 7,
|
143 |
+
"I-BIOLOGICAL_ATTRIBUTE": 10,
|
144 |
+
"I-BIOLOGICAL_STRUCTURE": 11,
|
145 |
+
"I-CLINICAL_EVENT": 14,
|
146 |
+
"I-COLOR": 16,
|
147 |
+
"I-COREFERENCE": 17,
|
148 |
+
"I-DATE": 20,
|
149 |
+
"I-DETAILED_DESCRIPTION": 21,
|
150 |
+
"I-DIAGNOSTIC_PROCEDURE": 23,
|
151 |
+
"I-DISEASE_DISORDER": 25,
|
152 |
+
"I-DISTANCE": 28,
|
153 |
+
"I-DOSAGE": 30,
|
154 |
+
"I-DURATION": 31,
|
155 |
+
"I-FAMILY_HISTORY": 33,
|
156 |
+
"I-FREQUENCY": 36,
|
157 |
+
"I-HEIGHT": 37,
|
158 |
+
"I-HISTORY": 40,
|
159 |
+
"I-LAB_VALUE": 41,
|
160 |
+
"I-MASS": 43,
|
161 |
+
"I-MEDICATION": 45,
|
162 |
+
"I-NONBIOLOGICAL_LOCATION": 47,
|
163 |
+
"I-OCCUPATION": 49,
|
164 |
+
"I-OTHER_ENTITY": 52,
|
165 |
+
"I-OTHER_EVENT": 54,
|
166 |
+
"I-OUTCOME": 55,
|
167 |
+
"I-PERSONAL_BACKGROUND": 57,
|
168 |
+
"I-QUALITATIVE_CONCEPT": 60,
|
169 |
+
"I-QUANTITATIVE_CONCEPT": 61,
|
170 |
+
"I-SEVERITY": 64,
|
171 |
+
"I-SEX": 66,
|
172 |
+
"I-SHAPE": 68,
|
173 |
+
"I-SIGN_SYMPTOM": 70,
|
174 |
+
"I-SUBJECT": 72,
|
175 |
+
"I-TEXTURE": 74,
|
176 |
+
"I-THERAPEUTIC_PROCEDURE": 76,
|
177 |
+
"I-TIME": 77,
|
178 |
+
"I-VOLUME": 80,
|
179 |
+
"I-WEIGHT": 81,
|
180 |
+
"O": 0
|
181 |
+
},
|
182 |
+
"max_position_embeddings": 512,
|
183 |
+
"model_type": "distilbert",
|
184 |
+
"n_heads": 12,
|
185 |
+
"n_layers": 6,
|
186 |
+
"output_past": true,
|
187 |
+
"pad_token_id": 0,
|
188 |
+
"qa_dropout": 0.1,
|
189 |
+
"seq_classif_dropout": 0.2,
|
190 |
+
"sinusoidal_pos_embds": false,
|
191 |
+
"tie_weights_": true,
|
192 |
+
"torch_dtype": "float32",
|
193 |
+
"transformers_version": "4.48.3",
|
194 |
+
"vocab_size": 28996
|
195 |
+
}
|
checkpoint-92/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:691568d1730026b95fe5a7d1005a616ca010681b8ec88899b99314c510979a77
|
3 |
+
size 261031300
|
checkpoint-92/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bde04177a1dff0cc8be3fc543d4f5a3b313a0501cff485928d58f0838b00f3da
|
3 |
+
size 522123450
|
checkpoint-92/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c96f5a47b163e72e28fbde7afe1320d6bcc042926cd5cb52bdc6f70d90c6d4d
|
3 |
+
size 14244
|
checkpoint-92/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4b2c3a566d1aa24c99821b905999ebcfce96b1faae6ee23e3f85708be8e34a3d
|
3 |
+
size 1064
|
checkpoint-92/special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"sep_token": {
|
24 |
+
"content": "[SEP]",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "[UNK]",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
checkpoint-92/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
checkpoint-92/tokenizer_config.json
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_lower_case": false,
|
47 |
+
"extra_special_tokens": {},
|
48 |
+
"mask_token": "[MASK]",
|
49 |
+
"max_length": 512,
|
50 |
+
"model_max_length": 512,
|
51 |
+
"pad_token": "[PAD]",
|
52 |
+
"sep_token": "[SEP]",
|
53 |
+
"stride": 0,
|
54 |
+
"strip_accents": null,
|
55 |
+
"tokenize_chinese_chars": true,
|
56 |
+
"tokenizer_class": "DistilBertTokenizer",
|
57 |
+
"truncation_side": "right",
|
58 |
+
"truncation_strategy": "longest_first",
|
59 |
+
"unk_token": "[UNK]"
|
60 |
+
}
|
checkpoint-92/trainer_state.json
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.76803058385849,
|
3 |
+
"best_model_checkpoint": "/content/drive/MyDrive/Files/checkpoint-92",
|
4 |
+
"epoch": 4.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 92,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.0,
|
13 |
+
"eval_accuracy": 0.7553191489361702,
|
14 |
+
"eval_f1": 0.5810435319543069,
|
15 |
+
"eval_loss": 0.9222098588943481,
|
16 |
+
"eval_precision": 0.5859277708592777,
|
17 |
+
"eval_recall": 0.5762400489895897,
|
18 |
+
"eval_runtime": 0.369,
|
19 |
+
"eval_samples_per_second": 54.199,
|
20 |
+
"eval_steps_per_second": 8.13,
|
21 |
+
"step": 23
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"epoch": 2.0,
|
25 |
+
"eval_accuracy": 0.7700754975978037,
|
26 |
+
"eval_f1": 0.6098380690498014,
|
27 |
+
"eval_loss": 0.8292961120605469,
|
28 |
+
"eval_precision": 0.6085365853658536,
|
29 |
+
"eval_recall": 0.6111451316595223,
|
30 |
+
"eval_runtime": 0.3909,
|
31 |
+
"eval_samples_per_second": 51.158,
|
32 |
+
"eval_steps_per_second": 7.674,
|
33 |
+
"step": 46
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 3.0,
|
37 |
+
"eval_accuracy": 0.7865477007549759,
|
38 |
+
"eval_f1": 0.6364756623536661,
|
39 |
+
"eval_loss": 0.782767653465271,
|
40 |
+
"eval_precision": 0.6404215747055176,
|
41 |
+
"eval_recall": 0.6325780771586038,
|
42 |
+
"eval_runtime": 0.3839,
|
43 |
+
"eval_samples_per_second": 52.099,
|
44 |
+
"eval_steps_per_second": 7.815,
|
45 |
+
"step": 69
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 4.0,
|
49 |
+
"eval_accuracy": 0.789293067947838,
|
50 |
+
"eval_f1": 0.641846153846154,
|
51 |
+
"eval_loss": 0.76803058385849,
|
52 |
+
"eval_precision": 0.645021645021645,
|
53 |
+
"eval_recall": 0.6387017758726271,
|
54 |
+
"eval_runtime": 0.4404,
|
55 |
+
"eval_samples_per_second": 45.415,
|
56 |
+
"eval_steps_per_second": 6.812,
|
57 |
+
"step": 92
|
58 |
+
}
|
59 |
+
],
|
60 |
+
"logging_steps": 500,
|
61 |
+
"max_steps": 92,
|
62 |
+
"num_input_tokens_seen": 0,
|
63 |
+
"num_train_epochs": 4,
|
64 |
+
"save_steps": 500,
|
65 |
+
"stateful_callbacks": {
|
66 |
+
"TrainerControl": {
|
67 |
+
"args": {
|
68 |
+
"should_epoch_stop": false,
|
69 |
+
"should_evaluate": false,
|
70 |
+
"should_log": false,
|
71 |
+
"should_save": true,
|
72 |
+
"should_training_stop": true
|
73 |
+
},
|
74 |
+
"attributes": {}
|
75 |
+
}
|
76 |
+
},
|
77 |
+
"total_flos": 94208005324800.0,
|
78 |
+
"train_batch_size": 8,
|
79 |
+
"trial_name": null,
|
80 |
+
"trial_params": null
|
81 |
+
}
|
checkpoint-92/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce032d36c503c8ec4cc7b6976897685f8c25accd569b89148aacf93a4d442372
|
3 |
+
size 5368
|
checkpoint-92/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
images/info_24dp_1F1F1F_FILL0_wght400_GRAD0_opsz24.png
ADDED
![]() |
images/medical_information_24dp_1F1F1F_FILL0_wght400_GRAD0_opsz24.png
ADDED
![]() |
pages/about.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.title("Info")
|
4 |
+
|
5 |
+
with st.expander("βΉοΈ - About this app", expanded=True):
|
6 |
+
|
7 |
+
st.write(
|
8 |
+
"""
|
9 |
+
- This app performs named entity recognition for medical entities.
|
10 |
+
- myDemo model was developed from dslim/distilbert-NER (a general NER model with 66M parameters) in HuggingFace, and fine-tuned on singh-aditya/MACCROBAT_biomedical_ner (a dataset annotated with medical entity labels in 41 categories).
|
11 |
+
- The model uses the default pretrained tokenizer in dslim/distilbert-NER.
|
12 |
+
"""
|
13 |
+
)
|
pages/home.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.title("πNamed Entity Recognition")
|
4 |
+
|
5 |
+
st.header("Tags the below 41 medical entities")
|
6 |
+
|
7 |
+
'ACTIVITY'
|
8 |
+
'ADMINISTRATION'
|
9 |
+
'AGE'
|
10 |
+
'AREA'
|
11 |
+
'BIOLOGICAL_ATTRIBUTE'
|
12 |
+
'BIOLOGICAL_STRUCTURE'
|
13 |
+
'CLINICAL_EVENT'
|
14 |
+
'COLOR'
|
15 |
+
'COREFERENCE'
|
16 |
+
'DATE'
|
17 |
+
'DETAILED_DESCRIPTION'
|
18 |
+
'DIAGNOSTIC_PROCEDURE'
|
19 |
+
'DISEASE_DISORDER'
|
20 |
+
'DISTANCE'
|
21 |
+
'DOSAGE'
|
22 |
+
'DURATION'
|
23 |
+
'FAMILY_HISTORY'
|
24 |
+
'FREQUENCY'
|
25 |
+
'HEIGHT'
|
26 |
+
'HISTORY'
|
27 |
+
'LAB_VALUE'
|
28 |
+
'MASS'
|
29 |
+
'MEDICATION'
|
30 |
+
'NONBIOLOGICAL_LOCATION'
|
31 |
+
'OCCUPATION'
|
32 |
+
'OTHER_ENTITY'
|
33 |
+
'OUTCOME'
|
34 |
+
'PERSONAL_BACKGROUND'
|
35 |
+
'QUALITATIVE_CONCEPT'
|
36 |
+
'QUANTITATIVE_CONCEPT'
|
37 |
+
'SEVERITY'
|
38 |
+
'SEX'
|
39 |
+
'SHAPE'
|
40 |
+
'SIGN_SYMPTOM'
|
41 |
+
'SUBJECT'
|
42 |
+
'TEXTURE'
|
43 |
+
'THERAPEUTIC_PROCEDURE'
|
44 |
+
'TIME'
|
45 |
+
'VOLUME'
|
46 |
+
'WEIGHT'
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
|
pages/type_text.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from io import StringIO
|
4 |
+
import json
|
5 |
+
from transformers import pipeline
|
6 |
+
#from transformers import AutoTokenizer, AutoModelForTokenClassification
|
7 |
+
|
8 |
+
def on_click():
|
9 |
+
st.session_state.user_input = ""
|
10 |
+
|
11 |
+
#@st.cache
|
12 |
+
def convert_df(df:pd.DataFrame):
|
13 |
+
return df.to_csv(index=False).encode('utf-8')
|
14 |
+
|
15 |
+
#@st.cache
|
16 |
+
def convert_json(df:pd.DataFrame):
|
17 |
+
result = df.to_json(orient="index")
|
18 |
+
parsed = json.loads(result)
|
19 |
+
json_string = json.dumps(parsed)
|
20 |
+
#st.json(json_string, expanded=True)
|
21 |
+
return json_string
|
22 |
+
|
23 |
+
#st.title("πmedical Named Entity Recognition Tagger")
|
24 |
+
|
25 |
+
text_input = st.text_input("Type input text and hit Enter", key="user_input")
|
26 |
+
st.button("Clear text", on_click=on_click)
|
27 |
+
|
28 |
+
my_model_results = pipeline("ner", model= "checkpoint-92")
|
29 |
+
HuggingFace_model_results = pipeline("ner", model = "blaze999/Medical-NER")
|
30 |
+
|
31 |
+
createNER_button = st.button("Create NER tags")
|
32 |
+
|
33 |
+
col1, col2 = st.columns([1,1.5])
|
34 |
+
col1.subheader("myDemo Model")
|
35 |
+
col2.subheader("blaze999/Medical-NER")
|
36 |
+
|
37 |
+
|
38 |
+
dictA = {"word": [], "entity": []}
|
39 |
+
dictB = {"word": [], "entity": []}
|
40 |
+
|
41 |
+
if text_input is not None and createNER_button == True:
|
42 |
+
|
43 |
+
with col1:
|
44 |
+
#st.write(my_model_results(text_input))
|
45 |
+
#col1.subheader("myDemo Model")
|
46 |
+
for result in my_model_results(text_input):
|
47 |
+
st.write(result['word'], result['entity'])
|
48 |
+
dictA["word"].append(result['word']), dictA["entity"].append(result['entity'])
|
49 |
+
dfA = pd.DataFrame.from_dict(dictA)
|
50 |
+
#st.write(dfA)
|
51 |
+
with col2:
|
52 |
+
#st.write(HuggingFace_model_results(text_input))
|
53 |
+
#col2.subheader("Hugging Face Model")
|
54 |
+
for result in HuggingFace_model_results(text_input):
|
55 |
+
st.write(result['word'], result['entity'])
|
56 |
+
dictB["word"].append(result['word']), dictB["entity"].append(result['entity'])
|
57 |
+
dfB = pd.DataFrame.from_dict(dictB)
|
58 |
+
#st.write(dfB)
|
59 |
+
|
60 |
+
bs, b1, b2, b3, bLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
|
61 |
+
with b1:
|
62 |
+
#csvbutton = download_button(results, "results.csv", "π₯ Download .csv")
|
63 |
+
csvbutton = st.download_button(label="π₯ Download .csv", data=convert_df(dfA), file_name= "results.csv", mime='text/csv', key='csv_b')
|
64 |
+
with b2:
|
65 |
+
#textbutton = download_button(results, "results.txt", "π₯ Download .txt")
|
66 |
+
textbutton = st.download_button(label="π₯ Download .txt", data=convert_df(dfA), file_name= "results.text", mime='text/plain', key='text_b')
|
67 |
+
with b3:
|
68 |
+
#jsonbutton = download_button(results, "results.json", "π₯ Download .json")
|
69 |
+
jsonbutton = st.download_button(label="π₯ Download .json", data=convert_json(dfA), file_name= "results.json", mime='application/json', key='json_b')
|
70 |
+
|
71 |
+
|
pages/upload_file.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from io import StringIO
|
4 |
+
import json
|
5 |
+
from transformers import pipeline
|
6 |
+
#from transformers import AutoTokenizer, AutoModelForTokenClassification
|
7 |
+
|
8 |
+
def on_click():
|
9 |
+
st.session_state.user_input = ""
|
10 |
+
|
11 |
+
#@st.cache
|
12 |
+
def convert_df(df:pd.DataFrame):
|
13 |
+
return df.to_csv(index=False).encode('utf-8')
|
14 |
+
|
15 |
+
#@st.cache
|
16 |
+
def convert_json(df:pd.DataFrame):
|
17 |
+
result = df.to_json(orient="index")
|
18 |
+
parsed = json.loads(result)
|
19 |
+
json_string = json.dumps(parsed)
|
20 |
+
#st.json(json_string, expanded=True)
|
21 |
+
return json_string
|
22 |
+
|
23 |
+
#st.title("πmedical Named Entity Recognition Tagger")
|
24 |
+
|
25 |
+
uploaded_file = st.file_uploader(label = "Upload single text file")
|
26 |
+
if uploaded_file is not None:
|
27 |
+
stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
|
28 |
+
string_data = stringio.read()
|
29 |
+
st.success('Your file input is: '+ string_data, icon="β
")
|
30 |
+
|
31 |
+
|
32 |
+
my_model_results = pipeline("ner", model= "checkpoint-92")
|
33 |
+
HuggingFace_model_results = pipeline("ner", model = "blaze999/Medical-NER")
|
34 |
+
|
35 |
+
|
36 |
+
createNER_button = st.button("Create NER tags")
|
37 |
+
|
38 |
+
col1, col2 = st.columns([1,1.5])
|
39 |
+
col1.subheader("myDemo Model")
|
40 |
+
col2.subheader("blaze999/Medical-NER")
|
41 |
+
|
42 |
+
if uploaded_file is not None and createNER_button == True:
|
43 |
+
dict1 = {"word": [], "entity": []}
|
44 |
+
dict2 = {"word": [], "entity": []}
|
45 |
+
#stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
|
46 |
+
#string_data = stringio.read()
|
47 |
+
#st.write("Your input is: ", string_data)
|
48 |
+
with col1:
|
49 |
+
#st.write(my_model_results(string_data))
|
50 |
+
#col1.subheader("myDemo Model")
|
51 |
+
for result in my_model_results(string_data):
|
52 |
+
st.write(result['word'], result['entity'])
|
53 |
+
dict1["word"].append(result['word']), dict1["entity"].append(result['entity'])
|
54 |
+
df1 = pd.DataFrame.from_dict(dict1)
|
55 |
+
#st.write(df1)
|
56 |
+
with col2:
|
57 |
+
#st.write(HuggingFace_model_results(string_data))
|
58 |
+
#col2.subheader("Hugging Face Model")
|
59 |
+
for result in HuggingFace_model_results(string_data):
|
60 |
+
st.write(result['word'], result['entity'])
|
61 |
+
dict2["word"].append(result['word']), dict2["entity"].append(result['entity'])
|
62 |
+
df2 = pd.DataFrame.from_dict(dict2)
|
63 |
+
#st.write(df2)
|
64 |
+
|
65 |
+
|
66 |
+
cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
|
67 |
+
with c1:
|
68 |
+
#csvbutton = download_button(results, "results.csv", "π₯ Download .csv")
|
69 |
+
csvbutton = st.download_button(label="π₯ Download .csv", data=convert_df(df1), file_name= "results.csv", mime='text/csv', key='csv')
|
70 |
+
with c2:
|
71 |
+
#textbutton = download_button(results, "results.txt", "π₯ Download .txt")
|
72 |
+
textbutton = st.download_button(label="π₯ Download .txt", data=convert_df(df1), file_name= "results.text", mime='text/plain', key='text')
|
73 |
+
with c3:
|
74 |
+
#jsonbutton = download_button(results, "results.json", "π₯ Download .json")
|
75 |
+
jsonbutton = st.download_button(label="π₯ Download .json", data=convert_json(df1), file_name= "results.json", mime='application/json', key='json')
|
76 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.41.1
|
2 |
+
pandas==2.2.2
|
3 |
+
torch==2.4.0
|
4 |
+
transformers==4.44.2
|