Shunfeng Zheng commited on
Commit
da5875f
·
verified ·
1 Parent(s): 851ada5

Update 1_SpatialParse.py

Browse files
Files changed (1) hide show
  1. 1_SpatialParse.py +398 -388
1_SpatialParse.py CHANGED
@@ -1,416 +1,426 @@
1
- import subprocess
2
- import importlib.util
3
- import os
4
-
5
- # 只在 geospacy 没有被安装时执行安装(避免重复装)
6
- if importlib.util.find_spec("geospacy") is None:
7
- subprocess.run(
8
- ["pip", "install", "--no-deps", "-r", "requirements_geospacy.txt"],
9
- check=True
10
- )
11
-
12
-
13
- import streamlit as st
14
- from spacy import displacy
15
- import spacy
16
- import geospacy
17
- from PIL import Image
18
- import base64
19
- import sys
20
- import pandas as pd
21
- import en_core_web_md
22
- from spacy.tokens import Span, Doc, Token
23
- from utils import geoutil
24
- import llm_coding
25
- import urllib.parse
26
-
27
-
28
- colors = {'GPE': "#43c6fc", "LOC": "#fd9720", "RSE":"#a6e22d"}
29
- options = {"ents": ['GPE', 'LOC', "RSE"], "colors": colors}
30
-
31
- HTML_WRAPPER = """<div style="overflow-x: auto; border: none solid #a6e22d; border-radius: 0.25rem; padding: 1rem">{}</div>"""
32
- model = ""
33
-
34
- gpe_selected = "GPE"
35
- loc_selected = "LOC"
36
- rse_selected = "RSE"
37
-
38
- types = ""
39
-
40
- #BASE_URL = "http://localhost:8080/"
41
- BASE_URL = ""
42
-
43
-
44
-
45
- def set_header():
46
- LOGO_IMAGE = "tetis-1.png"
47
-
48
- st.markdown(
49
- """
50
- <style>
51
- .container {
52
- display: flex;
53
- }
54
- .logo-text {
55
- font-weight:700 !important;
56
- font-size:50px !important;
57
- color: #f9a01b !important;
58
- padding-left: 10px !important;
59
- }
60
- .logo-img {
61
- float:right;
62
- width: 28%;
63
- height: 28%;
64
- }
65
- </style>
66
- """,
67
- unsafe_allow_html=True
68
- )
69
- st.markdown(
70
- f"""
71
- <div class="container">
72
- <img class="logo-img" src="data:image/png;base64,{base64.b64encode(open(LOGO_IMAGE, "rb").read()).decode()}">
73
- <p class="logo-text">GeOspaCy</p>
74
- </div>
75
- """,
76
- unsafe_allow_html=True
77
- )
78
-
79
-
80
-
81
- def set_side_menu():
82
-
83
- global gpe_selected, loc_selected, rse_selected, model, types
84
- types =""
85
- params = st.experimental_get_query_params()
86
- # params = st.query_params
87
- # print(params, 777)
88
-
89
- st.sidebar.markdown("## Spacy Model")
90
- st.sidebar.markdown("You can **select** the values of the *spacy model* from Dropdown.")
91
- models = ['en_core_web_sm', 'en_core_web_md', 'en_core_web_lg', 'en_core_web_trf']
92
- if "model" in params:
93
- default_ix = models.index(params["model"][0])
94
- else:
95
- default_ix = models.index('en_core_web_sm')
96
- model = st.sidebar.selectbox('Spacy Model',models, index=default_ix)
97
-
98
- st.sidebar.markdown("## Spatial Entity Labels")
99
- st.sidebar.markdown("**Mark** the Spatial Entities you want to extract?")
100
- tpes = ""
101
- if "type" in params:
102
- tpes = params['type'][0]
103
-
104
- if "g" in tpes:
105
- gpe = st.sidebar.checkbox('GPE', value = True)
106
- else:
107
- gpe = st.sidebar.checkbox('GPE')
108
-
109
- if "l" in tpes:
110
- loc = st.sidebar.checkbox('LOC', value = True)
111
- else:
112
- loc = st.sidebar.checkbox('LOC')
113
- if "r" in tpes:
114
- rse = st.sidebar.checkbox('RSE', value = True)
115
- else:
116
- rse = st.sidebar.checkbox('RSE')
117
- if(gpe):
118
- gpe_selected ="GPE"
119
- types+="g"
120
-
121
- if(loc):
122
- loc_selected ="LOC"
123
- types+="l"
124
-
125
- if(rse):
126
- rse_selected ="RSE"
127
- types+="r"
128
-
129
-
130
-
131
- def set_input():
132
- params = st.experimental_get_query_params()
133
- # params = st.query_params
134
-
135
- if "text" not in params:
136
- text = st.text_area("Input unstructured text:", "")
137
- else:
138
- text = st.text_area("Enter the text to extract {Spatial Entities}", params["text"][0])
139
- if(st.button("Extract")):
140
-
141
- # return 'France has detected a highly pathogenic strain of bird flu in a pet shop near Paris, days after an identical outbreak in one of Corsica’s main cities.'
142
-
143
-
144
- return 'I would like to know where is the area between Burwood and Glebe. Pyrmont.'
145
- return '5 km east of Burwood. 3 km south of Glebe. Between Pyrmont and Glebe.'
146
- # return 'Between Burwood and Pyrmont.'
147
- # return 'Between Burwood and Glebe.'
148
- # return 'Between Burwood and Darling Harbour.'
149
- # return 'Between China and USA.'
150
- # return 'The Burwood city.'
151
- # text = "New York is north of Washington. Between Burwood and Pyrmont city."
152
- return text
153
-
154
- def set_selected_entities(doc):
155
- global gpe_selected, loc_selected, rse_selected, model
156
- ents = [ent for ent in doc.ents if ent.label_ == gpe_selected or ent.label_ == loc_selected or ent.label_ == rse_selected]
157
-
158
- doc.ents = ents
159
- return doc
160
-
161
- def extract_spatial_entities(text):
162
- # nlp = en_core_web_md.load()
163
-
164
- # nlp = spacy.load("en_core_web_md")
165
- # nlp.add_pipe("spatial_pipeline", after="ner")
166
- # doc = nlp(text)
167
- # doc = set_selected_entities(doc)
168
- # html = displacy.render(doc, style="ent", options=options)
169
- # html = html.replace("\n", "")
170
- # st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
171
- # show_spatial_ent_table(doc, text)
172
-
173
- nlp = spacy.load("en_core_web_md") #####
174
- nlp.add_pipe("spatial_pipeline", after="ner")
175
- doc = nlp(text)
176
-
177
- # 分句处理
178
- sent_ents = []
179
- sent_texts = []
180
- sent_rse_id = []
181
- offset = 0 # 记录当前 token 偏移量
182
- sent_start_positions = [0] # 记录句子信息
183
- doc_copy = doc.copy() # 用于展示方程组合
184
- for sent in doc.sents:
185
-
186
- sent_doc = nlp(sent.text) # 逐句处理
187
- sent_doc = set_selected_entities(sent_doc) # 这里处理实体
188
- sent_texts.append(sent_doc.text)
189
-
190
- for ent in sent_doc.ents:
191
- sent_rse_id.append(ent._.rse_id)
192
- # **调整每个实体的索引,使其匹配完整文本**
193
- for ent in sent_doc.ents:
194
- new_ent = Span(doc, ent.start + offset, ent.end + offset, label=ent.label_)
195
- sent_ents.append(new_ent)
196
-
197
- offset += len(sent) # 更新偏移量
198
- sent_start_positions.append(sent_start_positions[-1] + len(sent)) # 记录句子起点
199
- # **创建新 Doc**
200
- final_doc = Doc(nlp.vocab, words=[token.text for token in doc], spaces=[token.whitespace_ for token in doc])
201
- for i in sent_start_positions: # 手动标记句子起始点
202
- if i < len(final_doc):
203
- final_doc[i].is_sent_start = True
204
- # **设置实体**
205
- final_doc.set_ents(sent_ents)
206
-
207
- for i in range(len(sent_rse_id)):
208
- final_doc.ents[i]._.rse_id = sent_rse_id[i]
209
- print(doc.ents[0].sent, '原始')
210
- doc = final_doc
211
- print(doc.ents[0].sent, '新')
212
- # 分句处理完毕
213
-
214
- # doc = set_selected_entities(doc)
215
- doc.to_disk("saved_doc.spacy")
216
-
217
-
218
-
219
-
220
- html = displacy.render(doc,style="ent", options = options)
221
- html = html.replace("\n","")
222
- st.write(HTML_WRAPPER.format(html),unsafe_allow_html=True)
223
- show_spatial_ent_table(doc, text)
224
-
225
- st.markdown("123123")
226
-
227
- show_sentence_selector_table(doc_copy)
228
-
229
- def show_sentence_selector_table(doc_copy):
230
- st.markdown("**______________________________________________________________________________________**")
231
- st.markdown("**Sentence Selector for Geographic Composition**")
232
-
233
- # 提取句子
234
- sentences = list(doc_copy.sents)
235
-
236
- # 构建表格数据
237
- rows = []
238
- for idx, sent in enumerate(sentences):
239
- sentence_text = sent.text.strip()
240
- # 生成跳转链接(定位到Tagger)
241
- url = BASE_URL + "Tagger?mode=geocombo&text=" + urllib.parse.quote(sentence_text)
242
- new_row = {
243
- 'Sr.': idx + 1,
244
- 'sentence': sentence_text,
245
- 'Select': f'<a target="_self" href="{url}">Select this sentence</a>'
246
- }
247
- rows.append(new_row)
248
-
249
- # 转为 DataFrame 并渲染为 HTML
250
- df = pd.DataFrame(rows)
251
- st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
252
-
253
-
254
-
255
- def show_spatial_ent_table(doc, text):
256
- global types
257
- if len(doc.ents) > 0:
258
- st.markdown("**______________________________________________________________________________________**")
259
- st.markdown("**Spatial Entities List**")
260
-
261
- # 初始化一个空 DataFrame
262
- df = pd.DataFrame(columns=['Sr.', 'entity', 'label', 'Map', 'GEOJson'])
263
- rows = [] # 用于存储所有行
264
-
265
- for ent in doc.ents:
266
- url_map = BASE_URL + "Tagger?map=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
267
- print(url_map, 'uuurrr')
268
- print(ent._.rse_id, 'pppp')
269
- url_json = BASE_URL + "Tagger?geojson=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
270
-
271
- # 创建新行
272
- new_row = {
273
- 'Sr.': len(rows) + 1,
274
- 'entity': ent.text,
275
- 'label': ent.label_,
276
- 'Map': f'<a target="_self" href="{url_map}">View</a>',
277
- 'GEOJson': f'<a target="_self" href="{url_json}">View</a>'
278
- }
279
-
280
- rows.append(new_row) # 将新行添加到列表中
281
-
282
- # 将所有行转为 DataFrame
283
- df = pd.DataFrame(rows)
284
-
285
- # 使用 Streamlit 显示 HTML 表格
286
- st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
287
-
288
- # params = st.experimental_get_query_params()
289
- # params = st.query_params
290
- # ase, level_1, level_2, level_3 = geoutil.get_ent(params["entity"][0])
291
- # print(geoutil.get_ent(params), 'ppppp')
292
-
293
- def set_header(): # tetis Geospacy LOGO
294
- LOGO_IMAGE = "title.jpg"
295
-
296
- st.markdown(
297
- """
298
- <style>
299
- .container {
300
- display: flex;
301
- }
302
- .logo-text {
303
- font-weight:700 !important;
304
- font-size:50px !important;
305
- color: #52aee3 !important;
306
- padding-left: 10px !important;
307
- }
308
- .logo-img {
309
- float:right;
310
- width: 10%;
311
- height: 10%;
312
- }
313
- </style>
314
- """,
315
- unsafe_allow_html=True
316
- )
317
- st.markdown(
318
- f"""
319
- <div class="container">
320
- <img class="logo-img" src="data:image/png;base64,{base64.b64encode(open(LOGO_IMAGE, "rb").read()).decode()}">
321
- <p class="logo-text">SpatialParse</p>
322
- </div>
323
- """,
324
- unsafe_allow_html=True
325
- )
326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
- def set_side_menu():
329
- global gpe_selected, loc_selected, rse_selected, model, types
330
- types = ""
331
- params = st.experimental_get_query_params()
332
- st.sidebar.markdown("## Deployment Method")
333
- st.sidebar.markdown("You can select the deployment method for the model.")
334
- deployment_options = ["API", "Local deployment"]
335
- use_local_model = st.sidebar.radio("Choose deployment method:", deployment_options, index=0) == "Local deployment"
336
-
337
- if use_local_model:
338
- local_model_path = st.sidebar.text_input("Enter local model path:", "")
339
-
340
- st.sidebar.markdown("## LLM Model")
341
- st.sidebar.markdown("You can **select** different *LLM model* powered by API.")
342
- models = ['Llama-3-8B', 'Mistral-7B-0.3', 'Gemma-2-10B', 'GPT-4o', 'Gemini Pro', 'Deepseek-R1', 'en_core_web_sm', 'en_core_web_md', 'en_core_web_lg', 'en_core_web_trf']
343
-
344
 
345
 
346
 
347
- if "model" in params:
348
- default_ix = models.index(params["model"][0])
349
- else:
350
- default_ix = models.index('GPT-4o')
351
 
 
 
 
 
352
 
353
 
354
 
355
- model = st.sidebar.selectbox('LLM Model', models, index=default_ix)
356
-
357
- st.sidebar.markdown("## Spatial Entity Labels")
358
 
359
- st.sidebar.markdown("Please **Mark** the Spatial Entities you want to extract.")
360
- tpes = ""
361
- if "type" in params:
362
- tpes = params['type'][0]
363
 
364
- st.sidebar.markdown("### Absolute Spatial Entity:")
365
- if "g" in tpes:
366
- gpe = st.sidebar.checkbox('GPE', value=True)
367
- else:
368
- gpe = st.sidebar.checkbox('GPE')
369
 
370
- if "l" in tpes:
371
- loc = st.sidebar.checkbox('LOC', value=True)
372
- else:
373
- loc = st.sidebar.checkbox('LOC')
374
 
375
- st.sidebar.markdown("### Relative Spatial Entity:")
 
 
 
 
376
 
377
- if "r" in tpes:
378
- rse = st.sidebar.checkbox('RSE', value=True)
379
- else:
380
- rse = st.sidebar.checkbox('RSE')
381
- if (gpe):
382
- gpe_selected = "GPE"
383
- types += "g"
384
 
385
- if (loc):
386
- loc_selected = "LOC"
387
- types += "l"
388
 
389
- if (rse):
390
- rse_selected = "RSE"
391
- types += "r"
 
 
 
 
392
 
 
 
 
393
 
 
 
 
394
 
395
 
396
 
397
- def main():
398
- global gpe_selected, loc_selected, rse_selected, model
399
- #print(displacy.templates.TPL_ENT)
400
- set_header()
401
- set_side_menu()
402
 
403
 
404
- text = set_input()
 
 
 
 
 
 
 
405
 
406
- if(text is not None):
407
- extract_spatial_entities(text)
408
- elif "text" in st.session_state:
409
- text = st.session_state.text
410
- extract_spatial_entities(text)
411
 
412
 
413
- if __name__ == '__main__':
414
- main()
415
 
416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+ # 这不会失败
3
+ def main():
4
+ import streamlit
5
+ subprocess.run(["pip", "install", "streamlit"])
6
+ main()
7
+
8
+
9
+
10
+
11
+ # import subprocess
12
+ # import importlib.util
13
+ # import os
14
+
15
+ # # 只在 geospacy 没有被安装时执行安装(避免重复装)
16
+ # if importlib.util.find_spec("geospacy") is None:
17
+ # subprocess.run(
18
+ # ["pip", "install", "--no-deps", "-r", "requirements_geospacy.txt"],
19
+ # check=True
20
+ # )
21
+
22
+
23
+ # import streamlit as st
24
+ # from spacy import displacy
25
+ # import spacy
26
+ # import geospacy
27
+ # from PIL import Image
28
+ # import base64
29
+ # import sys
30
+ # import pandas as pd
31
+ # import en_core_web_md
32
+ # from spacy.tokens import Span, Doc, Token
33
+ # from utils import geoutil
34
+ # import llm_coding
35
+ # import urllib.parse
36
+
37
+
38
+ # colors = {'GPE': "#43c6fc", "LOC": "#fd9720", "RSE":"#a6e22d"}
39
+ # options = {"ents": ['GPE', 'LOC', "RSE"], "colors": colors}
40
+
41
+ # HTML_WRAPPER = """<div style="overflow-x: auto; border: none solid #a6e22d; border-radius: 0.25rem; padding: 1rem">{}</div>"""
42
+ # model = ""
43
+
44
+ # gpe_selected = "GPE"
45
+ # loc_selected = "LOC"
46
+ # rse_selected = "RSE"
47
+
48
+ # types = ""
49
+
50
+ # #BASE_URL = "http://localhost:8080/"
51
+ # BASE_URL = ""
52
+
53
+
54
+
55
+ # def set_header():
56
+ # LOGO_IMAGE = "tetis-1.png"
57
+
58
+ # st.markdown(
59
+ # """
60
+ # <style>
61
+ # .container {
62
+ # display: flex;
63
+ # }
64
+ # .logo-text {
65
+ # font-weight:700 !important;
66
+ # font-size:50px !important;
67
+ # color: #f9a01b !important;
68
+ # padding-left: 10px !important;
69
+ # }
70
+ # .logo-img {
71
+ # float:right;
72
+ # width: 28%;
73
+ # height: 28%;
74
+ # }
75
+ # </style>
76
+ # """,
77
+ # unsafe_allow_html=True
78
+ # )
79
+ # st.markdown(
80
+ # f"""
81
+ # <div class="container">
82
+ # <img class="logo-img" src="data:image/png;base64,{base64.b64encode(open(LOGO_IMAGE, "rb").read()).decode()}">
83
+ # <p class="logo-text">GeOspaCy</p>
84
+ # </div>
85
+ # """,
86
+ # unsafe_allow_html=True
87
+ # )
88
+
89
+
90
+
91
+ # def set_side_menu():
92
+
93
+ # global gpe_selected, loc_selected, rse_selected, model, types
94
+ # types =""
95
+ # params = st.experimental_get_query_params()
96
+ # # params = st.query_params
97
+ # # print(params, 777)
98
+
99
+ # st.sidebar.markdown("## Spacy Model")
100
+ # st.sidebar.markdown("You can **select** the values of the *spacy model* from Dropdown.")
101
+ # models = ['en_core_web_sm', 'en_core_web_md', 'en_core_web_lg', 'en_core_web_trf']
102
+ # if "model" in params:
103
+ # default_ix = models.index(params["model"][0])
104
+ # else:
105
+ # default_ix = models.index('en_core_web_sm')
106
+ # model = st.sidebar.selectbox('Spacy Model',models, index=default_ix)
107
+
108
+ # st.sidebar.markdown("## Spatial Entity Labels")
109
+ # st.sidebar.markdown("**Mark** the Spatial Entities you want to extract?")
110
+ # tpes = ""
111
+ # if "type" in params:
112
+ # tpes = params['type'][0]
113
+
114
+ # if "g" in tpes:
115
+ # gpe = st.sidebar.checkbox('GPE', value = True)
116
+ # else:
117
+ # gpe = st.sidebar.checkbox('GPE')
118
+
119
+ # if "l" in tpes:
120
+ # loc = st.sidebar.checkbox('LOC', value = True)
121
+ # else:
122
+ # loc = st.sidebar.checkbox('LOC')
123
+ # if "r" in tpes:
124
+ # rse = st.sidebar.checkbox('RSE', value = True)
125
+ # else:
126
+ # rse = st.sidebar.checkbox('RSE')
127
+ # if(gpe):
128
+ # gpe_selected ="GPE"
129
+ # types+="g"
130
+
131
+ # if(loc):
132
+ # loc_selected ="LOC"
133
+ # types+="l"
134
+
135
+ # if(rse):
136
+ # rse_selected ="RSE"
137
+ # types+="r"
138
+
139
+
140
+
141
+ # def set_input():
142
+ # params = st.experimental_get_query_params()
143
+ # # params = st.query_params
144
+
145
+ # if "text" not in params:
146
+ # text = st.text_area("Input unstructured text:", "")
147
+ # else:
148
+ # text = st.text_area("Enter the text to extract {Spatial Entities}", params["text"][0])
149
+ # if(st.button("Extract")):
150
+
151
+ # # return 'France has detected a highly pathogenic strain of bird flu in a pet shop near Paris, days after an identical outbreak in one of Corsica’s main cities.'
152
+
153
+
154
+ # return 'I would like to know where is the area between Burwood and Glebe. Pyrmont.'
155
+ # return '5 km east of Burwood. 3 km south of Glebe. Between Pyrmont and Glebe.'
156
+ # # return 'Between Burwood and Pyrmont.'
157
+ # # return 'Between Burwood and Glebe.'
158
+ # # return 'Between Burwood and Darling Harbour.'
159
+ # # return 'Between China and USA.'
160
+ # # return 'The Burwood city.'
161
+ # # text = "New York is north of Washington. Between Burwood and Pyrmont city."
162
+ # return text
163
+
164
+ # def set_selected_entities(doc):
165
+ # global gpe_selected, loc_selected, rse_selected, model
166
+ # ents = [ent for ent in doc.ents if ent.label_ == gpe_selected or ent.label_ == loc_selected or ent.label_ == rse_selected]
167
+
168
+ # doc.ents = ents
169
+ # return doc
170
+
171
+ # def extract_spatial_entities(text):
172
+ # # nlp = en_core_web_md.load()
173
+
174
+ # # nlp = spacy.load("en_core_web_md")
175
+ # # nlp.add_pipe("spatial_pipeline", after="ner")
176
+ # # doc = nlp(text)
177
+ # # doc = set_selected_entities(doc)
178
+ # # html = displacy.render(doc, style="ent", options=options)
179
+ # # html = html.replace("\n", "")
180
+ # # st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
181
+ # # show_spatial_ent_table(doc, text)
182
+
183
+ # nlp = spacy.load("en_core_web_md") #####
184
+ # nlp.add_pipe("spatial_pipeline", after="ner")
185
+ # doc = nlp(text)
186
+
187
+ # # 分句处理
188
+ # sent_ents = []
189
+ # sent_texts = []
190
+ # sent_rse_id = []
191
+ # offset = 0 # 记录当前 token 偏移量
192
+ # sent_start_positions = [0] # 记录句子信息
193
+ # doc_copy = doc.copy() # 用于展示方程组合
194
+ # for sent in doc.sents:
195
+
196
+ # sent_doc = nlp(sent.text) # 逐句处理
197
+ # sent_doc = set_selected_entities(sent_doc) # 这里处理实体
198
+ # sent_texts.append(sent_doc.text)
199
+
200
+ # for ent in sent_doc.ents:
201
+ # sent_rse_id.append(ent._.rse_id)
202
+ # # **调整每个实体的索引,使其匹配完整文本**
203
+ # for ent in sent_doc.ents:
204
+ # new_ent = Span(doc, ent.start + offset, ent.end + offset, label=ent.label_)
205
+ # sent_ents.append(new_ent)
206
+
207
+ # offset += len(sent) # 更新偏移量
208
+ # sent_start_positions.append(sent_start_positions[-1] + len(sent)) # 记录句子起点
209
+ # # **创建新 Doc**
210
+ # final_doc = Doc(nlp.vocab, words=[token.text for token in doc], spaces=[token.whitespace_ for token in doc])
211
+ # for i in sent_start_positions: # 手动标记句子起始点
212
+ # if i < len(final_doc):
213
+ # final_doc[i].is_sent_start = True
214
+ # # **设置实体**
215
+ # final_doc.set_ents(sent_ents)
216
+
217
+ # for i in range(len(sent_rse_id)):
218
+ # final_doc.ents[i]._.rse_id = sent_rse_id[i]
219
+ # print(doc.ents[0].sent, '原始')
220
+ # doc = final_doc
221
+ # print(doc.ents[0].sent, '新')
222
+ # # 分句处理完毕
223
+
224
+ # # doc = set_selected_entities(doc)
225
+ # doc.to_disk("saved_doc.spacy")
226
+
227
+
228
+
229
+
230
+ # html = displacy.render(doc,style="ent", options = options)
231
+ # html = html.replace("\n","")
232
+ # st.write(HTML_WRAPPER.format(html),unsafe_allow_html=True)
233
+ # show_spatial_ent_table(doc, text)
234
+
235
+ # st.markdown("123123")
236
+
237
+ # show_sentence_selector_table(doc_copy)
238
+
239
+ # def show_sentence_selector_table(doc_copy):
240
+ # st.markdown("**______________________________________________________________________________________**")
241
+ # st.markdown("**Sentence Selector for Geographic Composition**")
242
+
243
+ # # 提取句子
244
+ # sentences = list(doc_copy.sents)
245
+
246
+ # # 构建表格数据
247
+ # rows = []
248
+ # for idx, sent in enumerate(sentences):
249
+ # sentence_text = sent.text.strip()
250
+ # # 生成跳转链接(定位到Tagger)
251
+ # url = BASE_URL + "Tagger?mode=geocombo&text=" + urllib.parse.quote(sentence_text)
252
+ # new_row = {
253
+ # 'Sr.': idx + 1,
254
+ # 'sentence': sentence_text,
255
+ # 'Select': f'<a target="_self" href="{url}">Select this sentence</a>'
256
+ # }
257
+ # rows.append(new_row)
258
+
259
+ # # 转为 DataFrame 并渲染为 HTML
260
+ # df = pd.DataFrame(rows)
261
+ # st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
262
+
263
+
264
+
265
+ # def show_spatial_ent_table(doc, text):
266
+ # global types
267
+ # if len(doc.ents) > 0:
268
+ # st.markdown("**______________________________________________________________________________________**")
269
+ # st.markdown("**Spatial Entities List**")
270
+
271
+ # # 初始化一个空 DataFrame
272
+ # df = pd.DataFrame(columns=['Sr.', 'entity', 'label', 'Map', 'GEOJson'])
273
+ # rows = [] # 用于存储所有行
274
+
275
+ # for ent in doc.ents:
276
+ # url_map = BASE_URL + "Tagger?map=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
277
+ # print(url_map, 'uuurrr')
278
+ # print(ent._.rse_id, 'pppp')
279
+ # url_json = BASE_URL + "Tagger?geojson=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
280
+
281
+ # # 创建新行
282
+ # new_row = {
283
+ # 'Sr.': len(rows) + 1,
284
+ # 'entity': ent.text,
285
+ # 'label': ent.label_,
286
+ # 'Map': f'<a target="_self" href="{url_map}">View</a>',
287
+ # 'GEOJson': f'<a target="_self" href="{url_json}">View</a>'
288
+ # }
289
+
290
+ # rows.append(new_row) # 将新行添加到列表中
291
+
292
+ # # 将所有行转为 DataFrame
293
+ # df = pd.DataFrame(rows)
294
+
295
+ # # 使用 Streamlit 显示 HTML 表格
296
+ # st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
297
+
298
+ # # params = st.experimental_get_query_params()
299
+ # # params = st.query_params
300
+ # # ase, level_1, level_2, level_3 = geoutil.get_ent(params["entity"][0])
301
+ # # print(geoutil.get_ent(params), 'ppppp')
302
+
303
+ # def set_header(): # tetis Geospacy LOGO
304
+ # LOGO_IMAGE = "title.jpg"
305
+
306
+ # st.markdown(
307
+ # """
308
+ # <style>
309
+ # .container {
310
+ # display: flex;
311
+ # }
312
+ # .logo-text {
313
+ # font-weight:700 !important;
314
+ # font-size:50px !important;
315
+ # color: #52aee3 !important;
316
+ # padding-left: 10px !important;
317
+ # }
318
+ # .logo-img {
319
+ # float:right;
320
+ # width: 10%;
321
+ # height: 10%;
322
+ # }
323
+ # </style>
324
+ # """,
325
+ # unsafe_allow_html=True
326
+ # )
327
+ # st.markdown(
328
+ # f"""
329
+ # <div class="container">
330
+ # <img class="logo-img" src="data:image/png;base64,{base64.b64encode(open(LOGO_IMAGE, "rb").read()).decode()}">
331
+ # <p class="logo-text">SpatialParse</p>
332
+ # </div>
333
+ # """,
334
+ # unsafe_allow_html=True
335
+ # )
336
 
337
+
338
+ # def set_side_menu():
339
+ # global gpe_selected, loc_selected, rse_selected, model, types
340
+ # types = ""
341
+ # params = st.experimental_get_query_params()
342
+ # st.sidebar.markdown("## Deployment Method")
343
+ # st.sidebar.markdown("You can select the deployment method for the model.")
344
+ # deployment_options = ["API", "Local deployment"]
345
+ # use_local_model = st.sidebar.radio("Choose deployment method:", deployment_options, index=0) == "Local deployment"
346
+
347
+ # if use_local_model:
348
+ # local_model_path = st.sidebar.text_input("Enter local model path:", "")
349
+
350
+ # st.sidebar.markdown("## LLM Model")
351
+ # st.sidebar.markdown("You can **select** different *LLM model* powered by API.")
352
+ # models = ['Llama-3-8B', 'Mistral-7B-0.3', 'Gemma-2-10B', 'GPT-4o', 'Gemini Pro', 'Deepseek-R1', 'en_core_web_sm', 'en_core_web_md', 'en_core_web_lg', 'en_core_web_trf']
353
 
354
 
355
 
 
 
 
 
356
 
357
+ # if "model" in params:
358
+ # default_ix = models.index(params["model"][0])
359
+ # else:
360
+ # default_ix = models.index('GPT-4o')
361
 
362
 
363
 
 
 
 
364
 
365
+ # model = st.sidebar.selectbox('LLM Model', models, index=default_ix)
 
 
 
366
 
367
+ # st.sidebar.markdown("## Spatial Entity Labels")
 
 
 
 
368
 
369
+ # st.sidebar.markdown("Please **Mark** the Spatial Entities you want to extract.")
370
+ # tpes = ""
371
+ # if "type" in params:
372
+ # tpes = params['type'][0]
373
 
374
+ # st.sidebar.markdown("### Absolute Spatial Entity:")
375
+ # if "g" in tpes:
376
+ # gpe = st.sidebar.checkbox('GPE', value=True)
377
+ # else:
378
+ # gpe = st.sidebar.checkbox('GPE')
379
 
380
+ # if "l" in tpes:
381
+ # loc = st.sidebar.checkbox('LOC', value=True)
382
+ # else:
383
+ # loc = st.sidebar.checkbox('LOC')
 
 
 
384
 
385
+ # st.sidebar.markdown("### Relative Spatial Entity:")
 
 
386
 
387
+ # if "r" in tpes:
388
+ # rse = st.sidebar.checkbox('RSE', value=True)
389
+ # else:
390
+ # rse = st.sidebar.checkbox('RSE')
391
+ # if (gpe):
392
+ # gpe_selected = "GPE"
393
+ # types += "g"
394
 
395
+ # if (loc):
396
+ # loc_selected = "LOC"
397
+ # types += "l"
398
 
399
+ # if (rse):
400
+ # rse_selected = "RSE"
401
+ # types += "r"
402
 
403
 
404
 
 
 
 
 
 
405
 
406
 
407
+ # def main():
408
+ # global gpe_selected, loc_selected, rse_selected, model
409
+ # #print(displacy.templates.TPL_ENT)
410
+ # set_header()
411
+ # set_side_menu()
412
+
413
+
414
+ # text = set_input()
415
 
416
+ # if(text is not None):
417
+ # extract_spatial_entities(text)
418
+ # elif "text" in st.session_state:
419
+ # text = st.session_state.text
420
+ # extract_spatial_entities(text)
421
 
422
 
423
+ # if __name__ == '__main__':
424
+ # main()
425
 
426