Spaces:
Build error
Build error
Upload gradio_app.py
Browse files- gradio_app.py +223 -0
gradio_app.py
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import gradio as gr
|
3 |
+
import spacy
|
4 |
+
from spacy import displacy
|
5 |
+
|
6 |
+
from bib_tokenizers import create_references_tokenizer
|
7 |
+
|
8 |
+
|
9 |
+
nlp = None
|
10 |
+
nlp = spacy.load("spacy-pipelines/model-best")
|
11 |
+
# return score for each token:
|
12 |
+
# with threshold set to zero each suggested span is returned, and span == token,
|
13 |
+
# because suggester is configured to suggest spans with len(span) == 1:
|
14 |
+
# [components.spancat.suggester]
|
15 |
+
# @misc = "spacy.ngram_suggester.v1"
|
16 |
+
# sizes = [1]
|
17 |
+
nlp.get_pipe("spancat").cfg["threshold"] = 0.0 # see )
|
18 |
+
print(nlp.get_pipe("spancat").cfg)
|
19 |
+
|
20 |
+
|
21 |
+
def create_bib_item_start_scorer_for_doc(doc, spanskey="sc"):
|
22 |
+
|
23 |
+
span_group = doc.spans[spanskey]
|
24 |
+
assert not span_group.has_overlap
|
25 |
+
assert len(span_group) == len(
|
26 |
+
doc
|
27 |
+
), "Check suggester config and the spancat threshold to make sure that spangroup contains single token span for each token"
|
28 |
+
|
29 |
+
spans_idx = {
|
30 |
+
offset: span.start
|
31 |
+
for span in span_group
|
32 |
+
for offset in range(span.start_char, span.end_char + 1)
|
33 |
+
}
|
34 |
+
|
35 |
+
def scorer(char_offset, fuzzy_in_tokens=(0, 0)):
|
36 |
+
i = spans_idx[char_offset]
|
37 |
+
|
38 |
+
span = span_group[i]
|
39 |
+
assert i == span.start
|
40 |
+
|
41 |
+
# fuzzines might improve fault tolerance if the model made a small mistake,
|
42 |
+
# e.g., if a number from prev line is classified as "citation number",
|
43 |
+
# see example at https://www.deeplearningbook.org/contents/bib.html
|
44 |
+
# if fuzzy == (0,0), it return score for the selected span only
|
45 |
+
return span, max(
|
46 |
+
span_group.attrs["scores"][i]
|
47 |
+
for i in range(i - fuzzy_in_tokens[0], i + fuzzy_in_tokens[1] + 1)
|
48 |
+
if i >= 0 and i < len(doc.text)
|
49 |
+
)
|
50 |
+
|
51 |
+
return scorer
|
52 |
+
|
53 |
+
|
54 |
+
nlp_blank = spacy.blank("en")
|
55 |
+
nlp_blank.tokenizer = create_references_tokenizer()(nlp_blank)
|
56 |
+
|
57 |
+
|
58 |
+
def split_up_references(
|
59 |
+
references: str, is_eol_mode=False, nlp=nlp, nlp_blank=nlp_blank
|
60 |
+
):
|
61 |
+
"""
|
62 |
+
Args:
|
63 |
+
references - a references section, ideally without a header
|
64 |
+
nlp - a model that splits up references into separate sentences
|
65 |
+
nlp_blank - a blank nlp with the same tokenizer/language
|
66 |
+
"""
|
67 |
+
|
68 |
+
normalized_references = references.replace("\n", " ")
|
69 |
+
|
70 |
+
# the model trained on 'normalized' references - the ones without '\n'
|
71 |
+
doc = nlp(normalized_references)
|
72 |
+
|
73 |
+
# 'transfer' annotations from doc without '\n' (normalized references) to the target doc created from the original input string
|
74 |
+
# the problem here is that docs differ in a number of tokens
|
75 |
+
# however, it should be easy to align on characters level because both '\n' and ' ' are whitespace, so spans have the same boundaries
|
76 |
+
|
77 |
+
target_doc = nlp_blank(references)
|
78 |
+
target_tokens_idx = {
|
79 |
+
offset: t.i for t in target_doc for offset in range(t.idx, t.idx + len(t))
|
80 |
+
}
|
81 |
+
|
82 |
+
# senter annotations
|
83 |
+
for i, t in enumerate(target_doc):
|
84 |
+
t.is_sent_start = i == 0
|
85 |
+
if is_eol_mode:
|
86 |
+
# use SpanCat scores to set sentence boundaries on the target doc
|
87 |
+
char_offset = 0
|
88 |
+
f = io.StringIO(references)
|
89 |
+
token_scorer = create_bib_item_start_scorer_for_doc(doc)
|
90 |
+
threshold = 0.2
|
91 |
+
lines = [line for line in f]
|
92 |
+
lines_len_in_tokens = [
|
93 |
+
_len for _len in map(lambda line: len(nlp_blank.tokenizer(line)), lines)
|
94 |
+
]
|
95 |
+
for line_num, line in enumerate(lines):
|
96 |
+
fuzzy = (
|
97 |
+
0 if line_num == 0 else lines_len_in_tokens[line_num - 1] // 4,
|
98 |
+
lines_len_in_tokens[line_num] // 4,
|
99 |
+
)
|
100 |
+
span, score = token_scorer(char_offset, fuzzy_in_tokens=fuzzy)
|
101 |
+
print(span, score)
|
102 |
+
if score > threshold:
|
103 |
+
target_doc[target_tokens_idx[char_offset]].is_sent_start = True
|
104 |
+
char_offset += len(line)
|
105 |
+
else:
|
106 |
+
# copy SentenceRecognizer annotations from doc without '\n' to the target doc
|
107 |
+
for t in doc:
|
108 |
+
if t.is_sent_start:
|
109 |
+
target_doc[target_tokens_idx[t.idx]].is_sent_start = True
|
110 |
+
|
111 |
+
# copy ner annotations:
|
112 |
+
target_doc.ents = [
|
113 |
+
target_doc.char_span(ent.start_char, ent.end_char, ent.label_)
|
114 |
+
for ent in doc.ents
|
115 |
+
# remove entities crossing sentence boundaries
|
116 |
+
if not any([t.is_sent_start for t in ent if t.i != ent.start])
|
117 |
+
]
|
118 |
+
|
119 |
+
return target_doc
|
120 |
+
|
121 |
+
|
122 |
+
def text_analysis(text, is_eol_mode):
|
123 |
+
|
124 |
+
html = ""
|
125 |
+
|
126 |
+
doc_with_linebreaks = split_up_references(
|
127 |
+
text, is_eol_mode=is_eol_mode, nlp=nlp, nlp_blank=nlp_blank
|
128 |
+
)
|
129 |
+
|
130 |
+
for i, sent in enumerate(doc_with_linebreaks.sents):
|
131 |
+
bib_item_doc = sent.as_doc()
|
132 |
+
bib_item_doc.user_data = {"title": f"***** Bib Item {i+1}: *****"}
|
133 |
+
html += displacy.render(bib_item_doc, style="ent")
|
134 |
+
|
135 |
+
html = (
|
136 |
+
"<div style='max-width:100%; max-height:360px; overflow:auto'>"
|
137 |
+
+ html
|
138 |
+
+ "</div>"
|
139 |
+
)
|
140 |
+
|
141 |
+
return html
|
142 |
+
|
143 |
+
|
144 |
+
demo = gr.Blocks()
|
145 |
+
with demo:
|
146 |
+
|
147 |
+
textbox = gr.components.Textbox(
|
148 |
+
label="Unparsed Bibliography Section",
|
149 |
+
placeholder="Enter bibliography here...",
|
150 |
+
lines=20,
|
151 |
+
)
|
152 |
+
is_eol_mode = gr.components.Checkbox(
|
153 |
+
label="a line does not contain more than one bibitem (Multiline bibitems are supported regardless of this choice)"
|
154 |
+
)
|
155 |
+
html = gr.components.HTML(label="Parsed Bib Items")
|
156 |
+
textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
157 |
+
is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
158 |
+
|
159 |
+
gr.Examples(
|
160 |
+
examples=[
|
161 |
+
[
|
162 |
+
"""[1] B. Foxman, R. Barlow, H. D'Arcy, B. Gillespie, and J. D. Sobel, "Urinary tract infection: self-reported incidence and associated costs," Ann Epidemiol, vol. 10, pp. 509-515, 2000. [2] B. Foxman, "Epidemiology of urinary tract infections: incidence, morbidity, and economic costs," Am J Med, vol. 113, pp. 5-13, 2002. [3] L. Nicolle, "Urinary tract infections in the elderly," Clin Geriatr Med, vol. 25, pp. 423-436, 2009."""
|
163 |
+
],
|
164 |
+
[
|
165 |
+
"""Barth, Fredrik, ed.
|
166 |
+
1969 Ethnic groups and boundaries: The social organization of culture difference. Oslo: Scandinavian University Press.
|
167 |
+
Bondokji, Neven
|
168 |
+
2016 The Expectation Gap in Humanitarian Operations: Field Perspectives from Jordan. Asian Journal of Peace Building 4(1):1-28.
|
169 |
+
Bourdieu, Pierre
|
170 |
+
The forms of capital In Handbook of Theory and Research for the Sociology of Education. J. Richardson, ed. Pp. 241-258. New York: Greenwood Publishesrs.
|
171 |
+
Carrion, Doris
|
172 |
+
2015 Are Syrian Refguees a Security Threat to the MIddle East Vol. 2016. London Reuters.
|
173 |
+
CFR
|
174 |
+
2016 The Global Humanitarian Regime: Priorities and Prospects for Reform. Council on Foerign Relations, International Institutues and Global Governance Program"""
|
175 |
+
],
|
176 |
+
[
|
177 |
+
"""(2) Hofmann, M.H. et al. Aberrant splicing caused by single nucleotide polymorphism c.516G>T [Q172H], a marker of CYP2B6*6, is responsible for decreased expression and activity of CYP2B6 in liver. J Pharmacol Exp Ther 325, 284-92 (2008).
|
178 |
+
(3) Zanger, U.M. & Klein, K. Pharmacogenetics of cytochrome P450 2B6 (CYP2B6): advances on polymorphisms, mechanisms, and clinical relevance. Front Genet 4, 24 (2013).
|
179 |
+
(4) Holzinger, E.R. et al. Genome-wide association study of plasma efavirenz pharmacokinetics in AIDS Clinical Trials Group protocols implicates several CYP2B6 variants. Pharmacogenet Genomics 22, 858-67 (2012).
|
180 |
+
"""
|
181 |
+
],
|
182 |
+
[
|
183 |
+
"""[Ein05] Albert Einstein. Zur Elektrodynamik bewegter K ̈orper. (German)
|
184 |
+
[On the electrodynamics of moving bodies]. Annalen der Physik,
|
185 |
+
322(10):891–921, 1905.
|
186 |
+
[GMS93] Michel Goossens, Frank Mittelbach, and Alexander Samarin. The LATEX Companion. Addison-Wesley, Reading, Massachusetts, 1993.
|
187 |
+
[Knu] Donald Knuth. Knuth: Computers and typesetting."""
|
188 |
+
],
|
189 |
+
[
|
190 |
+
"""References
|
191 |
+
Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
|
192 |
+
Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
|
193 |
+
Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
|
194 |
+
Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100
|
195 |
+
Beuther, H., Walsh, A. J., Thorwirth, S., et al. 2007, A&A, 466, 989
|
196 |
+
Brogan, C. L., Hunter, T. R., Cyganowski, C. J., et al. 2011, ApJ, 739, L16
|
197 |
+
Brown, A. T., Little, L. T., MacDonald, G. H., Riley, P. W., & Matheson, D. N.
|
198 |
+
1981, MNRAS, 195, 607
|
199 |
+
Brown, R. D. & Cragg, D. M. 1991, ApJ, 378, 445
|
200 |
+
Carrasco-González, C., Sanna, A., Rodríguez-Kamenetzky, A., et al. 2021, ApJ,
|
201 |
+
914, L1
|
202 |
+
Cesaroni, R., Walmsley, C. M., & Churchwell, E. 1992, A&A, 256, 618
|
203 |
+
Cheung, A. C., Rank, D. M., Townes, C. H., Thornton, D. D., & Welch, W. J.
|
204 |
+
1968, Phys. Rev. Lett., 21, 1701
|
205 |
+
Churchwell, E., Babler, B. L., Meade, M. R., et al. 2009, PASP, 121, 213
|
206 |
+
Cohen, R. J. & Brebner, G. C. 1985, MNRAS, 216, 51P
|
207 |
+
Comito, C., Schilke, P., Endesfelder, U., Jiménez-Serra, I., & Martín-Pintado, J.
|
208 |
+
2007, A&A, 469, 207
|
209 |
+
Curiel, S., Ho, P. T. P., Patel, N. A., et al. 2006, ApJ, 638, 878
|
210 |
+
Danby, G., Flower, D. R., Valiron, P., Schilke, P., & Walmsley, C. M. 1988,
|
211 |
+
MNRAS, 235, 229
|
212 |
+
De Buizer, J. M., Liu, M., Tan, J. C., et al. 2017, ApJ, 843, 33
|
213 |
+
De Buizer, J. M., Radomski, J. T., Telesco, C. M., & Piña, R. K. 2003, ApJ, 598,
|
214 |
+
1127
|
215 |
+
Dzib, S., Loinard, L., Rodríguez, L. F., Mioduszewski, A. J., & Torres, R. M.
|
216 |
+
2011, ApJ, 733, 71
|
217 |
+
Flower, D. R., Offer, A., & Schilke, P. 1990, MNRAS, 244, 4P
|
218 |
+
Galván-Madrid, R., Keto, E., Zhang, Q., et al. 2009, ApJ, 706, 1036"""
|
219 |
+
],
|
220 |
+
],
|
221 |
+
inputs=textbox,
|
222 |
+
)
|
223 |
+
demo.launch(share=False, server_name="0.0.0.0", server_port=7080)
|