Spaces:
Runtime error
Runtime error
Improved Glottolog retrieval
Browse files
app.py
CHANGED
@@ -3,7 +3,7 @@ import langcodes
|
|
3 |
from requests_html import HTMLSession
|
4 |
import urllib
|
5 |
import requests
|
6 |
-
|
7 |
# FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
|
8 |
# Big TODO: collate all the results into a big dictionary? Then display that. Reduces if statements?
|
9 |
# TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
|
@@ -51,17 +51,20 @@ def pull_obsolete_codes(iso_code):
|
|
51 |
|
52 |
def try_retrieving_glottolog_id(langtext):
|
53 |
languoid_id = ""
|
54 |
-
session = HTMLSession()
|
55 |
langtext_quoted = urllib.parse.quote(langtext)
|
56 |
query_url=f"https://glottolog.org/glottolog?search={langtext_quoted}"
|
57 |
glottolog_r= session.get(query_url)
|
58 |
returned_url = glottolog_r.html.url
|
59 |
-
|
60 |
|
61 |
if "languoid" in returned_url:
|
62 |
last_section = returned_url.split("/")[-1]
|
63 |
languoid_id = last_section
|
64 |
return languoid_id
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
def try_searching_vachan_engine(langtext):
|
67 |
results_list = []
|
@@ -76,7 +79,7 @@ def try_searching_vachan_engine(langtext):
|
|
76 |
def main():
|
77 |
st.write("# Language code/tag search")
|
78 |
st.write("Fed up with language tag confusion? Here's your one-stop shop!")
|
79 |
-
st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47
|
80 |
st.write(f"**Feedback:** Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")
|
81 |
|
82 |
# https://huggingface.co/blog/streamlit-spaces
|
@@ -121,15 +124,15 @@ def main():
|
|
121 |
st.success(f"* Natural language search found the following BCP-47 tag: {lang}")
|
122 |
except LookupError as e:
|
123 |
st.error("## Result: failure!")
|
124 |
-
st.error(f"Unable to look up
|
125 |
st.write(f"* You can also try https://r12a.github.io/app-subtags/")
|
126 |
-
st.write(f"* Or possibly https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)}")
|
127 |
lang = None
|
128 |
|
129 |
|
130 |
|
131 |
|
132 |
-
t_variant = None
|
|
|
133 |
|
134 |
|
135 |
#st.write(f"langcodes found the following tag: {type(found)}") # a Language object
|
@@ -184,15 +187,16 @@ def main():
|
|
184 |
st.write(b_obsolete_codes)
|
185 |
|
186 |
st.write(f"### Glottolog")
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
196 |
|
197 |
results_from_vachan = try_searching_vachan_engine(langtext)
|
198 |
if results_from_vachan:
|
|
|
3 |
from requests_html import HTMLSession
|
4 |
import urllib
|
5 |
import requests
|
6 |
+
session = HTMLSession()
|
7 |
# FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
|
8 |
# Big TODO: collate all the results into a big dictionary? Then display that. Reduces if statements?
|
9 |
# TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
|
|
|
51 |
|
52 |
def try_retrieving_glottolog_id(langtext):
|
53 |
languoid_id = ""
|
|
|
54 |
langtext_quoted = urllib.parse.quote(langtext)
|
55 |
query_url=f"https://glottolog.org/glottolog?search={langtext_quoted}"
|
56 |
glottolog_r= session.get(query_url)
|
57 |
returned_url = glottolog_r.html.url
|
|
|
58 |
|
59 |
if "languoid" in returned_url:
|
60 |
last_section = returned_url.split("/")[-1]
|
61 |
languoid_id = last_section
|
62 |
return languoid_id
|
63 |
+
|
64 |
+
def get_glottolog_json(languoid_id):
|
65 |
+
query_url=f"https://glottolog.org/resource/languoid/id/{languoid_id}.json"
|
66 |
+
glottolog_r = session.get(query_url)
|
67 |
+
return glottolog_r.json()
|
68 |
|
69 |
def try_searching_vachan_engine(langtext):
|
70 |
results_list = []
|
|
|
79 |
def main():
|
80 |
st.write("# Language code/tag search")
|
81 |
st.write("Fed up with language tag confusion? Here's your one-stop shop!")
|
82 |
+
st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 tag according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
|
83 |
st.write(f"**Feedback:** Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")
|
84 |
|
85 |
# https://huggingface.co/blog/streamlit-spaces
|
|
|
124 |
st.success(f"* Natural language search found the following BCP-47 tag: {lang}")
|
125 |
except LookupError as e:
|
126 |
st.error("## Result: failure!")
|
127 |
+
st.error(f"Unable to look up BCP-47 tag. But all hope is not lost...")
|
128 |
st.write(f"* You can also try https://r12a.github.io/app-subtags/")
|
|
|
129 |
lang = None
|
130 |
|
131 |
|
132 |
|
133 |
|
134 |
+
t_variant = None
|
135 |
+
b_variant = None
|
136 |
|
137 |
|
138 |
#st.write(f"langcodes found the following tag: {type(found)}") # a Language object
|
|
|
187 |
st.write(b_obsolete_codes)
|
188 |
|
189 |
st.write(f"### Glottolog")
|
190 |
+
search_terms_for_glottolog = [langtext, t_variant, b_variant]
|
191 |
+
languoids = []
|
192 |
+
for search_term in search_terms_for_glottolog :
|
193 |
+
if search_term :
|
194 |
+
languoid_id = try_retrieving_glottolog_id(search_term )
|
195 |
+
if languoid_id:
|
196 |
+
if languoid_id not in languoids:
|
197 |
+
st.write(f"**Glottolog Languoid ID:** Searching for '{search_term}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
|
198 |
+
# get_glottolog_json(languoid_id)
|
199 |
+
languoids.append(languoid_id)
|
200 |
|
201 |
results_from_vachan = try_searching_vachan_engine(langtext)
|
202 |
if results_from_vachan:
|