Spaces:
Runtime error
Runtime error
File size: 4,885 Bytes
68a8c29 6570b48 25a3c87 6390590 1dcc788 58b95fa 5a1315d 9989672 1dcc788 d2e9829 39897d9 a0fdec6 abed01c 1392687 39897d9 1392687 39897d9 e3d850e ff7c666 67daf03 cde8835 e3ca56d 726336c cde8835 e3d850e 21247cf 606d796 a397155 0fef655 d6aa39c 0fef655 947dc2d 7e88eb4 a8b6772 0fef655 2f590b1 16fc4ca 92a84ae 9989672 25a3c87 5d305df 25a3c87 5d305df 92a84ae dedac74 8329262 b65ecd9 878ffe0 55f8482 fa7755f 1392687 f023ff1 878ffe0 0005cf0 fa7755f 55f8482 fa7755f 55f8482 1392687 0ca7b2b fa7755f 92a84ae 10bed1d 59b8b02 9c98688 10bed1d 3792961 10bed1d 8492c38 10bed1d 3792961 537dd73 92a84ae ca54e5d fa7755f 07e3fb8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import streamlit as st
import langcodes
from requests_html import HTMLSession
st.write("# Language code/tag search")
st.write("Fed up with language tag confusion? Here's your one-stop shop!")
st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English`, and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 code according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
# https://huggingface.co/blog/streamlit-spaces
# https://github.com/psf/requests-html
# https://docs.streamlit.io/library/api-reference/write-magic/st.write
langtext = st.text_input("Language Code/Tag Lookup using langcodes", "english").strip()
st.write("Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")
if langcodes.tag_is_valid(langtext):
st.write(f"...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
else:
st.write(f"...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")
try:
lang = langcodes.Language.get(langtext)
# st.write(f"{lang} is the BCP-47 tag.")
if "unknown" in lang.display_name().lower():
st.write(f"Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
lang = None
except langcodes.LanguageTagError as e:
st.write(f"Could not lookup code directly, attempting to search for it as a natural language string.")
lang = None
if lang is None:
try:
found = langcodes.find(langtext)
lang = found
st.write(f"natural language search found the following BCP-47 tag: {lang}")
except LookupError as e:
st.write(f"Unable to look up language code.")
st.write(f"Try also: https://r12a.github.io/app-subtags/")
st.write(f"Try also: https://glottolog.org/glottolog?search={langtext}")
lang = None
def pull_obsolete_codes(iso_code):
session = HTMLSession()
r= session.get(f"https://iso639-3.sil.org/code/{iso_code}")
# https://www.w3schools.com/cssref/css_selectors.asp
obsolete_codes = {}
for found_element in r.html.find(".views-field-nothing", clean=True):
lines = found_element.text.splitlines()
for line in lines:
for obsolete_code_name in ["639-1","639-2/B", "639-2/T", "639-3"]:
if obsolete_code_name in line and ":" in line:
code = line.split()[-1]
obsolete_codes[obsolete_code_name] = code
return obsolete_codes
#st.write(f"langcodes found the following tag: {type(found)}") # a Language object
if lang is not None:
display = lang.display_name()
b_variant = lang.to_alpha3(variant='B')
t_variant = lang.to_alpha3(variant='T')
broader_tags = lang.broader_tags()
st.write("## Results")
st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")
st.write(f"Breakdown of tag components:")
st.write(lang.describe())
st.write(f"Display name for {lang}: {lang.display_name()}")
st.write(f"Autonym for {lang}: {lang.autonym()}")
st.write(f"**Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library:** `{langcodes.standardize_tag(lang)}`")
st.write("## Further Information:")
st.write(f"Broader tags for this language, if any:")
st.write(broader_tags)
st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}")
st.write(f"https://glottolog.org/glottolog?search={t_variant} may be of interest, with links to Ethnologue, etc. If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}")
# ethnologue prefers T for german (deu), and T for French
st.write("## Older Codes")
st.write(f"ISO 639-3 'alpha3' code, 'terminology' variant (deprecated): {t_variant}")
st.write(f"ISO 639-3 'alpha3' code, 'bibliographic' variant (deprecated): {b_variant}")
st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}")
st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}")
obsolete_codes = pull_obsolete_codes(t_variant)
#TODO: Cleanup this bit
if obsolete_codes:
st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{t_variant}:")
st.write(obsolete_codes)
else:
obsolete_codes = pull_obsolete_codes(b_variant)
if obsolete_codes:
st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:")
st.write(obsolete_codes)
|