Upload lang_codes.py
Browse files- lang_codes.py +142 -0
lang_codes.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
File: lang_codes.py
|
3 |
+
|
4 |
+
Description: Language codes (e.g. used by tessearct for OCR)
|
5 |
+
|
6 |
+
Author: Didier Guillevic
|
7 |
+
Date: 2024-11-23
|
8 |
+
"""
|
9 |
+
|
10 |
+
# Tesseract language codes:
|
11 |
+
# - https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
|
12 |
+
# - https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes
|
13 |
+
|
14 |
+
tesseract_lang_codes = {
|
15 |
+
'Afrikaans': 'afr',
|
16 |
+
'Amharic': 'amh',
|
17 |
+
'Arabic': 'ara',
|
18 |
+
'Assamese': 'asm',
|
19 |
+
'Azerbaijani': 'aze',
|
20 |
+
'Azerbaijani - Cyrilic': 'aze_cyrl',
|
21 |
+
'Belarusian': 'bel',
|
22 |
+
'Bengali': 'ben',
|
23 |
+
'Tibetan': 'bod',
|
24 |
+
'Bosnian': 'bos',
|
25 |
+
'Breton': 'bre',
|
26 |
+
'Bulgarian': 'bul',
|
27 |
+
'Catalan; Valencian': 'cat',
|
28 |
+
'Cebuano': 'ceb',
|
29 |
+
'Czech': 'ces',
|
30 |
+
'Chinese - Simplified': 'chi_sim',
|
31 |
+
'Chinese - Traditional': 'chi_tra',
|
32 |
+
'Cherokee': 'chr',
|
33 |
+
'Corsican': 'cos',
|
34 |
+
'Welsh': 'cym',
|
35 |
+
'Danish': 'dan',
|
36 |
+
'Danish - Fraktur (contrib)': 'dan_frak',
|
37 |
+
'German': 'deu',
|
38 |
+
'German - Fraktur (contrib)': 'deu_frak',
|
39 |
+
'German (Fraktur Latin)': 'deu_latf',
|
40 |
+
'Dzongkha': 'dzo',
|
41 |
+
'Greek, Modern (1453-)': 'ell',
|
42 |
+
'English': 'eng',
|
43 |
+
'English, Middle (1100-1500)': 'enm',
|
44 |
+
'Esperanto': 'epo',
|
45 |
+
'Math / equation detection module': 'equ',
|
46 |
+
'Estonian': 'est',
|
47 |
+
'Basque': 'eus',
|
48 |
+
'Faroese': 'fao',
|
49 |
+
'Persian': 'fas',
|
50 |
+
'Filipino (old - Tagalog)': 'fil',
|
51 |
+
'Finnish': 'fin',
|
52 |
+
'French': 'fra',
|
53 |
+
'German - Fraktur (now deu_latf)': 'frk',
|
54 |
+
'French, Middle (ca.1400-1600)': 'frm',
|
55 |
+
'Western Frisian': 'fry',
|
56 |
+
'Scottish Gaelic': 'gla',
|
57 |
+
'Irish': 'gle',
|
58 |
+
'Galician': 'glg',
|
59 |
+
'Greek, Ancient (to 1453) (contrib)': 'grc',
|
60 |
+
'Gujarati': 'guj',
|
61 |
+
'Haitian; Haitian Creole': 'hat',
|
62 |
+
'Hebrew': 'heb',
|
63 |
+
'Hindi': 'hin',
|
64 |
+
'Croatian': 'hrv',
|
65 |
+
'Hungarian': 'hun',
|
66 |
+
'Armenian': 'hye',
|
67 |
+
'Inuktitut': 'iku',
|
68 |
+
'Indonesian': 'ind',
|
69 |
+
'Icelandic': 'isl',
|
70 |
+
'Italian': 'ita',
|
71 |
+
'Italian - Old': 'ita_old',
|
72 |
+
'Javanese': 'jav',
|
73 |
+
'Japanese': 'jpn',
|
74 |
+
'Kannada': 'kan',
|
75 |
+
'Georgian': 'kat',
|
76 |
+
'Georgian - Old': 'kat_old',
|
77 |
+
'Kazakh': 'kaz',
|
78 |
+
'Central Khmer': 'khm',
|
79 |
+
'Kirghiz; Kyrgyz': 'kir',
|
80 |
+
'Kurmanji (Kurdish - Latin Script)': 'kmr',
|
81 |
+
'Korean': 'kor',
|
82 |
+
'Korean (vertical)': 'kor_vert',
|
83 |
+
'Kurdish (Arabic Script)': 'kur',
|
84 |
+
'Lao': 'lao',
|
85 |
+
'Latin': 'lat',
|
86 |
+
'Latvian': 'lav',
|
87 |
+
'Lithuanian': 'lit',
|
88 |
+
'Luxembourgish': 'ltz',
|
89 |
+
'Malayalam': 'mal',
|
90 |
+
'Marathi': 'mar',
|
91 |
+
'Macedonian': 'mkd',
|
92 |
+
'Maltese': 'mlt',
|
93 |
+
'Mongolian': 'mon',
|
94 |
+
'Maori': 'mri',
|
95 |
+
'Malay': 'msa',
|
96 |
+
'Burmese': 'mya',
|
97 |
+
'Nepali': 'nep',
|
98 |
+
'Dutch; Flemish': 'nld',
|
99 |
+
'Norwegian': 'nor',
|
100 |
+
'Occitan (post 1500)': 'oci',
|
101 |
+
'Oriya': 'ori',
|
102 |
+
'Orientation and script detection module': 'osd',
|
103 |
+
'Panjabi; Punjabi': 'pan',
|
104 |
+
'Polish': 'pol',
|
105 |
+
'Portuguese': 'por',
|
106 |
+
'Pushto; Pashto': 'pus',
|
107 |
+
'Quechua': 'que',
|
108 |
+
'Romanian; Moldavian; Moldovan': 'ron',
|
109 |
+
'Russian': 'rus',
|
110 |
+
'Sanskrit': 'san',
|
111 |
+
'Sinhala; Sinhalese': 'sin',
|
112 |
+
'Slovak': 'slk',
|
113 |
+
'Slovak - Fraktur (contrib)': 'slk_frak',
|
114 |
+
'Slovenian': 'slv',
|
115 |
+
'Sindhi': 'snd',
|
116 |
+
'Spanish; Castilian': 'spa',
|
117 |
+
'Spanish; Castilian - Old': 'spa_old',
|
118 |
+
'Albanian': 'sqi',
|
119 |
+
'Serbian': 'srp',
|
120 |
+
'Serbian - Latin': 'srp_latn',
|
121 |
+
'Sundanese': 'sun',
|
122 |
+
'Swahili': 'swa',
|
123 |
+
'Swedish': 'swe',
|
124 |
+
'Syriac': 'syr',
|
125 |
+
'Tamil': 'tam',
|
126 |
+
'Tatar': 'tat',
|
127 |
+
'Telugu': 'tel',
|
128 |
+
'Tajik': 'tgk',
|
129 |
+
'Tagalog (new - Filipino)': 'tgl',
|
130 |
+
'Thai': 'tha',
|
131 |
+
'Tigrinya': 'tir',
|
132 |
+
'Tonga': 'ton',
|
133 |
+
'Turkish': 'tur',
|
134 |
+
'Uighur; Uyghur': 'uig',
|
135 |
+
'Ukrainian': 'ukr',
|
136 |
+
'Urdu': 'urd',
|
137 |
+
'Uzbek': 'uzb',
|
138 |
+
'Uzbek - Cyrilic': 'uzb_cyrl',
|
139 |
+
'Vietnamese': 'vie',
|
140 |
+
'Yiddish': 'yid',
|
141 |
+
'Yoruba': 'yor'
|
142 |
+
}
|