KuangDW commited on
Commit
2aebc50
·
1 Parent(s): 6f67103

add laser tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +1 -1
  2. laser/.gitignore +0 -7
  3. laser/tools-external/fastBPE/LICENSE +21 -0
  4. laser/tools-external/fastBPE/MANIFEST.in +3 -0
  5. laser/tools-external/fastBPE/README.md +83 -0
  6. laser/tools-external/fastBPE/build/lib.linux-x86_64-cpython-37/fastBPE.cpython-37m-x86_64-linux-gnu.so +3 -0
  7. laser/tools-external/fastBPE/fastBPE.egg-info/PKG-INFO +91 -0
  8. laser/tools-external/fastBPE/fastBPE.egg-info/SOURCES.txt +12 -0
  9. laser/tools-external/fastBPE/fastBPE.egg-info/dependency_links.txt +1 -0
  10. laser/tools-external/fastBPE/fastBPE.egg-info/top_level.txt +1 -0
  11. laser/tools-external/fastBPE/fastBPE/fastBPE.cpp +0 -0
  12. laser/tools-external/fastBPE/fastBPE/fastBPE.hpp +692 -0
  13. laser/tools-external/fastBPE/fastBPE/fastBPE.pyx +24 -0
  14. laser/tools-external/fastBPE/fastBPE/main.cc +43 -0
  15. laser/tools-external/fastBPE/setup.py +49 -0
  16. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
  17. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
  18. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.de +325 -0
  19. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.el +1568 -0
  20. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.en +121 -0
  21. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.es +118 -0
  22. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.fi +138 -0
  23. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.fr +153 -0
  24. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ga +48 -0
  25. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.hu +103 -0
  26. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.is +251 -0
  27. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.it +180 -0
  28. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.lt +698 -0
  29. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.lv +100 -0
  30. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.nl +115 -0
  31. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.pl +283 -0
  32. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.pt +3 -0
  33. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ro +38 -0
  34. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ru +293 -0
  35. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.sk +474 -0
  36. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.sl +78 -0
  37. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.sv +46 -0
  38. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ta +276 -0
  39. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.yue +53 -0
  40. laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.zh +53 -0
  41. laser/tools-external/moses-tokenizer/tokenizer/basic-protected-patterns +5 -0
  42. laser/tools-external/moses-tokenizer/tokenizer/deescape-special-chars.perl +22 -0
  43. laser/tools-external/moses-tokenizer/tokenizer/detokenizer.perl +373 -0
  44. laser/tools-external/moses-tokenizer/tokenizer/lowercase.perl +14 -0
  45. laser/tools-external/moses-tokenizer/tokenizer/normalize-punctuation.perl +90 -0
  46. laser/tools-external/moses-tokenizer/tokenizer/remove-non-printing-char.perl +22 -0
  47. laser/tools-external/moses-tokenizer/tokenizer/tokenizer.perl +563 -0
  48. laser/tools-external/sentencepiece-master/.github/dependabot.yml +23 -0
  49. laser/tools-external/sentencepiece-master/.github/workflows/cifuzz.yml +30 -0
  50. laser/tools-external/sentencepiece-master/.github/workflows/cmake.yml +86 -0
app.py CHANGED
@@ -65,7 +65,7 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
65
  print(f"Using device: {device}")
66
  # Load models once
67
  print("Loading models...")
68
- model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
69
  tokenizer = AutoTokenizer.from_pretrained(model_id)
70
  model = AutoModelForCausalLM.from_pretrained(
71
  model_id,
 
65
  print(f"Using device: {device}")
66
  # Load models once
67
  print("Loading models...")
68
+ model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
69
  tokenizer = AutoTokenizer.from_pretrained(model_id)
70
  model = AutoModelForCausalLM.from_pretrained(
71
  model_id,
laser/.gitignore CHANGED
@@ -1,12 +1,5 @@
1
  source/__pycache__
2
  source/lib/__pycache__
3
- models
4
- tools-external
5
- tasks/mldoc/MLDoc
6
- tasks/bucc/downloaded
7
- tasks/similarity/dev/
8
- tasks/xnli/XNLI-1.0*
9
- tasks/xnli/multinli_1.0*
10
  .??*swp
11
  .idea
12
  __pycache__
 
1
  source/__pycache__
2
  source/lib/__pycache__
 
 
 
 
 
 
 
3
  .??*swp
4
  .idea
5
  __pycache__
laser/tools-external/fastBPE/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The MIT License
2
+
3
+ Copyright (c) 2019 Guillaume Lample,Timothée Lacroix
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
laser/tools-external/fastBPE/MANIFEST.in ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ include fastBPE/*.cc
2
+ include fastBPE/*.hpp
3
+ include fastBPE/*.pyx
laser/tools-external/fastBPE/README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # fastBPE
3
+
4
+ C++ implementation of [Neural Machine Translation of Rare Words with Subword Units](https://arxiv.org/abs/1508.07909), with Python API.
5
+
6
+ ## Installation
7
+
8
+ Compile with:
9
+ ```
10
+ g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
11
+ ```
12
+
13
+ ## Usage:
14
+
15
+ ### List commands
16
+ ```
17
+ ./fast
18
+ usage: fastbpe <command> <args>
19
+
20
+ The commands supported by fastBPE are:
21
+
22
+ getvocab input1 [input2] extract the vocabulary from one or two text files
23
+ learnbpe nCodes input1 [input2] learn BPE codes from one or two text files
24
+ applybpe output input codes [vocab] apply BPE codes to a text file
25
+ applybpe_stream codes [vocab] apply BPE codes to stdin and outputs to stdout
26
+ ```
27
+
28
+ fastBPE also supports stdin inputs. For instance, these two commands are equivalent:
29
+ ```
30
+ ./fast getvocab text > vocab
31
+ cat text | ./fast getvocab - > vocab
32
+ ```
33
+ But the first one will memory map the input file to read it efficiently, which can be more than twice faster than stdin on very large files. Similarly, these two commands are equivalent:
34
+ ```
35
+ ./fast applybpe output input codes vocab
36
+ cat input | ./fast applybpe_stream codes vocab > output
37
+ ```
38
+ Although the first one will be significantly faster on large datasets, as it uses multi-threading to pre-compute the BPE splits of all words in the input file.
39
+
40
+ ### Learn codes
41
+ ```
42
+ ./fast learnbpe 40000 train.de train.en > codes
43
+ ```
44
+
45
+ ### Apply codes to train
46
+ ```
47
+ ./fast applybpe train.de.40000 train.de codes
48
+ ./fast applybpe train.en.40000 train.en codes
49
+ ```
50
+
51
+ ### Get train vocabulary
52
+ ```
53
+ ./fast getvocab train.de.40000 > vocab.de.40000
54
+ ./fast getvocab train.en.40000 > vocab.en.40000
55
+ ```
56
+
57
+ ### Apply codes to valid and test
58
+ ```
59
+ ./fast applybpe valid.de.40000 valid.de codes vocab.de.40000
60
+ ./fast applybpe valid.en.40000 valid.en codes vocab.en.40000
61
+ ./fast applybpe test.de.40000 test.de codes vocab.de.40000
62
+ ./fast applybpe test.en.40000 test.en codes vocab.en.40000
63
+ ```
64
+
65
+ ## Python API
66
+
67
+ To install the Python API, simply run:
68
+ ```bash
69
+ python setup.py install
70
+ ```
71
+
72
+ **Note:** For Mac OSX Users, add `export MACOSX_DEPLOYMENT_TARGET=10.x` (x=9 or 10, depending on your version) or `-stdlib=libc++` to the `extra_compile_args` of `setup.py` before/during the above install command, as appropriate.
73
+
74
+ Call the API using:
75
+
76
+ ```python
77
+ import fastBPE
78
+
79
+ bpe = fastBPE.fastBPE(codes_path, vocab_path)
80
+ bpe.apply(["Roasted barramundi fish", "Centrally managed over a client-server architecture"])
81
+
82
+ >> ['Ro@@ asted barr@@ am@@ un@@ di fish', 'Centr@@ ally managed over a cli@@ ent-@@ server architecture']
83
+ ```
laser/tools-external/fastBPE/build/lib.linux-x86_64-cpython-37/fastBPE.cpython-37m-x86_64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5585531fdc9895f104c01440761b83c6edd388e1a76de4df9eda4bd21258b63
3
+ size 2622328
laser/tools-external/fastBPE/fastBPE.egg-info/PKG-INFO ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: fastBPE
3
+ Version: 0.1.1
4
+ Summary: C++ implementation of Neural Machine Translation of Rare Words with Subword Units, with Python API.
5
+ Home-page: https://github.com/glample/fastBPE
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+
9
+
10
+ # fastBPE
11
+
12
+ C++ implementation of [Neural Machine Translation of Rare Words with Subword Units](https://arxiv.org/abs/1508.07909), with Python API.
13
+
14
+ ## Installation
15
+
16
+ Compile with:
17
+ ```
18
+ g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
19
+ ```
20
+
21
+ ## Usage:
22
+
23
+ ### List commands
24
+ ```
25
+ ./fast
26
+ usage: fastbpe <command> <args>
27
+
28
+ The commands supported by fastBPE are:
29
+
30
+ getvocab input1 [input2] extract the vocabulary from one or two text files
31
+ learnbpe nCodes input1 [input2] learn BPE codes from one or two text files
32
+ applybpe output input codes [vocab] apply BPE codes to a text file
33
+ applybpe_stream codes [vocab] apply BPE codes to stdin and outputs to stdout
34
+ ```
35
+
36
+ fastBPE also supports stdin inputs. For instance, these two commands are equivalent:
37
+ ```
38
+ ./fast getvocab text > vocab
39
+ cat text | ./fast getvocab - > vocab
40
+ ```
41
+ But the first one will memory map the input file to read it efficiently, which can be more than twice faster than stdin on very large files. Similarly, these two commands are equivalent:
42
+ ```
43
+ ./fast applybpe output input codes vocab
44
+ cat input | ./fast applybpe_stream codes vocab > output
45
+ ```
46
+ Although the first one will be significantly faster on large datasets, as it uses multi-threading to pre-compute the BPE splits of all words in the input file.
47
+
48
+ ### Learn codes
49
+ ```
50
+ ./fast learnbpe 40000 train.de train.en > codes
51
+ ```
52
+
53
+ ### Apply codes to train
54
+ ```
55
+ ./fast applybpe train.de.40000 train.de codes
56
+ ./fast applybpe train.en.40000 train.en codes
57
+ ```
58
+
59
+ ### Get train vocabulary
60
+ ```
61
+ ./fast getvocab train.de.40000 > vocab.de.40000
62
+ ./fast getvocab train.en.40000 > vocab.en.40000
63
+ ```
64
+
65
+ ### Apply codes to valid and test
66
+ ```
67
+ ./fast applybpe valid.de.40000 valid.de codes vocab.de.40000
68
+ ./fast applybpe valid.en.40000 valid.en codes vocab.en.40000
69
+ ./fast applybpe test.de.40000 test.de codes vocab.de.40000
70
+ ./fast applybpe test.en.40000 test.en codes vocab.en.40000
71
+ ```
72
+
73
+ ## Python API
74
+
75
+ To install the Python API, simply run:
76
+ ```bash
77
+ python setup.py install
78
+ ```
79
+
80
+ **Note:** For Mac OSX Users, add `export MACOSX_DEPLOYMENT_TARGET=10.x` (x=9 or 10, depending on your version) or `-stdlib=libc++` to the `extra_compile_args` of `setup.py` before/during the above install command, as appropriate.
81
+
82
+ Call the API using:
83
+
84
+ ```python
85
+ import fastBPE
86
+
87
+ bpe = fastBPE.fastBPE(codes_path, vocab_path)
88
+ bpe.apply(["Roasted barramundi fish", "Centrally managed over a client-server architecture"])
89
+
90
+ >> ['Ro@@ asted barr@@ am@@ un@@ di fish', 'Centr@@ ally managed over a cli@@ ent-@@ server architecture']
91
+ ```
laser/tools-external/fastBPE/fastBPE.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ setup.py
5
+ fastBPE/fastBPE.cpp
6
+ fastBPE/fastBPE.hpp
7
+ fastBPE/fastBPE.pyx
8
+ fastBPE/main.cc
9
+ fastBPE.egg-info/PKG-INFO
10
+ fastBPE.egg-info/SOURCES.txt
11
+ fastBPE.egg-info/dependency_links.txt
12
+ fastBPE.egg-info/top_level.txt
laser/tools-external/fastBPE/fastBPE.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
laser/tools-external/fastBPE/fastBPE.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ fastBPE
laser/tools-external/fastBPE/fastBPE/fastBPE.cpp ADDED
The diff for this file is too large to render. See raw diff
 
laser/tools-external/fastBPE/fastBPE/fastBPE.hpp ADDED
@@ -0,0 +1,692 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <algorithm>
4
+ #include <assert.h>
5
+ #include <errno.h>
6
+ #include <fcntl.h>
7
+ #include <fstream>
8
+ #include <functional>
9
+ #include <iostream>
10
+ #include <list>
11
+ #include <set>
12
+ #include <stdio.h>
13
+ #include <string>
14
+ #include <cstring>
15
+ #include <sys/mman.h>
16
+ #include <sys/stat.h>
17
+ #include <thread>
18
+ #include <unistd.h> // ftruncate
19
+ #include <unordered_map>
20
+ #include <unordered_set>
21
+ #include <vector>
22
+
23
+
24
+ namespace fastBPE {
25
+
26
+ using namespace std;
27
+
28
+ const size_t kMaxPairs = 1000 * 1000 * 1000;
29
+ const size_t kThreads = max(1, min(10, int(thread::hardware_concurrency())));
30
+ const char *kEndWord = "</w>";
31
+ const size_t kEndWordLength = 4;
32
+ const char *kTokenDelim = "@@";
33
+ const size_t kTokenDelimLength = 2;
34
+
35
+ int safeOpen(const char *file_path, int flags, mode_t mode = 0) {
36
+ int fd = open(file_path, flags, mode);
37
+ if (fd < 0) {
38
+ fprintf(stderr, "Cannot open text file %s\n", file_path);
39
+ exit(EXIT_FAILURE);
40
+ }
41
+ return fd;
42
+ }
43
+
44
+ void readText(const char *fp, unordered_map<string, uint32_t> &word_count) {
45
+ string cur_word;
46
+ uint64_t total = 0;
47
+ auto deal_with_char = [&](char cur_char){
48
+ if (cur_char == ' ' || cur_char == '\n') {
49
+ if (cur_word.size() == 0)
50
+ return;
51
+ // end of word
52
+ auto it = word_count.find(cur_word);
53
+ int count = it != word_count.end() ? it->second : 0;
54
+ word_count[cur_word] = count + 1;
55
+ total++;
56
+ cur_word.clear();
57
+ } else {
58
+ cur_word.push_back(cur_char);
59
+ }
60
+ };
61
+
62
+ if (string(fp).compare("-") == 0) {
63
+ for (std::string line; std::getline(std::cin, line);) {
64
+ for(char c: line){
65
+ deal_with_char(c);
66
+ }
67
+ deal_with_char('\n');
68
+ }
69
+ }
70
+ else {
71
+ int fd = safeOpen(fp, O_RDONLY);
72
+
73
+ struct stat s;
74
+ fstat(fd, &s);
75
+ fprintf(stderr, "Loading vocabulary from %s ...\n", fp);
76
+
77
+ size_t size = s.st_size;
78
+ char *f = (char *)mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
79
+
80
+ for (size_t i = 0; i < size; i++) {
81
+ deal_with_char(f[i]);
82
+ }
83
+ }
84
+ fprintf(stderr, "Read %lu words (%lu unique) from text file.\n", total,
85
+ word_count.size());
86
+ }
87
+
88
+ std::pair<size_t, uint64_t> output_or_count(
89
+ unordered_map<string, string> &bpe, size_t size, char *f, char *fo
90
+ ) {
91
+ string cur_word;
92
+ size_t charOut = 0;
93
+ uint64_t total = 0;
94
+ for (size_t i = 0; i < size; i++) {
95
+ auto &cur_char = f[i];
96
+ if (cur_char == ' ' || cur_char == '\n') {
97
+ if (cur_word.size() == 0) {
98
+ if (fo != nullptr) fo[charOut] = cur_char;
99
+ charOut++;
100
+ continue;
101
+ }
102
+ // end of word : write bpe to output
103
+ auto it = bpe.find(cur_word);
104
+ assert(it != bpe.end());
105
+ for (auto x : it->second) {
106
+ if (fo != nullptr) fo[charOut] = x;
107
+ charOut++;
108
+ }
109
+ if (fo != nullptr) fo[charOut] = cur_char;
110
+ charOut++;
111
+
112
+ total++;
113
+ cur_word.clear();
114
+ } else {
115
+ cur_word.push_back(cur_char);
116
+ }
117
+ }
118
+ return std::make_pair(charOut, total);
119
+ }
120
+
121
+ void outputText(const char *fpo, const char *fp,
122
+ unordered_map<string, string> &bpe) {
123
+
124
+ int fd = safeOpen(fp, O_RDONLY);
125
+ auto fdOut = safeOpen(fpo, O_RDWR | O_CREAT | O_TRUNC, 0666);
126
+
127
+ struct stat s;
128
+ fstat(fd, &s);
129
+
130
+ fprintf(stderr, "Applying BPE to %s ...\n", fp);
131
+ auto size = s.st_size;
132
+ char *f = (char *)mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
133
+
134
+ auto p = output_or_count(bpe, size, f, nullptr);
135
+ size_t out_size = p.first;
136
+
137
+ if (ftruncate(fdOut, out_size) < 0) {
138
+ fprintf(stderr, "Couldn't truncate output file %s to size %lu\n", fpo,
139
+ out_size);
140
+ exit(EXIT_FAILURE);
141
+ }
142
+
143
+
144
+ char *fo = (char *)mmap(NULL, out_size, PROT_WRITE, MAP_SHARED, fdOut, 0);
145
+ if (fo == MAP_FAILED) {
146
+ fprintf(stderr, "Output memory map failed : %d.\n", errno);
147
+ exit(EXIT_FAILURE);
148
+ }
149
+ p = output_or_count(bpe, size, f, fo);
150
+ fprintf(stderr, "Modified %lu words from text file.\n", p.second);
151
+ munmap(fo, out_size);
152
+ munmap(f, size);
153
+ close(fdOut);
154
+ close(fd);
155
+ }
156
+
157
+ struct pair_hash {
158
+ template <class T1, class T2> size_t operator()(const pair<T1, T2> &p) const {
159
+ auto h1 = hash<T1>{}(p.first);
160
+ auto h2 = hash<T2>{}(p.second);
161
+ size_t seed = h1;
162
+ // boost::hash_combine
163
+ return h2 + 0x9e3779b9 + (seed << 6) + (seed >> 2);
164
+ }
165
+ };
166
+
167
+ void tokenize(const unordered_map<string, uint32_t> &word_count,
168
+ unordered_map<string, uint32_t> &token_to_int,
169
+ vector<string> &int_to_token, vector<list<uint32_t>> &words,
170
+ vector<int32_t> &counts) {
171
+
172
+ for (auto &x : word_count) {
173
+ auto &word = x.first;
174
+
175
+ words.push_back(list<uint32_t>());
176
+ auto &current_word = words.back();
177
+ counts.push_back(x.second);
178
+
179
+ int pos = 0, realLength = 0;
180
+ int lastStart = 0;
181
+ while (word[pos]) {
182
+ bool newChar = (word[pos] & 0xc0) != 0x80; // not a continuation byte
183
+ realLength += newChar;
184
+ // new token
185
+ if (newChar && pos > 0) {
186
+ auto new_token = word.substr(lastStart, pos - lastStart);
187
+ if (token_to_int.count(new_token) == 0) {
188
+ int_to_token.push_back(new_token);
189
+ token_to_int[new_token] = int_to_token.size() - 1;
190
+ }
191
+ current_word.push_back(token_to_int[new_token]);
192
+ lastStart = pos;
193
+ }
194
+ pos++;
195
+ }
196
+ auto new_token = word.substr(lastStart, string::npos) + kEndWord;
197
+ if (token_to_int.count(new_token) == 0) {
198
+ int_to_token.push_back(new_token);
199
+ token_to_int[new_token] = int_to_token.size() - 1;
200
+ }
201
+ current_word.push_back(token_to_int[new_token]);
202
+ }
203
+ }
204
+
205
+ void tokenize_str(const unordered_map<string, uint32_t> &word_count,
206
+ unordered_map<string, vector<string>> &words) {
207
+
208
+ for (auto &x : word_count) {
209
+ auto &word = x.first;
210
+ words[word] = vector<string>();
211
+
212
+ int pos = 0, realLength = 0;
213
+ int lastStart = 0;
214
+ while (word[pos]) {
215
+ bool newChar = (word[pos] & 0xc0) != 0x80; // not a continuation byte
216
+ realLength += newChar;
217
+ // new token
218
+ if (newChar && pos > 0) {
219
+ auto new_token = word.substr(lastStart, pos - lastStart);
220
+ words[word].push_back(new_token);
221
+ lastStart = pos;
222
+ }
223
+ pos++;
224
+ }
225
+ auto new_token = word.substr(lastStart, string::npos) + kEndWord;
226
+ words[word].push_back(new_token);
227
+ }
228
+ }
229
+
230
+ using tp = pair<uint32_t, uint32_t>;
231
+ using tps = pair<string, string>;
232
+ using pc = unordered_map<tp, pair<int32_t, tp> *, pair_hash>;
233
+
234
+ void count_in_word(
235
+ list<uint32_t> &word, uint32_t wi, uint32_t count, pc &pair_counts,
236
+ vector<pair<int32_t, tp>> &contiguous_counts,
237
+ unordered_map<tp, unordered_set<uint32_t>, pair_hash> &where) {
238
+ bool second = false;
239
+ tp cur_pair;
240
+ for (uint32_t token : word) {
241
+ if (second) {
242
+ cur_pair.first = cur_pair.second;
243
+ }
244
+ cur_pair.second = token;
245
+ if (second) {
246
+ auto it = pair_counts.find(cur_pair);
247
+ if (it == pair_counts.end()) {
248
+ contiguous_counts.emplace_back(0, cur_pair);
249
+ auto *added = &contiguous_counts.back();
250
+ pair_counts.emplace(piecewise_construct, forward_as_tuple(cur_pair),
251
+ forward_as_tuple(added));
252
+ where[cur_pair].emplace();
253
+ }
254
+ if (count > 0) {
255
+ where[cur_pair].insert(wi);
256
+ } else {
257
+ where[cur_pair].erase(wi);
258
+ }
259
+ pair_counts[cur_pair]->first += count;
260
+ } else {
261
+ second = true;
262
+ }
263
+ }
264
+ }
265
+
266
+ void find_maxp(vector<pair<int32_t, tp>> &contiguous_counts, tp &maxp,
267
+ int32_t &max_c) {
268
+ max_c = 0;
269
+ for (auto &x : contiguous_counts) {
270
+ if (x.first > max_c) {
271
+ max_c = x.first;
272
+ maxp = x.second;
273
+ } else if (x.first == max_c and x.second < maxp) {
274
+ maxp = x.second;
275
+ }
276
+ }
277
+ }
278
+
279
+ void getvocab(const char *inputFile1, const char *inputFile2) {
280
+ // get vocab
281
+ unordered_map<string, uint32_t> word_count;
282
+ readText(inputFile1, word_count);
283
+ if (strcmp(inputFile2, "") != 0) {
284
+ readText(inputFile2, word_count);
285
+ }
286
+
287
+ // sort vocab
288
+ auto compFunctor = [](pair<string, int> elem1, pair<string, int> elem2) {
289
+ return elem1.second > elem2.second ||
290
+ (elem1.second == elem2.second && elem1.first < elem2.first);
291
+ };
292
+ set<pair<string, int>, decltype(compFunctor)> sorted_vocab(
293
+ word_count.begin(), word_count.end(), compFunctor);
294
+ assert(word_count.size() == sorted_vocab.size());
295
+
296
+ // print sorted vocab
297
+ for (auto element : sorted_vocab)
298
+ cout << element.first << " " << element.second << endl;
299
+ }
300
+
301
+ void learnbpe(const uint32_t kNPairs, const char *inputFile1,
302
+ const char *inputFile2) {
303
+ // get vocab
304
+ unordered_map<string, uint32_t> word_count;
305
+ readText(inputFile1, word_count);
306
+ if (strcmp(inputFile2, "") != 0) {
307
+ readText(inputFile2, word_count);
308
+ }
309
+
310
+ // a token is an int, it represents a string
311
+ unordered_map<string, uint32_t> token_to_int;
312
+ vector<string> int_to_token;
313
+
314
+ vector<list<uint32_t>> words;
315
+ vector<int32_t> counts;
316
+
317
+ tokenize(word_count, token_to_int, int_to_token, words, counts);
318
+
319
+ vector<pair<int32_t, tp>> contiguous_counts;
320
+ contiguous_counts.reserve(kMaxPairs);
321
+
322
+ pc pair_counts;
323
+ unordered_map<tp, unordered_set<uint32_t>, pair_hash> where_to_update;
324
+
325
+ tp cur_pair;
326
+ int32_t max_c = 0;
327
+ tp max_p;
328
+ for (uint32_t wi = 0; wi < words.size(); wi++) {
329
+ count_in_word(words[wi], wi, counts[wi], pair_counts, contiguous_counts,
330
+ where_to_update);
331
+ }
332
+ find_maxp(contiguous_counts, max_p, max_c);
333
+ for (size_t i = 0; i < kNPairs; i++) {
334
+ // create new token for pair. replace
335
+ auto new_token = int_to_token[max_p.first] + int_to_token[max_p.second];
336
+ cout << int_to_token[max_p.first] << " " << int_to_token[max_p.second]
337
+ << " " << max_c << endl;
338
+
339
+ uint32_t new_token_id = int_to_token.size();
340
+ int_to_token.push_back(new_token);
341
+ token_to_int[new_token] = new_token_id;
342
+ max_c = 0;
343
+ auto change_count = [&](tp pair, int32_t v, uint32_t wi) {
344
+ auto it = pair_counts.find(pair);
345
+ if (it != pair_counts.end()) {
346
+ // assert(it->second + v >= 0);
347
+ it->second->first += v;
348
+ } else {
349
+ if (v > 0) {
350
+ contiguous_counts.emplace_back(v, pair);
351
+ pair_counts.emplace(piecewise_construct, forward_as_tuple(pair),
352
+ forward_as_tuple(&(contiguous_counts.back())));
353
+ where_to_update[pair] = unordered_set<uint32_t>();
354
+ }
355
+ }
356
+ if (v > 0)
357
+ where_to_update[pair].insert(wi);
358
+ };
359
+
360
+ for (auto wi : where_to_update[max_p]) {
361
+ auto &cur_word = words[wi];
362
+ auto it = cur_word.begin();
363
+ bool second = false;
364
+ while (it != cur_word.end()) {
365
+ if (second) {
366
+ cur_pair.first = cur_pair.second;
367
+ }
368
+ cur_pair.second = *it;
369
+
370
+ if (second) {
371
+ // found the pair
372
+ if (cur_pair == max_p) {
373
+ it--; // points to first element of pair
374
+ // if there is a token before us
375
+ if (it != cur_word.begin()) {
376
+ it--;
377
+ change_count(make_pair(*it, cur_pair.first), -counts[wi], wi);
378
+ change_count(make_pair(*it, new_token_id), counts[wi], wi);
379
+ it++;
380
+ }
381
+
382
+ it = cur_word.insert(it, new_token_id); // it points to new token
383
+ it++; // it points to first element of pair
384
+ it = cur_word.erase(it); // it points to second element of pair
385
+ it = cur_word.erase(it); // it points to next value
386
+
387
+ // if there is a token after the one we inserted
388
+ if (it != cur_word.end()) {
389
+ change_count(make_pair(cur_pair.second, *it), -counts[wi], wi);
390
+ change_count(make_pair(new_token_id, *it), counts[wi], wi);
391
+ }
392
+ cur_pair.second = new_token_id;
393
+ } else {
394
+ it++;
395
+ }
396
+ } else {
397
+ second = true;
398
+ it++;
399
+ }
400
+ }
401
+ }
402
+
403
+ if (pair_counts.find(max_p) != pair_counts.end()){
404
+ pair_counts[max_p]->first = 0;
405
+ }
406
+ find_maxp(contiguous_counts, max_p, max_c);
407
+ }
408
+ }
409
+
410
+ void split(vector<string> &splits, const string &text, char sep) {
411
+ size_t start = 0, end = 0;
412
+ while ((end = text.find(sep, start)) != string::npos) {
413
+ if (end != start)
414
+ splits.push_back(text.substr(start, end - start));
415
+ start = end + 1;
416
+ }
417
+ if (end != start && start < text.size())
418
+ splits.push_back(text.substr(start));
419
+ }
420
+
421
+ void readVocab(const char *fp, unordered_map<string, uint32_t> &vocab) {
422
+ ifstream file(fp);
423
+ if (!file) {
424
+ fprintf(stderr, "Cannot open vocabulary file %s\n", fp);
425
+ exit(EXIT_FAILURE);
426
+ }
427
+ fprintf(stderr, "Loading vocabulary from %s ...\n", fp);
428
+ string line;
429
+ uint64_t total = 0;
430
+ while (getline(file, line)) {
431
+ vector<string> splits;
432
+ split(splits, line, ' ');
433
+ assert(splits.size() == 2);
434
+ assert(vocab.find(splits[0]) == vocab.end());
435
+ int count = stoi(splits[1]);
436
+ vocab[splits[0]] = count;
437
+ total += count;
438
+ }
439
+ fprintf(stderr, "Read %lu words (%lu unique) from vocabulary file.\n", total,
440
+ vocab.size());
441
+ }
442
+
443
+ void readCodes(const char *fp, unordered_map<tps, uint32_t, pair_hash> &codes,
444
+ unordered_map<string, tps> &reversed_codes) {
445
+ ifstream file(fp);
446
+ if (!file) {
447
+ fprintf(stderr, "Cannot open codes file %s\n", fp);
448
+ exit(EXIT_FAILURE);
449
+ }
450
+ fprintf(stderr, "Loading codes from %s ...\n", fp);
451
+ string line;
452
+ while (getline(file, line)) {
453
+ vector<string> splits;
454
+ split(splits, line, ' ');
455
+ assert(splits.size() == 3);
456
+ auto pair = make_pair(splits[0], splits[1]);
457
+ string concat = splits[0] + splits[1];
458
+ assert(codes.find(pair) == codes.end());
459
+ assert(reversed_codes.find(concat) == reversed_codes.end());
460
+ codes[pair] = codes.size();
461
+ reversed_codes[concat] = pair;
462
+ }
463
+ fprintf(stderr, "Read %lu codes from the codes file.\n", codes.size());
464
+ }
465
+
466
+ void decompose(const string s, vector<string> &newSubwords,
467
+ const unordered_map<string, tps> &reversed_codes,
468
+ const unordered_map<string, uint32_t> &vocab, bool isFinal) {
469
+ auto it = reversed_codes.find(s);
470
+ if (it == reversed_codes.end()) {
471
+ // TODO this whole block below is just some sanity check
472
+ // if we cannot un-merge a subword, it has to be a char
473
+ string s2 = isFinal ? s.substr(0, s.size() - kEndWordLength) : s;
474
+ int count = 0;
475
+ for (size_t j = 0; j < s2.size(); j++) {
476
+ if ((s2[j] & 0xc0) != 0x80) {
477
+ count++;
478
+ }
479
+ }
480
+ assert(count == 1);
481
+ newSubwords.push_back(s);
482
+ return;
483
+ }
484
+ assert(it != reversed_codes.end());
485
+ string token1 = it->second.first;
486
+ if (vocab.find(token1 + kTokenDelim) == vocab.end()) {
487
+ decompose(token1, newSubwords, reversed_codes, vocab, false);
488
+ } else {
489
+ newSubwords.push_back(token1);
490
+ }
491
+ string token2 = it->second.second;
492
+ auto query = token2 + kTokenDelim;
493
+ if (isFinal) {
494
+ query = token2.substr(0, token2.size() - kEndWordLength);
495
+ }
496
+ if (vocab.find(query) == vocab.end()) {
497
+ decompose(token2, newSubwords, reversed_codes, vocab, isFinal);
498
+ } else {
499
+ newSubwords.push_back(token2);
500
+ }
501
+ }
502
+
503
+ void limitVocab(const vector<string> &subwords, vector<string> &newSubwords,
504
+ const unordered_map<string, tps> &reversed_codes,
505
+ const unordered_map<string, uint32_t> &vocab) {
506
+ string query;
507
+ for (size_t i = 0; i < subwords.size(); i++) {
508
+ bool isFinal = i == subwords.size() - 1;
509
+ auto &subword = subwords[i];
510
+ if (isFinal) {
511
+ query = subword.substr(0, subword.size() - kEndWordLength);
512
+ } else {
513
+ query = subword + kTokenDelim;
514
+ }
515
+ if (vocab.find(query) == vocab.end()) {
516
+ decompose(subword, newSubwords, reversed_codes, vocab, isFinal);
517
+ } else {
518
+ newSubwords.push_back(subword);
519
+ }
520
+ }
521
+ }
522
+
523
+ string process_bpe(vector<string> &subwords,
524
+ unordered_map<tps, uint32_t, pair_hash> &codes,
525
+ unordered_map<string, tps> &reversed_codes,
526
+ unordered_map<string, uint32_t> &vocab) {
527
+ // merge subWords as much as possible
528
+ vector<string> newSubwords;
529
+ while (subwords.size() > 1) {
530
+ // find the best pair
531
+ int bestPairId = -1;
532
+ auto bestPair = codes.end(); // TODO ugly hack that works
533
+ for (size_t i = 0; i < subwords.size() - 1; i++) {
534
+ auto pair = make_pair(subwords[i], subwords[i + 1]);
535
+ auto it = codes.find(pair);
536
+ int pairRank = it == codes.end() ? -1 : it->second;
537
+ if (pairRank >= 0 && (bestPairId == -1 || int(bestPair->second) > pairRank)) {
538
+ bestPair = it;
539
+ bestPairId = i;
540
+ }
541
+ }
542
+ // if we cannot merge anything, stop
543
+ if (bestPairId == -1) {
544
+ break;
545
+ }
546
+ // otherwise, merge subWords
547
+ bool justMerged = false;
548
+ newSubwords = vector<string>();
549
+ for (size_t i = 0; i < subwords.size(); i++) {
550
+ if ((i + 1 < subwords.size()) && (not justMerged) &&
551
+ subwords[i] == bestPair->first.first &&
552
+ subwords[i + 1] == bestPair->first.second) {
553
+ newSubwords.push_back(subwords[i] + subwords[i + 1]);
554
+ justMerged = true;
555
+ } else {
556
+ if (not justMerged) {
557
+ newSubwords.push_back(subwords[i]);
558
+ }
559
+ justMerged = false;
560
+ }
561
+ }
562
+ subwords = newSubwords;
563
+ }
564
+ // check that we are only using words in the dictionary
565
+ if (vocab.size() > 0) {
566
+ vector<string> newSubwords;
567
+ limitVocab(subwords, newSubwords, reversed_codes, vocab);
568
+ subwords = newSubwords;
569
+ }
570
+ // concat subWords
571
+ string result;
572
+ for (auto x : subwords) {
573
+ result = result + x + kTokenDelim + " ";
574
+ }
575
+ return result.substr(
576
+ 0,
577
+ result.size() - kEndWordLength - kTokenDelimLength - 1 // "</w>@@ "
578
+ );
579
+ }
580
+
581
+ void applybpe(const char *outputFile, const char *inputFile,
582
+ const char *codesPath, const char *vocabPath) {
583
+ // read vocabulary (to which we want to limit the output file)
584
+ unordered_map<string, uint32_t> vocab;
585
+ if (strcmp(vocabPath, "") != 0) {
586
+ readVocab(vocabPath, vocab);
587
+ }
588
+
589
+ // read codes
590
+ unordered_map<tps, uint32_t, pair_hash> codes;
591
+ unordered_map<string, tps> reversed_codes;
592
+ readCodes(codesPath, codes, reversed_codes);
593
+
594
+ // read input file words
595
+ unordered_map<string, uint32_t> word_count;
596
+ readText(inputFile, word_count);
597
+
598
+ // tokenize
599
+ unordered_map<string, vector<string>> bpeTok;
600
+ tokenize_str(word_count, bpeTok);
601
+
602
+ vector<pair<string, vector<string>>> bpeTokVec;
603
+ for (auto x : bpeTok) {
604
+ bpeTokVec.push_back(x);
605
+ }
606
+
607
+ // apply BPE codes to each word
608
+ unordered_map<string, string> bpe[kThreads];
609
+ vector<thread> threads;
610
+ for (size_t i = 0; i < kThreads; i++) {
611
+ threads.emplace_back(
612
+ [&](size_t this_thread) {
613
+ for (size_t w = this_thread; w < bpeTokVec.size(); w += kThreads) {
614
+ auto &x = bpeTokVec[w];
615
+ bpe[this_thread][x.first] = process_bpe(x.second, codes, reversed_codes, vocab);
616
+ }
617
+ },
618
+ i
619
+ );
620
+ }
621
+
622
+ unordered_map<string, string> final_bpe;
623
+ for (size_t i = 0; i < kThreads; i++) {
624
+ threads[i].join();
625
+ for (auto x : bpe[i]) {
626
+ final_bpe[x.first] = x.second;
627
+ }
628
+ }
629
+ // output
630
+ outputText(outputFile, inputFile, final_bpe);
631
+ }
632
+
633
+
634
+ class BPEApplyer {
635
+ private:
636
+ unordered_map<string, uint32_t> vocab;
637
+ unordered_map<tps, uint32_t, pair_hash> codes;
638
+ unordered_map<string, tps> reversed_codes;
639
+
640
+ public:
641
+ BPEApplyer(const string& codesPath, const string& vocabPath) {
642
+ if (vocabPath.size() > 0) readVocab(vocabPath.c_str(), vocab);
643
+ readCodes(codesPath.c_str(), codes, reversed_codes);
644
+ }
645
+
646
+ vector<string> apply(vector<string>& sentences) {
647
+ vector<string> res;
648
+ for(auto &s: sentences) {
649
+ res.emplace_back("");
650
+ string& cur = res.back();
651
+ vector<string> words;
652
+ split(words, s, ' ');
653
+ for (size_t i = 0; i < words.size(); i++) {
654
+ auto word = words[i];
655
+ vector<string> word_bpes;
656
+ int pos = 0, realLength = 0;
657
+ int lastStart = 0;
658
+ while (word[pos]) {
659
+ bool newChar = (word[pos] & 0xc0) != 0x80; // not a continuation byte
660
+ realLength += newChar;
661
+ if (newChar && pos > 0) {
662
+ auto new_token = word.substr(lastStart, pos - lastStart);
663
+ word_bpes.push_back(new_token);
664
+ lastStart = pos;
665
+ }
666
+ pos++;
667
+ }
668
+ auto bpe = word.substr(lastStart, string::npos) + kEndWord;
669
+ word_bpes.push_back(bpe);
670
+ cur += process_bpe(word_bpes, codes, reversed_codes, vocab);
671
+ if (i < words.size() - 1) cur += " ";
672
+ }
673
+ }
674
+ return res;
675
+ }
676
+
677
+ };
678
+
679
+
680
+ void applybpe_stream(const char *codesPath, const char *vocabPath) {
681
+ BPEApplyer applyer(codesPath, vocabPath);
682
+ std::string line;
683
+ while(std::getline(std::cin, line)) {
684
+ vector<string> tmp;
685
+ tmp.push_back(line);
686
+ for(auto& l : applyer.apply(tmp)){
687
+ std::cout << l << std::endl;
688
+ }
689
+ }
690
+ }
691
+
692
+ };
laser/tools-external/fastBPE/fastBPE/fastBPE.pyx ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cython: language_level=3
2
+ # distutils: language = c++
3
+
4
+ from libcpp.vector cimport vector
5
+ from libcpp.string cimport string
6
+
7
+ cdef extern from "fastBPE.hpp" namespace "fastBPE":
8
+ cdef cppclass BPEApplyer:
9
+ BPEApplyer(const string& codes_path, const string& vocab_path)
10
+ vector[string] apply(vector[string]& sentences)
11
+
12
+ cdef class fastBPE:
13
+ cdef BPEApplyer* c_obj
14
+
15
+ def __dealloc__(self):
16
+ del self.c_obj
17
+
18
+ def __init__(self, codes_path, vocab_path=""):
19
+ self.c_obj = new BPEApplyer(codes_path.encode(), vocab_path.encode())
20
+
21
+ def apply(self, sentences):
22
+ cdef vector[string] s = [x.encode() for x in sentences]
23
+ cdef vector[string] res = self.c_obj.apply(s)
24
+ return [x.decode() for x in res]
laser/tools-external/fastBPE/fastBPE/main.cc ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "fastBPE.hpp"
2
+
3
+ using namespace std;
4
+ using namespace fastBPE;
5
+
6
+ void printUsage() {
7
+ cerr
8
+ << "usage: fastbpe <command> <args>\n\n"
9
+ << "The commands supported by fastBPE are:\n\n"
10
+ << "getvocab input1 [input2] extract the vocabulary from one "
11
+ "or two text files\n"
12
+ << "learnbpe nCodes input1 [input2] learn BPE codes from one or two "
13
+ "text files\n"
14
+ << "applybpe output input codes [vocab] apply BPE codes to a text file\n"
15
+ << "applybpe_stream codes [vocab] apply BPE codes to stdin and output to stdout\n"
16
+ << endl;
17
+ }
18
+
19
+
20
+ int main(int argc, char **argv) {
21
+ if (argc < 2) {
22
+ printUsage();
23
+ exit(EXIT_FAILURE);
24
+ }
25
+ string command = argv[1];
26
+ if (command == "getvocab") {
27
+ assert(argc == 3 || argc == 4);
28
+ getvocab(argv[2], argc == 4 ? argv[3] : "");
29
+ } else if (command == "learnbpe") {
30
+ assert(argc == 4 || argc == 5);
31
+ learnbpe(stoi(argv[2]), argv[3], argc == 5 ? argv[4] : "");
32
+ } else if (command == "applybpe") {
33
+ assert(argc == 5 || argc == 6);
34
+ applybpe(argv[2], argv[3], argv[4], argc == 6 ? argv[5] : "");
35
+ } else if (command == "applybpe_stream") {
36
+ assert(argc == 3 || argc == 4);
37
+ applybpe_stream(argv[2], argc == 4 ? argv[3] : "");
38
+ } else {
39
+ printUsage();
40
+ exit(EXIT_FAILURE);
41
+ }
42
+ return 0;
43
+ }
laser/tools-external/fastBPE/setup.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages, Extension
2
+ from distutils.command.sdist import sdist as _sdist
3
+
4
+
5
+ try:
6
+ from Cython.Build import cythonize
7
+ except ImportError:
8
+ use_cython = False
9
+ else:
10
+ use_cython = True
11
+
12
+
13
+ if use_cython:
14
+ extension = 'pyx'
15
+ else:
16
+ extension = 'cpp'
17
+
18
+
19
+ extensions = [
20
+ Extension(
21
+ 'fastBPE',
22
+ [ "fastBPE/fastBPE." + extension ],
23
+ language='c++',
24
+ extra_compile_args=[
25
+ "-std=c++11", "-Ofast", "-pthread"
26
+ ],
27
+ ),
28
+ ]
29
+ if use_cython:
30
+ extensions = cythonize(extensions)
31
+
32
+
33
+ with open('README.md') as f:
34
+ readme = f.read()
35
+
36
+
37
+ setup(
38
+ name = 'fastBPE',
39
+ version = '0.1.1',
40
+ description = 'C++ implementation of Neural Machine Translation of Rare Words with Subword Units, with Python API.',
41
+ url = 'https://github.com/glample/fastBPE',
42
+ long_description = readme,
43
+ long_description_content_type = 'text/markdown',
44
+ ext_package = '',
45
+ ext_modules = extensions,
46
+ packages=[
47
+ 'fastBPE',
48
+ ],
49
+ )
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ca ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Dr
2
+ Dra
3
+ pàg
4
+ p
5
+ c
6
+ av
7
+ Sr
8
+ Sra
9
+ adm
10
+ esq
11
+ Prof
12
+ S.A
13
+ S.L
14
+ p.e
15
+ ptes
16
+ Sta
17
+ St
18
+ pl
19
+ màx
20
+ cast
21
+ dir
22
+ nre
23
+ fra
24
+ admdora
25
+ Emm
26
+ Excma
27
+ espf
28
+ dc
29
+ admdor
30
+ tel
31
+ angl
32
+ aprox
33
+ ca
34
+ dept
35
+ dj
36
+ dl
37
+ dt
38
+ ds
39
+ dg
40
+ dv
41
+ ed
42
+ entl
43
+ al
44
+ i.e
45
+ maj
46
+ smin
47
+ n
48
+ núm
49
+ pta
50
+ A
51
+ B
52
+ C
53
+ D
54
+ E
55
+ F
56
+ G
57
+ H
58
+ I
59
+ J
60
+ K
61
+ L
62
+ M
63
+ N
64
+ O
65
+ P
66
+ Q
67
+ R
68
+ S
69
+ T
70
+ U
71
+ V
72
+ W
73
+ X
74
+ Y
75
+ Z
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.cs ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Bc
2
+ BcA
3
+ Ing
4
+ Ing.arch
5
+ MUDr
6
+ MVDr
7
+ MgA
8
+ Mgr
9
+ JUDr
10
+ PhDr
11
+ RNDr
12
+ PharmDr
13
+ ThLic
14
+ ThDr
15
+ Ph.D
16
+ Th.D
17
+ prof
18
+ doc
19
+ CSc
20
+ DrSc
21
+ dr. h. c
22
+ PaedDr
23
+ Dr
24
+ PhMr
25
+ DiS
26
+ abt
27
+ ad
28
+ a.i
29
+ aj
30
+ angl
31
+ anon
32
+ apod
33
+ atd
34
+ atp
35
+ aut
36
+ bd
37
+ biogr
38
+ b.m
39
+ b.p
40
+ b.r
41
+ cca
42
+ cit
43
+ cizojaz
44
+ c.k
45
+ col
46
+ čes
47
+ čín
48
+ čj
49
+ ed
50
+ facs
51
+ fasc
52
+ fol
53
+ fot
54
+ franc
55
+ h.c
56
+ hist
57
+ hl
58
+ hrsg
59
+ ibid
60
+ il
61
+ ind
62
+ inv.č
63
+ jap
64
+ jhdt
65
+ jv
66
+ koed
67
+ kol
68
+ korej
69
+ kl
70
+ krit
71
+ lat
72
+ lit
73
+ m.a
74
+ maď
75
+ mj
76
+ mp
77
+ násl
78
+ např
79
+ nepubl
80
+ něm
81
+ no
82
+ nr
83
+ n.s
84
+ okr
85
+ odd
86
+ odp
87
+ obr
88
+ opr
89
+ orig
90
+ phil
91
+ pl
92
+ pokrač
93
+ pol
94
+ port
95
+ pozn
96
+ př.kr
97
+ př.n.l
98
+ přel
99
+ přeprac
100
+ příl
101
+ pseud
102
+ pt
103
+ red
104
+ repr
105
+ resp
106
+ revid
107
+ rkp
108
+ roč
109
+ roz
110
+ rozš
111
+ samost
112
+ sect
113
+ sest
114
+ seš
115
+ sign
116
+ sl
117
+ srv
118
+ stol
119
+ sv
120
+ šk
121
+ šk.ro
122
+ špan
123
+ tab
124
+ t.č
125
+ tis
126
+ tj
127
+
128
+ tzv
129
+ univ
130
+ uspoř
131
+ vol
132
+ vl.jm
133
+ vs
134
+ vyd
135
+ vyobr
136
+ zal
137
+ zejm
138
+ zkr
139
+ zprac
140
+ zvl
141
+ n.p
142
+ např
143
+ než
144
+ MUDr
145
+ abl
146
+ absol
147
+ adj
148
+ adv
149
+ ak
150
+ ak. sl
151
+ akt
152
+ alch
153
+ amer
154
+ anat
155
+ angl
156
+ anglosas
157
+ arab
158
+ arch
159
+ archit
160
+ arg
161
+ astr
162
+ astrol
163
+ att
164
+ bás
165
+ belg
166
+ bibl
167
+ biol
168
+ boh
169
+ bot
170
+ bulh
171
+ círk
172
+ csl
173
+ č
174
+ čas
175
+ čes
176
+ dat
177
+ děj
178
+ dep
179
+ dět
180
+ dial
181
+ dór
182
+ dopr
183
+ dosl
184
+ ekon
185
+ epic
186
+ etnonym
187
+ eufem
188
+ f
189
+ fam
190
+ fem
191
+ fil
192
+ film
193
+ form
194
+ fot
195
+ fr
196
+ fut
197
+ fyz
198
+ gen
199
+ geogr
200
+ geol
201
+ geom
202
+ germ
203
+ gram
204
+ hebr
205
+ herald
206
+ hist
207
+ hl
208
+ hovor
209
+ hud
210
+ hut
211
+ chcsl
212
+ chem
213
+ ie
214
+ imp
215
+ impf
216
+ ind
217
+ indoevr
218
+ inf
219
+ instr
220
+ interj
221
+ ión
222
+ iron
223
+ it
224
+ kanad
225
+ katalán
226
+ klas
227
+ kniž
228
+ komp
229
+ konj
230
+
231
+ konkr
232
+
233
+ kuch
234
+ lat
235
+ lék
236
+ les
237
+ lid
238
+ lit
239
+ liturg
240
+ lok
241
+ log
242
+ m
243
+ mat
244
+ meteor
245
+ metr
246
+ mod
247
+ ms
248
+ mysl
249
+ n
250
+ náb
251
+ námoř
252
+ neklas
253
+ něm
254
+ nesklon
255
+ nom
256
+ ob
257
+ obch
258
+ obyč
259
+ ojed
260
+ opt
261
+ part
262
+ pas
263
+ pejor
264
+ pers
265
+ pf
266
+ pl
267
+ plpf
268
+
269
+ práv
270
+ prep
271
+ předl
272
+ přivl
273
+ r
274
+ rcsl
275
+ refl
276
+ reg
277
+ rkp
278
+ ř
279
+ řec
280
+ s
281
+ samohl
282
+ sg
283
+ sl
284
+ souhl
285
+ spec
286
+ srov
287
+ stfr
288
+ střv
289
+ stsl
290
+ subj
291
+ subst
292
+ superl
293
+ sv
294
+ sz
295
+ táz
296
+ tech
297
+ telev
298
+ teol
299
+ trans
300
+ typogr
301
+ var
302
+ vedl
303
+ verb
304
+ vl. jm
305
+ voj
306
+ vok
307
+ vůb
308
+ vulg
309
+ výtv
310
+ vztaž
311
+ zahr
312
+ zájm
313
+ zast
314
+ zejm
315
+
316
+ zeměd
317
+ zkr
318
+
319
+ mj
320
+ dl
321
+ atp
322
+ sport
323
+ Mgr
324
+ horn
325
+ MVDr
326
+ JUDr
327
+ RSDr
328
+ Bc
329
+ PhDr
330
+ ThDr
331
+ Ing
332
+ aj
333
+ apod
334
+ PharmDr
335
+ pomn
336
+ ev
337
+ slang
338
+ nprap
339
+ odp
340
+ dop
341
+ pol
342
+ st
343
+ stol
344
+ p. n. l
345
+ před n. l
346
+ n. l
347
+ př. Kr
348
+ po Kr
349
+ př. n. l
350
+ odd
351
+ RNDr
352
+ tzv
353
+ atd
354
+ tzn
355
+ resp
356
+ tj
357
+ p
358
+ br
359
+ č. j
360
+ čj
361
+ č. p
362
+ čp
363
+ a. s
364
+ s. r. o
365
+ spol. s r. o
366
+ p. o
367
+ s. p
368
+ v. o. s
369
+ k. s
370
+ o. p. s
371
+ o. s
372
+ v. r
373
+ v z
374
+ ml
375
+
376
+ kr
377
+ mld
378
+ hod
379
+ popř
380
+ ap
381
+ event
382
+ rus
383
+ slov
384
+ rum
385
+ švýc
386
+ P. T
387
+ zvl
388
+ hor
389
+ dol
390
+ S.O.S
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.de ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+
4
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5
+ #usually upper case letters are initials in a name
6
+ #no german words end in single lower-case letters, so we throw those in too.
7
+ A
8
+ B
9
+ C
10
+ D
11
+ E
12
+ F
13
+ G
14
+ H
15
+ I
16
+ J
17
+ K
18
+ L
19
+ M
20
+ N
21
+ O
22
+ P
23
+ Q
24
+ R
25
+ S
26
+ T
27
+ U
28
+ V
29
+ W
30
+ X
31
+ Y
32
+ Z
33
+ a
34
+ b
35
+ c
36
+ d
37
+ e
38
+ f
39
+ g
40
+ h
41
+ i
42
+ j
43
+ k
44
+ l
45
+ m
46
+ n
47
+ o
48
+ p
49
+ q
50
+ r
51
+ s
52
+ t
53
+ u
54
+ v
55
+ w
56
+ x
57
+ y
58
+ z
59
+
60
+
61
+ #Roman Numerals. A dot after one of these is not a sentence break in German.
62
+ I
63
+ II
64
+ III
65
+ IV
66
+ V
67
+ VI
68
+ VII
69
+ VIII
70
+ IX
71
+ X
72
+ XI
73
+ XII
74
+ XIII
75
+ XIV
76
+ XV
77
+ XVI
78
+ XVII
79
+ XVIII
80
+ XIX
81
+ XX
82
+ i
83
+ ii
84
+ iii
85
+ iv
86
+ v
87
+ vi
88
+ vii
89
+ viii
90
+ ix
91
+ x
92
+ xi
93
+ xii
94
+ xiii
95
+ xiv
96
+ xv
97
+ xvi
98
+ xvii
99
+ xviii
100
+ xix
101
+ xx
102
+
103
+ #Titles and Honorifics
104
+ Adj
105
+ Adm
106
+ Adv
107
+ Asst
108
+ Bart
109
+ Bldg
110
+ Brig
111
+ Bros
112
+ Capt
113
+ Cmdr
114
+ Col
115
+ Comdr
116
+ Con
117
+ Corp
118
+ Cpl
119
+ DR
120
+ Dr
121
+ Ens
122
+ Gen
123
+ Gov
124
+ Hon
125
+ Hosp
126
+ Insp
127
+ Lt
128
+ MM
129
+ MR
130
+ MRS
131
+ MS
132
+ Maj
133
+ Messrs
134
+ Mlle
135
+ Mme
136
+ Mr
137
+ Mrs
138
+ Ms
139
+ Msgr
140
+ Op
141
+ Ord
142
+ Pfc
143
+ Ph
144
+ Prof
145
+ Pvt
146
+ Rep
147
+ Reps
148
+ Res
149
+ Rev
150
+ Rt
151
+ Sen
152
+ Sens
153
+ Sfc
154
+ Sgt
155
+ Sr
156
+ St
157
+ Supt
158
+ Surg
159
+
160
+ #Misc symbols
161
+ Mio
162
+ Mrd
163
+ bzw
164
+ v
165
+ vs
166
+ usw
167
+ d.h
168
+ z.B
169
+ u.a
170
+ etc
171
+ Mrd
172
+ MwSt
173
+ ggf
174
+ d.J
175
+ D.h
176
+ m.E
177
+ vgl
178
+ I.F
179
+ z.T
180
+ sogen
181
+ ff
182
+ u.E
183
+ g.U
184
+ g.g.A
185
+ c.-à-d
186
+ Buchst
187
+ u.s.w
188
+ sog
189
+ u.ä
190
+ Std
191
+ evtl
192
+ Zt
193
+ Chr
194
+ u.U
195
+ o.ä
196
+ Ltd
197
+ b.A
198
+ z.Zt
199
+ spp
200
+ sen
201
+ SA
202
+ k.o
203
+ jun
204
+ i.H.v
205
+ dgl
206
+ dergl
207
+ Co
208
+ zzt
209
+ usf
210
+ s.p.a
211
+ Dkr
212
+ Corp
213
+ bzgl
214
+ BSE
215
+
216
+ #Number indicators
217
+ # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
218
+ No
219
+ Nos
220
+ Art
221
+ Nr
222
+ pp
223
+ ca
224
+ Ca
225
+
226
+ #Ordinals are done with . in German - "1." = "1st" in English
227
+ 1
228
+ 2
229
+ 3
230
+ 4
231
+ 5
232
+ 6
233
+ 7
234
+ 8
235
+ 9
236
+ 10
237
+ 11
238
+ 12
239
+ 13
240
+ 14
241
+ 15
242
+ 16
243
+ 17
244
+ 18
245
+ 19
246
+ 20
247
+ 21
248
+ 22
249
+ 23
250
+ 24
251
+ 25
252
+ 26
253
+ 27
254
+ 28
255
+ 29
256
+ 30
257
+ 31
258
+ 32
259
+ 33
260
+ 34
261
+ 35
262
+ 36
263
+ 37
264
+ 38
265
+ 39
266
+ 40
267
+ 41
268
+ 42
269
+ 43
270
+ 44
271
+ 45
272
+ 46
273
+ 47
274
+ 48
275
+ 49
276
+ 50
277
+ 51
278
+ 52
279
+ 53
280
+ 54
281
+ 55
282
+ 56
283
+ 57
284
+ 58
285
+ 59
286
+ 60
287
+ 61
288
+ 62
289
+ 63
290
+ 64
291
+ 65
292
+ 66
293
+ 67
294
+ 68
295
+ 69
296
+ 70
297
+ 71
298
+ 72
299
+ 73
300
+ 74
301
+ 75
302
+ 76
303
+ 77
304
+ 78
305
+ 79
306
+ 80
307
+ 81
308
+ 82
309
+ 83
310
+ 84
311
+ 85
312
+ 86
313
+ 87
314
+ 88
315
+ 89
316
+ 90
317
+ 91
318
+ 92
319
+ 93
320
+ 94
321
+ 95
322
+ 96
323
+ 97
324
+ 98
325
+ 99
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.el ADDED
@@ -0,0 +1,1568 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Sigle letters in upper-case are usually abbreviations of names
2
+ Α
3
+ Β
4
+ Γ
5
+ Δ
6
+ Ε
7
+ Ζ
8
+ Η
9
+ Θ
10
+ Ι
11
+ Κ
12
+ Λ
13
+ Μ
14
+ Ν
15
+ Ξ
16
+ Ο
17
+ Π
18
+ Ρ
19
+ Σ
20
+ Τ
21
+ Υ
22
+ Φ
23
+ Χ
24
+ Ψ
25
+ Ω
26
+
27
+ # Includes abbreviations for the Greek language compiled from various sources (Greek grammar books, Greek language related web content).
28
+ Άθαν
29
+ Έγχρ
30
+ Έκθ
31
+ Έσδ
32
+ Έφ
33
+ Όμ
34
+ Α΄Έσδρ
35
+ Α΄Έσδ
36
+ Α΄Βασ
37
+ Α΄Θεσ
38
+ Α΄Ιω
39
+ Α΄Κορινθ
40
+ Α΄Κορ
41
+ Α΄Μακκ
42
+ Α΄Μακ
43
+ Α΄Πέτρ
44
+ Α΄Πέτ
45
+ Α΄Παραλ
46
+ Α΄Πε
47
+ Α΄Σαμ
48
+ Α΄Τιμ
49
+ Α΄Χρον
50
+ Α΄Χρ
51
+ Α.Β.Α
52
+ Α.Β
53
+ Α.Ε
54
+ Α.Κ.Τ.Ο
55
+ Αέθλ
56
+ Αέτ
57
+ Αίλ.Δ
58
+ Αίλ.Τακτ
59
+ Αίσ
60
+ Αββακ
61
+ Αβυδ
62
+ Αβ
63
+ Αγάκλ
64
+ Αγάπ
65
+ Αγάπ.Αμαρτ.Σ
66
+ Αγάπ.Γεωπ
67
+ Αγαθάγγ
68
+ Αγαθήμ
69
+ Αγαθιν
70
+ Αγαθοκλ
71
+ Αγαθρχ
72
+ Αγαθ
73
+ Αγαθ.Ιστ
74
+ Αγαλλ
75
+ Αγαπητ
76
+ Αγγ
77
+ Αγησ
78
+ Αγλ
79
+ Αγορ.Κ
80
+ Αγρο.Κωδ
81
+ Αγρ.Εξ
82
+ Αγρ.Κ
83
+ Αγ.Γρ
84
+ Αδριαν
85
+ Αδρ
86
+ Αετ
87
+ Αθάν
88
+ Αθήν
89
+ Αθήν.Επιγρ
90
+ Αθήν.Επιτ
91
+ Αθήν.Ιατρ
92
+ Αθήν.Μηχ
93
+ Αθανάσ
94
+ Αθαν
95
+ Αθηνί
96
+ Αθηναγ
97
+ Αθηνόδ
98
+ Αθ
99
+ Αθ.Αρχ
100
+ Αιλ
101
+ Αιλ.Επιστ
102
+ Αιλ.ΖΙ
103
+ Αιλ.ΠΙ
104
+ Αιλ.απ
105
+ Αιμιλ
106
+ Αιν.Γαζ
107
+ Αιν.Τακτ
108
+ Αισχίν
109
+ Αισχίν.Επιστ
110
+ Αισχ
111
+ Αισχ.Αγαμ
112
+ Αισχ.Αγ
113
+ Αισχ.Αλ
114
+ Αισχ.Ελεγ
115
+ Αισχ.Επτ.Θ
116
+ Αισχ.Ευμ
117
+ Αισχ.Ικέτ
118
+ Αισχ.Ικ
119
+ Αισχ.Περσ
120
+ Αισχ.Προμ.Δεσμ
121
+ Αισχ.Πρ
122
+ Αισχ.Χοηφ
123
+ Αισχ.Χο
124
+ Αισχ.απ
125
+ ΑιτΕ
126
+ Αιτ
127
+ Αλκ
128
+ Αλχιας
129
+ Αμ.Π.Ο
130
+ Αμβ
131
+ Αμμών
132
+ Αμ.
133
+ Αν.Πειθ.Συμβ.Δικ
134
+ Ανακρ
135
+ Ανακ
136
+ Αναμν.Τόμ
137
+ Αναπλ
138
+ Ανδ
139
+ Ανθλγος
140
+ Ανθστης
141
+ Αντισθ
142
+ Ανχης
143
+ Αν
144
+ Αποκ
145
+ Απρ
146
+ Απόδ
147
+ Απόφ
148
+ Απόφ.Νομ
149
+ Απ
150
+ Απ.Δαπ
151
+ Απ.Διατ
152
+ Απ.Επιστ
153
+ Αριθ
154
+ Αριστοτ
155
+ Αριστοφ
156
+ Αριστοφ.Όρν
157
+ Αριστοφ.Αχ
158
+ Αριστοφ.Βάτρ
159
+ Αριστοφ.Ειρ
160
+ Αριστοφ.Εκκλ
161
+ Αριστοφ.Θεσμ
162
+ Αριστοφ.Ιππ
163
+ Αριστοφ.Λυσ
164
+ Αριστοφ.Νεφ
165
+ Αριστοφ.Πλ
166
+ Αριστοφ.Σφ
167
+ Αριστ
168
+ Αριστ.Αθ.Πολ
169
+ Αριστ.Αισθ
170
+ Αριστ.Αν.Πρ
171
+ Αριστ.Ζ.Ι
172
+ Αριστ.Ηθ.Ευδ
173
+ Αριστ.Ηθ.Νικ
174
+ Αριστ.Κατ
175
+ Αριστ.Μετ
176
+ Αριστ.Πολ
177
+ Αριστ.Φυσιογν
178
+ Αριστ.Φυσ
179
+ Αριστ.Ψυχ
180
+ Αριστ.Ρητ
181
+ Αρμεν
182
+ Αρμ
183
+ Αρχ.Εκ.Καν.Δ
184
+ Αρχ.Ευβ.Μελ
185
+ Αρχ.Ιδ.Δ
186
+ Αρχ.Νομ
187
+ Αρχ.Ν
188
+ Αρχ.Π.Ε
189
+ Αρ
190
+ Αρ.Φορ.Μητρ
191
+ Ασμ
192
+ Ασμ.ασμ
193
+ Αστ.Δ
194
+ Αστ.Χρον
195
+ Ασ
196
+ Ατομ.Γνωμ
197
+ Αυγ
198
+ Αφρ
199
+ Αχ.Νομ
200
+ Α
201
+ Α.Εγχ.Π
202
+ Α.Κ.΄Υδρας
203
+ Β΄Έσδρ
204
+ Β΄Έσδ
205
+ Β΄Βασ
206
+ Β΄Θεσ
207
+ Β΄Ιω
208
+ Β΄Κορινθ
209
+ Β΄Κορ
210
+ Β΄Μακκ
211
+ Β΄Μακ
212
+ Β΄Πέτρ
213
+ Β΄Πέτ
214
+ Β΄Πέ
215
+ Β΄Παραλ
216
+ Β΄Σαμ
217
+ Β΄Τιμ
218
+ Β΄Χρον
219
+ Β΄Χρ
220
+ Β.Ι.Π.Ε
221
+ Β.Κ.Τ
222
+ Β.Κ.Ψ.Β
223
+ Β.Μ
224
+ Β.Ο.Α.Κ
225
+ Β.Ο.Α
226
+ Β.Ο.Δ
227
+ Βίβλ
228
+ Βαρ
229
+ ΒεΘ
230
+ Βι.Περ
231
+ Βιπερ
232
+ Βιργ
233
+ Βλγ
234
+ Βούλ
235
+ Βρ
236
+ Γ΄Βασ
237
+ Γ΄Μακκ
238
+ ΓΕΝμλ
239
+ Γέν
240
+ Γαλ
241
+ Γεν
242
+ Γλ
243
+ Γν.Ν.Σ.Κρ
244
+ Γνωμ
245
+ Γν
246
+ Γράμμ
247
+ Γρηγ.Ναζ
248
+ Γρηγ.Νύσ
249
+ Γ Νοσ
250
+ Γ' Ογκολ
251
+ Γ.Ν
252
+ Δ΄Βασ
253
+ Δ.Β
254
+ Δ.Δίκη
255
+ Δ.Δίκ
256
+ Δ.Ε.Σ
257
+ Δ.Ε.Φ.Α
258
+ Δ.Ε.Φ
259
+ Δ.Εργ.Ν
260
+ Δαμ
261
+ Δαμ.μνημ.έργ
262
+ Δαν
263
+ Δασ.Κ
264
+ Δεκ
265
+ Δελτ.Δικ.Ε.Τ.Ε
266
+ Δελτ.Νομ
267
+ Δελτ.Συνδ.Α.Ε
268
+ Δερμ
269
+ Δευτ
270
+ Δεύτ
271
+ Δημοσθ
272
+ Δημόκρ
273
+ Δι.Δικ
274
+ Διάτ
275
+ Διαιτ.Απ
276
+ Διαιτ
277
+ Διαρκ.Στρατ
278
+ Δικ
279
+ Διοίκ.Πρωτ
280
+ ΔιοικΔνη
281
+ Διοικ.Εφ
282
+ Διον.Αρ
283
+ Διόρθ.Λαθ
284
+ Δ.κ.Π
285
+ Δνη
286
+ Δν
287
+ Δογμ.Όρος
288
+ Δρ
289
+ Δ.τ.Α
290
+ Δτ
291
+ ΔωδΝομ
292
+ Δ.Περ
293
+ Δ.Στρ
294
+ ΕΔΠολ
295
+ ΕΕυρΚ
296
+ ΕΙΣ
297
+ ΕΝαυτΔ
298
+ ΕΣΑμΕΑ
299
+ ΕΣΘ
300
+ ΕΣυγκΔ
301
+ ΕΤρΑξΧρΔ
302
+ Ε.Φ.Ε.Τ
303
+ Ε.Φ.Ι
304
+ Ε.Φ.Ο.Επ.Α
305
+ Εβδ
306
+ Εβρ
307
+ Εγκύκλ.Επιστ
308
+ Εγκ
309
+ Εε.Αιγ
310
+ Εθν.Κ.Τ
311
+ Εθν
312
+ Ειδ.Δικ.Αγ.Κακ
313
+ Εικ
314
+ Ειρ.Αθ
315
+ Ειρην.Αθ
316
+ Ειρην
317
+ Έλεγχ
318
+ Ειρ
319
+ Εισ.Α.Π
320
+ Εισ.Ε
321
+ Εισ.Ν.Α.Κ
322
+ Εισ.Ν.Κ.Πολ.Δ
323
+ Εισ.Πρωτ
324
+ Εισηγ.Έκθ
325
+ Εισ
326
+ Εκκλ
327
+ Εκκ
328
+ Εκ
329
+ Ελλ.Δνη
330
+ Εν.Ε
331
+ Εξ
332
+ Επ.Αν
333
+ Επ.Εργ.Δ
334
+ Επ.Εφ
335
+ Επ.Κυπ.Δ
336
+ Επ.Μεσ.Αρχ
337
+ Επ.Νομ
338
+ Επίκτ
339
+ Επίκ
340
+ Επι.Δ.Ε
341
+ Επιθ.Ναυτ.Δικ
342
+ Επικ
343
+ Επισκ.Ε.Δ
344
+ Επισκ.Εμπ.Δικ
345
+ Επιστ.Επετ.Αρμ
346
+ Επιστ.Επετ
347
+ Επιστ.Ιερ
348
+ Επιτρ.Προστ.Συνδ.Στελ
349
+ Επιφάν
350
+ Επτ.Εφ
351
+ Επ.Ιρ
352
+ Επ.Ι
353
+ Εργ.Ασφ.Νομ
354
+ Ερμ.Α.Κ
355
+ Ερμη.Σ
356
+ Εσθ
357
+ Εσπερ
358
+ Ετρ.Δ
359
+ Ευκλ
360
+ Ευρ.Δ.Δ.Α
361
+ Ευρ.Σ.Δ.Α
362
+ Ευρ.ΣτΕ
363
+ Ευρατόμ
364
+ Ευρ.Άλκ
365
+ Ευρ.Ανδρομ
366
+ Ευρ.Βάκχ
367
+ Ευρ.Εκ
368
+ Ευρ.Ελ
369
+ Ευρ.Ηλ
370
+ Ευρ.Ηρακ
371
+ Ευρ.Ηρ
372
+ Ευρ.Ηρ.Μαιν
373
+ Ευρ.Ικέτ
374
+ Ευρ.Ιππόλ
375
+ Ευρ.Ιφ.��
376
+ Ευρ.Ιφ.Τ
377
+ Ευρ.Ι.Τ
378
+ Ευρ.Κύκλ
379
+ Ευρ.Μήδ
380
+ Ευρ.Ορ
381
+ Ευρ.Ρήσ
382
+ Ευρ.Τρωάδ
383
+ Ευρ.Φοίν
384
+ Εφ.Αθ
385
+ Εφ.Εν
386
+ Εφ.Επ
387
+ Εφ.Θρ
388
+ Εφ.Θ
389
+ Εφ.Ι
390
+ Εφ.Κερ
391
+ Εφ.Κρ
392
+ Εφ.Λ
393
+ Εφ.Ν
394
+ Εφ.Πατ
395
+ Εφ.Πειρ
396
+ Εφαρμ.Δ.Δ
397
+ Εφαρμ
398
+ Εφεσ
399
+ Εφημ
400
+ Εφ
401
+ Ζαχ
402
+ Ζιγ
403
+ Ζυ
404
+ Ζχ
405
+ ΗΕ.Δ
406
+ Ημερ
407
+ Ηράκλ
408
+ Ηροδ
409
+ Ησίοδ
410
+ Ησ
411
+ Η.Ε.Γ
412
+ ΘΗΣ
413
+ ΘΡ
414
+ Θαλ
415
+ Θεοδ
416
+ Θεοφ
417
+ Θεσ
418
+ Θεόδ.Μοψ
419
+ Θεόκρ
420
+ Θεόφιλ
421
+ Θουκ
422
+ Θρ
423
+ Θρ.Ε
424
+ Θρ.Ιερ
425
+ Θρ.Ιρ
426
+ Ιακ
427
+ Ιαν
428
+ Ιβ
429
+ Ιδθ
430
+ Ιδ
431
+ Ιεζ
432
+ Ιερ
433
+ Ιζ
434
+ Ιησ
435
+ Ιησ.Ν
436
+ Ικ
437
+ Ιλ
438
+ Ιν
439
+ Ιουδ
440
+ Ιουστ
441
+ Ιούδα
442
+ Ιούλ
443
+ Ιούν
444
+ Ιπποκρ
445
+ Ιππόλ
446
+ Ιρ
447
+ Ισίδ.Πηλ
448
+ Ισοκρ
449
+ Ισ.Ν
450
+ Ιωβ
451
+ Ιωλ
452
+ Ιων
453
+ Ιω
454
+ ΚΟΣ
455
+ ΚΟ.ΜΕ.ΚΟΝ
456
+ ΚΠοινΔ
457
+ ΚΠολΔ
458
+ ΚαΒ
459
+ Καλ
460
+ Καλ.Τέχν
461
+ ΚανΒ
462
+ Καν.Διαδ
463
+ Κατάργ
464
+ Κλ
465
+ ΚοινΔ
466
+ Κολσ
467
+ Κολ
468
+ Κον
469
+ Κορ
470
+ Κος
471
+ ΚριτΕπιθ
472
+ ΚριτΕ
473
+ Κριτ
474
+ Κρ
475
+ ΚτΒ
476
+ ΚτΕ
477
+ ΚτΠ
478
+ Κυβ
479
+ Κυπρ
480
+ Κύριλ.Αλεξ
481
+ Κύριλ.Ιερ
482
+ Λεβ
483
+ Λεξ.Σουίδα
484
+ Λευϊτ
485
+ Λευ
486
+ Λκ
487
+ Λογ
488
+ ΛουκΑμ
489
+ Λουκιαν
490
+ Λουκ.Έρωτ
491
+ Λουκ.Ενάλ.Διάλ
492
+ Λουκ.Ερμ
493
+ Λουκ.Εταιρ.Διάλ
494
+ Λουκ.Ε.Δ
495
+ Λουκ.Θε.Δ
496
+ Λουκ.Ικ.
497
+ Λουκ.Ιππ
498
+ Λουκ.Λεξιφ
499
+ Λουκ.Μεν
500
+ Λουκ.Μισθ.Συν
501
+ Λουκ.Ορχ
502
+ Λουκ.Περ
503
+ Λουκ.Συρ
504
+ Λουκ.Τοξ
505
+ Λουκ.Τυρ
506
+ Λουκ.Φιλοψ
507
+ Λουκ.Φιλ
508
+ Λουκ.Χάρ
509
+ Λουκ.
510
+ Λουκ.Αλ
511
+ Λοχ
512
+ Λυδ
513
+ Λυκ
514
+ Λυσ
515
+ Λωζ
516
+ Λ1
517
+ Λ2
518
+ ΜΟΕφ
519
+ Μάρκ
520
+ Μέν
521
+ Μαλ
522
+ Ματθ
523
+ Μα
524
+ Μιχ
525
+ Μκ
526
+ Μλ
527
+ Μμ
528
+ Μον.Δ.Π
529
+ Μον.Πρωτ
530
+ Μον
531
+ Μρ
532
+ Μτ
533
+ Μχ
534
+ Μ.Βασ
535
+ Μ.Πλ
536
+ ΝΑ
537
+ Ναυτ.Χρον
538
+ Να
539
+ Νδικ
540
+ Νεεμ
541
+ Νε
542
+ Νικ
543
+ ΝκΦ
544
+ Νμ
545
+ ΝοΒ
546
+ Νομ.Δελτ.Τρ.Ελ
547
+ Νομ.Δελτ
548
+ Νομ.Σ.Κ
549
+ Νομ.Χρ
550
+ Νομ
551
+ Νομ.Διεύθ
552
+ Νοσ
553
+ Ντ
554
+ Νόσων
555
+ Ν1
556
+ Ν2
557
+ Ν3
558
+ Ν4
559
+ Νtot
560
+ Ξενοφ
561
+ Ξεν
562
+ Ξεν.Ανάβ
563
+ Ξεν.Απολ
564
+ Ξεν.Απομν
565
+ Ξεν.Απομ
566
+ Ξεν.Ελλ
567
+ Ξεν.Ιέρ
568
+ Ξεν.Ιππαρχ
569
+ Ξεν.Ιππ
570
+ Ξεν.Κυρ.Αν
571
+ Ξεν.Κύρ.Παιδ
572
+ Ξεν.Κ.Π
573
+ Ξεν.Λακ.Πολ
574
+ Ξεν.Οικ
575
+ Ξεν.Προσ
576
+ Ξεν.Συμπόσ
577
+ Ξεν.Συμπ
578
+ Ο΄
579
+ Οβδ
580
+ Οβ
581
+ ΟικΕ
582
+ Οικ
583
+ Οικ.Πατρ
584
+ Οικ.Σύν.Βατ
585
+ Ολομ
586
+ Ολ
587
+ Ολ.Α.Π
588
+ Ομ.Ιλ
589
+ Ομ.Οδ
590
+ ΟπΤοιχ
591
+ Οράτ
592
+ Ορθ
593
+ ΠΡΟ.ΠΟ
594
+ Πίνδ
595
+ Πίνδ.Ι
596
+ Πίνδ.Νεμ
597
+ Πίνδ.Ν
598
+ Πίνδ.Ολ
599
+ Πίνδ.Παθ
600
+ Πίνδ.Πυθ
601
+ Πίνδ.Π
602
+ ΠαγΝμλγ
603
+ Παν
604
+ Παρμ
605
+ Παροιμ
606
+ Παρ
607
+ Παυσ
608
+ Πειθ.Συμβ
609
+ ΠειρΝ
610
+ Πελ
611
+ ΠεντΣτρ
612
+ Πεντ
613
+ Πεντ.Εφ
614
+ ΠερΔικ
615
+ Περ.Γεν.Νοσ
616
+ Πετ
617
+ Πλάτ
618
+ Πλάτ.Αλκ
619
+ Πλάτ.Αντ
620
+ Πλάτ.Αξίοχ
621
+ Πλάτ.Απόλ
622
+ Πλάτ.Γοργ
623
+ Πλάτ.Ευθ
624
+ Πλάτ.Θεαίτ
625
+ Πλάτ.Κρατ
626
+ Πλάτ.Κριτ
627
+ Πλάτ.Λύσ
628
+ Πλάτ.Μεν
629
+ Πλάτ.Νόμ
630
+ Πλάτ.Πολιτ
631
+ Πλάτ.Πολ
632
+ Πλάτ.Πρωτ
633
+ Πλάτ.Σοφ.
634
+ Πλάτ.Συμπ
635
+ Πλάτ.Τίμ
636
+ Πλάτ.Φαίδρ
637
+ Πλάτ.Φιλ
638
+ Πλημ
639
+ Πλούτ
640
+ Πλούτ.Άρατ
641
+ Πλούτ.Αιμ
642
+ Πλούτ.Αλέξ
643
+ Πλούτ.Αλκ
644
+ Πλούτ.Αντ
645
+ Πλούτ.Αρτ
646
+ Πλούτ.Ηθ
647
+ Πλούτ.Θεμ
648
+ Πλούτ.Κάμ
649
+ Πλούτ.Καίσ
650
+ Πλούτ.Κικ
651
+ Πλούτ.Κράσ
652
+ Πλούτ.Κ
653
+ Πλούτ.Λυκ
654
+ Πλούτ.Μάρκ
655
+ Πλούτ.Μάρ
656
+ Πλούτ.Περ
657
+ Πλούτ.Ρωμ
658
+ Πλούτ.Σύλλ
659
+ Πλούτ.Φλαμ
660
+ Πλ
661
+ Ποιν.Δικ
662
+ Ποιν.Δ
663
+ Ποιν.Ν
664
+ Ποιν.Χρον
665
+ Ποιν.Χρ
666
+ Πολ.Δ
667
+ Πολ.Πρωτ
668
+ Πολ
669
+ Πολ.Μηχ
670
+ Πολ.Μ
671
+ Πρακτ.Αναθ
672
+ Πρακτ.Ολ
673
+ Πραξ
674
+ Πρμ
675
+ Πρξ
676
+ Πρωτ
677
+ Πρ
678
+ Πρ.Αν
679
+ Πρ.Λογ
680
+ Πταισμ
681
+ Πυρ.Καλ
682
+ Πόλη
683
+ Π.Δ
684
+ Π.Δ.Άσμ
685
+ ΡΜ.Ε
686
+ Ρθ
687
+ Ρμ
688
+ Ρωμ
689
+ ΣΠλημ
690
+ Σαπφ
691
+ Σειρ
692
+ Σολ
693
+ Σοφ
694
+ Σοφ.Αντιγ
695
+ Σοφ.Αντ
696
+ Σοφ.Αποσ
697
+ Σοφ.Απ
698
+ Σοφ.Ηλέκ
699
+ Σοφ.Ηλ
700
+ Σοφ.Οιδ.Κολ
701
+ Σοφ.Οιδ.Τύρ
702
+ Σοφ.Ο.Τ
703
+ Σοφ.Σειρ
704
+ Σοφ.Σολ
705
+ Σοφ.Τραχ
706
+ Σοφ.Φιλοκτ
707
+ Σρ
708
+ Σ.τ.Ε
709
+ Σ.τ.Π
710
+ Στρ.Π.Κ
711
+ Στ.Ευρ
712
+ Συζήτ
713
+ Συλλ.Νομολ
714
+ Συλ.Νομ
715
+ ΣυμβΕπιθ
716
+ Συμπ.Ν
717
+ Συνθ.Αμ
718
+ Συνθ.Ε.Ε
719
+ Συνθ.Ε.Κ
720
+ Συνθ.Ν
721
+ Σφν
722
+ Σφ
723
+ Σφ.Σλ
724
+ Σχ.Πολ.Δ
725
+ Σχ.Συντ.Ε
726
+ Σωσ
727
+ Σύντ
728
+ Σ.Πληρ
729
+ ΤΘ
730
+ ΤΣ.Δ
731
+ Τίτ
732
+ Τβ
733
+ Τελ.Ενημ
734
+ Τελ.Κ
735
+ Τερτυλ
736
+ Τιμ
737
+ Τοπ.Α
738
+ Τρ.Ο
739
+ Τριμ
740
+ Τριμ.Πλ
741
+ Τρ.Πλημ
742
+ Τρ.Π.Δ
743
+ Τ.τ.Ε
744
+ Ττ
745
+ Τωβ
746
+ Υγ
747
+ Υπερ
748
+ Υπ
749
+ Υ.Γ
750
+ Φιλήμ
751
+ Φιλιπ
752
+ Φιλ
753
+ Φλμ
754
+ Φλ
755
+ Φορ.Β
756
+ Φορ.Δ.Ε
757
+ Φορ.Δνη
758
+ Φορ.Δ
759
+ Φορ.Επ
760
+ Φώτ
761
+ Χρ.Ι.Δ
762
+ Χρ.Ιδ.Δ
763
+ Χρ.Ο
764
+ Χρυσ
765
+ Ψήφ
766
+ Ψαλμ
767
+ Ψαλ
768
+ Ψλ
769
+ Ωριγ
770
+ Ωσ
771
+ Ω.Ρ.Λ
772
+ άγν
773
+ άγν.ετυμολ
774
+ άγ
775
+ άκλ
776
+ άνθρ
777
+ άπ
778
+ άρθρ
779
+ άρν
780
+ άρ
781
+ άτ
782
+ άψ
783
+ ά
784
+ έκδ
785
+ έκφρ
786
+ έμψ
787
+ ένθ.αν
788
+ έτ
789
+ έ.α
790
+ ίδ
791
+ αβεστ
792
+ αβησσ
793
+ αγγλ
794
+ αγγ
795
+ αδημ
796
+ αεροναυτ
797
+ αερον
798
+ αεροπ
799
+ αθλητ
800
+ αθλ
801
+ αθροιστ
802
+ αιγυπτ
803
+ αιγ
804
+ αιτιολ
805
+ αιτ
806
+ αι
807
+ ακαδ
808
+ ακκαδ
809
+ αλβ
810
+ αλλ
811
+ αλφαβητ
812
+ αμα
813
+ αμερικ
814
+ αμερ
815
+ αμετάβ
816
+ αμτβ
817
+ αμφιβ
818
+ αμφισβ
819
+ αμφ
820
+ αμ
821
+ ανάλ
822
+ ανάπτ
823
+ ανάτ
824
+ αναβ
825
+ αναδαν
826
+ αναδιπλασ
827
+ αναδιπλ
828
+ αναδρ
829
+ αναλ
830
+ αναν
831
+ ανασυλλ
832
+ ανατολ
833
+ ανατομ
834
+ ανατυπ
835
+ ανατ
836
+ αναφορ
837
+ αναφ
838
+ ανα.ε
839
+ ανδρων
840
+ ανθρωπολ
841
+ ανθρωπ
842
+ ανθ
843
+ ανομ
844
+ αντίτ
845
+ αντδ
846
+ αντιγρ
847
+ αντιθ
848
+ αντικ
849
+ αντιμετάθ
850
+ αντων
851
+ αντ
852
+ ανωτ
853
+ ανόργ
854
+ ανών
855
+ αορ
856
+ απαρέμφ
857
+ απαρφ
858
+ απαρχ
859
+ απαρ
860
+ απλολ
861
+ απλοπ
862
+ αποβ
863
+ αποηχηροπ
864
+ αποθ
865
+ αποκρυφ
866
+ αποφ
867
+ απρμφ
868
+ απρφ
869
+ απρόσ
870
+ απόδ
871
+ απόλ
872
+ απόσπ
873
+ απόφ
874
+ αραβοτουρκ
875
+ αραβ
876
+ αραμ
877
+ αρβαν
878
+ αργκ
879
+ αριθμτ
880
+ αριθμ
881
+ αριθ
882
+ αρκτικόλ
883
+ αρκ
884
+ αρμεν
885
+ αρμ
886
+ αρνητ
887
+ αρσ
888
+ αρχαιολ
889
+ αρχιτεκτ
890
+ αρχιτ
891
+ αρχκ
892
+ αρχ
893
+ αρωμουν
894
+ αρωμ
895
+ αρ
896
+ αρ.μετρ
897
+ αρ.φ
898
+ ασσυρ
899
+ αστρολ
900
+ αστροναυτ
901
+ αστρον
902
+ αττ
903
+ αυστραλ
904
+ αυτοπ
905
+ αυτ
906
+ αφγαν
907
+ αφηρ
908
+ αφομ
909
+ αφρικ
910
+ αχώρ
911
+ αόρ
912
+ α.α
913
+ α/α
914
+ α0
915
+ βαθμ
916
+ βαθ
917
+ βαπτ
918
+ βασκ
919
+ βεβαιωτ
920
+ βεβ
921
+ βεδ
922
+ βενετ
923
+ βεν
924
+ βερβερ
925
+ βιβλγρ
926
+ βιολ
927
+ βιομ
928
+ βιοχημ
929
+ βιοχ
930
+ βλάχ
931
+ βλ
932
+ βλ.λ
933
+ βοταν
934
+ βοτ
935
+ βουλγαρ
936
+ βουλγ
937
+ βούλ
938
+ βραζιλ
939
+ βρετον
940
+ βόρ
941
+ γαλλ
942
+ γενικότ
943
+ γενοβ
944
+ γεν
945
+ γερμαν
946
+ γερμ
947
+ γεωγρ
948
+ γεωλ
949
+ γεωμετρ
950
+ γεωμ
951
+ γεωπ
952
+ γεωργ
953
+ γλυπτ
954
+ γλωσσολ
955
+ γλωσσ
956
+ γλ
957
+ γνμδ
958
+ γνμ
959
+ γνωμ
960
+ γοτθ
961
+ γραμμ
962
+ γραμ
963
+ γρμ
964
+ γρ
965
+ γυμν
966
+ δίδες
967
+ δίκ
968
+ δίφθ
969
+ δαν
970
+ δεικτ
971
+ δεκατ
972
+ δηλ
973
+ δημογρ
974
+ δημοτ
975
+ δημώδ
976
+ δημ
977
+ διάγρ
978
+ διάκρ
979
+ διάλεξ
980
+ διάλ
981
+ διάσπ
982
+ διαλεκτ
983
+ διατρ
984
+ διαφ
985
+ διαχ
986
+ διδα
987
+ διεθν
988
+ διεθ
989
+ δικον
990
+ διστ
991
+ δισύλλ
992
+ δισ
993
+ διφθογγοπ
994
+ δογμ
995
+ δολ
996
+ δοτ
997
+ δρμ
998
+ δρχ
999
+ δρ(α)
1000
+ δωρ
1001
+ δ
1002
+ εβρ
1003
+ εγκλπ
1004
+ εδ
1005
+ εθνολ
1006
+ εθν
1007
+ ειδικότ
1008
+ ειδ
1009
+ ειδ.β
1010
+ εικ
1011
+ ειρ
1012
+ εισ
1013
+ εκατοστμ
1014
+ εκατοστ
1015
+ εκατστ.2
1016
+ εκατστ.3
1017
+ εκατ
1018
+ εκδ
1019
+ εκκλησ
1020
+ εκκλ
1021
+ εκ
1022
+ ελλην
1023
+ ελλ
1024
+ ελνστ
1025
+ ελπ
1026
+ εμβ
1027
+ εμφ
1028
+ εναλλ
1029
+ ενδ
1030
+ ενεργ
1031
+ ενεστ
1032
+ ενικ
1033
+ ενν
1034
+ εν
1035
+ εξέλ
1036
+ εξακολ
1037
+ εξομάλ
1038
+ εξ
1039
+ εο
1040
+ επέκτ
1041
+ επίδρ
1042
+ επίθ
1043
+ επίρρ
1044
+ επίσ
1045
+ επαγγελμ
1046
+ επανάλ
1047
+ επανέκδ
1048
+ επιθ
1049
+ επικ
1050
+ επιμ
1051
+ επιρρ
1052
+ επιστ
1053
+ επιτατ
1054
+ επιφ
1055
+ επών
1056
+ επ
1057
+ εργ
1058
+ ερμ
1059
+ ερρινοπ
1060
+ ερωτ
1061
+ ετρουσκ
1062
+ ετυμ
1063
+ ετ
1064
+ ευφ
1065
+ ευχετ
1066
+ εφ
1067
+ εύχρ
1068
+ ε.α
1069
+ ε/υ
1070
+ ε0
1071
+ ζωγρ
1072
+ ζωολ
1073
+ ηθικ
1074
+ ηθ
1075
+ ηλεκτρολ
1076
+ ηλεκτρον
1077
+ ηλεκτρ
1078
+ ημίτ
1079
+ ημίφ
1080
+ ημιφ
1081
+ ηχηροπ
1082
+ ηχηρ
1083
+ ηχομιμ
1084
+ ηχ
1085
+ η
1086
+ θέατρ
1087
+ θεολ
1088
+ θετ
1089
+ θηλ
1090
+ θρακ
1091
+ θρησκειολ
1092
+ θρησκ
1093
+ θ
1094
+ ιαπων
1095
+ ιατρ
1096
+ ιδιωμ
1097
+ ιδ
1098
+ ινδ
1099
+ ιραν
1100
+ ισπαν
1101
+ ιστορ
1102
+ ιστ
1103
+ ισχυροπ
1104
+ ιταλ
1105
+ ιχθυολ
1106
+ ιων
1107
+ κάτ
1108
+ καθ
1109
+ κακοσ
1110
+ καν
1111
+ καρ
1112
+ κατάλ
1113
+ κατατ
1114
+ κατωτ
1115
+ κατ
1116
+ κα
1117
+ κελτ
1118
+ κεφ
1119
+ κινεζ
1120
+ κινημ
1121
+ κλητ
1122
+ κλιτ
1123
+ κλπ
1124
+ κλ
1125
+ κν
1126
+ κοινωνιολ
1127
+ κοινων
1128
+ κοπτ
1129
+ κουτσοβλαχ
1130
+ κουτσοβλ
1131
+ κπ
1132
+ κρ.γν
1133
+ κτγ
1134
+ κτην
1135
+ κτητ
1136
+ κτλ
1137
+ κτ
1138
+ κυριολ
1139
+ κυρ
1140
+ κύρ
1141
+ κ
1142
+ κ.ά
1143
+ κ.ά.π
1144
+ κ.α
1145
+ κ.εξ
1146
+ κ.επ
1147
+ κ.ε
1148
+ κ.λπ
1149
+ κ.λ.π
1150
+ κ.ού.κ
1151
+ κ.ο.κ
1152
+ κ.τ.λ
1153
+ κ.τ.τ
1154
+ κ.τ.ό
1155
+ λέξ
1156
+ λαογρ
1157
+ λαπ
1158
+ λατιν
1159
+ λατ
1160
+ λαϊκότρ
1161
+ λαϊκ
1162
+ λετ
1163
+ λιθ
1164
+ λογιστ
1165
+ λογοτ
1166
+ λογ
1167
+ λουβ
1168
+ λυδ
1169
+ λόγ
1170
+ λ
1171
+ λ.χ
1172
+ μέλλ
1173
+ μέσ
1174
+ μαθημ
1175
+ μαθ
1176
+ μαιευτ
1177
+ μαλαισ
1178
+ μαλτ
1179
+ μαμμων
1180
+ μεγεθ
1181
+ μεε
1182
+ μειωτ
1183
+ μελ
1184
+ μεξ
1185
+ μεσν
1186
+ μεσογ
1187
+ μεσοπαθ
1188
+ μεσοφ
1189
+ μετάθ
1190
+ μεταβτ
1191
+ μεταβ
1192
+ μετακ
1193
+ μεταπλ
1194
+ μεταπτωτ
1195
+ μεταρ
1196
+ μεταφορ
1197
+ μετβ
1198
+ μετεπιθ
1199
+ μετεπιρρ
1200
+ μετεωρολ
1201
+ μετεωρ
1202
+ μετον
1203
+ μετουσ
1204
+ μετοχ
1205
+ μετρ
1206
+ μετ
1207
+ μητρων
1208
+ μηχανολ
1209
+ μηχ
1210
+ μικροβιολ
1211
+ μογγολ
1212
+ μορφολ
1213
+ μουσ
1214
+ μπενελούξ
1215
+ μσνλατ
1216
+ μσν
1217
+ μτβ
1218
+ μτγν
1219
+ μτγ
1220
+ μτφρδ
1221
+ μτφρ
1222
+ μτφ
1223
+ μτχ
1224
+ μυθ
1225
+ μυκην
1226
+ μυκ
1227
+ μφ
1228
+ μ
1229
+ μ.ε
1230
+ μ.μ
1231
+ μ.π.ε
1232
+ μ.π.π
1233
+ μ0
1234
+ ναυτ
1235
+ νεοελλ
1236
+ νεολατιν
1237
+ νεολατ
1238
+ νεολ
1239
+ νεότ
1240
+ νλατ
1241
+ νομ
1242
+ νορβ
1243
+ νοσ
1244
+ νότ
1245
+ ν
1246
+ ξ.λ
1247
+ οικοδ
1248
+ οικολ
1249
+ οικον
1250
+ οικ
1251
+ ολλανδ
1252
+ ολλ
1253
+ ομηρ
1254
+ ομόρρ
1255
+ ονομ
1256
+ ον
1257
+ οπτ
1258
+ ορθογρ
1259
+ ορθ
1260
+ οριστ
1261
+ ορυκτολ
1262
+ ορυκτ
1263
+ ορ
1264
+ οσε��
1265
+ οσκ
1266
+ ουαλ
1267
+ ουγγρ
1268
+ ουδ
1269
+ ουσιαστικοπ
1270
+ ουσιαστ
1271
+ ουσ
1272
+ πίν
1273
+ παθητ
1274
+ παθολ
1275
+ παθ
1276
+ παιδ
1277
+ παλαιοντ
1278
+ παλαιότ
1279
+ παλ
1280
+ παππων
1281
+ παράγρ
1282
+ παράγ
1283
+ παράλλ
1284
+ παράλ
1285
+ παραγ
1286
+ παρακ
1287
+ παραλ
1288
+ παραπ
1289
+ παρατ
1290
+ παρβ
1291
+ παρετυμ
1292
+ παροξ
1293
+ παρων
1294
+ παρωχ
1295
+ παρ
1296
+ παρ.φρ
1297
+ πατριδων
1298
+ πατρων
1299
+ πβ
1300
+ περιθ
1301
+ περιλ
1302
+ περιφρ
1303
+ περσ
1304
+ περ
1305
+ πιθ
1306
+ πληθ
1307
+ πληροφ
1308
+ ποδ
1309
+ ποιητ
1310
+ πολιτ
1311
+ πολλαπλ
1312
+ πολ
1313
+ πορτογαλ
1314
+ πορτ
1315
+ ποσ
1316
+ πρακριτ
1317
+ πρβλ
1318
+ πρβ
1319
+ πργ
1320
+ πρκμ
1321
+ πρκ
1322
+ πρλ
1323
+ προέλ
1324
+ προβηγκ
1325
+ προελλ
1326
+ προηγ
1327
+ προθεμ
1328
+ προπαραλ
1329
+ προπαροξ
1330
+ προπερισπ
1331
+ προσαρμ
1332
+ προσηγορ
1333
+ προσταχτ
1334
+ προστ
1335
+ προσφών
1336
+ προσ
1337
+ προτακτ
1338
+ προτ.Εισ
1339
+ προφ
1340
+ προχωρ
1341
+ πρτ
1342
+ πρόθ
1343
+ πρόσθ
1344
+ πρόσ
1345
+ πρότ
1346
+ πρ
1347
+ πρ.Εφ
1348
+ πτ
1349
+ πυ
1350
+ π
1351
+ π.Χ
1352
+ π.μ
1353
+ π.χ
1354
+ ρήμ
1355
+ ρίζ
1356
+ ρηματ
1357
+ ρητορ
1358
+ ριν
1359
+ ρουμ
1360
+ ρωμ
1361
+ ρωσ
1362
+ ρ
1363
+ σανσκρ
1364
+ σαξ
1365
+ σελ
1366
+ σερβοκρ
1367
+ σερβ
1368
+ σημασιολ
1369
+ σημδ
1370
+ σημειολ
1371
+ σημερ
1372
+ σημιτ
1373
+ σημ
1374
+ σκανδ
1375
+ σκυθ
1376
+ σκωπτ
1377
+ σλαβ
1378
+ σλοβ
1379
+ σουηδ
1380
+ σουμερ
1381
+ σουπ
1382
+ σπάν
1383
+ σπανιότ
1384
+ σπ
1385
+ σσ
1386
+ στατ
1387
+ στερ
1388
+ στιγμ
1389
+ στιχ
1390
+ στρέμ
1391
+ στρατιωτ
1392
+ στρατ
1393
+ στ
1394
+ συγγ
1395
+ συγκρ
1396
+ συγκ
1397
+ συμπερ
1398
+ συμπλεκτ
1399
+ συμπλ
1400
+ συμπροφ
1401
+ συμφυρ
1402
+ συμφ
1403
+ συνήθ
1404
+ συνίζ
1405
+ συναίρ
1406
+ συναισθ
1407
+ συνδετ
1408
+ συνδ
1409
+ συνεκδ
1410
+ συνηρ
1411
+ συνθετ
1412
+ συνθ
1413
+ συνοπτ
1414
+ συντελ
1415
+ συντομογρ
1416
+ συντ
1417
+ συν
1418
+ συρ
1419
+ σχημ
1420
+ σχ
1421
+ σύγκρ
1422
+ σύμπλ
1423
+ σύμφ
1424
+ σύνδ
1425
+ σύνθ
1426
+ σύντμ
1427
+ σύντ
1428
+ σ
1429
+ σ.π
1430
+ σ/β
1431
+ τακτ
1432
+ τελ
1433
+ τετρ
1434
+ τετρ.μ
1435
+ τεχνλ
1436
+ τεχνολ
1437
+ τεχν
1438
+ τεύχ
1439
+ τηλεπικ
1440
+ τηλεόρ
1441
+ τιμ
1442
+ τιμ.τομ
1443
+ τοΣ
1444
+ τον
1445
+ τοπογρ
1446
+ τοπων
1447
+ τοπ
1448
+ τοσκ
1449
+ τουρκ
1450
+ τοχ
1451
+ τριτοπρόσ
1452
+ τροποπ
1453
+ τροπ
1454
+ τσεχ
1455
+ τσιγγ
1456
+ ττ
1457
+ τυπ
1458
+ τόμ
1459
+ τόνν
1460
+ τ
1461
+ τ.μ
1462
+ τ.χλμ
1463
+ υβρ
1464
+ υπερθ
1465
+ υπερσ
1466
+ υπερ
1467
+ υπεύθ
1468
+ υποθ
1469
+ υποκορ
1470
+ υποκ
1471
+ υποσημ
1472
+ υποτ
1473
+ υποφ
1474
+ υποχωρ
1475
+ υπόλ
1476
+ υπόχρ
1477
+ υπ
1478
+ υστλατ
1479
+ υψόμ
1480
+ υψ
1481
+ φάκ
1482
+ φαρμακολ
1483
+ φαρμ
1484
+ φιλολ
1485
+ φιλοσ
1486
+ φιλοτ
1487
+ φινλ
1488
+ φοινικ
1489
+ φράγκ
1490
+ φρανκον
1491
+ φριζ
1492
+ φρ
1493
+ φυλλ
1494
+ φυσιολ
1495
+ φυσ
1496
+ φωνηεντ
1497
+ φωνητ
1498
+ φωνολ
1499
+ φων
1500
+ φωτογρ
1501
+ φ
1502
+ φ.τ.μ
1503
+ χαμιτ
1504
+ χαρτόσ
1505
+ χαρτ
1506
+ χασμ
1507
+ χαϊδ
1508
+ χγφ
1509
+ χειλ
1510
+ χεττ
1511
+ χημ
1512
+ χιλ
1513
+ χλγρ
1514
+ χλγ
1515
+ χλμ
1516
+ χλμ.2
1517
+ χλμ.3
1518
+ χλσγρ
1519
+ χλστγρ
1520
+ χλστμ
1521
+ χλστμ.2
1522
+ χλστμ.3
1523
+ χλ
1524
+ χργρ
1525
+ χρημ
1526
+ χρον
1527
+ χρ
1528
+ χφ
1529
+ χ.ε
1530
+ χ.κ
1531
+ χ.ο
1532
+ χ.σ
1533
+ χ.τ
1534
+ χ.χ
1535
+ ψευδ
1536
+ ψυχαν
1537
+ ψυχιατρ
1538
+ ψυχολ
1539
+ ψυχ
1540
+ ωκεαν
1541
+ όμ
1542
+ όν
1543
+ όπ.παρ
1544
+ όπ.π
1545
+ ό.π
1546
+ ύψ
1547
+ 1Βσ
1548
+ 1Εσ
1549
+ 1Θσ
1550
+ 1Ιν
1551
+ 1Κρ
1552
+ 1Μκ
1553
+ 1Πρ
1554
+ 1Πτ
1555
+ 1Τμ
1556
+ 2Βσ
1557
+ 2Εσ
1558
+ 2Θσ
1559
+ 2Ιν
1560
+ 2Κρ
1561
+ 2Μκ
1562
+ 2Πρ
1563
+ 2Πτ
1564
+ 2Τμ
1565
+ 3Βσ
1566
+ 3Ιν
1567
+ 3Μκ
1568
+ 4Βσ
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.en ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+
4
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5
+ #usually upper case letters are initials in a name
6
+ A
7
+ B
8
+ C
9
+ D
10
+ E
11
+ F
12
+ G
13
+ H
14
+ I
15
+ J
16
+ K
17
+ L
18
+ M
19
+ N
20
+ O
21
+ P
22
+ Q
23
+ R
24
+ S
25
+ T
26
+ U
27
+ V
28
+ W
29
+ X
30
+ Y
31
+ Z
32
+
33
+ #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
34
+ Adj
35
+ Adm
36
+ Adv
37
+ Asst
38
+ Bart
39
+ Bldg
40
+ Brig
41
+ Bros
42
+ Capt
43
+ Cmdr
44
+ Col
45
+ Comdr
46
+ Con
47
+ Corp
48
+ Cpl
49
+ DR
50
+ Dr
51
+ Drs
52
+ Ens
53
+ Gen
54
+ Gov
55
+ Hon
56
+ Hr
57
+ Hosp
58
+ Insp
59
+ Lt
60
+ MM
61
+ MR
62
+ MRS
63
+ MS
64
+ Maj
65
+ Messrs
66
+ Mlle
67
+ Mme
68
+ Mr
69
+ Mrs
70
+ Ms
71
+ Msgr
72
+ Op
73
+ Ord
74
+ Pfc
75
+ Ph
76
+ Prof
77
+ Pvt
78
+ Rep
79
+ Reps
80
+ Res
81
+ Rev
82
+ Rt
83
+ Sen
84
+ Sens
85
+ Sfc
86
+ Sgt
87
+ Sr
88
+ St
89
+ Supt
90
+ Surg
91
+
92
+ #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
93
+ v
94
+ vs
95
+ i.e
96
+ rev
97
+ e.g
98
+
99
+ #Numbers only. These should only induce breaks when followed by a numeric sequence
100
+ # add NUMERIC_ONLY after the word for this function
101
+ #This case is mostly for the english "No." which can either be a sentence of its own, or
102
+ #if followed by a number, a non-breaking prefix
103
+ No #NUMERIC_ONLY#
104
+ Nos
105
+ Art #NUMERIC_ONLY#
106
+ Nr
107
+ pp #NUMERIC_ONLY#
108
+
109
+ #month abbreviations
110
+ Jan
111
+ Feb
112
+ Mar
113
+ Apr
114
+ #May is a full word
115
+ Jun
116
+ Jul
117
+ Aug
118
+ Sep
119
+ Oct
120
+ Nov
121
+ Dec
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.es ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+
4
+ #any single upper case letter followed by a period is not a sentence ender
5
+ #usually upper case letters are initials in a name
6
+ A
7
+ B
8
+ C
9
+ D
10
+ E
11
+ F
12
+ G
13
+ H
14
+ I
15
+ J
16
+ K
17
+ L
18
+ M
19
+ N
20
+ O
21
+ P
22
+ Q
23
+ R
24
+ S
25
+ T
26
+ U
27
+ V
28
+ W
29
+ X
30
+ Y
31
+ Z
32
+
33
+ # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
34
+
35
+ A.C
36
+ Apdo
37
+ Av
38
+ Bco
39
+ CC.AA
40
+ Da
41
+ Dep
42
+ Dn
43
+ Dr
44
+ Dra
45
+ EE.UU
46
+ Excmo
47
+ FF.CC
48
+ Fil
49
+ Gral
50
+ J.C
51
+ Let
52
+ Lic
53
+ N.B
54
+ P.D
55
+ P.V.P
56
+ Prof
57
+ Pts
58
+ Rte
59
+ S.A
60
+ S.A.R
61
+ S.E
62
+ S.L
63
+ S.R.C
64
+ Sr
65
+ Sra
66
+ Srta
67
+ Sta
68
+ Sto
69
+ T.V.E
70
+ Tel
71
+ Ud
72
+ Uds
73
+ V.B
74
+ V.E
75
+ Vd
76
+ Vds
77
+ a/c
78
+ adj
79
+ admón
80
+ afmo
81
+ apdo
82
+ av
83
+ c
84
+ c.f
85
+ c.g
86
+ cap
87
+ cm
88
+ cta
89
+ dcha
90
+ doc
91
+ ej
92
+ entlo
93
+ esq
94
+ etc
95
+ f.c
96
+ gr
97
+ grs
98
+ izq
99
+ kg
100
+ km
101
+ mg
102
+ mm
103
+ núm
104
+ núm
105
+ p
106
+ p.a
107
+ p.ej
108
+ ptas
109
+ pág
110
+ págs
111
+ pág
112
+ págs
113
+ q.e.g.e
114
+ q.e.s.m
115
+ s
116
+ s.s.s
117
+ vid
118
+ vol
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.fi ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT
2
+ #indicate an end-of-sentence marker. Special cases are included for prefixes
3
+ #that ONLY appear before 0-9 numbers.
4
+
5
+ #This list is compiled from omorfi <http://code.google.com/p/omorfi> database
6
+ #by Tommi A Pirinen.
7
+
8
+
9
+ #any single upper case letter followed by a period is not a sentence ender
10
+ A
11
+ B
12
+ C
13
+ D
14
+ E
15
+ F
16
+ G
17
+ H
18
+ I
19
+ J
20
+ K
21
+ L
22
+ M
23
+ N
24
+ O
25
+ P
26
+ Q
27
+ R
28
+ S
29
+ T
30
+ U
31
+ V
32
+ W
33
+ X
34
+ Y
35
+ Z
36
+ Å
37
+ Ä
38
+ Ö
39
+
40
+ #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
41
+ alik
42
+ alil
43
+ amir
44
+ apul
45
+ apul.prof
46
+ arkkit
47
+ ass
48
+ assist
49
+ dipl
50
+ dipl.arkkit
51
+ dipl.ekon
52
+ dipl.ins
53
+ dipl.kielenk
54
+ dipl.kirjeenv
55
+ dipl.kosm
56
+ dipl.urk
57
+ dos
58
+ erikoiseläinl
59
+ erikoishammasl
60
+ erikoisl
61
+ erikoist
62
+ ev.luutn
63
+ evp
64
+ fil
65
+ ft
66
+ hallinton
67
+ hallintot
68
+ hammaslääket
69
+ jatk
70
+ jääk
71
+ kansaned
72
+ kapt
73
+ kapt.luutn
74
+ kenr
75
+ kenr.luutn
76
+ kenr.maj
77
+ kers
78
+ kirjeenv
79
+ kom
80
+ kom.kapt
81
+ komm
82
+ konst
83
+ korpr
84
+ luutn
85
+ maist
86
+ maj
87
+ Mr
88
+ Mrs
89
+ Ms
90
+ M.Sc
91
+ neuv
92
+ nimim
93
+ Ph.D
94
+ prof
95
+ puh.joht
96
+ pääll
97
+ res
98
+ san
99
+ siht
100
+ suom
101
+ sähköp
102
+ säv
103
+ toht
104
+ toim
105
+ toim.apul
106
+ toim.joht
107
+ toim.siht
108
+ tuom
109
+ ups
110
+ vänr
111
+ vääp
112
+ ye.ups
113
+ ylik
114
+ ylil
115
+ ylim
116
+ ylimatr
117
+ yliop
118
+ yliopp
119
+ ylip
120
+ yliv
121
+
122
+ #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall
123
+ #into this category - it sometimes ends a sentence)
124
+ e.g
125
+ ent
126
+ esim
127
+ huom
128
+ i.e
129
+ ilm
130
+ l
131
+ mm
132
+ myöh
133
+ nk
134
+ nyk
135
+ par
136
+ po
137
+ t
138
+ v
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.fr ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+ #
4
+ #any single upper case letter followed by a period is not a sentence ender
5
+ #usually upper case letters are initials in a name
6
+ #no French words end in single lower-case letters, so we throw those in too?
7
+ A
8
+ B
9
+ C
10
+ D
11
+ E
12
+ F
13
+ G
14
+ H
15
+ I
16
+ J
17
+ K
18
+ L
19
+ M
20
+ N
21
+ O
22
+ P
23
+ Q
24
+ R
25
+ S
26
+ T
27
+ U
28
+ V
29
+ W
30
+ X
31
+ Y
32
+ Z
33
+ #a
34
+ b
35
+ c
36
+ d
37
+ e
38
+ f
39
+ g
40
+ h
41
+ i
42
+ j
43
+ k
44
+ l
45
+ m
46
+ n
47
+ o
48
+ p
49
+ q
50
+ r
51
+ s
52
+ t
53
+ u
54
+ v
55
+ w
56
+ x
57
+ y
58
+ z
59
+
60
+ # Period-final abbreviation list for French
61
+ A.C.N
62
+ A.M
63
+ art
64
+ ann
65
+ apr
66
+ av
67
+ auj
68
+ lib
69
+ B.P
70
+ boul
71
+ ca
72
+ c.-à-d
73
+ cf
74
+ ch.-l
75
+ chap
76
+ contr
77
+ C.P.I
78
+ C.Q.F.D
79
+ C.N
80
+ C.N.S
81
+ C.S
82
+ dir
83
+ éd
84
+ e.g
85
+ env
86
+ al
87
+ etc
88
+ E.V
89
+ ex
90
+ fasc
91
+ fém
92
+ fig
93
+ fr
94
+ hab
95
+ ibid
96
+ id
97
+ i.e
98
+ inf
99
+ LL.AA
100
+ LL.AA.II
101
+ LL.AA.RR
102
+ LL.AA.SS
103
+ L.D
104
+ LL.EE
105
+ LL.MM
106
+ LL.MM.II.RR
107
+ loc.cit
108
+ masc
109
+ MM
110
+ ms
111
+ N.B
112
+ N.D.A
113
+ N.D.L.R
114
+ N.D.T
115
+ n/réf
116
+ NN.SS
117
+ N.S
118
+ N.D
119
+ N.P.A.I
120
+ p.c.c
121
+ pl
122
+ pp
123
+ p.ex
124
+ p.j
125
+ P.S
126
+ R.A.S
127
+ R.-V
128
+ R.P
129
+ R.I.P
130
+ SS
131
+ S.S
132
+ S.A
133
+ S.A.I
134
+ S.A.R
135
+ S.A.S
136
+ S.E
137
+ sec
138
+ sect
139
+ sing
140
+ S.M
141
+ S.M.I.R
142
+ sq
143
+ sqq
144
+ suiv
145
+ sup
146
+ suppl
147
+ tél
148
+ T.S.V.P
149
+ vb
150
+ vol
151
+ vs
152
+ X.O
153
+ Z.I
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ga ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ A
3
+ B
4
+ C
5
+ D
6
+ E
7
+ F
8
+ G
9
+ H
10
+ I
11
+ J
12
+ K
13
+ L
14
+ M
15
+ N
16
+ O
17
+ P
18
+ Q
19
+ R
20
+ S
21
+ T
22
+ U
23
+ V
24
+ W
25
+ X
26
+ Y
27
+ Z
28
+ Á
29
+ É
30
+ Í
31
+ Ó
32
+ Ú
33
+
34
+ Uacht
35
+ Dr
36
+ B.Arch
37
+
38
+ m.sh
39
+ .i
40
+ Co
41
+ Cf
42
+ cf
43
+ i.e
44
+ r
45
+ Chr
46
+ lch #NUMERIC_ONLY#
47
+ lgh #NUMERIC_ONLY#
48
+ uimh #NUMERIC_ONLY#
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.hu ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+
4
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5
+ #usually upper case letters are initials in a name
6
+ A
7
+ B
8
+ C
9
+ D
10
+ E
11
+ F
12
+ G
13
+ H
14
+ I
15
+ J
16
+ K
17
+ L
18
+ M
19
+ N
20
+ O
21
+ P
22
+ Q
23
+ R
24
+ S
25
+ T
26
+ U
27
+ V
28
+ W
29
+ X
30
+ Y
31
+ Z
32
+ Á
33
+ É
34
+ Í
35
+ Ó
36
+ Ö
37
+ Ő
38
+ Ú
39
+ Ü
40
+ Ű
41
+
42
+ #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
43
+ Dr
44
+ dr
45
+ kb
46
+ Kb
47
+
48
+
49
+ pl
50
+ Pl
51
+ ca
52
+ Ca
53
+ min
54
+ Min
55
+ max
56
+ Max
57
+ ún
58
+ Ún
59
+ prof
60
+ Prof
61
+ de
62
+ De
63
+ du
64
+ Du
65
+ Szt
66
+ St
67
+
68
+ #Numbers only. These should only induce breaks when followed by a numeric sequence
69
+ # add NUMERIC_ONLY after the word for this function
70
+ #This case is mostly for the english "No." which can either be a sentence of its own, or
71
+ #if followed by a number, a non-breaking prefix
72
+
73
+ # Month name abbreviations
74
+ jan #NUMERIC_ONLY#
75
+ Jan #NUMERIC_ONLY#
76
+ Feb #NUMERIC_ONLY#
77
+ feb #NUMERIC_ONLY#
78
+ márc #NUMERIC_ONLY#
79
+ Márc #NUMERIC_ONLY#
80
+ ápr #NUMERIC_ONLY#
81
+ Ápr #NUMERIC_ONLY#
82
+ máj #NUMERIC_ONLY#
83
+ Máj #NUMERIC_ONLY#
84
+ jún #NUMERIC_ONLY#
85
+ Jún #NUMERIC_ONLY#
86
+ Júl #NUMERIC_ONLY#
87
+ júl #NUMERIC_ONLY#
88
+ aug #NUMERIC_ONLY#
89
+ Aug #NUMERIC_ONLY#
90
+ Szept #NUMERIC_ONLY#
91
+ szept #NUMERIC_ONLY#
92
+ okt #NUMERIC_ONLY#
93
+ Okt #NUMERIC_ONLY#
94
+ nov #NUMERIC_ONLY#
95
+ Nov #NUMERIC_ONLY#
96
+ dec #NUMERIC_ONLY#
97
+ Dec #NUMERIC_ONLY#
98
+
99
+ # Other abbreviations
100
+ tel #NUMERIC_ONLY#
101
+ Tel #NUMERIC_ONLY#
102
+ Fax #NUMERIC_ONLY#
103
+ fax #NUMERIC_ONLY#
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.is ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ no #NUMERIC_ONLY#
2
+ No #NUMERIC_ONLY#
3
+ nr #NUMERIC_ONLY#
4
+ Nr #NUMERIC_ONLY#
5
+ nR #NUMERIC_ONLY#
6
+ NR #NUMERIC_ONLY#
7
+ a
8
+ b
9
+ c
10
+ d
11
+ e
12
+ f
13
+ g
14
+ h
15
+ i
16
+ j
17
+ k
18
+ l
19
+ m
20
+ n
21
+ o
22
+ p
23
+ q
24
+ r
25
+ s
26
+ t
27
+ u
28
+ v
29
+ w
30
+ x
31
+ y
32
+ z
33
+ ^
34
+ í
35
+ á
36
+ ó
37
+ æ
38
+ A
39
+ B
40
+ C
41
+ D
42
+ E
43
+ F
44
+ G
45
+ H
46
+ I
47
+ J
48
+ K
49
+ L
50
+ M
51
+ N
52
+ O
53
+ P
54
+ Q
55
+ R
56
+ S
57
+ T
58
+ U
59
+ V
60
+ W
61
+ X
62
+ Y
63
+ Z
64
+ ab.fn
65
+ a.fn
66
+ afs
67
+ al
68
+ alm
69
+ alg
70
+ andh
71
+ ath
72
+ aths
73
+ atr
74
+ ao
75
+ au
76
+ aukaf
77
+ áfn
78
+ áhrl.s
79
+ áhrs
80
+ ákv.gr
81
+ ákv
82
+ bh
83
+ bls
84
+ dr
85
+ e.Kr
86
+ et
87
+ ef
88
+ efn
89
+ ennfr
90
+ eink
91
+ end
92
+ e.st
93
+ erl
94
+ fél
95
+ fskj
96
+ fh
97
+ f.hl
98
+ físl
99
+ fl
100
+ fn
101
+ fo
102
+ forl
103
+ frb
104
+ frl
105
+ frh
106
+ frt
107
+ fsl
108
+ fsh
109
+ fs
110
+ fsk
111
+ fst
112
+ f.Kr
113
+ ft
114
+ fv
115
+ fyrrn
116
+ fyrrv
117
+ germ
118
+ gm
119
+ gr
120
+ hdl
121
+ hdr
122
+ hf
123
+ hl
124
+ hlsk
125
+ hljsk
126
+ hljv
127
+ hljóðv
128
+ hr
129
+ hv
130
+ hvk
131
+ holl
132
+ Hos
133
+ höf
134
+ hk
135
+ hrl
136
+ ísl
137
+ kaf
138
+ kap
139
+ Khöfn
140
+ kk
141
+ kg
142
+ kk
143
+ km
144
+ kl
145
+ klst
146
+ kr
147
+ kt
148
+ kgúrsk
149
+ kvk
150
+ leturbr
151
+ lh
152
+ lh.nt
153
+ lh.þt
154
+ lo
155
+ ltr
156
+ mlja
157
+ mljó
158
+ millj
159
+ mm
160
+ mms
161
+ m.fl
162
+ miðm
163
+ mgr
164
+ mst
165
+ mín
166
+ nf
167
+ nh
168
+ nhm
169
+ nl
170
+ nk
171
+ nmgr
172
+ no
173
+ núv
174
+ nt
175
+ o.áfr
176
+ o.m.fl
177
+ ohf
178
+ o.fl
179
+ o.s.frv
180
+ ófn
181
+ ób
182
+ óákv.gr
183
+ óákv
184
+ pfn
185
+ PR
186
+ pr
187
+ Ritstj
188
+ Rvík
189
+ Rvk
190
+ samb
191
+ samhlj
192
+ samn
193
+ samn
194
+ sbr
195
+ sek
196
+ sérn
197
+ sf
198
+ sfn
199
+ sh
200
+ sfn
201
+ sh
202
+ s.hl
203
+ sk
204
+ skv
205
+ sl
206
+ sn
207
+ so
208
+ ss.us
209
+ s.st
210
+ samþ
211
+ sbr
212
+ shlj
213
+ sign
214
+ skál
215
+ st
216
+ st.s
217
+ stk
218
+
219
+ teg
220
+ tbl
221
+ tfn
222
+ tl
223
+ tvíhlj
224
+ tvt
225
+ till
226
+ to
227
+ umr
228
+ uh
229
+ us
230
+ uppl
231
+ útg
232
+ vb
233
+ Vf
234
+ vh
235
+ vkf
236
+ Vl
237
+ vl
238
+ vlf
239
+ vmf
240
+ 8vo
241
+ vsk
242
+ vth
243
+ þt
244
+ þf
245
+ þjs
246
+ þgf
247
+ þlt
248
+ þolm
249
+ þm
250
+ þml
251
+ þýð
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.it ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+
4
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5
+ #usually upper case letters are initials in a name
6
+ A
7
+ B
8
+ C
9
+ D
10
+ E
11
+ F
12
+ G
13
+ H
14
+ I
15
+ J
16
+ K
17
+ L
18
+ M
19
+ N
20
+ O
21
+ P
22
+ Q
23
+ R
24
+ S
25
+ T
26
+ U
27
+ V
28
+ W
29
+ X
30
+ Y
31
+ Z
32
+
33
+ #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
34
+ Adj
35
+ Adm
36
+ Adv
37
+ Amn
38
+ Arch
39
+ Asst
40
+ Avv
41
+ Bart
42
+ Bcc
43
+ Bldg
44
+ Brig
45
+ Bros
46
+ C.A.P
47
+ C.P
48
+ Capt
49
+ Cc
50
+ Cmdr
51
+ Co
52
+ Col
53
+ Comdr
54
+ Con
55
+ Corp
56
+ Cpl
57
+ DR
58
+ Dott
59
+ Dr
60
+ Drs
61
+ Egr
62
+ Ens
63
+ Gen
64
+ Geom
65
+ Gov
66
+ Hon
67
+ Hosp
68
+ Hr
69
+ Id
70
+ Ing
71
+ Insp
72
+ Lt
73
+ MM
74
+ MR
75
+ MRS
76
+ MS
77
+ Maj
78
+ Messrs
79
+ Mlle
80
+ Mme
81
+ Mo
82
+ Mons
83
+ Mr
84
+ Mrs
85
+ Ms
86
+ Msgr
87
+ N.B
88
+ Op
89
+ Ord
90
+ P.S
91
+ P.T
92
+ Pfc
93
+ Ph
94
+ Prof
95
+ Pvt
96
+ RP
97
+ RSVP
98
+ Rag
99
+ Rep
100
+ Reps
101
+ Res
102
+ Rev
103
+ Rif
104
+ Rt
105
+ S.A
106
+ S.B.F
107
+ S.P.M
108
+ S.p.A
109
+ S.r.l
110
+ Sen
111
+ Sens
112
+ Sfc
113
+ Sgt
114
+ Sig
115
+ Sigg
116
+ Soc
117
+ Spett
118
+ Sr
119
+ St
120
+ Supt
121
+ Surg
122
+ V.P
123
+
124
+ # other
125
+ a.c
126
+ acc
127
+ all
128
+ banc
129
+ c.a
130
+ c.c.p
131
+ c.m
132
+ c.p
133
+ c.s
134
+ c.v
135
+ corr
136
+ dott
137
+ e.p.c
138
+ ecc
139
+ es
140
+ fatt
141
+ gg
142
+ int
143
+ lett
144
+ ogg
145
+ on
146
+ p.c
147
+ p.c.c
148
+ p.es
149
+ p.f
150
+ p.r
151
+ p.v
152
+ post
153
+ pp
154
+ racc
155
+ ric
156
+ s.n.c
157
+ seg
158
+ sgg
159
+ ss
160
+ tel
161
+ u.s
162
+ v.r
163
+ v.s
164
+
165
+ #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
166
+ v
167
+ vs
168
+ i.e
169
+ rev
170
+ e.g
171
+
172
+ #Numbers only. These should only induce breaks when followed by a numeric sequence
173
+ # add NUMERIC_ONLY after the word for this function
174
+ #This case is mostly for the english "No." which can either be a sentence of its own, or
175
+ #if followed by a number, a non-breaking prefix
176
+ No #NUMERIC_ONLY#
177
+ Nos
178
+ Art #NUMERIC_ONLY#
179
+ Nr
180
+ pp #NUMERIC_ONLY#
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.lt ADDED
@@ -0,0 +1,698 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Anything in this file, followed by a period (and an upper-case word),
2
+ # does NOT indicate an end-of-sentence marker.
3
+ # Special cases are included for prefixes that ONLY appear before 0-9 numbers.
4
+
5
+ # Any single upper case letter followed by a period is not a sentence ender
6
+ # (excluding I occasionally, but we leave it in)
7
+ # usually upper case letters are initials in a name
8
+ A
9
+ Ā
10
+ B
11
+ C
12
+ Č
13
+ D
14
+ E
15
+ Ē
16
+ F
17
+ G
18
+ Ģ
19
+ H
20
+ I
21
+ Ī
22
+ J
23
+ K
24
+ Ķ
25
+ L
26
+ Ļ
27
+ M
28
+ N
29
+ Ņ
30
+ O
31
+ P
32
+ Q
33
+ R
34
+ S
35
+ Š
36
+ T
37
+ U
38
+ Ū
39
+ V
40
+ W
41
+ X
42
+ Y
43
+ Z
44
+ Ž
45
+
46
+ # Initialis -- Džonas
47
+ Dz
48
+
49
+ Just
50
+
51
+ # Day and month abbreviations
52
+ # m. menesis d. diena g. gimes
53
+ m
54
+ mėn
55
+ d
56
+ g
57
+ gim
58
+ # Pirmadienis Penktadienis
59
+ Pr
60
+ Pn
61
+ Pirm
62
+ Antr
63
+ Treč
64
+ Ketv
65
+ Penkt
66
+ Šešt
67
+ Sekm
68
+ Saus
69
+ Vas
70
+ Kov
71
+ Bal
72
+ Geg
73
+ Birž
74
+ Liep
75
+ Rugpj
76
+ Rugs
77
+ Spal
78
+ Lapkr
79
+ Gruod
80
+
81
+ # Business, governmental, geographical terms
82
+ a
83
+ # aikštė
84
+ adv
85
+ # advokatas
86
+ akad
87
+ # akademikas
88
+ aklg
89
+ # akligatvis
90
+ akt
91
+ # aktorius
92
+ al
93
+ # alėja
94
+ A.V
95
+ # antspaudo vieta
96
+ aps
97
+ apskr
98
+ # apskritis
99
+ apyg
100
+ # apygarda
101
+ aps
102
+ apskr
103
+ # apskritis
104
+ asist
105
+ # asistentas
106
+ asmv
107
+ avd
108
+ # asmenvardis
109
+ a.k
110
+ asm
111
+ asm.k
112
+ # asmens kodas
113
+ atsak
114
+ # atsakingasis
115
+ atsisk
116
+ sąsk
117
+ # atsiskaitomoji sąskaita
118
+ aut
119
+ # autorius
120
+ b
121
+ k
122
+ b.k
123
+ # banko kodas
124
+ bkl
125
+ # bakalauras
126
+ bt
127
+ # butas
128
+ buv
129
+ # buvęs, -usi
130
+ dail
131
+ # dailininkas
132
+ dek
133
+ # dekanas
134
+ dėst
135
+ # dėstytojas
136
+ dir
137
+ # direktorius
138
+ dirig
139
+ # dirigentas
140
+ doc
141
+ # docentas
142
+ drp
143
+ # durpynas
144
+
145
+ # dešinysis
146
+ egz
147
+ # egzempliorius
148
+ eil
149
+ # eilutė
150
+ ekon
151
+ # ekonomika
152
+ el
153
+ # elektroninis
154
+ etc
155
+
156
+ # ežeras
157
+ faks
158
+ # faksas
159
+ fak
160
+ # fakultetas
161
+ gen
162
+ # generolas
163
+ gyd
164
+ # gydytojas
165
+ gv
166
+ # gyvenvietė
167
+ įl
168
+ # įlanka
169
+ Įn
170
+ # įnagininkas
171
+ insp
172
+ # inspektorius
173
+ pan
174
+ # ir panašiai
175
+ t.t
176
+ # ir taip toliau
177
+ k.a
178
+ # kaip antai
179
+ kand
180
+ # kandidatas
181
+ kat
182
+ # katedra
183
+ kyš
184
+ # kyšulys
185
+ kl
186
+ # klasė
187
+ kln
188
+ # kalnas
189
+ kn
190
+ # knyga
191
+ koresp
192
+ # korespondentas
193
+ kpt
194
+ # kapitonas
195
+ kr
196
+ # kairysis
197
+ kt
198
+ # kitas
199
+ kun
200
+ # kunigas
201
+ l
202
+ e
203
+ p
204
+ l.e.p
205
+ # laikinai einantis pareigas
206
+ ltn
207
+ # leitenantas
208
+ m
209
+ mst
210
+ # miestas
211
+ m.e
212
+ # mūsų eros
213
+ m.m
214
+ # mokslo metai
215
+ mot
216
+ # moteris
217
+ mstl
218
+ # miestelis
219
+ mgr
220
+ # magistras
221
+ mgnt
222
+ # magistrantas
223
+ mjr
224
+ # majoras
225
+ mln
226
+ # milijonas
227
+ mlrd
228
+ # milijardas
229
+ mok
230
+ # mokinys
231
+ mokyt
232
+ # mokytojas
233
+ moksl
234
+ # mokslinis
235
+ nkt
236
+ # nekaitomas
237
+ ntk
238
+ # neteiktinas
239
+ Nr
240
+ nr
241
+ # numeris
242
+ p
243
+ # ponas
244
+ p.d
245
+ a.d
246
+ # pašto dėžutė, abonentinė dėžutė
247
+ p.m.e
248
+ # prieš mūsų erą
249
+ pan
250
+ # ir panašiai
251
+ pav
252
+ # paveikslas
253
+ pavad
254
+ # pavaduotojas
255
+ pirm
256
+ # pirmininkas
257
+ pl
258
+ # plentas
259
+ plg
260
+ # palygink
261
+ plk
262
+ # pulkininkas; pelkė
263
+ pr
264
+ # prospektas
265
+ Kr
266
+ pr.Kr
267
+ # prieš Kristų
268
+ prok
269
+ # prokuroras
270
+ prot
271
+ # protokolas
272
+ pss
273
+ # pusiasalis
274
+ pšt
275
+ # paštas
276
+ pvz
277
+ # pavyzdžiui
278
+ r
279
+ # rajonas
280
+ red
281
+ # redaktorius
282
+
283
+ # raštų kalbos
284
+ sąs
285
+ # sąsiuvinis
286
+ saviv
287
+ sav
288
+ # savivaldybė
289
+ sekr
290
+ # sekretorius
291
+ sen
292
+ # seniūnija, seniūnas
293
+ sk
294
+ # skaityk; skyrius
295
+ skg
296
+ # skersgatvis
297
+ skyr
298
+ sk
299
+ # skyrius
300
+ skv
301
+ # skveras
302
+ sp
303
+ # spauda; spaustuvė
304
+ spec
305
+ # specialistas
306
+ sr
307
+ # sritis
308
+ st
309
+ # stotis
310
+ str
311
+ # straipsnis
312
+ stud
313
+ # studentas
314
+ š
315
+ š.m
316
+ # šių metų
317
+ šnek
318
+ # šnekamosios
319
+ tir
320
+ # tiražas
321
+ tūkst
322
+ # tūkstantis
323
+ up
324
+ # upė
325
+ upl
326
+ # upelis
327
+ vad
328
+ # vadinamasis, -oji
329
+ vlsč
330
+ # valsčius
331
+ ved
332
+ # vedėjas
333
+ vet
334
+ # veterinarija
335
+ virš
336
+ # viršininkas, viršaitis
337
+ vyr
338
+ # vyriausiasis, -ioji; vyras
339
+ vyresn
340
+ # vyresnysis
341
+ vlsč
342
+ # valsčius
343
+ vs
344
+ # viensėdis
345
+ Vt
346
+ vt
347
+ # vietininkas
348
+ vtv
349
+ vv
350
+ # vietovardis
351
+ žml
352
+ # žemėlapis
353
+
354
+ # Technical terms, abbreviations used in guidebooks, advertisments, etc.
355
+ # Generally lower-case.
356
+ air
357
+ # airiškai
358
+ amer
359
+ # amerikanizmas
360
+ anat
361
+ # anatomija
362
+ angl
363
+ # angl. angliskai
364
+ arab
365
+ # arabų
366
+ archeol
367
+ archit
368
+ asm
369
+ # asmuo
370
+ astr
371
+ # astronomija
372
+ austral
373
+ # australiškai
374
+ aut
375
+ # automobilis
376
+ av
377
+ # aviacija
378
+ bažn
379
+ bdv
380
+ # būdvardis
381
+ bibl
382
+ # Biblija
383
+ biol
384
+ # biologija
385
+ bot
386
+ # botanika
387
+ brt
388
+ # burtai, burtažodis.
389
+ brus
390
+ # baltarusių
391
+ buh
392
+ # buhalterija
393
+ chem
394
+ # chemija
395
+ col
396
+ # collectivum
397
+ con
398
+ conj
399
+ # conjunctivus, jungtukas
400
+ dab
401
+ # dab. dabartine
402
+ dgs
403
+ # daugiskaita
404
+ dial
405
+ # dialektizmas
406
+ dipl
407
+ dktv
408
+ # daiktavardis
409
+ džn
410
+ # dažnai
411
+ ekon
412
+ el
413
+ # elektra
414
+ esam
415
+ # esamasis laikas
416
+ euf
417
+ # eufemizmas
418
+ fam
419
+ # familiariai
420
+ farm
421
+ # farmacija
422
+ filol
423
+ # filologija
424
+ filos
425
+ # filosofija
426
+ fin
427
+ # finansai
428
+ fiz
429
+ # fizika
430
+ fiziol
431
+ # fiziologija
432
+ flk
433
+ # folkloras
434
+ fon
435
+ # fonetika
436
+ fot
437
+ # fotografija
438
+ geod
439
+ # geodezija
440
+ geogr
441
+ geol
442
+ # geologija
443
+ geom
444
+ # geometrija
445
+ glžk
446
+ gr
447
+ # graikų
448
+ gram
449
+ her
450
+ # heraldika
451
+ hidr
452
+ # hidrotechnika
453
+ ind
454
+ # Indų
455
+ iron
456
+ # ironiškai
457
+ isp
458
+ # ispanų
459
+ ist
460
+ istor
461
+ # istorija
462
+ it
463
+ # italų
464
+ įv
465
+ reikšm
466
+ įv.reikšm
467
+ # įvairiomis reikšmėmis
468
+ jap
469
+ # japonų
470
+ juok
471
+ # juokaujamai
472
+ jūr
473
+ # jūrininkystė
474
+ kalb
475
+ # kalbotyra
476
+ kar
477
+ # karyba
478
+ kas
479
+ # kasyba
480
+ kin
481
+ # kinematografija
482
+ klaus
483
+ # klausiamasis
484
+ knyg
485
+ # knyginis
486
+ kom
487
+ # komercija
488
+ komp
489
+ # kompiuteris
490
+ kosm
491
+ # kosmonautika
492
+ kt
493
+ # kitas
494
+ kul
495
+ # kulinarija
496
+ kuop
497
+ # kuopine
498
+ l
499
+ # laikas
500
+ lit
501
+ # literatūrinis
502
+ lingv
503
+ # lingvistika
504
+ log
505
+ # logika
506
+ lot
507
+ # lotynų
508
+ mat
509
+ # matematika
510
+ maž
511
+ # mažybinis
512
+ med
513
+ # medicina
514
+ medž
515
+ # medžioklė
516
+ men
517
+ # menas
518
+ menk
519
+ # menkinamai
520
+ metal
521
+ # metalurgija
522
+ meteor
523
+ min
524
+ # mineralogija
525
+ mit
526
+ # mitologija
527
+ mok
528
+ # mokyklinis
529
+ ms
530
+ # mįslė
531
+ muz
532
+ # muzikinis
533
+ n
534
+ # naujasis
535
+ neig
536
+ # neigiamasis
537
+ neol
538
+ # neologizmas
539
+ niek
540
+ # niekinamai
541
+ ofic
542
+ # oficialus
543
+ opt
544
+ # optika
545
+ orig
546
+ # original
547
+ p
548
+ # pietūs
549
+ pan
550
+ # panašiai
551
+ parl
552
+ # parlamentas
553
+ pat
554
+ # patarlė
555
+ paž
556
+ # pažodžiui
557
+ plg
558
+ # palygink
559
+ poet
560
+ # poetizmas
561
+ poez
562
+ # poezija
563
+ poligr
564
+ # poligrafija
565
+ polit
566
+ # politika
567
+ ppr
568
+ # paprastai
569
+ pranc
570
+ pr
571
+ # prancūzų, prūsų
572
+ priet
573
+ # prietaras
574
+ prek
575
+ # prekyba
576
+ prk
577
+ # perkeltine
578
+ prs
579
+ # persona, asmuo
580
+ psn
581
+ # pasenęs žodis
582
+ psich
583
+ # psichologija
584
+ pvz
585
+ # pavyzdžiui
586
+ r
587
+ # rytai
588
+ rad
589
+ # radiotechnika
590
+ rel
591
+ # religija
592
+ ret
593
+ # retai
594
+ rus
595
+ # rusų
596
+ sen
597
+ # senasis
598
+ sl
599
+ # slengas, slavų
600
+ sov
601
+ # sovietinis
602
+ spec
603
+ # specialus
604
+ sport
605
+ stat
606
+ # statyba
607
+ sudurt
608
+ # sudurtinis
609
+ sutr
610
+ # sutrumpintas
611
+ suv
612
+ # suvalkiečių
613
+ š
614
+ # šiaurė
615
+ šach
616
+ # šachmatai
617
+ šiaur
618
+ škot
619
+ # škotiškai
620
+ šnek
621
+ # šnekamoji
622
+ teatr
623
+ tech
624
+ techn
625
+ # technika
626
+ teig
627
+ # teigiamas
628
+ teis
629
+ # teisė
630
+ tekst
631
+ # tekstilė
632
+ tel
633
+ # telefonas
634
+ teol
635
+ # teologija
636
+ v
637
+ # tik vyriškosios, vakarai
638
+ t.p
639
+ t
640
+ p
641
+ # ir taip pat
642
+ t.t
643
+ # ir taip toliau
644
+ t.y
645
+ # tai yra
646
+ vaik
647
+ # vaikų
648
+ vart
649
+ # vartojama
650
+ vet
651
+ # veterinarija
652
+ vid
653
+ # vidurinis
654
+ vksm
655
+ # veiksmažodis
656
+ vns
657
+ # vienaskaita
658
+ vok
659
+ # vokiečių
660
+ vulg
661
+ # vulgariai
662
+ zool
663
+ # zoologija
664
+ žr
665
+ # žiūrėk
666
+ ž.ū
667
+ ž
668
+ ū
669
+ # žemės ūkis
670
+
671
+ # List of titles. These are often followed by upper-case names, but do
672
+ # not indicate sentence breaks
673
+ #
674
+ # Jo Eminencija
675
+ Em.
676
+ # Gerbiamasis
677
+ Gerb
678
+ gerb
679
+ # malonus
680
+ malon
681
+ # profesorius
682
+ Prof
683
+ prof
684
+ # daktaras (mokslų)
685
+ Dr
686
+ dr
687
+ habil
688
+ med
689
+ # inž inžinierius
690
+ inž
691
+ Inž
692
+
693
+
694
+ #Numbers only. These should only induce breaks when followed by a numeric sequence
695
+ # add NUMERIC_ONLY after the word for this function
696
+ #This case is mostly for the english "No." which can either be a sentence of its own, or
697
+ #if followed by a number, a non-breaking prefix
698
+ No #NUMERIC_ONLY#
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.lv ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+
4
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5
+ #usually upper case letters are initials in a name
6
+ A
7
+ Ā
8
+ B
9
+ C
10
+ Č
11
+ D
12
+ E
13
+ Ē
14
+ F
15
+ G
16
+ Ģ
17
+ H
18
+ I
19
+ Ī
20
+ J
21
+ K
22
+ Ķ
23
+ L
24
+ Ļ
25
+ M
26
+ N
27
+ Ņ
28
+ O
29
+ P
30
+ Q
31
+ R
32
+ S
33
+ Š
34
+ T
35
+ U
36
+ Ū
37
+ V
38
+ W
39
+ X
40
+ Y
41
+ Z
42
+ Ž
43
+
44
+ #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
45
+ dr
46
+ Dr
47
+ med
48
+ prof
49
+ Prof
50
+ inž
51
+ Inž
52
+ ist.loc
53
+ Ist.loc
54
+ kor.loc
55
+ Kor.loc
56
+ v.i
57
+ vietn
58
+ Vietn
59
+
60
+ #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
61
+ a.l
62
+ t.p
63
+ pārb
64
+ Pārb
65
+ vec
66
+ Vec
67
+ inv
68
+ Inv
69
+ sk
70
+ Sk
71
+ spec
72
+ Spec
73
+ vienk
74
+ Vienk
75
+ virz
76
+ Virz
77
+ māksl
78
+ Māksl
79
+ mūz
80
+ Mūz
81
+ akad
82
+ Akad
83
+ soc
84
+ Soc
85
+ galv
86
+ Galv
87
+ vad
88
+ Vad
89
+ sertif
90
+ Sertif
91
+ folkl
92
+ Folkl
93
+ hum
94
+ Hum
95
+
96
+ #Numbers only. These should only induce breaks when followed by a numeric sequence
97
+ # add NUMERIC_ONLY after the word for this function
98
+ #This case is mostly for the english "No." which can either be a sentence of its own, or
99
+ #if followed by a number, a non-breaking prefix
100
+ Nr #NUMERIC_ONLY#
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.nl ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+ #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen
4
+ # http://nl.wikipedia.org/wiki/Aanspreekvorm
5
+ # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
6
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
7
+ #usually upper case letters are initials in a name
8
+ A
9
+ B
10
+ C
11
+ D
12
+ E
13
+ F
14
+ G
15
+ H
16
+ I
17
+ J
18
+ K
19
+ L
20
+ M
21
+ N
22
+ O
23
+ P
24
+ Q
25
+ R
26
+ S
27
+ T
28
+ U
29
+ V
30
+ W
31
+ X
32
+ Y
33
+ Z
34
+
35
+ #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
36
+ bacc
37
+ bc
38
+ bgen
39
+ c.i
40
+ dhr
41
+ dr
42
+ dr.h.c
43
+ drs
44
+ drs
45
+ ds
46
+ eint
47
+ fa
48
+ Fa
49
+ fam
50
+ gen
51
+ genm
52
+ ing
53
+ ir
54
+ jhr
55
+ jkvr
56
+ jr
57
+ kand
58
+ kol
59
+ lgen
60
+ lkol
61
+ Lt
62
+ maj
63
+ Mej
64
+ mevr
65
+ Mme
66
+ mr
67
+ mr
68
+ Mw
69
+ o.b.s
70
+ plv
71
+ prof
72
+ ritm
73
+ tint
74
+ Vz
75
+ Z.D
76
+ Z.D.H
77
+ Z.E
78
+ Z.Em
79
+ Z.H
80
+ Z.K.H
81
+ Z.K.M
82
+ Z.M
83
+ z.v
84
+
85
+ #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
86
+ #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
87
+ a.g.v
88
+ bijv
89
+ bijz
90
+ bv
91
+ d.w.z
92
+ e.c
93
+ e.g
94
+ e.k
95
+ ev
96
+ i.p.v
97
+ i.s.m
98
+ i.t.t
99
+ i.v.m
100
+ m.a.w
101
+ m.b.t
102
+ m.b.v
103
+ m.h.o
104
+ m.i
105
+ m.i.v
106
+ v.w.t
107
+
108
+ #Numbers only. These should only induce breaks when followed by a numeric sequence
109
+ # add NUMERIC_ONLY after the word for this function
110
+ #This case is mostly for the english "No." which can either be a sentence of its own, or
111
+ #if followed by a number, a non-breaking prefix
112
+ Nr #NUMERIC_ONLY#
113
+ Nrs
114
+ nrs
115
+ nr #NUMERIC_ONLY#
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.pl ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ adw
2
+ afr
3
+ akad
4
+ al
5
+ Al
6
+ am
7
+ amer
8
+ arch
9
+ art
10
+ Art
11
+ artyst
12
+ astr
13
+ austr
14
+ bałt
15
+ bdb
16
+
17
+ bm
18
+ br
19
+ bryg
20
+ bryt
21
+ centr
22
+ ces
23
+ chem
24
+ chiń
25
+ chir
26
+ c.k
27
+ c.o
28
+ cyg
29
+ cyw
30
+ cyt
31
+ czes
32
+ czw
33
+ cd
34
+ Cd
35
+ czyt
36
+ ćw
37
+ ćwicz
38
+ daw
39
+ dcn
40
+ dekl
41
+ demokr
42
+ det
43
+ diec
44
+
45
+ dn
46
+ dot
47
+ dol
48
+ dop
49
+ dost
50
+ dosł
51
+ h.c
52
+ ds
53
+ dst
54
+ duszp
55
+ dypl
56
+ egz
57
+ ekol
58
+ ekon
59
+ elektr
60
+ em
61
+ ew
62
+ fab
63
+ farm
64
+ fot
65
+ fr
66
+ gat
67
+ gastr
68
+ geogr
69
+ geol
70
+ gimn
71
+ głęb
72
+ gm
73
+ godz
74
+ górn
75
+ gosp
76
+ gr
77
+ gram
78
+ hist
79
+ hiszp
80
+ hr
81
+ Hr
82
+ hot
83
+ id
84
+ in
85
+ im
86
+ iron
87
+ jn
88
+ kard
89
+ kat
90
+ katol
91
+ k.k
92
+ kk
93
+ kol
94
+ kl
95
+ k.p.a
96
+ kpc
97
+ k.p.c
98
+ kpt
99
+ kr
100
+ k.r
101
+ krak
102
+ k.r.o
103
+ kryt
104
+ kult
105
+ laic
106
+ łac
107
+ niem
108
+ woj
109
+ nb
110
+ np
111
+ Nb
112
+ Np
113
+ pol
114
+ pow
115
+ m.in
116
+ pt
117
+ ps
118
+ Pt
119
+ Ps
120
+ cdn
121
+ jw
122
+ ryc
123
+ rys
124
+ Ryc
125
+ Rys
126
+ tj
127
+ tzw
128
+ Tzw
129
+ tzn
130
+ zob
131
+ ang
132
+ ub
133
+ ul
134
+ pw
135
+ pn
136
+ pl
137
+ al
138
+ k
139
+ n
140
+ nr #NUMERIC_ONLY#
141
+ Nr #NUMERIC_ONLY#
142
+ ww
143
+
144
+ ur
145
+ zm
146
+ żyd
147
+ żarg
148
+ żyw
149
+ wył
150
+ bp
151
+ bp
152
+ wyst
153
+ tow
154
+ Tow
155
+ o
156
+ sp
157
+ Sp
158
+ st
159
+ spółdz
160
+ Spółdz
161
+ społ
162
+ spółgł
163
+ stoł
164
+ stow
165
+ Stoł
166
+ Stow
167
+ zn
168
+ zew
169
+ zewn
170
+ zdr
171
+ zazw
172
+ zast
173
+ zaw
174
+ zał
175
+ zal
176
+ zam
177
+ zak
178
+ zakł
179
+ zagr
180
+ zach
181
+ adw
182
+ Adw
183
+ lek
184
+ Lek
185
+ med
186
+ mec
187
+ Mec
188
+ doc
189
+ Doc
190
+ dyw
191
+ dyr
192
+ Dyw
193
+ Dyr
194
+ inż
195
+ Inż
196
+ mgr
197
+ Mgr
198
+ dh
199
+ dr
200
+ Dh
201
+ Dr
202
+ p
203
+ P
204
+ red
205
+ Red
206
+ prof
207
+ prok
208
+ Prof
209
+ Prok
210
+ hab
211
+ płk
212
+ Płk
213
+ nadkom
214
+ Nadkom
215
+ podkom
216
+ Podkom
217
+ ks
218
+ Ks
219
+ gen
220
+ Gen
221
+ por
222
+ Por
223
+ reż
224
+ Reż
225
+ przyp
226
+ Przyp
227
+ śp
228
+ św
229
+ śW
230
+ Śp
231
+ Św
232
+ ŚW
233
+ szer
234
+ Szer
235
+ pkt #NUMERIC_ONLY#
236
+ str #NUMERIC_ONLY#
237
+ tab #NUMERIC_ONLY#
238
+ Tab #NUMERIC_ONLY#
239
+ tel
240
+ ust #NUMERIC_ONLY#
241
+ par #NUMERIC_ONLY#
242
+ poz
243
+ pok
244
+ oo
245
+ oO
246
+ Oo
247
+ OO
248
+ r #NUMERIC_ONLY#
249
+ l #NUMERIC_ONLY#
250
+ s #NUMERIC_ONLY#
251
+ najśw
252
+ Najśw
253
+ A
254
+ B
255
+ C
256
+ D
257
+ E
258
+ F
259
+ G
260
+ H
261
+ I
262
+ J
263
+ K
264
+ L
265
+ M
266
+ N
267
+ O
268
+ P
269
+ Q
270
+ R
271
+ S
272
+ T
273
+ U
274
+ V
275
+ W
276
+ X
277
+ Y
278
+ Z
279
+ Ś
280
+ Ć
281
+ Ż
282
+ Ź
283
+ Dz
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4477f9ad690e262c04bc057f4757e12b16777892f80016539130f3b7eebd58b4
3
+ size 1792
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ro ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ A
2
+ B
3
+ C
4
+ D
5
+ E
6
+ F
7
+ G
8
+ H
9
+ I
10
+ J
11
+ K
12
+ L
13
+ M
14
+ N
15
+ O
16
+ P
17
+ Q
18
+ R
19
+ S
20
+ T
21
+ U
22
+ V
23
+ W
24
+ X
25
+ Y
26
+ Z
27
+ dpdv
28
+ etc
29
+ șamd
30
+ M.Ap.N
31
+ dl
32
+ Dl
33
+ d-na
34
+ D-na
35
+ dvs
36
+ Dvs
37
+ pt
38
+ Pt
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ru ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # added Cyrillic uppercase letters [А-Я]
2
+ # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes)
3
+ # edited by Kate Young ([email protected]) 21 May 2013
4
+ А
5
+ Б
6
+ В
7
+ Г
8
+ Д
9
+ Е
10
+ Ж
11
+ З
12
+ И
13
+ Й
14
+ К
15
+ Л
16
+ М
17
+ Н
18
+ О
19
+ П
20
+ Р
21
+ С
22
+ Т
23
+ У
24
+ Ф
25
+ Х
26
+ Ц
27
+ Ч
28
+ Ш
29
+ Щ
30
+ Ъ
31
+ Ы
32
+ Ь
33
+ Э
34
+ Ю
35
+ Я
36
+ A
37
+ B
38
+ C
39
+ D
40
+ E
41
+ F
42
+ G
43
+ H
44
+ I
45
+ J
46
+ K
47
+ L
48
+ M
49
+ N
50
+ O
51
+ P
52
+ Q
53
+ R
54
+ S
55
+ T
56
+ U
57
+ V
58
+ W
59
+ X
60
+ Y
61
+ Z
62
+ 0гг
63
+ 1гг
64
+ 2гг
65
+ 3гг
66
+ 4гг
67
+ 5гг
68
+ 6гг
69
+ 7гг
70
+ 8гг
71
+ 9гг
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+ Xвв
83
+ Vвв
84
+ Iвв
85
+ Lвв
86
+ Mвв
87
+ Cвв
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+ 0мм
105
+ 1мм
106
+ 2мм
107
+ 3мм
108
+ 4мм
109
+ 5мм
110
+ 6мм
111
+ 7мм
112
+ 8мм
113
+ 9мм
114
+ 0см
115
+ 1см
116
+ 2см
117
+ 3см
118
+ 4см
119
+ 5см
120
+ 6см
121
+ 7см
122
+ 8см
123
+ 9см
124
+ 0дм
125
+ 1дм
126
+ 2дм
127
+ 3дм
128
+ 4дм
129
+ 5дм
130
+ 6дм
131
+ 7дм
132
+ 8дм
133
+ 9дм
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+ 0км
145
+ 1км
146
+ 2км
147
+ 3км
148
+ 4км
149
+ 5км
150
+ 6км
151
+ 7км
152
+ 8км
153
+ 9км
154
+ 0га
155
+ 1га
156
+ 2га
157
+ 3га
158
+ 4га
159
+ 5га
160
+ 6га
161
+ 7га
162
+ 8га
163
+ 9га
164
+ 0кг
165
+ 1кг
166
+ 2кг
167
+ 3кг
168
+ 4кг
169
+ 5кг
170
+ 6кг
171
+ 7кг
172
+ 8кг
173
+ 9кг
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+ 0мг
195
+ 1мг
196
+ 2мг
197
+ 3мг
198
+ 4мг
199
+ 5мг
200
+ 6мг
201
+ 7мг
202
+ 8мг
203
+ 9мг
204
+ бульв
205
+ в
206
+ вв
207
+ г
208
+ га
209
+ гг
210
+ гл
211
+ гос
212
+ д
213
+ дм
214
+ доп
215
+ др
216
+ е
217
+ ед
218
+ ед
219
+ зам
220
+ и
221
+ инд
222
+ исп
223
+ Исп
224
+ к
225
+ кап
226
+ кг
227
+ кв
228
+ кл
229
+ км
230
+ кол
231
+ комн
232
+ коп
233
+ куб
234
+ л
235
+ лиц
236
+ лл
237
+ м
238
+ макс
239
+ мг
240
+ мин
241
+ мл
242
+ млн
243
+ млрд
244
+ мм
245
+ н
246
+ наб
247
+ нач
248
+ неуд
249
+ ном
250
+ о
251
+ обл
252
+ обр
253
+ общ
254
+ ок
255
+ ост
256
+ отл
257
+ п
258
+ пер
259
+ перераб
260
+ пл
261
+ пос
262
+ пр
263
+ просп
264
+ проф
265
+ р
266
+ ред
267
+ руб
268
+ с
269
+ сб
270
+ св
271
+ см
272
+ соч
273
+ ср
274
+ ст
275
+ стр
276
+ т
277
+ тел
278
+ Тел
279
+ тех
280
+ тт
281
+ туп
282
+ тыс
283
+ уд
284
+ ул
285
+ уч
286
+ физ
287
+ х
288
+ хор
289
+ ч
290
+ чел
291
+ шт
292
+ экз
293
+ э
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.sk ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Bc
2
+ Mgr
3
+ RNDr
4
+ PharmDr
5
+ PhDr
6
+ JUDr
7
+ PaedDr
8
+ ThDr
9
+ Ing
10
+ MUDr
11
+ MDDr
12
+ MVDr
13
+ Dr
14
+ ThLic
15
+ PhD
16
+ ArtD
17
+ ThDr
18
+ Dr
19
+ DrSc
20
+ CSs
21
+ prof
22
+ obr
23
+ Obr
24
+ Č
25
+ č
26
+ absol
27
+ adj
28
+ admin
29
+ adr
30
+ Adr
31
+ adv
32
+ advok
33
+ afr
34
+ ak
35
+ akad
36
+ akc
37
+ akuz
38
+ et
39
+ al
40
+ alch
41
+ amer
42
+ anat
43
+ angl
44
+ Angl
45
+ anglosas
46
+ anorg
47
+ ap
48
+ apod
49
+ arch
50
+ archeol
51
+ archit
52
+ arg
53
+ art
54
+ astr
55
+ astrol
56
+ astron
57
+ atp
58
+ atď
59
+ austr
60
+ Austr
61
+ aut
62
+ belg
63
+ Belg
64
+ bibl
65
+ Bibl
66
+ biol
67
+ bot
68
+ bud
69
+ bás
70
+ býv
71
+ cest
72
+ chem
73
+ cirk
74
+ csl
75
+ čs
76
+ Čs
77
+ dat
78
+ dep
79
+ det
80
+ dial
81
+ diaľ
82
+ dipl
83
+ distrib
84
+ dokl
85
+ dosl
86
+ dopr
87
+ dram
88
+ duš
89
+ dv
90
+ dvojčl
91
+ dór
92
+ ekol
93
+ ekon
94
+ el
95
+ elektr
96
+ elektrotech
97
+ energet
98
+ epic
99
+ est
100
+ etc
101
+ etonym
102
+ eufem
103
+ európ
104
+ Európ
105
+ ev
106
+ evid
107
+ expr
108
+ fa
109
+ fam
110
+ farm
111
+ fem
112
+ feud
113
+ fil
114
+ filat
115
+ filoz
116
+ fi
117
+ fon
118
+ form
119
+ fot
120
+ fr
121
+ Fr
122
+ franc
123
+ Franc
124
+ fraz
125
+ fut
126
+ fyz
127
+ fyziol
128
+ garb
129
+ gen
130
+ genet
131
+ genpor
132
+ geod
133
+ geogr
134
+ geol
135
+ geom
136
+ germ
137
+ gr
138
+ Gr
139
+ gréc
140
+ Gréc
141
+ gréckokat
142
+ hebr
143
+ herald
144
+ hist
145
+ hlav
146
+ hosp
147
+ hromad
148
+ hud
149
+ hypok
150
+ ident
151
+ i.e
152
+ ident
153
+ imp
154
+ impf
155
+ indoeur
156
+ inf
157
+ inform
158
+ instr
159
+ int
160
+ interj
161
+ inšt
162
+ inštr
163
+ iron
164
+ jap
165
+ Jap
166
+ jaz
167
+ jedn
168
+ juhoamer
169
+ juhových
170
+ juhozáp
171
+ juž
172
+ kanad
173
+ Kanad
174
+ kanc
175
+ kapit
176
+ kpt
177
+ kart
178
+ katastr
179
+ knih
180
+ kniž
181
+ komp
182
+ konj
183
+ konkr
184
+ kozmet
185
+ krajč
186
+ kresť
187
+ kt
188
+ kuch
189
+ lat
190
+ latinskoamer
191
+ lek
192
+ lex
193
+ lingv
194
+ lit
195
+ litur
196
+ log
197
+ lok
198
+ max
199
+ Max
200
+ maď
201
+ Maď
202
+ medzinár
203
+ mest
204
+ metr
205
+ mil
206
+ Mil
207
+ min
208
+ Min
209
+ miner
210
+ ml
211
+ mld
212
+ mn
213
+ mod
214
+ mytol
215
+ napr
216
+ nar
217
+ Nar
218
+ nasl
219
+ nedok
220
+ neg
221
+ negat
222
+ neklas
223
+ nem
224
+ Nem
225
+ neodb
226
+ neos
227
+ neskl
228
+ nesklon
229
+ nespis
230
+ nespráv
231
+ neved
232
+ než
233
+ niekt
234
+ niž
235
+ nom
236
+ náb
237
+ nákl
238
+ námor
239
+ nár
240
+ obch
241
+ obj
242
+ obv
243
+ obyč
244
+ obč
245
+ občian
246
+ odb
247
+ odd
248
+ ods
249
+ ojed
250
+ okr
251
+ Okr
252
+ opt
253
+ opyt
254
+ org
255
+ os
256
+ osob
257
+ ot
258
+ ovoc
259
+ par
260
+ part
261
+ pejor
262
+ pers
263
+ pf
264
+ Pf
265
+ P.f
266
+ p.f
267
+ pl
268
+ Plk
269
+ pod
270
+ podst
271
+ pokl
272
+ polit
273
+ politol
274
+ polygr
275
+ pomn
276
+ popl
277
+ por
278
+ porad
279
+ porov
280
+ posch
281
+ potrav
282
+ použ
283
+ poz
284
+ pozit
285
+ poľ
286
+ poľno
287
+ poľnohosp
288
+ poľov
289
+ pošt
290
+ pož
291
+ prac
292
+ predl
293
+ pren
294
+ prep
295
+ preuk
296
+ priezv
297
+ Priezv
298
+ privl
299
+ prof
300
+ práv
301
+ príd
302
+ príj
303
+ prík
304
+ príp
305
+ prír
306
+ prísl
307
+ príslov
308
+ príč
309
+ psych
310
+ publ
311
+ pís
312
+ písm
313
+ pôv
314
+ refl
315
+ reg
316
+ rep
317
+ resp
318
+ rozk
319
+ rozlič
320
+ rozpráv
321
+ roč
322
+ Roč
323
+ ryb
324
+ rádiotech
325
+ rím
326
+ samohl
327
+ semest
328
+ sev
329
+ severoamer
330
+ severových
331
+ severozáp
332
+ sg
333
+ skr
334
+ skup
335
+ sl
336
+ Sloven
337
+ soc
338
+ soch
339
+ sociol
340
+ sp
341
+ spol
342
+ Spol
343
+ spoloč
344
+ spoluhl
345
+ správ
346
+ spôs
347
+ st
348
+ star
349
+ starogréc
350
+ starorím
351
+ s.r.o
352
+ stol
353
+ stor
354
+ str
355
+ stredoamer
356
+ stredoškol
357
+ subj
358
+ subst
359
+ superl
360
+ sv
361
+ sz
362
+ súkr
363
+ súp
364
+ súvzť
365
+ tal
366
+ Tal
367
+ tech
368
+ tel
369
+ Tel
370
+ telef
371
+ teles
372
+ telev
373
+ teol
374
+ trans
375
+ turist
376
+ tuzem
377
+ typogr
378
+ tzn
379
+ tzv
380
+ ukaz
381
+ ul
382
+ Ul
383
+ umel
384
+ univ
385
+ ust
386
+ ved
387
+ vedľ
388
+ verb
389
+ veter
390
+ vin
391
+ viď
392
+ vl
393
+ vod
394
+ vodohosp
395
+ pnl
396
+ vulg
397
+ vyj
398
+ vys
399
+ vysokoškol
400
+ vzťaž
401
+ vôb
402
+ vých
403
+ výd
404
+ výrob
405
+ výsk
406
+ výsl
407
+ výtv
408
+ výtvar
409
+ význ
410
+ včel
411
+
412
+ všeob
413
+ zahr
414
+ zar
415
+ zariad
416
+ zast
417
+ zastar
418
+ zastaráv
419
+ zb
420
+ zdravot
421
+ združ
422
+ zjemn
423
+ zlat
424
+ zn
425
+ Zn
426
+ zool
427
+ zr
428
+ zried
429
+ zv
430
+ záhr
431
+ zák
432
+ zákl
433
+ zám
434
+ záp
435
+ západoeur
436
+ zázn
437
+ územ
438
+ účt
439
+ čast
440
+ čes
441
+ Čes
442
+ čl
443
+ čísl
444
+ živ
445
+ pr
446
+ fak
447
+ Kr
448
+ p.n.l
449
+ A
450
+ B
451
+ C
452
+ D
453
+ E
454
+ F
455
+ G
456
+ H
457
+ I
458
+ J
459
+ K
460
+ L
461
+ M
462
+ N
463
+ O
464
+ P
465
+ Q
466
+ R
467
+ S
468
+ T
469
+ U
470
+ V
471
+ W
472
+ X
473
+ Y
474
+ Z
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.sl ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dr
2
+ Dr
3
+ itd
4
+ itn
5
+ št #NUMERIC_ONLY#
6
+ Št #NUMERIC_ONLY#
7
+ d
8
+ jan
9
+ Jan
10
+ feb
11
+ Feb
12
+ mar
13
+ Mar
14
+ apr
15
+ Apr
16
+ jun
17
+ Jun
18
+ jul
19
+ Jul
20
+ avg
21
+ Avg
22
+ sept
23
+ Sept
24
+ sep
25
+ Sep
26
+ okt
27
+ Okt
28
+ nov
29
+ Nov
30
+ dec
31
+ Dec
32
+ tj
33
+ Tj
34
+ npr
35
+ Npr
36
+ sl
37
+ Sl
38
+ op
39
+ Op
40
+ gl
41
+ Gl
42
+ oz
43
+ Oz
44
+ prev
45
+ dipl
46
+ ing
47
+ prim
48
+ Prim
49
+ cf
50
+ Cf
51
+ gl
52
+ Gl
53
+ A
54
+ B
55
+ C
56
+ D
57
+ E
58
+ F
59
+ G
60
+ H
61
+ I
62
+ J
63
+ K
64
+ L
65
+ M
66
+ N
67
+ O
68
+ P
69
+ Q
70
+ R
71
+ S
72
+ T
73
+ U
74
+ V
75
+ W
76
+ X
77
+ Y
78
+ Z
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.sv ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #single upper case letter are usually initials
2
+ A
3
+ B
4
+ C
5
+ D
6
+ E
7
+ F
8
+ G
9
+ H
10
+ I
11
+ J
12
+ K
13
+ L
14
+ M
15
+ N
16
+ O
17
+ P
18
+ Q
19
+ R
20
+ S
21
+ T
22
+ U
23
+ V
24
+ W
25
+ X
26
+ Y
27
+ Z
28
+ #misc abbreviations
29
+ AB
30
+ G
31
+ VG
32
+ dvs
33
+ etc
34
+ from
35
+ iaf
36
+ jfr
37
+ kl
38
+ kr
39
+ mao
40
+ mfl
41
+ mm
42
+ osv
43
+ pga
44
+ tex
45
+ tom
46
+ vs
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ta ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+
4
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5
+ #usually upper case letters are initials in a name
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+ கா
21
+ கி
22
+ கீ
23
+ கு
24
+ கூ
25
+ கெ
26
+ கே
27
+ கை
28
+ கொ
29
+ கோ
30
+ கௌ
31
+ க்
32
+
33
+ சா
34
+ சி
35
+ சீ
36
+ சு
37
+ சூ
38
+ செ
39
+ சே
40
+ சை
41
+ சொ
42
+ சோ
43
+ சௌ
44
+ ச்
45
+
46
+ டா
47
+ டி
48
+ டீ
49
+ டு
50
+ டூ
51
+ டெ
52
+ டே
53
+ டை
54
+ டொ
55
+ டோ
56
+ டௌ
57
+ ட்
58
+
59
+ தா
60
+ தி
61
+ தீ
62
+ து
63
+ தூ
64
+ தெ
65
+ தே
66
+ தை
67
+ தொ
68
+ தோ
69
+ தௌ
70
+ த்
71
+
72
+ பா
73
+ பி
74
+ பீ
75
+ பு
76
+ பூ
77
+ பெ
78
+ பே
79
+ பை
80
+ பொ
81
+ போ
82
+ பௌ
83
+ ப்
84
+
85
+ றா
86
+ றி
87
+ றீ
88
+ று
89
+ றூ
90
+ றெ
91
+ றே
92
+ றை
93
+ றொ
94
+ றோ
95
+ றௌ
96
+ ற்
97
+
98
+ யா
99
+ யி
100
+ யீ
101
+ யு
102
+ யூ
103
+ யெ
104
+ யே
105
+ யை
106
+ யொ
107
+ யோ
108
+ யௌ
109
+ ய்
110
+
111
+ ரா
112
+ ரி
113
+ ரீ
114
+ ரு
115
+ ரூ
116
+ ரெ
117
+ ரே
118
+ ரை
119
+ ரொ
120
+ ரோ
121
+ ரௌ
122
+ ர்
123
+
124
+ லா
125
+ லி
126
+ லீ
127
+ லு
128
+ லூ
129
+ லெ
130
+ லே
131
+ லை
132
+ லொ
133
+ லோ
134
+ லௌ
135
+ ல்
136
+
137
+ வா
138
+ வி
139
+ வீ
140
+ வு
141
+ வூ
142
+ வெ
143
+ வே
144
+ வை
145
+ வொ
146
+ வோ
147
+ வௌ
148
+ வ்
149
+
150
+ ளா
151
+ ளி
152
+ ளீ
153
+ ளு
154
+ ளூ
155
+ ளெ
156
+ ளே
157
+ ளை
158
+ ளொ
159
+ ளோ
160
+ ளௌ
161
+ ள்
162
+
163
+ ழா
164
+ ழி
165
+ ழீ
166
+ ழு
167
+ ழூ
168
+ ழெ
169
+ ழே
170
+ ழை
171
+ ழொ
172
+ ழோ
173
+ ழௌ
174
+ ழ்
175
+
176
+ ஙா
177
+ ஙி
178
+ ஙீ
179
+ ஙு
180
+ ஙூ
181
+ ஙெ
182
+ ஙே
183
+ ஙை
184
+ ஙொ
185
+ ஙோ
186
+ ஙௌ
187
+ ங்
188
+
189
+ ஞா
190
+ ஞி
191
+ ஞீ
192
+ ஞு
193
+ ஞூ
194
+ ஞெ
195
+ ஞே
196
+ ஞை
197
+ ஞொ
198
+ ஞோ
199
+ ஞௌ
200
+ ஞ்
201
+
202
+ ணா
203
+ ணி
204
+ ணீ
205
+ ணு
206
+ ணூ
207
+ ணெ
208
+ ணே
209
+ ணை
210
+ ணொ
211
+ ணோ
212
+ ணௌ
213
+ ண்
214
+
215
+ நா
216
+ நி
217
+ நீ
218
+ நு
219
+ நூ
220
+ நெ
221
+ நே
222
+ நை
223
+ நொ
224
+ நோ
225
+ நௌ
226
+ ந்
227
+
228
+ மா
229
+ மி
230
+ மீ
231
+ மு
232
+ மூ
233
+ மெ
234
+ மே
235
+ மை
236
+ மொ
237
+ மோ
238
+ மௌ
239
+ ம்
240
+
241
+ னா
242
+ னி
243
+ னீ
244
+ னு
245
+ னூ
246
+ னெ
247
+ னே
248
+ னை
249
+ னொ
250
+ னோ
251
+ னௌ
252
+ ன்
253
+
254
+
255
+ #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
256
+ திரு
257
+ திருமதி
258
+ வண
259
+ கௌரவ
260
+
261
+
262
+ #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
263
+ உ.ம்
264
+ #கா.ம்
265
+ #எ.ம்
266
+
267
+
268
+ #Numbers only. These should only induce breaks when followed by a numeric sequence
269
+ # add NUMERIC_ONLY after the word for this function
270
+ #This case is mostly for the english "No." which can either be a sentence of its own, or
271
+ #if followed by a number, a non-breaking prefix
272
+ No #NUMERIC_ONLY#
273
+ Nos
274
+ Art #NUMERIC_ONLY#
275
+ Nr
276
+ pp #NUMERIC_ONLY#
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.yue ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Cantonese (Chinese)
3
+ #
4
+ # Anything in this file, followed by a period,
5
+ # does NOT indicate an end-of-sentence marker.
6
+ #
7
+ # English/Euro-language given-name initials (appearing in
8
+ # news, periodicals, etc.)
9
+ A
10
+ Ā
11
+ B
12
+ C
13
+ Č
14
+ D
15
+ E
16
+ Ē
17
+ F
18
+ G
19
+ Ģ
20
+ H
21
+ I
22
+ Ī
23
+ J
24
+ K
25
+ Ķ
26
+ L
27
+ Ļ
28
+ M
29
+ N
30
+ Ņ
31
+ O
32
+ P
33
+ Q
34
+ R
35
+ S
36
+ Š
37
+ T
38
+ U
39
+ Ū
40
+ V
41
+ W
42
+ X
43
+ Y
44
+ Z
45
+ Ž
46
+
47
+ # Numbers only. These should only induce breaks when followed by
48
+ # a numeric sequence.
49
+ # Add NUMERIC_ONLY after the word for this function. This case is
50
+ # mostly for the english "No." which can either be a sentence of its
51
+ # own, or if followed by a number, a non-breaking prefix.
52
+ No #NUMERIC_ONLY#
53
+ Nr #NUMERIC_ONLY#
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.zh ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Mandarin (Chinese)
3
+ #
4
+ # Anything in this file, followed by a period,
5
+ # does NOT indicate an end-of-sentence marker.
6
+ #
7
+ # English/Euro-language given-name initials (appearing in
8
+ # news, periodicals, etc.)
9
+ A
10
+ Ā
11
+ B
12
+ C
13
+ Č
14
+ D
15
+ E
16
+ Ē
17
+ F
18
+ G
19
+ Ģ
20
+ H
21
+ I
22
+ Ī
23
+ J
24
+ K
25
+ Ķ
26
+ L
27
+ Ļ
28
+ M
29
+ N
30
+ Ņ
31
+ O
32
+ P
33
+ Q
34
+ R
35
+ S
36
+ Š
37
+ T
38
+ U
39
+ Ū
40
+ V
41
+ W
42
+ X
43
+ Y
44
+ Z
45
+ Ž
46
+
47
+ # Numbers only. These should only induce breaks when followed by
48
+ # a numeric sequence.
49
+ # Add NUMERIC_ONLY after the word for this function. This case is
50
+ # mostly for the english "No." which can either be a sentence of its
51
+ # own, or if followed by a number, a non-breaking prefix.
52
+ No #NUMERIC_ONLY#
53
+ Nr #NUMERIC_ONLY#
laser/tools-external/moses-tokenizer/tokenizer/basic-protected-patterns ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ <\/?\S+\/?>
2
+ <\S+( [a-zA-Z0-9]+\=\"?[^\"]\")+ ?\/?>
3
+ <\S+( [a-zA-Z0-9]+\=\'?[^\']\')+ ?\/?>
4
+ [\w\-\_\.]+\@([\w\-\_]+\.)+[a-zA-Z]{2,}
5
+ (http[s]?|ftp):\/\/[^:\/\s]+(\/\w+)*\/[\w\-\.]+
laser/tools-external/moses-tokenizer/tokenizer/deescape-special-chars.perl ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ use warnings;
7
+ use strict;
8
+
9
+ while(<STDIN>) {
10
+ s/\&bar;/\|/g; # factor separator (legacy)
11
+ s/\&#124;/\|/g; # factor separator
12
+ s/\&lt;/\</g; # xml
13
+ s/\&gt;/\>/g; # xml
14
+ s/\&bra;/\[/g; # syntax non-terminal (legacy)
15
+ s/\&ket;/\]/g; # syntax non-terminal (legacy)
16
+ s/\&quot;/\"/g; # xml
17
+ s/\&apos;/\'/g; # xml
18
+ s/\&#91;/\[/g; # syntax non-terminal
19
+ s/\&#93;/\]/g; # syntax non-terminal
20
+ s/\&amp;/\&/g; # escape escape
21
+ print $_;
22
+ }
laser/tools-external/moses-tokenizer/tokenizer/detokenizer.perl ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ # $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $
4
+ # Sample De-Tokenizer
5
+ # written by Josh Schroeder, based on code by Philipp Koehn
6
+ # further modifications by Ondrej Bojar
7
+ #
8
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
9
+ # Public License version 2.1 or, at your option, any later version.
10
+
11
+ binmode(STDIN, ":utf8");
12
+ binmode(STDOUT, ":utf8");
13
+
14
+ use warnings;
15
+ use strict;
16
+ use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
17
+
18
+ my $language = "en";
19
+ my $QUIET = 0;
20
+ my $HELP = 0;
21
+ my $UPPERCASE_SENT = 0;
22
+ my $PENN = 0;
23
+
24
+ while (@ARGV) {
25
+ $_ = shift;
26
+ /^-b$/ && ($| = 1, next);
27
+ /^-l$/ && ($language = shift, next);
28
+ /^-q$/ && ($QUIET = 1, next);
29
+ /^-h$/ && ($HELP = 1, next);
30
+ /^-u$/ && ($UPPERCASE_SENT = 1, next);
31
+ /^-penn$/ && ($PENN = 1, next);
32
+ }
33
+
34
+ if ($HELP) {
35
+ print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n";
36
+ print "Options:\n";
37
+ print " -u ... uppercase the first char in the final sentence.\n";
38
+ print " -q ... don't report detokenizer revision.\n";
39
+ print " -b ... disable Perl buffering.\n";
40
+ print " -penn ... assume input is tokenized as per tokenizer.perl's -penn option.\n";
41
+ exit;
42
+ }
43
+
44
+ if ($language !~ /^(cs|en|fr|it|fi)$/) {
45
+ print STDERR "Warning: No built-in rules for language $language.\n"
46
+ }
47
+
48
+ if ($PENN && $language ne "en") {
49
+ print STDERR "Error: -penn option only supported for English text.\n";
50
+ exit;
51
+ }
52
+
53
+ if (!$QUIET) {
54
+ print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n";
55
+ print STDERR "Language: $language\n";
56
+ }
57
+
58
+ while(<STDIN>) {
59
+ if (/^<.+>$/ || /^\s*$/) {
60
+ #don't try to detokenize XML/HTML tag lines
61
+ print $_;
62
+ } elsif ($PENN) {
63
+ print &detokenize_penn($_);
64
+ } else {
65
+ print &detokenize($_);
66
+ }
67
+ }
68
+
69
+
70
+ sub ucsecondarg {
71
+ # uppercase the second argument
72
+ my $arg1 = shift;
73
+ my $arg2 = shift;
74
+ return $arg1.uc($arg2);
75
+ }
76
+
77
+ sub deescape {
78
+ # de-escape special chars
79
+ my ($text) = @_;
80
+ $text =~ s/\&bar;/\|/g; # factor separator (legacy)
81
+ $text =~ s/\&#124;/\|/g; # factor separator
82
+ $text =~ s/\&lt;/\</g; # xml
83
+ $text =~ s/\&gt;/\>/g; # xml
84
+ $text =~ s/\&bra;/\[/g; # syntax non-terminal (legacy)
85
+ $text =~ s/\&ket;/\]/g; # syntax non-terminal (legacy)
86
+ $text =~ s/\&quot;/\"/g; # xml
87
+ $text =~ s/\&apos;/\'/g; # xml
88
+ $text =~ s/\&#91;/\[/g; # syntax non-terminal
89
+ $text =~ s/\&#93;/\]/g; # syntax non-terminal
90
+ $text =~ s/\&amp;/\&/g; # escape escape
91
+ return $text;
92
+ }
93
+
94
+ sub detokenize {
95
+ my($text) = @_;
96
+ chomp($text);
97
+ $text = " $text ";
98
+ $text =~ s/ \@\-\@ /-/g;
99
+ $text = &deescape($text);
100
+
101
+ my $word;
102
+ my $i;
103
+ my @words = split(/ /,$text);
104
+ $text = "";
105
+ my %quoteCount = ("\'"=>0,"\""=>0);
106
+ my $prependSpace = " ";
107
+ for ($i=0;$i<(scalar(@words));$i++) {
108
+ if (&startsWithCJKChar($words[$i])) {
109
+ if ($i > 0 && &endsWithCJKChar($words[$i-1])) {
110
+ # perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word
111
+ $text=$text.$words[$i];
112
+ } else {
113
+ # ... but do nothing special if this is a CJK word that doesn't follow a CJK word
114
+ $text=$text.$prependSpace.$words[$i];
115
+ }
116
+ $prependSpace = " ";
117
+ } elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
118
+ #perform right shift on currency and other random punctuation items
119
+ $text = $text.$prependSpace.$words[$i];
120
+ $prependSpace = "";
121
+ } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
122
+ if (($language eq "fr") && ($words[$i] =~ /^[\?\!\:\;\\\%]$/)) {
123
+ #these punctuations are prefixed with a non-breakable space in french
124
+ $text .= " "; }
125
+ #perform left shift on punctuation items
126
+ $text=$text.$words[$i];
127
+ $prependSpace = " ";
128
+ } elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
129
+ #left-shift the contraction for English
130
+ $text=$text.$words[$i];
131
+ $prependSpace = " ";
132
+ } elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) {
133
+ #left-shift floats in Czech
134
+ $text=$text.$words[$i];
135
+ $prependSpace = " ";
136
+ } elsif ((($language eq "fr") ||($language eq "it")) && ($i<=(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
137
+ #right-shift the contraction for French and Italian
138
+ $text = $text.$prependSpace.$words[$i];
139
+ $prependSpace = "";
140
+ } elsif (($language eq "cs") && ($i<(scalar(@words)-3))
141
+ && ($words[$i] =~ /[\p{IsAlpha}]$/)
142
+ && ($words[$i+1] =~ /^[-–]$/)
143
+ && ($words[$i+2] =~ /^li$|^mail.*/i)
144
+ ) {
145
+ #right-shift "-li" in Czech and a few Czech dashed words (e-mail)
146
+ $text = $text.$prependSpace.$words[$i].$words[$i+1];
147
+ $i++; # advance over the dash
148
+ $prependSpace = "";
149
+ } elsif ($words[$i] =~ /^[\'\"„“`]+$/) {
150
+ #combine punctuation smartly
151
+ my $normalized_quo = $words[$i];
152
+ $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/;
153
+ $quoteCount{$normalized_quo} = 0
154
+ if !defined $quoteCount{$normalized_quo};
155
+ if ($language eq "cs" && $words[$i] eq "„") {
156
+ # this is always the starting quote in Czech
157
+ $quoteCount{$normalized_quo} = 0;
158
+ }
159
+ if ($language eq "cs" && $words[$i] eq "“") {
160
+ # this is usually the ending quote in Czech
161
+ $quoteCount{$normalized_quo} = 1;
162
+ }
163
+ if (($quoteCount{$normalized_quo} % 2) eq 0) {
164
+ if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
165
+ #single quote for posesssives ending in s... "The Jones' house"
166
+ #left shift
167
+ $text=$text.$words[$i];
168
+ $prependSpace = " ";
169
+ } else {
170
+ #right shift
171
+ $text = $text.$prependSpace.$words[$i];
172
+ $prependSpace = "";
173
+ $quoteCount{$normalized_quo} ++;
174
+
175
+ }
176
+ } else {
177
+ #left shift
178
+ $text=$text.$words[$i];
179
+ $prependSpace = " ";
180
+ $quoteCount{$normalized_quo} ++;
181
+
182
+ }
183
+
184
+ } elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) {
185
+ # Finnish : without intervening space if followed by case suffix
186
+ # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
187
+ $text=$text. lc $words[$i];
188
+ $prependSpace = " ";
189
+ } else {
190
+ $text=$text.$prependSpace.$words[$i];
191
+ $prependSpace = " ";
192
+ }
193
+ }
194
+
195
+ # clean up spaces at head and tail of each line as well as any double-spacing
196
+ $text =~ s/ +/ /g;
197
+ $text =~ s/\n /\n/g;
198
+ $text =~ s/ \n/\n/g;
199
+ $text =~ s/^ //g;
200
+ $text =~ s/ $//g;
201
+
202
+ #add trailing break
203
+ $text .= "\n" unless $text =~ /\n$/;
204
+
205
+ $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
206
+
207
+ return $text;
208
+ }
209
+
210
+ sub detokenize_penn {
211
+ my($text) = @_;
212
+
213
+ chomp($text);
214
+ $text = " $text ";
215
+ $text =~ s/ \@\-\@ /-/g;
216
+ $text =~ s/ \@\/\@ /\//g;
217
+ $text = &deescape($text);
218
+
219
+ # merge de-contracted forms except where the second word begins with an
220
+ # apostrophe (those are handled later)
221
+ $text =~ s/ n't /n't /g;
222
+ $text =~ s/ N'T /N'T /g;
223
+ $text =~ s/ ([Cc])an not / $1annot /g;
224
+ $text =~ s/ ([Dd])' ye / $1'ye /g;
225
+ $text =~ s/ ([Gg])im me / $1imme /g;
226
+ $text =~ s/ ([Gg])on na / $1onna /g;
227
+ $text =~ s/ ([Gg])ot ta / $1otta /g;
228
+ $text =~ s/ ([Ll])em me / $1emme /g;
229
+ $text =~ s/ '([Tt]) is / '$1is /g;
230
+ $text =~ s/ '([Tt]) was / '$1was /g;
231
+ $text =~ s/ ([Ww])an na / $1anna /g;
232
+
233
+ # restore brackets
234
+ $text =~ s/-LRB-/\(/g;
235
+ $text =~ s/-RRB-/\)/g;
236
+ $text =~ s/-LSB-/\[/g;
237
+ $text =~ s/-RSB-/\]/g;
238
+ $text =~ s/-LCB-/{/g;
239
+ $text =~ s/-RCB-/}/g;
240
+
241
+ my $i;
242
+ my @words = split(/ /,$text);
243
+ $text = "";
244
+ my $prependSpace = " ";
245
+ for ($i=0;$i<(scalar(@words));$i++) {
246
+ if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
247
+ # perform right shift on currency and other random punctuation items
248
+ $text = $text.$prependSpace.$words[$i];
249
+ $prependSpace = "";
250
+ } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
251
+ # perform left shift on punctuation items
252
+ $text=$text.$words[$i];
253
+ $prependSpace = " ";
254
+ } elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
255
+ # left-shift the contraction
256
+ $text=$text.$words[$i];
257
+ $prependSpace = " ";
258
+ } elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only
259
+ # opening single quote: convert to straight quote and right-shift
260
+ $text = $text.$prependSpace."\'";
261
+ $prependSpace = "";
262
+ } elsif ($words[$i] eq "``") {
263
+ # opening double quote: convert to straight quote and right-shift
264
+ $text = $text.$prependSpace."\"";
265
+ $prependSpace = "";
266
+ } elsif ($words[$i] eq "\'") {
267
+ # closing single quote: convert to straight quote and left shift
268
+ $text = $text."\'";
269
+ $prependSpace = " ";
270
+ } elsif ($words[$i] eq "\'\'") {
271
+ # closing double quote: convert to straight quote and left shift
272
+ $text = $text."\"";
273
+ $prependSpace = " ";
274
+ } else {
275
+ $text = $text.$prependSpace.$words[$i];
276
+ $prependSpace = " ";
277
+ }
278
+ }
279
+
280
+ # clean up spaces at head and tail of each line as well as any double-spacing
281
+ $text =~ s/ +/ /g;
282
+ $text =~ s/\n /\n/g;
283
+ $text =~ s/ \n/\n/g;
284
+ $text =~ s/^ //g;
285
+ $text =~ s/ $//g;
286
+
287
+ # add trailing break
288
+ $text .= "\n" unless $text =~ /\n$/;
289
+
290
+ $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
291
+
292
+ return $text;
293
+ }
294
+
295
+ sub startsWithCJKChar {
296
+ my ($str) = @_;
297
+ return 0 if length($str) == 0;
298
+ my $firstChar = substr($str, 0, 1);
299
+ return &charIsCJK($firstChar);
300
+ }
301
+
302
+ sub endsWithCJKChar {
303
+ my ($str) = @_;
304
+ return 0 if length($str) == 0;
305
+ my $lastChar = substr($str, length($str)-1, 1);
306
+ return &charIsCJK($lastChar);
307
+ }
308
+
309
+ # Given a string consisting of one character, returns true iff the character
310
+ # is a CJK (Chinese/Japanese/Korean) character
311
+ sub charIsCJK {
312
+ my ($char) = @_;
313
+ # $char should be a string of length 1
314
+ my $codepoint = &codepoint_dec($char);
315
+
316
+ # The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
317
+
318
+ # Hangul Jamo (1100–11FF)
319
+ return 1 if (&between_hexes($codepoint, '1100', '11FF'));
320
+
321
+ # CJK Radicals Supplement (2E80–2EFF)
322
+ # Kangxi Radicals (2F00–2FDF)
323
+ # Ideographic Description Characters (2FF0–2FFF)
324
+ # CJK Symbols and Punctuation (3000–303F)
325
+ # Hiragana (3040–309F)
326
+ # Katakana (30A0–30FF)
327
+ # Bopomofo (3100–312F)
328
+ # Hangul Compatibility Jamo (3130–318F)
329
+ # Kanbun (3190–319F)
330
+ # Bopomofo Extended (31A0–31BF)
331
+ # CJK Strokes (31C0–31EF)
332
+ # Katakana Phonetic Extensions (31F0–31FF)
333
+ # Enclosed CJK Letters and Months (3200–32FF)
334
+ # CJK Compatibility (3300–33FF)
335
+ # CJK Unified Ideographs Extension A (3400–4DBF)
336
+ # Yijing Hexagram Symbols (4DC0–4DFF)
337
+ # CJK Unified Ideographs (4E00–9FFF)
338
+ # Yi Syllables (A000–A48F)
339
+ # Yi Radicals (A490–A4CF)
340
+ return 1 if (&between_hexes($codepoint, '2E80', 'A4CF'));
341
+
342
+ # Phags-pa (A840–A87F)
343
+ return 1 if (&between_hexes($codepoint, 'A840', 'A87F'));
344
+
345
+ # Hangul Syllables (AC00–D7AF)
346
+ return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF'));
347
+
348
+ # CJK Compatibility Ideographs (F900–FAFF)
349
+ return 1 if (&between_hexes($codepoint, 'F900', 'FAFF'));
350
+
351
+ # CJK Compatibility Forms (FE30–FE4F)
352
+ return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F'));
353
+
354
+ # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
355
+ return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC'));
356
+
357
+ # Supplementary Ideographic Plane 20000–2FFFF
358
+ return 1 if (&between_hexes($codepoint, '20000', '2FFFF'));
359
+
360
+ return 0;
361
+ }
362
+
363
+ # Returns the code point of a Unicode char, represented as a decimal number
364
+ sub codepoint_dec {
365
+ if (my $char = shift) {
366
+ return unpack('U0U*', $char);
367
+ }
368
+ }
369
+
370
+ sub between_hexes {
371
+ my ($num, $left, $right) = @_;
372
+ return $num >= hex($left) && $num <= hex($right);
373
+ }
laser/tools-external/moses-tokenizer/tokenizer/lowercase.perl ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ use warnings;
7
+ use strict;
8
+
9
+ binmode(STDIN, ":utf8");
10
+ binmode(STDOUT, ":utf8");
11
+
12
+ while(<STDIN>) {
13
+ print lc($_);
14
+ }
laser/tools-external/moses-tokenizer/tokenizer/normalize-punctuation.perl ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ use warnings;
7
+ use strict;
8
+
9
+ my $language = "en";
10
+ my $PENN = 0;
11
+
12
+ while (@ARGV) {
13
+ $_ = shift;
14
+ /^-b$/ && ($| = 1, next); # not buffered (flush each line)
15
+ /^-l$/ && ($language = shift, next);
16
+ /^[^\-]/ && ($language = $_, next);
17
+ /^-penn$/ && ($PENN = 1, next);
18
+ }
19
+
20
+ while(<STDIN>) {
21
+ s/\r//g;
22
+ # remove extra spaces
23
+ s/\(/ \(/g;
24
+ s/\)/\) /g; s/ +/ /g;
25
+ s/\) ([\.\!\:\?\;\,])/\)$1/g;
26
+ s/\( /\(/g;
27
+ s/ \)/\)/g;
28
+ s/(\d) \%/$1\%/g;
29
+ s/ :/:/g;
30
+ s/ ;/;/g;
31
+ # normalize unicode punctuation
32
+ if ($PENN == 0) {
33
+ s/\`/\'/g;
34
+ s/\'\'/ \" /g;
35
+ }
36
+
37
+ s/„/\"/g;
38
+ s/“/\"/g;
39
+ s/”/\"/g;
40
+ s/–/-/g;
41
+ s/—/ - /g; s/ +/ /g;
42
+ s/´/\'/g;
43
+ s/([a-z])‘([a-z])/$1\'$2/gi;
44
+ s/([a-z])’([a-z])/$1\'$2/gi;
45
+ s/‘/\"/g;
46
+ s/‚/\"/g;
47
+ s/’/\"/g;
48
+ s/''/\"/g;
49
+ s/´´/\"/g;
50
+ s/…/.../g;
51
+ # French quotes
52
+ s/ « / \"/g;
53
+ s/« /\"/g;
54
+ s/«/\"/g;
55
+ s/ » /\" /g;
56
+ s/ »/\"/g;
57
+ s/»/\"/g;
58
+ # handle pseudo-spaces
59
+ s/ \%/\%/g;
60
+ s/nº /nº /g;
61
+ s/ :/:/g;
62
+ s/ ºC/ ºC/g;
63
+ s/ cm/ cm/g;
64
+ s/ \?/\?/g;
65
+ s/ \!/\!/g;
66
+ s/ ;/;/g;
67
+ s/, /, /g; s/ +/ /g;
68
+
69
+ # English "quotation," followed by comma, style
70
+ if ($language eq "en") {
71
+ s/\"([,\.]+)/$1\"/g;
72
+ }
73
+ # Czech is confused
74
+ elsif ($language eq "cs" || $language eq "cz") {
75
+ }
76
+ # German/Spanish/French "quotation", followed by comma, style
77
+ else {
78
+ s/,\"/\",/g;
79
+ s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
80
+ }
81
+
82
+
83
+ if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
84
+ s/(\d) (\d)/$1,$2/g;
85
+ }
86
+ else {
87
+ s/(\d) (\d)/$1.$2/g;
88
+ }
89
+ print $_;
90
+ }
laser/tools-external/moses-tokenizer/tokenizer/remove-non-printing-char.perl ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ use warnings;
7
+ use utf8;
8
+
9
+ binmode(STDIN, ":utf8");
10
+ binmode(STDOUT, ":utf8");
11
+ binmode(STDERR, ":utf8");
12
+
13
+ while (my $line = <STDIN>) {
14
+ chomp($line);
15
+ #$line =~ tr/\040-\176/ /c;
16
+ #$line =~ s/[^[:print:]]/ /g;
17
+ #$line =~ s/\s+/ /g;
18
+ $line =~ s/\p{C}/ /g;
19
+
20
+ print "$line\n";
21
+ }
22
+
laser/tools-external/moses-tokenizer/tokenizer/tokenizer.perl ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+ #
3
+ # This file is part of moses. Its use is licensed under the GNU Lesser General
4
+ # Public License version 2.1 or, at your option, any later version.
5
+
6
+ use warnings;
7
+
8
+ # Sample Tokenizer
9
+ ### Version 1.1
10
+ # written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
11
+ # Version 1.1 updates:
12
+ # (1) add multithreading option "-threads NUM_THREADS" (default is 1);
13
+ # (2) add a timing option "-time" to calculate the average speed of this tokenizer;
14
+ # (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
15
+ ### Version 1.0
16
+ # $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
17
+ # written by Josh Schroeder, based on code by Philipp Koehn
18
+
19
+ binmode(STDIN, ":utf8");
20
+ binmode(STDOUT, ":utf8");
21
+
22
+ use warnings;
23
+ use FindBin qw($RealBin);
24
+ use strict;
25
+ use Time::HiRes;
26
+
27
+ if (eval {require Thread;1;}) {
28
+ #module loaded
29
+ Thread->import();
30
+ }
31
+
32
+ my $mydir = "$RealBin/../share/nonbreaking_prefixes";
33
+
34
+ my %NONBREAKING_PREFIX = ();
35
+ my @protected_patterns = ();
36
+ my $protected_patterns_file = "";
37
+ my $language = "en";
38
+ my $QUIET = 0;
39
+ my $HELP = 0;
40
+ my $AGGRESSIVE = 0;
41
+ my $SKIP_XML = 0;
42
+ my $TIMING = 0;
43
+ my $NUM_THREADS = 1;
44
+ my $NUM_SENTENCES_PER_THREAD = 2000;
45
+ my $PENN = 0;
46
+ my $NO_ESCAPING = 0;
47
+ while (@ARGV)
48
+ {
49
+ $_ = shift;
50
+ /^-b$/ && ($| = 1, next);
51
+ /^-l$/ && ($language = shift, next);
52
+ /^-q$/ && ($QUIET = 1, next);
53
+ /^-h$/ && ($HELP = 1, next);
54
+ /^-x$/ && ($SKIP_XML = 1, next);
55
+ /^-a$/ && ($AGGRESSIVE = 1, next);
56
+ /^-time$/ && ($TIMING = 1, next);
57
+ # Option to add list of regexps to be protected
58
+ /^-protected/ && ($protected_patterns_file = shift, next);
59
+ /^-threads$/ && ($NUM_THREADS = int(shift), next);
60
+ /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
61
+ /^-penn$/ && ($PENN = 1, next);
62
+ /^-no-escape/ && ($NO_ESCAPING = 1, next);
63
+ }
64
+
65
+ # for time calculation
66
+ my $start_time;
67
+ if ($TIMING)
68
+ {
69
+ $start_time = [ Time::HiRes::gettimeofday( ) ];
70
+ }
71
+
72
+ # print help message
73
+ if ($HELP)
74
+ {
75
+ print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
76
+ print "Options:\n";
77
+ print " -q ... quiet.\n";
78
+ print " -a ... aggressive hyphen splitting.\n";
79
+ print " -b ... disable Perl buffering.\n";
80
+ print " -time ... enable processing time calculation.\n";
81
+ print " -penn ... use Penn treebank-like tokenization.\n";
82
+ print " -protected FILE ... specify file with patters to be protected in tokenisation.\n";
83
+ print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
84
+ exit;
85
+ }
86
+
87
+ if (!$QUIET)
88
+ {
89
+ print STDERR "Tokenizer Version 1.1\n";
90
+ print STDERR "Language: $language\n";
91
+ print STDERR "Number of threads: $NUM_THREADS\n";
92
+ }
93
+
94
+ # load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
95
+ load_prefixes($language,\%NONBREAKING_PREFIX);
96
+
97
+ if (scalar(%NONBREAKING_PREFIX) eq 0)
98
+ {
99
+ print STDERR "Warning: No known abbreviations for language '$language'\n";
100
+ }
101
+
102
+ # Load protected patterns
103
+ if ($protected_patterns_file)
104
+ {
105
+ open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
106
+ while(<PP>) {
107
+ chomp;
108
+ push @protected_patterns, $_;
109
+ }
110
+ }
111
+
112
+ my @batch_sentences = ();
113
+ my @thread_list = ();
114
+ my $count_sentences = 0;
115
+
116
+ if ($NUM_THREADS > 1)
117
+ {# multi-threading tokenization
118
+ while(<STDIN>)
119
+ {
120
+ $count_sentences = $count_sentences + 1;
121
+ push(@batch_sentences, $_);
122
+ if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
123
+ {
124
+ # assign each thread work
125
+ for (my $i=0; $i<$NUM_THREADS; $i++)
126
+ {
127
+ my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
128
+ my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
129
+ my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
130
+ my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
131
+ push(@thread_list, $new_thread);
132
+ }
133
+ foreach (@thread_list)
134
+ {
135
+ my $tokenized_list = $_->join;
136
+ foreach (@$tokenized_list)
137
+ {
138
+ print $_;
139
+ }
140
+ }
141
+ # reset for the new run
142
+ @thread_list = ();
143
+ @batch_sentences = ();
144
+ }
145
+ }
146
+ # the last batch
147
+ if (scalar(@batch_sentences)>0)
148
+ {
149
+ # assign each thread work
150
+ for (my $i=0; $i<$NUM_THREADS; $i++)
151
+ {
152
+ my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
153
+ if ($start_index >= scalar(@batch_sentences))
154
+ {
155
+ last;
156
+ }
157
+ my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
158
+ if ($end_index >= scalar(@batch_sentences))
159
+ {
160
+ $end_index = scalar(@batch_sentences)-1;
161
+ }
162
+ my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
163
+ my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
164
+ push(@thread_list, $new_thread);
165
+ }
166
+ foreach (@thread_list)
167
+ {
168
+ my $tokenized_list = $_->join;
169
+ foreach (@$tokenized_list)
170
+ {
171
+ print $_;
172
+ }
173
+ }
174
+ }
175
+ }
176
+ else
177
+ {# single thread only
178
+ while(<STDIN>)
179
+ {
180
+ if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
181
+ {
182
+ #don't try to tokenize XML/HTML tag lines
183
+ print $_;
184
+ }
185
+ else
186
+ {
187
+ print &tokenize($_);
188
+ }
189
+ }
190
+ }
191
+
192
+ if ($TIMING)
193
+ {
194
+ my $duration = Time::HiRes::tv_interval( $start_time );
195
+ print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
196
+ print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
197
+ }
198
+
199
+ #####################################################################################
200
+ # subroutines afterward
201
+
202
+ # tokenize a batch of texts saved in an array
203
+ # input: an array containing a batch of texts
204
+ # return: another array containing a batch of tokenized texts for the input array
205
+ sub tokenize_batch
206
+ {
207
+ my(@text_list) = @_;
208
+ my(@tokenized_list) = ();
209
+ foreach (@text_list)
210
+ {
211
+ if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
212
+ {
213
+ #don't try to tokenize XML/HTML tag lines
214
+ push(@tokenized_list, $_);
215
+ }
216
+ else
217
+ {
218
+ push(@tokenized_list, &tokenize($_));
219
+ }
220
+ }
221
+ return \@tokenized_list;
222
+ }
223
+
224
+ # the actual tokenize function which tokenizes one input string
225
+ # input: one string
226
+ # return: the tokenized string for the input string
227
+ sub tokenize
228
+ {
229
+ my($text) = @_;
230
+
231
+ if ($PENN) {
232
+ return tokenize_penn($text);
233
+ }
234
+
235
+ chomp($text);
236
+ $text = " $text ";
237
+
238
+ # remove ASCII junk
239
+ $text =~ s/\s+/ /g;
240
+ $text =~ s/[\000-\037]//g;
241
+
242
+ # Find protected patterns
243
+ my @protected = ();
244
+ foreach my $protected_pattern (@protected_patterns) {
245
+ my $t = $text;
246
+ while ($t =~ /(?<PATTERN>$protected_pattern)(?<TAIL>.*)$/) {
247
+ push @protected, $+{PATTERN};
248
+ $t = $+{TAIL};
249
+ }
250
+ }
251
+
252
+ for (my $i = 0; $i < scalar(@protected); ++$i) {
253
+ my $subst = sprintf("THISISPROTECTED%.3d", $i);
254
+ $text =~ s,\Q$protected[$i], $subst ,g;
255
+ }
256
+ $text =~ s/ +/ /g;
257
+ $text =~ s/^ //g;
258
+ $text =~ s/ $//g;
259
+
260
+ # seperate out all "other" special characters
261
+ $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
262
+
263
+ # aggressive hyphen splitting
264
+ if ($AGGRESSIVE)
265
+ {
266
+ $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
267
+ }
268
+
269
+ #multi-dots stay together
270
+ $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
271
+ while($text =~ /DOTMULTI\./)
272
+ {
273
+ $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
274
+ $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
275
+ }
276
+
277
+ # seperate out "," except if within numbers (5,300)
278
+ #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
279
+
280
+ # separate out "," except if within numbers (5,300)
281
+ # previous "global" application skips some: A,B,C,D,E > A , B,C , D,E
282
+ # first application uses up B so rule can't see B,C
283
+ # two-step version here may create extra spaces but these are removed later
284
+ # will also space digit,letter or letter,digit forms (redundant with next section)
285
+ $text =~ s/([^\p{IsN}])[,]/$1 , /g;
286
+ $text =~ s/[,]([^\p{IsN}])/ , $1/g;
287
+
288
+ # separate "," after a number if it's the end of a sentence
289
+ $text =~ s/([\p{IsN}])[,]$/$1 ,/g;
290
+
291
+ # separate , pre and post number
292
+ #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
293
+ #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
294
+
295
+ # turn `into '
296
+ #$text =~ s/\`/\'/g;
297
+
298
+ #turn '' into "
299
+ #$text =~ s/\'\'/ \" /g;
300
+
301
+ if ($language eq "en")
302
+ {
303
+ #split contractions right
304
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
305
+ $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
306
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
307
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
308
+ #special case for "1990's"
309
+ $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
310
+ }
311
+ elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga"))
312
+ {
313
+ #split contractions left
314
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
315
+ $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
316
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
317
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
318
+ }
319
+ else
320
+ {
321
+ $text =~ s/\'/ \' /g;
322
+ }
323
+
324
+ #word token method
325
+ my @words = split(/\s/,$text);
326
+ $text = "";
327
+ for (my $i=0;$i<(scalar(@words));$i++)
328
+ {
329
+ my $word = $words[$i];
330
+ if ( $word =~ /^(\S+)\.$/)
331
+ {
332
+ my $pre = $1;
333
+ if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
334
+ {
335
+ #no change
336
+ }
337
+ elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
338
+ {
339
+ #no change
340
+ }
341
+ else
342
+ {
343
+ $word = $pre." .";
344
+ }
345
+ }
346
+ $text .= $word." ";
347
+ }
348
+
349
+ # clean up extraneous spaces
350
+ $text =~ s/ +/ /g;
351
+ $text =~ s/^ //g;
352
+ $text =~ s/ $//g;
353
+
354
+ # .' at end of sentence is missed
355
+ $text =~ s/\.\' ?$/ . ' /;
356
+
357
+ # restore protected
358
+ for (my $i = 0; $i < scalar(@protected); ++$i) {
359
+ my $subst = sprintf("THISISPROTECTED%.3d", $i);
360
+ $text =~ s/$subst/$protected[$i]/g;
361
+ }
362
+
363
+ #restore multi-dots
364
+ while($text =~ /DOTDOTMULTI/)
365
+ {
366
+ $text =~ s/DOTDOTMULTI/DOTMULTI./g;
367
+ }
368
+ $text =~ s/DOTMULTI/./g;
369
+
370
+ #escape special chars
371
+ if (!$NO_ESCAPING)
372
+ {
373
+ $text =~ s/\&/\&amp;/g; # escape escape
374
+ $text =~ s/\|/\&#124;/g; # factor separator
375
+ $text =~ s/\</\&lt;/g; # xml
376
+ $text =~ s/\>/\&gt;/g; # xml
377
+ $text =~ s/\'/\&apos;/g; # xml
378
+ $text =~ s/\"/\&quot;/g; # xml
379
+ $text =~ s/\[/\&#91;/g; # syntax non-terminal
380
+ $text =~ s/\]/\&#93;/g; # syntax non-terminal
381
+ }
382
+
383
+ #ensure final line break
384
+ $text .= "\n" unless $text =~ /\n$/;
385
+
386
+ return $text;
387
+ }
388
+
389
+ sub tokenize_penn
390
+ {
391
+ # Improved compatibility with Penn Treebank tokenization. Useful if
392
+ # the text is to later be parsed with a PTB-trained parser.
393
+ #
394
+ # Adapted from Robert MacIntyre's sed script:
395
+ # http://www.cis.upenn.edu/~treebank/tokenizer.sed
396
+
397
+ my($text) = @_;
398
+ chomp($text);
399
+
400
+ # remove ASCII junk
401
+ $text =~ s/\s+/ /g;
402
+ $text =~ s/[\000-\037]//g;
403
+
404
+ # attempt to get correct directional quotes
405
+ $text =~ s/^``/`` /g;
406
+ $text =~ s/^"/`` /g;
407
+ $text =~ s/^`([^`])/` $1/g;
408
+ $text =~ s/^'/` /g;
409
+ $text =~ s/([ ([{<])"/$1 `` /g;
410
+ $text =~ s/([ ([{<])``/$1 `` /g;
411
+ $text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
412
+ $text =~ s/([ ([{<])'/$1 ` /g;
413
+ # close quotes handled at end
414
+
415
+ $text =~ s=\.\.\.= _ELLIPSIS_ =g;
416
+
417
+ # separate out "," except if within numbers (5,300)
418
+ $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
419
+ # separate , pre and post number
420
+ $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
421
+ $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
422
+
423
+ #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
424
+ $text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
425
+
426
+ # Separate out intra-token slashes. PTB tokenization doesn't do this, so
427
+ # the tokens should be merged prior to parsing with a PTB-trained parser
428
+ # (see syntax-hyphen-splitting.perl).
429
+ $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
430
+
431
+ # Assume sentence tokenization has been done first, so split FINAL periods
432
+ # only.
433
+ $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
434
+ # however, we may as well split ALL question marks and exclamation points,
435
+ # since they shouldn't have the abbrev.-marker ambiguity problem
436
+ $text =~ s=([?!])= $1 =g;
437
+
438
+ # parentheses, brackets, etc.
439
+ $text =~ s=([\]\[\(\){}<>])= $1 =g;
440
+ $text =~ s/\(/-LRB-/g;
441
+ $text =~ s/\)/-RRB-/g;
442
+ $text =~ s/\[/-LSB-/g;
443
+ $text =~ s/\]/-RSB-/g;
444
+ $text =~ s/{/-LCB-/g;
445
+ $text =~ s/}/-RCB-/g;
446
+
447
+ $text =~ s=--= -- =g;
448
+
449
+ # First off, add a space to the beginning and end of each line, to reduce
450
+ # necessary number of regexps.
451
+ $text =~ s=$= =;
452
+ $text =~ s=^= =;
453
+
454
+ $text =~ s="= '' =g;
455
+ # possessive or close-single-quote
456
+ $text =~ s=([^'])' =$1 ' =g;
457
+ # as in it's, I'm, we'd
458
+ $text =~ s='([sSmMdD]) = '$1 =g;
459
+ $text =~ s='ll = 'll =g;
460
+ $text =~ s='re = 're =g;
461
+ $text =~ s='ve = 've =g;
462
+ $text =~ s=n't = n't =g;
463
+ $text =~ s='LL = 'LL =g;
464
+ $text =~ s='RE = 'RE =g;
465
+ $text =~ s='VE = 'VE =g;
466
+ $text =~ s=N'T = N'T =g;
467
+
468
+ $text =~ s= ([Cc])annot = $1an not =g;
469
+ $text =~ s= ([Dd])'ye = $1' ye =g;
470
+ $text =~ s= ([Gg])imme = $1im me =g;
471
+ $text =~ s= ([Gg])onna = $1on na =g;
472
+ $text =~ s= ([Gg])otta = $1ot ta =g;
473
+ $text =~ s= ([Ll])emme = $1em me =g;
474
+ $text =~ s= ([Mm])ore'n = $1ore 'n =g;
475
+ $text =~ s= '([Tt])is = '$1 is =g;
476
+ $text =~ s= '([Tt])was = '$1 was =g;
477
+ $text =~ s= ([Ww])anna = $1an na =g;
478
+
479
+ #word token method
480
+ my @words = split(/\s/,$text);
481
+ $text = "";
482
+ for (my $i=0;$i<(scalar(@words));$i++)
483
+ {
484
+ my $word = $words[$i];
485
+ if ( $word =~ /^(\S+)\.$/)
486
+ {
487
+ my $pre = $1;
488
+ if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
489
+ {
490
+ #no change
491
+ }
492
+ elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
493
+ {
494
+ #no change
495
+ }
496
+ else
497
+ {
498
+ $word = $pre." .";
499
+ }
500
+ }
501
+ $text .= $word." ";
502
+ }
503
+
504
+ # restore ellipses
505
+ $text =~ s=_ELLIPSIS_=\.\.\.=g;
506
+
507
+ # clean out extra spaces
508
+ $text =~ s= *= =g;
509
+ $text =~ s=^ *==g;
510
+ $text =~ s= *$==g;
511
+
512
+ #escape special chars
513
+ $text =~ s/\&/\&amp;/g; # escape escape
514
+ $text =~ s/\|/\&#124;/g; # factor separator
515
+ $text =~ s/\</\&lt;/g; # xml
516
+ $text =~ s/\>/\&gt;/g; # xml
517
+ $text =~ s/\'/\&apos;/g; # xml
518
+ $text =~ s/\"/\&quot;/g; # xml
519
+ $text =~ s/\[/\&#91;/g; # syntax non-terminal
520
+ $text =~ s/\]/\&#93;/g; # syntax non-terminal
521
+
522
+ #ensure final line break
523
+ $text .= "\n" unless $text =~ /\n$/;
524
+
525
+ return $text;
526
+ }
527
+
528
+ sub load_prefixes
529
+ {
530
+ my ($language, $PREFIX_REF) = @_;
531
+
532
+ my $prefixfile = "$mydir/nonbreaking_prefix.$language";
533
+
534
+ #default back to English if we don't have a language-specific prefix file
535
+ if (!(-e $prefixfile))
536
+ {
537
+ $prefixfile = "$mydir/nonbreaking_prefix.en";
538
+ print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
539
+ die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
540
+ }
541
+
542
+ if (-e "$prefixfile")
543
+ {
544
+ open(PREFIX, "<:utf8", "$prefixfile");
545
+ while (<PREFIX>)
546
+ {
547
+ my $item = $_;
548
+ chomp($item);
549
+ if (($item) && (substr($item,0,1) ne "#"))
550
+ {
551
+ if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
552
+ {
553
+ $PREFIX_REF->{$1} = 2;
554
+ }
555
+ else
556
+ {
557
+ $PREFIX_REF->{$item} = 1;
558
+ }
559
+ }
560
+ }
561
+ close(PREFIX);
562
+ }
563
+ }
laser/tools-external/sentencepiece-master/.github/dependabot.yml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To get started with Dependabot version updates, you'll need to specify which
2
+ # package ecosystems to update and where the package manifests are located.
3
+ # Please see the documentation for all configuration options:
4
+ # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5
+
6
+ version: 2
7
+ updates:
8
+ - package-ecosystem: "github-actions"
9
+ directory: "/"
10
+ schedule:
11
+ interval: "monthly"
12
+ groups:
13
+ github-actions:
14
+ patterns:
15
+ - "*"
16
+ - package-ecosystem: "pip"
17
+ directory: "/.github/workflows/requirements"
18
+ schedule:
19
+ interval: "monthly"
20
+ groups:
21
+ build-time-deps:
22
+ patterns:
23
+ - "*"
laser/tools-external/sentencepiece-master/.github/workflows/cifuzz.yml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CIFuzz
2
+ on: [pull_request]
3
+
4
+ permissions:
5
+ contents: read
6
+
7
+ jobs:
8
+ Fuzzing:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Build Fuzzers
12
+ id: build
13
+ uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
14
+ with:
15
+ oss-fuzz-project-name: 'sentencepiece'
16
+ dry-run: false
17
+ language: c++
18
+ - name: Run Fuzzers
19
+ uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
20
+ with:
21
+ oss-fuzz-project-name: 'sentencepiece'
22
+ fuzz-seconds: 300
23
+ dry-run: false
24
+ language: c++
25
+ - name: Upload Crash
26
+ uses: actions/upload-artifact@v3
27
+ if: failure() && steps.build.outcome == 'success'
28
+ with:
29
+ name: artifacts
30
+ path: ./out/artifacts
laser/tools-external/sentencepiece-master/.github/workflows/cmake.yml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI for general build
2
+
3
+ on:
4
+ push:
5
+ branches: [ master ]
6
+ tags:
7
+ - 'v*'
8
+ pull_request:
9
+ branches: [ master ]
10
+
11
+ permissions:
12
+ contents: read
13
+
14
+ jobs:
15
+ build:
16
+ strategy:
17
+ matrix:
18
+ os: [ ubuntu-latest, ubuntu-20.04, windows-latest, macOS-11 ]
19
+ arch: [ x64 ]
20
+ include:
21
+ - os: windows-latest
22
+ arch: x86
23
+ runs-on: ${{ matrix.os }}
24
+
25
+ permissions:
26
+ contents: write # svenstaro/upload-release-action
27
+
28
+ steps:
29
+ - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
30
+ - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
31
+ with:
32
+ python-version: '3.x'
33
+ architecture: ${{matrix.arch}}
34
+
35
+ - name: Config for Windows
36
+ if: runner.os == 'Windows'
37
+ run: |
38
+ if ("${{matrix.arch}}" -eq "x64") {
39
+ $msbuildPlatform = "x64"
40
+ } else {
41
+ $msbuildPlatform = "Win32"
42
+ }
43
+ cmake -A $msbuildPlatform -B ${{github.workspace}}/build -DSPM_BUILD_TEST=ON -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/build/root
44
+
45
+ - name: Config for Unix
46
+ if: runner.os != 'Windows'
47
+ run: cmake -B ${{github.workspace}}/build -DSPM_BUILD_TEST=ON -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/build/root
48
+ env:
49
+ CMAKE_OSX_ARCHITECTURES: arm64;x86_64
50
+
51
+ - name: Build
52
+ run: cmake --build ${{github.workspace}}/build --config Release --target install --parallel 8
53
+
54
+ - name: Test
55
+ working-directory: ${{github.workspace}}/build
56
+ run: ctest -C Release --output-on-failure
57
+
58
+ - name: Package
59
+ working-directory: ${{github.workspace}}/build
60
+ run: cpack
61
+
62
+ - name: Build Python wrapper
63
+ working-directory: ${{github.workspace}}/python
64
+ run: |
65
+ python -m pip install --require-hashes --no-dependencies -r ../.github/workflows/requirements/base.txt
66
+ python setup.py build
67
+ python setup.py bdist_wheel
68
+ python -m pytest
69
+
70
+ - name: Upload artifcacts
71
+ uses: actions/upload-artifact@v3
72
+ with:
73
+ name: artifcacts
74
+ path: ./build/*.7z
75
+
76
+ - name: Upload Release Assets
77
+ if: startsWith(github.ref, 'refs/tags/')
78
+ uses: svenstaro/upload-release-action@04733e069f2d7f7f0b4aebc4fbdbce8613b03ccd # v2.9.0
79
+ with:
80
+ repo_token: ${{ secrets.GITHUB_TOKEN }}
81
+ file: ./build/*.7z
82
+ tag: ${{ github.ref }}
83
+ overwrite: true
84
+ prerelease: true
85
+ file_glob: true
86
+ body: "This is my release text"