Spaces:

nvidia
/

Plan2Align-NV

Running on L40S

App Files Files Community

KuangDW commited on Apr 15

Commit

2aebc50

1 Parent(s): 6f67103

add laser tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +1 -1
laser/.gitignore +0 -7
laser/tools-external/fastBPE/LICENSE +21 -0
laser/tools-external/fastBPE/MANIFEST.in +3 -0
laser/tools-external/fastBPE/README.md +83 -0
laser/tools-external/fastBPE/build/lib.linux-x86_64-cpython-37/fastBPE.cpython-37m-x86_64-linux-gnu.so +3 -0
laser/tools-external/fastBPE/fastBPE.egg-info/PKG-INFO +91 -0
laser/tools-external/fastBPE/fastBPE.egg-info/SOURCES.txt +12 -0
laser/tools-external/fastBPE/fastBPE.egg-info/dependency_links.txt +1 -0
laser/tools-external/fastBPE/fastBPE.egg-info/top_level.txt +1 -0
laser/tools-external/fastBPE/fastBPE/fastBPE.cpp +0 -0
laser/tools-external/fastBPE/fastBPE/fastBPE.hpp +692 -0
laser/tools-external/fastBPE/fastBPE/fastBPE.pyx +24 -0
laser/tools-external/fastBPE/fastBPE/main.cc +43 -0
laser/tools-external/fastBPE/setup.py +49 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.de +325 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.el +1568 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.en +121 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.es +118 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.fi +138 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.fr +153 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ga +48 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.hu +103 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.is +251 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.it +180 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.lt +698 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.lv +100 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.nl +115 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.pl +283 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.pt +3 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ro +38 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ru +293 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.sk +474 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.sl +78 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.sv +46 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ta +276 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.yue +53 -0
laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.zh +53 -0
laser/tools-external/moses-tokenizer/tokenizer/basic-protected-patterns +5 -0
laser/tools-external/moses-tokenizer/tokenizer/deescape-special-chars.perl +22 -0
laser/tools-external/moses-tokenizer/tokenizer/detokenizer.perl +373 -0
laser/tools-external/moses-tokenizer/tokenizer/lowercase.perl +14 -0
laser/tools-external/moses-tokenizer/tokenizer/normalize-punctuation.perl +90 -0
laser/tools-external/moses-tokenizer/tokenizer/remove-non-printing-char.perl +22 -0
laser/tools-external/moses-tokenizer/tokenizer/tokenizer.perl +563 -0
laser/tools-external/sentencepiece-master/.github/dependabot.yml +23 -0
laser/tools-external/sentencepiece-master/.github/workflows/cifuzz.yml +30 -0
laser/tools-external/sentencepiece-master/.github/workflows/cmake.yml +86 -0

app.py CHANGED Viewed

@@ -65,7 +65,7 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
 # Load models once
 print("Loading models...")
-model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,

 print(f"Using device: {device}")
 # Load models once
 print("Loading models...")
+model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,

laser/.gitignore CHANGED Viewed

@@ -1,12 +1,5 @@
 source/__pycache__
 source/lib/__pycache__
-models
-tools-external
-tasks/mldoc/MLDoc
-tasks/bucc/downloaded
-tasks/similarity/dev/
-tasks/xnli/XNLI-1.0*
-tasks/xnli/multinli_1.0*
 .??*swp
 .idea
 __pycache__

 source/__pycache__
 source/lib/__pycache__
 .??*swp
 .idea
 __pycache__

laser/tools-external/fastBPE/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+The MIT License
+Copyright (c) 2019 Guillaume Lample,Timothée Lacroix
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

laser/tools-external/fastBPE/MANIFEST.in ADDED Viewed

	@@ -0,0 +1,3 @@

+include fastBPE/*.cc
+include fastBPE/*.hpp
+include fastBPE/*.pyx

laser/tools-external/fastBPE/README.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# fastBPE
+C++ implementation of [Neural Machine Translation of Rare Words with Subword Units](https://arxiv.org/abs/1508.07909), with Python API.
+## Installation
+Compile with:
+```
+g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
+```
+## Usage:
+### List commands
+```
+./fast
+usage: fastbpe <command> <args>
+The commands supported by fastBPE are:
+getvocab input1 [input2]             extract the vocabulary from one or two text files
+learnbpe nCodes input1 [input2]      learn BPE codes from one or two text files
+applybpe output input codes [vocab]  apply BPE codes to a text file
+applybpe_stream codes [vocab]        apply BPE codes to stdin and outputs to stdout
+```
+fastBPE also supports stdin inputs. For instance, these two commands are equivalent:
+```
+./fast getvocab text > vocab
+cat text | ./fast getvocab - > vocab
+```
+But the first one will memory map the input file to read it efficiently, which can be more than twice faster than stdin on very large files. Similarly, these two commands are equivalent:
+```
+./fast applybpe output input codes vocab
+cat input | ./fast applybpe_stream codes vocab > output
+```
+Although the first one will be significantly faster on large datasets, as it uses multi-threading to pre-compute the BPE splits of all words in the input file.
+### Learn codes
+```
+./fast learnbpe 40000 train.de train.en > codes
+```
+### Apply codes to train
+```
+./fast applybpe train.de.40000 train.de codes
+./fast applybpe train.en.40000 train.en codes
+```
+### Get train vocabulary
+```
+./fast getvocab train.de.40000 > vocab.de.40000
+./fast getvocab train.en.40000 > vocab.en.40000
+```
+### Apply codes to valid and test
+```
+./fast applybpe valid.de.40000 valid.de codes vocab.de.40000
+./fast applybpe valid.en.40000 valid.en codes vocab.en.40000
+./fast applybpe test.de.40000  test.de  codes vocab.de.40000
+./fast applybpe test.en.40000  test.en  codes vocab.en.40000
+```
+## Python API
+To install the Python API, simply run:
+```bash
+python setup.py install
+```
+**Note:** For Mac OSX Users, add `export MACOSX_DEPLOYMENT_TARGET=10.x` (x=9 or 10, depending on your version) or `-stdlib=libc++` to the `extra_compile_args` of `setup.py` before/during the above install command, as appropriate.
+Call the API using:
+```python
+import fastBPE
+bpe = fastBPE.fastBPE(codes_path, vocab_path)
+bpe.apply(["Roasted barramundi fish", "Centrally managed over a client-server architecture"])
+>> ['Ro@@ asted barr@@ am@@ un@@ di fish', 'Centr@@ ally managed over a cli@@ ent-@@ server architecture']
+```

laser/tools-external/fastBPE/build/lib.linux-x86_64-cpython-37/fastBPE.cpython-37m-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5585531fdc9895f104c01440761b83c6edd388e1a76de4df9eda4bd21258b63
+size 2622328

laser/tools-external/fastBPE/fastBPE.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,91 @@

+Metadata-Version: 2.1
+Name: fastBPE
+Version: 0.1.1
+Summary: C++ implementation of Neural Machine Translation of Rare Words with Subword Units, with Python API.
+Home-page: https://github.com/glample/fastBPE
+Description-Content-Type: text/markdown
+License-File: LICENSE
+# fastBPE
+C++ implementation of [Neural Machine Translation of Rare Words with Subword Units](https://arxiv.org/abs/1508.07909), with Python API.
+## Installation
+Compile with:
+```
+g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
+```
+## Usage:
+### List commands
+```
+./fast
+usage: fastbpe <command> <args>
+The commands supported by fastBPE are:
+getvocab input1 [input2]             extract the vocabulary from one or two text files
+learnbpe nCodes input1 [input2]      learn BPE codes from one or two text files
+applybpe output input codes [vocab]  apply BPE codes to a text file
+applybpe_stream codes [vocab]        apply BPE codes to stdin and outputs to stdout
+```
+fastBPE also supports stdin inputs. For instance, these two commands are equivalent:
+```
+./fast getvocab text > vocab
+cat text | ./fast getvocab - > vocab
+```
+But the first one will memory map the input file to read it efficiently, which can be more than twice faster than stdin on very large files. Similarly, these two commands are equivalent:
+```
+./fast applybpe output input codes vocab
+cat input | ./fast applybpe_stream codes vocab > output
+```
+Although the first one will be significantly faster on large datasets, as it uses multi-threading to pre-compute the BPE splits of all words in the input file.
+### Learn codes
+```
+./fast learnbpe 40000 train.de train.en > codes
+```
+### Apply codes to train
+```
+./fast applybpe train.de.40000 train.de codes
+./fast applybpe train.en.40000 train.en codes
+```
+### Get train vocabulary
+```
+./fast getvocab train.de.40000 > vocab.de.40000
+./fast getvocab train.en.40000 > vocab.en.40000
+```
+### Apply codes to valid and test
+```
+./fast applybpe valid.de.40000 valid.de codes vocab.de.40000
+./fast applybpe valid.en.40000 valid.en codes vocab.en.40000
+./fast applybpe test.de.40000  test.de  codes vocab.de.40000
+./fast applybpe test.en.40000  test.en  codes vocab.en.40000
+```
+## Python API
+To install the Python API, simply run:
+```bash
+python setup.py install
+```
+**Note:** For Mac OSX Users, add `export MACOSX_DEPLOYMENT_TARGET=10.x` (x=9 or 10, depending on your version) or `-stdlib=libc++` to the `extra_compile_args` of `setup.py` before/during the above install command, as appropriate.
+Call the API using:
+```python
+import fastBPE
+bpe = fastBPE.fastBPE(codes_path, vocab_path)
+bpe.apply(["Roasted barramundi fish", "Centrally managed over a client-server architecture"])
+>> ['Ro@@ asted barr@@ am@@ un@@ di fish', 'Centr@@ ally managed over a cli@@ ent-@@ server architecture']
+```

laser/tools-external/fastBPE/fastBPE.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+LICENSE
+MANIFEST.in
+README.md
+setup.py
+fastBPE/fastBPE.cpp
+fastBPE/fastBPE.hpp
+fastBPE/fastBPE.pyx
+fastBPE/main.cc
+fastBPE.egg-info/PKG-INFO
+fastBPE.egg-info/SOURCES.txt
+fastBPE.egg-info/dependency_links.txt
+fastBPE.egg-info/top_level.txt

laser/tools-external/fastBPE/fastBPE.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

laser/tools-external/fastBPE/fastBPE.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ fastBPE

laser/tools-external/fastBPE/fastBPE/fastBPE.cpp ADDED Viewed

The diff for this file is too large to render. See raw diff

laser/tools-external/fastBPE/fastBPE/fastBPE.hpp ADDED Viewed

	@@ -0,0 +1,692 @@

+#pragma once
+#include <algorithm>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <list>
+#include <set>
+#include <stdio.h>
+#include <string>
+#include <cstring>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <thread>
+#include <unistd.h> // ftruncate
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+namespace fastBPE {
+using namespace std;
+const size_t kMaxPairs = 1000 * 1000 * 1000;
+const size_t kThreads = max(1, min(10, int(thread::hardware_concurrency())));
+const char *kEndWord = "</w>";
+const size_t kEndWordLength = 4;
+const char *kTokenDelim = "@@";
+const size_t kTokenDelimLength = 2;
+int safeOpen(const char *file_path, int flags, mode_t mode = 0) {
+  int fd = open(file_path, flags, mode);
+  if (fd < 0) {
+    fprintf(stderr, "Cannot open text file %s\n", file_path);
+    exit(EXIT_FAILURE);
+  }
+  return fd;
+}
+void readText(const char *fp, unordered_map<string, uint32_t> &word_count) {
+  string cur_word;
+  uint64_t total = 0;
+  auto deal_with_char = [&](char cur_char){
+    if (cur_char == ' ' || cur_char == '\n') {
+      if (cur_word.size() == 0)
+        return;
+      // end of word
+      auto it = word_count.find(cur_word);
+      int count = it != word_count.end() ? it->second : 0;
+      word_count[cur_word] = count + 1;
+      total++;
+      cur_word.clear();
+    } else {
+      cur_word.push_back(cur_char);
+    }
+  };
+  if (string(fp).compare("-") == 0) {
+    for (std::string line; std::getline(std::cin, line);) {
+      for(char c: line){
+        deal_with_char(c);
+      }
+      deal_with_char('\n');
+    }
+  }
+  else {
+    int fd = safeOpen(fp, O_RDONLY);
+    struct stat s;
+    fstat(fd, &s);
+    fprintf(stderr, "Loading vocabulary from %s ...\n", fp);
+    size_t size = s.st_size;
+    char *f = (char *)mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
+    for (size_t i = 0; i < size; i++) {
+      deal_with_char(f[i]);
+    }
+  }
+  fprintf(stderr, "Read %lu words (%lu unique) from text file.\n", total,
+          word_count.size());
+}
+std::pair<size_t, uint64_t> output_or_count(
+  unordered_map<string, string> &bpe, size_t size, char *f, char *fo
+) {
+  string cur_word;
+  size_t charOut = 0;
+  uint64_t total = 0;
+  for (size_t i = 0; i < size; i++) {
+    auto &cur_char = f[i];
+    if (cur_char == ' ' || cur_char == '\n') {
+      if (cur_word.size() == 0) {
+        if (fo != nullptr) fo[charOut] = cur_char;
+        charOut++;
+        continue;
+      }
+      // end of word : write bpe to output
+      auto it = bpe.find(cur_word);
+      assert(it != bpe.end());
+      for (auto x : it->second) {
+        if (fo != nullptr) fo[charOut] = x;
+        charOut++;
+      }
+      if (fo != nullptr) fo[charOut] = cur_char;
+      charOut++;
+      total++;
+      cur_word.clear();
+    } else {
+      cur_word.push_back(cur_char);
+    }
+  }
+  return std::make_pair(charOut, total);
+}
+void outputText(const char *fpo, const char *fp,
+                unordered_map<string, string> &bpe) {
+  int fd = safeOpen(fp, O_RDONLY);
+  auto fdOut = safeOpen(fpo, O_RDWR | O_CREAT | O_TRUNC, 0666);
+  struct stat s;
+  fstat(fd, &s);
+  fprintf(stderr, "Applying BPE to %s ...\n", fp);
+  auto size = s.st_size;
+  char *f = (char *)mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
+  auto p = output_or_count(bpe, size, f, nullptr);
+  size_t out_size = p.first;
+  if (ftruncate(fdOut, out_size) < 0) {
+    fprintf(stderr, "Couldn't truncate output file %s to size %lu\n", fpo,
+            out_size);
+    exit(EXIT_FAILURE);
+  }
+  char *fo = (char *)mmap(NULL, out_size, PROT_WRITE, MAP_SHARED, fdOut, 0);
+  if (fo == MAP_FAILED) {
+    fprintf(stderr, "Output memory map failed : %d.\n", errno);
+    exit(EXIT_FAILURE);
+  }
+  p = output_or_count(bpe, size, f, fo);
+  fprintf(stderr, "Modified %lu words from text file.\n", p.second);
+  munmap(fo, out_size);
+  munmap(f, size);
+  close(fdOut);
+  close(fd);
+}
+struct pair_hash {
+  template <class T1, class T2> size_t operator()(const pair<T1, T2> &p) const {
+    auto h1 = hash<T1>{}(p.first);
+    auto h2 = hash<T2>{}(p.second);
+    size_t seed = h1;
+    // boost::hash_combine
+    return h2 + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  }
+};
+void tokenize(const unordered_map<string, uint32_t> &word_count,
+              unordered_map<string, uint32_t> &token_to_int,
+              vector<string> &int_to_token, vector<list<uint32_t>> &words,
+              vector<int32_t> &counts) {
+  for (auto &x : word_count) {
+    auto &word = x.first;
+    words.push_back(list<uint32_t>());
+    auto &current_word = words.back();
+    counts.push_back(x.second);
+    int pos = 0, realLength = 0;
+    int lastStart = 0;
+    while (word[pos]) {
+      bool newChar = (word[pos] & 0xc0) != 0x80; // not a continuation byte
+      realLength += newChar;
+      // new token
+      if (newChar && pos > 0) {
+        auto new_token = word.substr(lastStart, pos - lastStart);
+        if (token_to_int.count(new_token) == 0) {
+          int_to_token.push_back(new_token);
+          token_to_int[new_token] = int_to_token.size() - 1;
+        }
+        current_word.push_back(token_to_int[new_token]);
+        lastStart = pos;
+      }
+      pos++;
+    }
+    auto new_token = word.substr(lastStart, string::npos) + kEndWord;
+    if (token_to_int.count(new_token) == 0) {
+      int_to_token.push_back(new_token);
+      token_to_int[new_token] = int_to_token.size() - 1;
+    }
+    current_word.push_back(token_to_int[new_token]);
+  }
+}
+void tokenize_str(const unordered_map<string, uint32_t> &word_count,
+                  unordered_map<string, vector<string>> &words) {
+  for (auto &x : word_count) {
+    auto &word = x.first;
+    words[word] = vector<string>();
+    int pos = 0, realLength = 0;
+    int lastStart = 0;
+    while (word[pos]) {
+      bool newChar = (word[pos] & 0xc0) != 0x80; // not a continuation byte
+      realLength += newChar;
+      // new token
+      if (newChar && pos > 0) {
+        auto new_token = word.substr(lastStart, pos - lastStart);
+        words[word].push_back(new_token);
+        lastStart = pos;
+      }
+      pos++;
+    }
+    auto new_token = word.substr(lastStart, string::npos) + kEndWord;
+    words[word].push_back(new_token);
+  }
+}
+using tp = pair<uint32_t, uint32_t>;
+using tps = pair<string, string>;
+using pc = unordered_map<tp, pair<int32_t, tp> *, pair_hash>;
+void count_in_word(
+    list<uint32_t> &word, uint32_t wi, uint32_t count, pc &pair_counts,
+    vector<pair<int32_t, tp>> &contiguous_counts,
+    unordered_map<tp, unordered_set<uint32_t>, pair_hash> &where) {
+  bool second = false;
+  tp cur_pair;
+  for (uint32_t token : word) {
+    if (second) {
+      cur_pair.first = cur_pair.second;
+    }
+    cur_pair.second = token;
+    if (second) {
+      auto it = pair_counts.find(cur_pair);
+      if (it == pair_counts.end()) {
+        contiguous_counts.emplace_back(0, cur_pair);
+        auto *added = &contiguous_counts.back();
+        pair_counts.emplace(piecewise_construct, forward_as_tuple(cur_pair),
+                            forward_as_tuple(added));
+        where[cur_pair].emplace();
+      }
+      if (count > 0) {
+        where[cur_pair].insert(wi);
+      } else {
+        where[cur_pair].erase(wi);
+      }
+      pair_counts[cur_pair]->first += count;
+    } else {
+      second = true;
+    }
+  }
+}
+void find_maxp(vector<pair<int32_t, tp>> &contiguous_counts, tp &maxp,
+               int32_t &max_c) {
+  max_c = 0;
+  for (auto &x : contiguous_counts) {
+    if (x.first > max_c) {
+      max_c = x.first;
+      maxp = x.second;
+    } else if (x.first == max_c and x.second < maxp) {
+      maxp = x.second;
+    }
+  }
+}
+void getvocab(const char *inputFile1, const char *inputFile2) {
+  // get vocab
+  unordered_map<string, uint32_t> word_count;
+  readText(inputFile1, word_count);
+  if (strcmp(inputFile2, "") != 0) {
+    readText(inputFile2, word_count);
+  }
+  // sort vocab
+  auto compFunctor = [](pair<string, int> elem1, pair<string, int> elem2) {
+    return elem1.second > elem2.second ||
+           (elem1.second == elem2.second && elem1.first < elem2.first);
+  };
+  set<pair<string, int>, decltype(compFunctor)> sorted_vocab(
+      word_count.begin(), word_count.end(), compFunctor);
+  assert(word_count.size() == sorted_vocab.size());
+  // print sorted vocab
+  for (auto element : sorted_vocab)
+    cout << element.first << " " << element.second << endl;
+}
+void learnbpe(const uint32_t kNPairs, const char *inputFile1,
+              const char *inputFile2) {
+  // get vocab
+  unordered_map<string, uint32_t> word_count;
+  readText(inputFile1, word_count);
+  if (strcmp(inputFile2, "") != 0) {
+    readText(inputFile2, word_count);
+  }
+  // a token is an int, it represents a string
+  unordered_map<string, uint32_t> token_to_int;
+  vector<string> int_to_token;
+  vector<list<uint32_t>> words;
+  vector<int32_t> counts;
+  tokenize(word_count, token_to_int, int_to_token, words, counts);
+  vector<pair<int32_t, tp>> contiguous_counts;
+  contiguous_counts.reserve(kMaxPairs);
+  pc pair_counts;
+  unordered_map<tp, unordered_set<uint32_t>, pair_hash> where_to_update;
+  tp cur_pair;
+  int32_t max_c = 0;
+  tp max_p;
+  for (uint32_t wi = 0; wi < words.size(); wi++) {
+    count_in_word(words[wi], wi, counts[wi], pair_counts, contiguous_counts,
+                  where_to_update);
+  }
+  find_maxp(contiguous_counts, max_p, max_c);
+  for (size_t i = 0; i < kNPairs; i++) {
+    // create new token for pair. replace
+    auto new_token = int_to_token[max_p.first] + int_to_token[max_p.second];
+    cout << int_to_token[max_p.first] << " " << int_to_token[max_p.second]
+         << " " << max_c << endl;
+    uint32_t new_token_id = int_to_token.size();
+    int_to_token.push_back(new_token);
+    token_to_int[new_token] = new_token_id;
+    max_c = 0;
+    auto change_count = [&](tp pair, int32_t v, uint32_t wi) {
+      auto it = pair_counts.find(pair);
+      if (it != pair_counts.end()) {
+        // assert(it->second + v >= 0);
+        it->second->first += v;
+      } else {
+        if (v > 0) {
+          contiguous_counts.emplace_back(v, pair);
+          pair_counts.emplace(piecewise_construct, forward_as_tuple(pair),
+                              forward_as_tuple(&(contiguous_counts.back())));
+          where_to_update[pair] = unordered_set<uint32_t>();
+        }
+      }
+      if (v > 0)
+        where_to_update[pair].insert(wi);
+    };
+    for (auto wi : where_to_update[max_p]) {
+      auto &cur_word = words[wi];
+      auto it = cur_word.begin();
+      bool second = false;
+      while (it != cur_word.end()) {
+        if (second) {
+          cur_pair.first = cur_pair.second;
+        }
+        cur_pair.second = *it;
+        if (second) {
+          // found the pair
+          if (cur_pair == max_p) {
+            it--; // points to first element of pair
+            // if there is a token before us
+            if (it != cur_word.begin()) {
+              it--;
+              change_count(make_pair(*it, cur_pair.first), -counts[wi], wi);
+              change_count(make_pair(*it, new_token_id), counts[wi], wi);
+              it++;
+            }
+            it = cur_word.insert(it, new_token_id); // it points to new token
+            it++;                    // it points to first element of pair
+            it = cur_word.erase(it); // it points to second element of pair
+            it = cur_word.erase(it); // it points to next value
+            // if there is a token after the one we inserted
+            if (it != cur_word.end()) {
+              change_count(make_pair(cur_pair.second, *it), -counts[wi], wi);
+              change_count(make_pair(new_token_id, *it), counts[wi], wi);
+            }
+            cur_pair.second = new_token_id;
+          } else {
+            it++;
+          }
+        } else {
+          second = true;
+          it++;
+        }
+      }
+    }
+    if (pair_counts.find(max_p) != pair_counts.end()){
+      pair_counts[max_p]->first = 0;
+    }
+    find_maxp(contiguous_counts, max_p, max_c);
+  }
+}
+void split(vector<string> &splits, const string &text, char sep) {
+  size_t start = 0, end = 0;
+  while ((end = text.find(sep, start)) != string::npos) {
+    if (end != start)
+      splits.push_back(text.substr(start, end - start));
+    start = end + 1;
+  }
+  if (end != start && start < text.size())
+    splits.push_back(text.substr(start));
+}
+void readVocab(const char *fp, unordered_map<string, uint32_t> &vocab) {
+  ifstream file(fp);
+  if (!file) {
+    fprintf(stderr, "Cannot open vocabulary file %s\n", fp);
+    exit(EXIT_FAILURE);
+  }
+  fprintf(stderr, "Loading vocabulary from %s ...\n", fp);
+  string line;
+  uint64_t total = 0;
+  while (getline(file, line)) {
+    vector<string> splits;
+    split(splits, line, ' ');
+    assert(splits.size() == 2);
+    assert(vocab.find(splits[0]) == vocab.end());
+    int count = stoi(splits[1]);
+    vocab[splits[0]] = count;
+    total += count;
+  }
+  fprintf(stderr, "Read %lu words (%lu unique) from vocabulary file.\n", total,
+          vocab.size());
+}
+void readCodes(const char *fp, unordered_map<tps, uint32_t, pair_hash> &codes,
+               unordered_map<string, tps> &reversed_codes) {
+  ifstream file(fp);
+  if (!file) {
+    fprintf(stderr, "Cannot open codes file %s\n", fp);
+    exit(EXIT_FAILURE);
+  }
+  fprintf(stderr, "Loading codes from %s ...\n", fp);
+  string line;
+  while (getline(file, line)) {
+    vector<string> splits;
+    split(splits, line, ' ');
+    assert(splits.size() == 3);
+    auto pair = make_pair(splits[0], splits[1]);
+    string concat = splits[0] + splits[1];
+    assert(codes.find(pair) == codes.end());
+    assert(reversed_codes.find(concat) == reversed_codes.end());
+    codes[pair] = codes.size();
+    reversed_codes[concat] = pair;
+  }
+  fprintf(stderr, "Read %lu codes from the codes file.\n", codes.size());
+}
+void decompose(const string s, vector<string> &newSubwords,
+               const unordered_map<string, tps> &reversed_codes,
+               const unordered_map<string, uint32_t> &vocab, bool isFinal) {
+  auto it = reversed_codes.find(s);
+  if (it == reversed_codes.end()) {
+    // TODO this whole block below is just some sanity check
+    // if we cannot un-merge a subword, it has to be a char
+    string s2 = isFinal ? s.substr(0, s.size() - kEndWordLength) : s;
+    int count = 0;
+    for (size_t j = 0; j < s2.size(); j++) {
+      if ((s2[j] & 0xc0) != 0x80) {
+        count++;
+      }
+    }
+    assert(count == 1);
+    newSubwords.push_back(s);
+    return;
+  }
+  assert(it != reversed_codes.end());
+  string token1 = it->second.first;
+  if (vocab.find(token1 + kTokenDelim) == vocab.end()) {
+    decompose(token1, newSubwords, reversed_codes, vocab, false);
+  } else {
+    newSubwords.push_back(token1);
+  }
+  string token2 = it->second.second;
+  auto query = token2 + kTokenDelim;
+  if (isFinal) {
+    query = token2.substr(0, token2.size() - kEndWordLength);
+  }
+  if (vocab.find(query) == vocab.end()) {
+    decompose(token2, newSubwords, reversed_codes, vocab, isFinal);
+  } else {
+    newSubwords.push_back(token2);
+  }
+}
+void limitVocab(const vector<string> &subwords, vector<string> &newSubwords,
+                const unordered_map<string, tps> &reversed_codes,
+                const unordered_map<string, uint32_t> &vocab) {
+  string query;
+  for (size_t i = 0; i < subwords.size(); i++) {
+    bool isFinal = i == subwords.size() - 1;
+    auto &subword = subwords[i];
+    if (isFinal) {
+      query = subword.substr(0, subword.size() - kEndWordLength);
+    } else {
+      query = subword + kTokenDelim;
+    }
+    if (vocab.find(query) == vocab.end()) {
+      decompose(subword, newSubwords, reversed_codes, vocab, isFinal);
+    } else {
+      newSubwords.push_back(subword);
+    }
+  }
+}
+string process_bpe(vector<string> &subwords,
+                   unordered_map<tps, uint32_t, pair_hash> &codes,
+                   unordered_map<string, tps> &reversed_codes,
+                   unordered_map<string, uint32_t> &vocab) {
+  // merge subWords as much as possible
+  vector<string> newSubwords;
+  while (subwords.size() > 1) {
+    // find the best pair
+    int bestPairId = -1;
+    auto bestPair = codes.end(); // TODO ugly hack that works
+    for (size_t i = 0; i < subwords.size() - 1; i++) {
+      auto pair = make_pair(subwords[i], subwords[i + 1]);
+      auto it = codes.find(pair);
+      int pairRank = it == codes.end() ? -1 : it->second;
+      if (pairRank >= 0 && (bestPairId == -1 || int(bestPair->second) > pairRank)) {
+        bestPair = it;
+        bestPairId = i;
+      }
+    }
+    // if we cannot merge anything, stop
+    if (bestPairId == -1) {
+      break;
+    }
+    // otherwise, merge subWords
+    bool justMerged = false;
+    newSubwords = vector<string>();
+    for (size_t i = 0; i < subwords.size(); i++) {
+      if ((i + 1 < subwords.size()) && (not justMerged) &&
+          subwords[i] == bestPair->first.first &&
+          subwords[i + 1] == bestPair->first.second) {
+        newSubwords.push_back(subwords[i] + subwords[i + 1]);
+        justMerged = true;
+      } else {
+        if (not justMerged) {
+          newSubwords.push_back(subwords[i]);
+        }
+        justMerged = false;
+      }
+    }
+    subwords = newSubwords;
+  }
+  // check that we are only using words in the dictionary
+  if (vocab.size() > 0) {
+    vector<string> newSubwords;
+    limitVocab(subwords, newSubwords, reversed_codes, vocab);
+    subwords = newSubwords;
+  }
+  // concat subWords
+  string result;
+  for (auto x : subwords) {
+    result = result + x + kTokenDelim + " ";
+  }
+  return result.substr(
+    0,
+    result.size() - kEndWordLength - kTokenDelimLength - 1 // "</w>@@ "
+  );
+}
+void applybpe(const char *outputFile, const char *inputFile,
+              const char *codesPath, const char *vocabPath) {
+  // read vocabulary (to which we want to limit the output file)
+  unordered_map<string, uint32_t> vocab;
+  if (strcmp(vocabPath, "") != 0) {
+    readVocab(vocabPath, vocab);
+  }
+  // read codes
+  unordered_map<tps, uint32_t, pair_hash> codes;
+  unordered_map<string, tps> reversed_codes;
+  readCodes(codesPath, codes, reversed_codes);
+  // read input file words
+  unordered_map<string, uint32_t> word_count;
+  readText(inputFile, word_count);
+  // tokenize
+  unordered_map<string, vector<string>> bpeTok;
+  tokenize_str(word_count, bpeTok);
+  vector<pair<string, vector<string>>> bpeTokVec;
+  for (auto x : bpeTok) {
+    bpeTokVec.push_back(x);
+  }
+  // apply BPE codes to each word
+  unordered_map<string, string> bpe[kThreads];
+  vector<thread> threads;
+  for (size_t i = 0; i < kThreads; i++) {
+    threads.emplace_back(
+      [&](size_t this_thread) {
+        for (size_t w = this_thread; w < bpeTokVec.size(); w += kThreads) {
+          auto &x = bpeTokVec[w];
+          bpe[this_thread][x.first] = process_bpe(x.second, codes, reversed_codes, vocab);
+        }
+      },
+      i
+    );
+  }
+  unordered_map<string, string> final_bpe;
+  for (size_t i = 0; i < kThreads; i++) {
+    threads[i].join();
+    for (auto x : bpe[i]) {
+      final_bpe[x.first] = x.second;
+    }
+  }
+  // output
+  outputText(outputFile, inputFile, final_bpe);
+}
+class BPEApplyer {
+private:
+  unordered_map<string, uint32_t> vocab;
+  unordered_map<tps, uint32_t, pair_hash> codes;
+  unordered_map<string, tps> reversed_codes;
+public:
+  BPEApplyer(const string& codesPath, const string& vocabPath) {
+    if (vocabPath.size() > 0) readVocab(vocabPath.c_str(), vocab);
+    readCodes(codesPath.c_str(), codes, reversed_codes);
+  }
+  vector<string> apply(vector<string>& sentences) {
+    vector<string> res;
+    for(auto &s: sentences) {
+      res.emplace_back("");
+      string& cur = res.back();
+      vector<string> words;
+      split(words, s, ' ');
+      for (size_t i = 0; i < words.size(); i++) {
+        auto word = words[i];
+        vector<string> word_bpes;
+        int pos = 0, realLength = 0;
+        int lastStart = 0;
+        while (word[pos]) {
+          bool newChar = (word[pos] & 0xc0) != 0x80; // not a continuation byte
+          realLength += newChar;
+          if (newChar && pos > 0) {
+            auto new_token = word.substr(lastStart, pos - lastStart);
+            word_bpes.push_back(new_token);
+            lastStart = pos;
+          }
+          pos++;
+        }
+        auto bpe = word.substr(lastStart, string::npos) + kEndWord;
+        word_bpes.push_back(bpe);
+        cur += process_bpe(word_bpes, codes, reversed_codes, vocab);
+        if (i < words.size() - 1) cur += " ";
+      }
+    }
+    return res;
+  }
+};
+void applybpe_stream(const char *codesPath, const char *vocabPath) {
+  BPEApplyer applyer(codesPath, vocabPath);
+  std::string line;
+  while(std::getline(std::cin, line)) {
+    vector<string> tmp;
+    tmp.push_back(line);
+    for(auto& l : applyer.apply(tmp)){
+      std::cout << l << std::endl;
+    }
+  }
+}
+};

laser/tools-external/fastBPE/fastBPE/fastBPE.pyx ADDED Viewed

	@@ -0,0 +1,24 @@

+# cython: language_level=3
+# distutils: language = c++
+from libcpp.vector cimport vector
+from libcpp.string cimport string
+cdef extern from "fastBPE.hpp" namespace "fastBPE":
+    cdef cppclass BPEApplyer:
+        BPEApplyer(const string& codes_path, const string& vocab_path)
+        vector[string] apply(vector[string]& sentences)
+cdef class fastBPE:
+    cdef BPEApplyer* c_obj
+    def __dealloc__(self):
+        del self.c_obj
+    def __init__(self, codes_path, vocab_path=""):
+        self.c_obj = new BPEApplyer(codes_path.encode(), vocab_path.encode())
+    def apply(self, sentences):
+        cdef vector[string] s = [x.encode() for x in sentences]
+        cdef vector[string] res = self.c_obj.apply(s)
+        return [x.decode() for x in res]

laser/tools-external/fastBPE/fastBPE/main.cc ADDED Viewed

	@@ -0,0 +1,43 @@

+#include "fastBPE.hpp"
+using namespace std;
+using namespace fastBPE;
+void printUsage() {
+  cerr
+      << "usage: fastbpe <command> <args>\n\n"
+      << "The commands supported by fastBPE are:\n\n"
+      << "getvocab input1 [input2]             extract the vocabulary from one "
+         "or two text files\n"
+      << "learnbpe nCodes input1 [input2]      learn BPE codes from one or two "
+         "text files\n"
+      << "applybpe output input codes [vocab]  apply BPE codes to a text file\n"
+      << "applybpe_stream codes [vocab]        apply BPE codes to stdin and output to stdout\n"
+      << endl;
+}
+int main(int argc, char **argv) {
+  if (argc < 2) {
+    printUsage();
+    exit(EXIT_FAILURE);
+  }
+  string command = argv[1];
+  if (command == "getvocab") {
+    assert(argc == 3 || argc == 4);
+    getvocab(argv[2], argc == 4 ? argv[3] : "");
+  } else if (command == "learnbpe") {
+    assert(argc == 4 || argc == 5);
+    learnbpe(stoi(argv[2]), argv[3], argc == 5 ? argv[4] : "");
+  } else if (command == "applybpe") {
+    assert(argc == 5 || argc == 6);
+    applybpe(argv[2], argv[3], argv[4], argc == 6 ? argv[5] : "");
+  } else if (command == "applybpe_stream") {
+    assert(argc == 3 || argc == 4);
+    applybpe_stream(argv[2], argc == 4 ? argv[3] : "");
+  } else {
+    printUsage();
+    exit(EXIT_FAILURE);
+  }
+  return 0;
+}

laser/tools-external/fastBPE/setup.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from setuptools import setup, find_packages, Extension
+from distutils.command.sdist import sdist as _sdist
+try:
+    from Cython.Build import cythonize
+except ImportError:
+    use_cython = False
+else:
+    use_cython = True
+if use_cython:
+    extension = 'pyx'
+else:
+    extension = 'cpp'
+extensions = [
+    Extension(
+        'fastBPE',
+        [ "fastBPE/fastBPE." + extension ],
+        language='c++',
+        extra_compile_args=[
+            "-std=c++11", "-Ofast", "-pthread"
+        ],
+    ),
+]
+if use_cython:
+    extensions = cythonize(extensions)
+with open('README.md') as f:
+    readme = f.read()
+setup(
+    name = 'fastBPE',
+    version = '0.1.1',
+    description = 'C++ implementation of Neural Machine Translation of Rare Words with Subword Units, with Python API.',
+    url = 'https://github.com/glample/fastBPE',
+    long_description = readme,
+    long_description_content_type = 'text/markdown',
+    ext_package = '',
+    ext_modules = extensions,
+    packages=[
+        'fastBPE',
+    ],
+)

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ca ADDED Viewed

	@@ -0,0 +1,75 @@

+Dr
+Dra
+pàg
+p
+c
+av
+Sr
+Sra
+adm
+esq
+Prof
+S.A
+S.L
+p.e
+ptes
+Sta
+St
+pl
+màx
+cast
+dir
+nre
+fra
+admdora
+Emm
+Excma
+espf
+dc
+admdor
+tel
+angl
+aprox
+ca
+dept
+dj
+dl
+dt
+ds
+dg
+dv
+ed
+entl
+al
+i.e
+maj
+smin
+n
+núm
+pta
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.cs ADDED Viewed

	@@ -0,0 +1,390 @@

+Bc
+BcA
+Ing
+Ing.arch
+MUDr
+MVDr
+MgA
+Mgr
+JUDr
+PhDr
+RNDr
+PharmDr
+ThLic
+ThDr
+Ph.D
+Th.D
+prof
+doc
+CSc
+DrSc
+dr. h. c
+PaedDr
+Dr
+PhMr
+DiS
+abt
+ad
+a.i
+aj
+angl
+anon
+apod
+atd
+atp
+aut
+bd
+biogr
+b.m
+b.p
+b.r
+cca
+cit
+cizojaz
+c.k
+col
+čes
+čín
+čj
+ed
+facs
+fasc
+fol
+fot
+franc
+h.c
+hist
+hl
+hrsg
+ibid
+il
+ind
+inv.č
+jap
+jhdt
+jv
+koed
+kol
+korej
+kl
+krit
+lat
+lit
+m.a
+maď
+mj
+mp
+násl
+např
+nepubl
+něm
+no
+nr
+n.s
+okr
+odd
+odp
+obr
+opr
+orig
+phil
+pl
+pokrač
+pol
+port
+pozn
+př.kr
+př.n.l
+přel
+přeprac
+příl
+pseud
+pt
+red
+repr
+resp
+revid
+rkp
+roč
+roz
+rozš
+samost
+sect
+sest
+seš
+sign
+sl
+srv
+stol
+sv
+šk
+šk.ro
+špan
+tab
+t.č
+tis
+tj
+tř
+tzv
+univ
+uspoř
+vol
+vl.jm
+vs
+vyd
+vyobr
+zal
+zejm
+zkr
+zprac
+zvl
+n.p
+např
+než
+MUDr
+abl
+absol
+adj
+adv
+ak
+ak. sl
+akt
+alch
+amer
+anat
+angl
+anglosas
+arab
+arch
+archit
+arg
+astr
+astrol
+att
+bás
+belg
+bibl
+biol
+boh
+bot
+bulh
+círk
+csl
+č
+čas
+čes
+dat
+děj
+dep
+dět
+dial
+dór
+dopr
+dosl
+ekon
+epic
+etnonym
+eufem
+f
+fam
+fem
+fil
+film
+form
+fot
+fr
+fut
+fyz
+gen
+geogr
+geol
+geom
+germ
+gram
+hebr
+herald
+hist
+hl
+hovor
+hud
+hut
+chcsl
+chem
+ie
+imp
+impf
+ind
+indoevr
+inf
+instr
+interj
+ión
+iron
+it
+kanad
+katalán
+klas
+kniž
+komp
+konj
+konkr
+kř
+kuch
+lat
+lék
+les
+lid
+lit
+liturg
+lok
+log
+m
+mat
+meteor
+metr
+mod
+ms
+mysl
+n
+náb
+námoř
+neklas
+něm
+nesklon
+nom
+ob
+obch
+obyč
+ojed
+opt
+part
+pas
+pejor
+pers
+pf
+pl
+plpf
+práv
+prep
+předl
+přivl
+r
+rcsl
+refl
+reg
+rkp
+ř
+řec
+s
+samohl
+sg
+sl
+souhl
+spec
+srov
+stfr
+střv
+stsl
+subj
+subst
+superl
+sv
+sz
+táz
+tech
+telev
+teol
+trans
+typogr
+var
+vedl
+verb
+vl. jm
+voj
+vok
+vůb
+vulg
+výtv
+vztaž
+zahr
+zájm
+zast
+zejm
+zeměd
+zkr
+zř
+mj
+dl
+atp
+sport
+Mgr
+horn
+MVDr
+JUDr
+RSDr
+Bc
+PhDr
+ThDr
+Ing
+aj
+apod
+PharmDr
+pomn
+ev
+slang
+nprap
+odp
+dop
+pol
+st
+stol
+p. n. l
+před n. l
+n. l
+př. Kr
+po Kr
+př. n. l
+odd
+RNDr
+tzv
+atd
+tzn
+resp
+tj
+p
+br
+č. j
+čj
+č. p
+čp
+a. s
+s. r. o
+spol. s r. o
+p. o
+s. p
+v. o. s
+k. s
+o. p. s
+o. s
+v. r
+v z
+ml
+vč
+kr
+mld
+hod
+popř
+ap
+event
+rus
+slov
+rum
+švýc
+P. T
+zvl
+hor
+dol
+S.O.S

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.de ADDED Viewed

	@@ -0,0 +1,325 @@

+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+#no german words end in single lower-case letters, so we throw those in too.
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+#Roman Numerals. A dot after one of these is not a sentence break in German.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+#Titles and Honorifics
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Ens
+Gen
+Gov
+Hon
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+#Misc symbols
+Mio
+Mrd
+bzw
+v
+vs
+usw
+d.h
+z.B
+u.a
+etc
+Mrd
+MwSt
+ggf
+d.J
+D.h
+m.E
+vgl
+I.F
+z.T
+sogen
+ff
+u.E
+g.U
+g.g.A
+c.-à-d
+Buchst
+u.s.w
+sog
+u.ä
+Std
+evtl
+Zt
+Chr
+u.U
+o.ä
+Ltd
+b.A
+z.Zt
+spp
+sen
+SA
+k.o
+jun
+i.H.v
+dgl
+dergl
+Co
+zzt
+usf
+s.p.a
+Dkr
+Corp
+bzgl
+BSE
+#Number indicators
+# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
+No
+Nos
+Art
+Nr
+pp
+ca
+Ca
+#Ordinals are done with . in German - "1." = "1st" in English
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.el ADDED Viewed

	@@ -0,0 +1,1568 @@

+# Sigle letters in upper-case are usually abbreviations of names
+Α
+Β
+Γ
+Δ
+Ε
+Ζ
+Η
+Θ
+Ι
+Κ
+Λ
+Μ
+Ν
+Ξ
+Ο
+Π
+Ρ
+Σ
+Τ
+Υ
+Φ
+Χ
+Ψ
+Ω
+# Includes abbreviations for the Greek language compiled from various sources (Greek grammar books, Greek language related web content).
+Άθαν
+Έγχρ
+Έκθ
+Έσδ
+Έφ
+Όμ
+Α΄Έσδρ
+Α΄Έσδ
+Α΄Βασ
+Α΄Θεσ
+Α΄Ιω
+Α΄Κορινθ
+Α΄Κορ
+Α΄Μακκ
+Α΄Μακ
+Α΄Πέτρ
+Α΄Πέτ
+Α΄Παραλ
+Α΄Πε
+Α΄Σαμ
+Α΄Τιμ
+Α΄Χρον
+Α΄Χρ
+Α.Β.Α
+Α.Β
+Α.Ε
+Α.Κ.Τ.Ο
+Αέθλ
+Αέτ
+Αίλ.Δ
+Αίλ.Τακτ
+Αίσ
+Αββακ
+Αβυδ
+Αβ
+Αγάκλ
+Αγάπ
+Αγάπ.Αμαρτ.Σ
+Αγάπ.Γεωπ
+Αγαθάγγ
+Αγαθήμ
+Αγαθιν
+Αγαθοκλ
+Αγαθρχ
+Αγαθ
+Αγαθ.Ιστ
+Αγαλλ
+Αγαπητ
+Αγγ
+Αγησ
+Αγλ
+Αγορ.Κ
+Αγρο.Κωδ
+Αγρ.Εξ
+Αγρ.Κ
+Αγ.Γρ
+Αδριαν
+Αδρ
+Αετ
+Αθάν
+Αθήν
+Αθήν.Επιγρ
+Αθήν.Επιτ
+Αθήν.Ιατρ
+Αθήν.Μηχ
+Αθανάσ
+Αθαν
+Αθηνί
+Αθηναγ
+Αθηνόδ
+Αθ
+Αθ.Αρχ
+Αιλ
+Αιλ.Επιστ
+Αιλ.ΖΙ
+Αιλ.ΠΙ
+Αιλ.απ
+Αιμιλ
+Αιν.Γαζ
+Αιν.Τακτ
+Αισχίν
+Αισχίν.Επιστ
+Αισχ
+Αισχ.Αγαμ
+Αισχ.Αγ
+Αισχ.Αλ
+Αισχ.Ελεγ
+Αισχ.Επτ.Θ
+Αισχ.Ευμ
+Αισχ.Ικέτ
+Αισχ.Ικ
+Αισχ.Περσ
+Αισχ.Προμ.Δεσμ
+Αισχ.Πρ
+Αισχ.Χοηφ
+Αισχ.Χο
+Αισχ.απ
+ΑιτΕ
+Αιτ
+Αλκ
+Αλχιας
+Αμ.Π.Ο
+Αμβ
+Αμμών
+Αμ.
+Αν.Πειθ.Συμβ.Δικ
+Ανακρ
+Ανακ
+Αναμν.Τόμ
+Αναπλ
+Ανδ
+Ανθλγος
+Ανθστης
+Αντισθ
+Ανχης
+Αν
+Αποκ
+Απρ
+Απόδ
+Απόφ
+Απόφ.Νομ
+Απ
+Απ.Δαπ
+Απ.Διατ
+Απ.Επιστ
+Αριθ
+Αριστοτ
+Αριστοφ
+Αριστοφ.Όρν
+Αριστοφ.Αχ
+Αριστοφ.Βάτρ
+Αριστοφ.Ειρ
+Αριστοφ.Εκκλ
+Αριστοφ.Θεσμ
+Αριστοφ.Ιππ
+Αριστοφ.Λυσ
+Αριστοφ.Νεφ
+Αριστοφ.Πλ
+Αριστοφ.Σφ
+Αριστ
+Αριστ.Αθ.Πολ
+Αριστ.Αισθ
+Αριστ.Αν.Πρ
+Αριστ.Ζ.Ι
+Αριστ.Ηθ.Ευδ
+Αριστ.Ηθ.Νικ
+Αριστ.Κατ
+Αριστ.Μετ
+Αριστ.Πολ
+Αριστ.Φυσιογν
+Αριστ.Φυσ
+Αριστ.Ψυχ
+Αριστ.Ρητ
+Αρμεν
+Αρμ
+Αρχ.Εκ.Καν.Δ
+Αρχ.Ευβ.Μελ
+Αρχ.Ιδ.Δ
+Αρχ.Νομ
+Αρχ.Ν
+Αρχ.Π.Ε
+Αρ
+Αρ.Φορ.Μητρ
+Ασμ
+Ασμ.ασμ
+Αστ.Δ
+Αστ.Χρον
+Ασ
+Ατομ.Γνωμ
+Αυγ
+Αφρ
+Αχ.Νομ
+Α
+Α.Εγχ.Π
+Α.Κ.΄Υδρας
+Β΄Έσδρ
+Β΄Έσδ
+Β΄Βασ
+Β΄Θεσ
+Β΄Ιω
+Β΄Κορινθ
+Β΄Κορ
+Β΄Μακκ
+Β΄Μακ
+Β΄Πέτρ
+Β΄Πέτ
+Β΄Πέ
+Β΄Παραλ
+Β΄Σαμ
+Β΄Τιμ
+Β΄Χρον
+Β΄Χρ
+Β.Ι.Π.Ε
+Β.Κ.Τ
+Β.Κ.Ψ.Β
+Β.Μ
+Β.Ο.Α.Κ
+Β.Ο.Α
+Β.Ο.Δ
+Βίβλ
+Βαρ
+ΒεΘ
+Βι.Περ
+Βιπερ
+Βιργ
+Βλγ
+Βούλ
+Βρ
+Γ΄Βασ
+Γ΄Μακκ
+ΓΕΝμλ
+Γέν
+Γαλ
+Γεν
+Γλ
+Γν.Ν.Σ.Κρ
+Γνωμ
+Γν
+Γράμμ
+Γρηγ.Ναζ
+Γρηγ.Νύσ
+Γ Νοσ
+Γ' Ογκολ
+Γ.Ν
+Δ΄Βασ
+Δ.Β
+Δ.Δίκη
+Δ.Δίκ
+Δ.Ε.Σ
+Δ.Ε.Φ.Α
+Δ.Ε.Φ
+Δ.Εργ.Ν
+Δαμ
+Δαμ.μνημ.έργ
+Δαν
+Δασ.Κ
+Δεκ
+Δελτ.Δικ.Ε.Τ.Ε
+Δελτ.Νομ
+Δελτ.Συνδ.Α.Ε
+Δερμ
+Δευτ
+Δεύτ
+Δημοσθ
+Δημόκρ
+Δι.Δικ
+Διάτ
+Διαιτ.Απ
+Διαιτ
+Διαρκ.Στρατ
+Δικ
+Διοίκ.Πρωτ
+ΔιοικΔνη
+Διοικ.Εφ
+Διον.Αρ
+Διόρθ.Λαθ
+Δ.κ.Π
+Δνη
+Δν
+Δογμ.Όρος
+Δρ
+Δ.τ.Α
+Δτ
+ΔωδΝομ
+Δ.Περ
+Δ.Στρ
+ΕΔΠολ
+ΕΕυρΚ
+ΕΙΣ
+ΕΝαυτΔ
+ΕΣΑμΕΑ
+ΕΣΘ
+ΕΣυγκΔ
+ΕΤρΑξΧρΔ
+Ε.Φ.Ε.Τ
+Ε.Φ.Ι
+Ε.Φ.Ο.Επ.Α
+Εβδ
+Εβρ
+Εγκύκλ.Επιστ
+Εγκ
+Εε.Αιγ
+Εθν.Κ.Τ
+Εθν
+Ειδ.Δικ.Αγ.Κακ
+Εικ
+Ειρ.Αθ
+Ειρην.Αθ
+Ειρην
+Έλεγχ
+Ειρ
+Εισ.Α.Π
+Εισ.Ε
+Εισ.Ν.Α.Κ
+Εισ.Ν.Κ.Πολ.Δ
+Εισ.Πρωτ
+Εισηγ.Έκθ
+Εισ
+Εκκλ
+Εκκ
+Εκ
+Ελλ.Δνη
+Εν.Ε
+Εξ
+Επ.Αν
+Επ.Εργ.Δ
+Επ.Εφ
+Επ.Κυπ.Δ
+Επ.Μεσ.Αρχ
+Επ.Νομ
+Επίκτ
+Επίκ
+Επι.Δ.Ε
+Επιθ.Ναυτ.Δικ
+Επικ
+Επισκ.Ε.Δ
+Επισκ.Εμπ.Δικ
+Επιστ.Επετ.Αρμ
+Επιστ.Επετ
+Επιστ.Ιερ
+Επιτρ.Προστ.Συνδ.Στελ
+Επιφάν
+Επτ.Εφ
+Επ.Ιρ
+Επ.Ι
+Εργ.Ασφ.Νομ
+Ερμ.Α.Κ
+Ερμη.Σ
+Εσθ
+Εσπερ
+Ετρ.Δ
+Ευκλ
+Ευρ.Δ.Δ.Α
+Ευρ.Σ.Δ.Α
+Ευρ.ΣτΕ
+Ευρατόμ
+Ευρ.Άλκ
+Ευρ.Ανδρομ
+Ευρ.Βάκχ
+Ευρ.Εκ
+Ευρ.Ελ
+Ευρ.Ηλ
+Ευρ.Ηρακ
+Ευρ.Ηρ
+Ευρ.Ηρ.Μαιν
+Ευρ.Ικέτ
+Ευρ.Ιππόλ
+Ευρ.Ιφ.��
+Ευρ.Ιφ.Τ
+Ευρ.Ι.Τ
+Ευρ.Κύκλ
+Ευρ.Μήδ
+Ευρ.Ορ
+Ευρ.Ρήσ
+Ευρ.Τρωάδ
+Ευρ.Φοίν
+Εφ.Αθ
+Εφ.Εν
+Εφ.Επ
+Εφ.Θρ
+Εφ.Θ
+Εφ.Ι
+Εφ.Κερ
+Εφ.Κρ
+Εφ.Λ
+Εφ.Ν
+Εφ.Πατ
+Εφ.Πειρ
+Εφαρμ.Δ.Δ
+Εφαρμ
+Εφεσ
+Εφημ
+Εφ
+Ζαχ
+Ζιγ
+Ζυ
+Ζχ
+ΗΕ.Δ
+Ημερ
+Ηράκλ
+Ηροδ
+Ησίοδ
+Ησ
+Η.Ε.Γ
+ΘΗΣ
+ΘΡ
+Θαλ
+Θεοδ
+Θεοφ
+Θεσ
+Θεόδ.Μοψ
+Θεόκρ
+Θεόφιλ
+Θουκ
+Θρ
+Θρ.Ε
+Θρ.Ιερ
+Θρ.Ιρ
+Ιακ
+Ιαν
+Ιβ
+Ιδθ
+Ιδ
+Ιεζ
+Ιερ
+Ιζ
+Ιησ
+Ιησ.Ν
+Ικ
+Ιλ
+Ιν
+Ιουδ
+Ιουστ
+Ιούδα
+Ιούλ
+Ιούν
+Ιπποκρ
+Ιππόλ
+Ιρ
+Ισίδ.Πηλ
+Ισοκρ
+Ισ.Ν
+Ιωβ
+Ιωλ
+Ιων
+Ιω
+ΚΟΣ
+ΚΟ.ΜΕ.ΚΟΝ
+ΚΠοινΔ
+ΚΠολΔ
+ΚαΒ
+Καλ
+Καλ.Τέχν
+ΚανΒ
+Καν.Διαδ
+Κατάργ
+Κλ
+ΚοινΔ
+Κολσ
+Κολ
+Κον
+Κορ
+Κος
+ΚριτΕπιθ
+ΚριτΕ
+Κριτ
+Κρ
+ΚτΒ
+ΚτΕ
+ΚτΠ
+Κυβ
+Κυπρ
+Κύριλ.Αλεξ
+Κύριλ.Ιερ
+Λεβ
+Λεξ.Σουίδα
+Λευϊτ
+Λευ
+Λκ
+Λογ
+ΛουκΑμ
+Λουκιαν
+Λουκ.Έρωτ
+Λουκ.Ενάλ.Διάλ
+Λουκ.Ερμ
+Λουκ.Εταιρ.Διάλ
+Λουκ.Ε.Δ
+Λουκ.Θε.Δ
+Λουκ.Ικ.
+Λουκ.Ιππ
+Λουκ.Λεξιφ
+Λουκ.Μεν
+Λουκ.Μισθ.Συν
+Λουκ.Ορχ
+Λουκ.Περ
+Λουκ.Συρ
+Λουκ.Τοξ
+Λουκ.Τυρ
+Λουκ.Φιλοψ
+Λουκ.Φιλ
+Λουκ.Χάρ
+Λουκ.
+Λουκ.Αλ
+Λοχ
+Λυδ
+Λυκ
+Λυσ
+Λωζ
+Λ1
+Λ2
+ΜΟΕφ
+Μάρκ
+Μέν
+Μαλ
+Ματθ
+Μα
+Μιχ
+Μκ
+Μλ
+Μμ
+Μον.Δ.Π
+Μον.Πρωτ
+Μον
+Μρ
+Μτ
+Μχ
+Μ.Βασ
+Μ.Πλ
+ΝΑ
+Ναυτ.Χρον
+Να
+Νδικ
+Νεεμ
+Νε
+Νικ
+ΝκΦ
+Νμ
+ΝοΒ
+Νομ.Δελτ.Τρ.Ελ
+Νομ.Δελτ
+Νομ.Σ.Κ
+Νομ.Χρ
+Νομ
+Νομ.Διεύθ
+Νοσ
+Ντ
+Νόσων
+Ν1
+Ν2
+Ν3
+Ν4
+Νtot
+Ξενοφ
+Ξεν
+Ξεν.Ανάβ
+Ξεν.Απολ
+Ξεν.Απομν
+Ξεν.Απομ
+Ξεν.Ελλ
+Ξεν.Ιέρ
+Ξεν.Ιππαρχ
+Ξεν.Ιππ
+Ξεν.Κυρ.Αν
+Ξεν.Κύρ.Παιδ
+Ξεν.Κ.Π
+Ξεν.Λακ.Πολ
+Ξεν.Οικ
+Ξεν.Προσ
+Ξεν.Συμπόσ
+Ξεν.Συμπ
+Ο΄
+Οβδ
+Οβ
+ΟικΕ
+Οικ
+Οικ.Πατρ
+Οικ.Σύν.Βατ
+Ολομ
+Ολ
+Ολ.Α.Π
+Ομ.Ιλ
+Ομ.Οδ
+ΟπΤοιχ
+Οράτ
+Ορθ
+ΠΡΟ.ΠΟ
+Πίνδ
+Πίνδ.Ι
+Πίνδ.Νεμ
+Πίνδ.Ν
+Πίνδ.Ολ
+Πίνδ.Παθ
+Πίνδ.Πυθ
+Πίνδ.Π
+ΠαγΝμλγ
+Παν
+Παρμ
+Παροιμ
+Παρ
+Παυσ
+Πειθ.Συμβ
+ΠειρΝ
+Πελ
+ΠεντΣτρ
+Πεντ
+Πεντ.Εφ
+ΠερΔικ
+Περ.Γεν.Νοσ
+Πετ
+Πλάτ
+Πλάτ.Αλκ
+Πλάτ.Αντ
+Πλάτ.Αξίοχ
+Πλάτ.Απόλ
+Πλάτ.Γοργ
+Πλάτ.Ευθ
+Πλάτ.Θεαίτ
+Πλάτ.Κρατ
+Πλάτ.Κριτ
+Πλάτ.Λύσ
+Πλάτ.Μεν
+Πλάτ.Νόμ
+Πλάτ.Πολιτ
+Πλάτ.Πολ
+Πλάτ.Πρωτ
+Πλάτ.Σοφ.
+Πλάτ.Συμπ
+Πλάτ.Τίμ
+Πλάτ.Φαίδρ
+Πλάτ.Φιλ
+Πλημ
+Πλούτ
+Πλούτ.Άρατ
+Πλούτ.Αιμ
+Πλούτ.Αλέξ
+Πλούτ.Αλκ
+Πλούτ.Αντ
+Πλούτ.Αρτ
+Πλούτ.Ηθ
+Πλούτ.Θεμ
+Πλούτ.Κάμ
+Πλούτ.Καίσ
+Πλούτ.Κικ
+Πλούτ.Κράσ
+Πλούτ.Κ
+Πλούτ.Λυκ
+Πλούτ.Μάρκ
+Πλούτ.Μάρ
+Πλούτ.Περ
+Πλούτ.Ρωμ
+Πλούτ.Σύλλ
+Πλούτ.Φλαμ
+Πλ
+Ποιν.Δικ
+Ποιν.Δ
+Ποιν.Ν
+Ποιν.Χρον
+Ποιν.Χρ
+Πολ.Δ
+Πολ.Πρωτ
+Πολ
+Πολ.Μηχ
+Πολ.Μ
+Πρακτ.Αναθ
+Πρακτ.Ολ
+Πραξ
+Πρμ
+Πρξ
+Πρωτ
+Πρ
+Πρ.Αν
+Πρ.Λογ
+Πταισμ
+Πυρ.Καλ
+Πόλη
+Π.Δ
+Π.Δ.Άσμ
+ΡΜ.Ε
+Ρθ
+Ρμ
+Ρωμ
+ΣΠλημ
+Σαπφ
+Σειρ
+Σολ
+Σοφ
+Σοφ.Αντιγ
+Σοφ.Αντ
+Σοφ.Αποσ
+Σοφ.Απ
+Σοφ.Ηλέκ
+Σοφ.Ηλ
+Σοφ.Οιδ.Κολ
+Σοφ.Οιδ.Τύρ
+Σοφ.Ο.Τ
+Σοφ.Σειρ
+Σοφ.Σολ
+Σοφ.Τραχ
+Σοφ.Φιλοκτ
+Σρ
+Σ.τ.Ε
+Σ.τ.Π
+Στρ.Π.Κ
+Στ.Ευρ
+Συζήτ
+Συλλ.Νομολ
+Συλ.Νομ
+ΣυμβΕπιθ
+Συμπ.Ν
+Συνθ.Αμ
+Συνθ.Ε.Ε
+Συνθ.Ε.Κ
+Συνθ.Ν
+Σφν
+Σφ
+Σφ.Σλ
+Σχ.Πολ.Δ
+Σχ.Συντ.Ε
+Σωσ
+Σύντ
+Σ.Πληρ
+ΤΘ
+ΤΣ.Δ
+Τίτ
+Τβ
+Τελ.Ενημ
+Τελ.Κ
+Τερτυλ
+Τιμ
+Τοπ.Α
+Τρ.Ο
+Τριμ
+Τριμ.Πλ
+Τρ.Πλημ
+Τρ.Π.Δ
+Τ.τ.Ε
+Ττ
+Τωβ
+Υγ
+Υπερ
+Υπ
+Υ.Γ
+Φιλήμ
+Φιλιπ
+Φιλ
+Φλμ
+Φλ
+Φορ.Β
+Φορ.Δ.Ε
+Φορ.Δνη
+Φορ.Δ
+Φορ.Επ
+Φώτ
+Χρ.Ι.Δ
+Χρ.Ιδ.Δ
+Χρ.Ο
+Χρυσ
+Ψήφ
+Ψαλμ
+Ψαλ
+Ψλ
+Ωριγ
+Ωσ
+Ω.Ρ.Λ
+άγν
+άγν.ετυμολ
+άγ
+άκλ
+άνθρ
+άπ
+άρθρ
+άρν
+άρ
+άτ
+άψ
+ά
+έκδ
+έκφρ
+έμψ
+ένθ.αν
+έτ
+έ.α
+ίδ
+αβεστ
+αβησσ
+αγγλ
+αγγ
+αδημ
+αεροναυτ
+αερον
+αεροπ
+αθλητ
+αθλ
+αθροιστ
+αιγυπτ
+αιγ
+αιτιολ
+αιτ
+αι
+ακαδ
+ακκαδ
+αλβ
+αλλ
+αλφαβητ
+αμα
+αμερικ
+αμερ
+αμετάβ
+αμτβ
+αμφιβ
+αμφισβ
+αμφ
+αμ
+ανάλ
+ανάπτ
+ανάτ
+αναβ
+αναδαν
+αναδιπλασ
+αναδιπλ
+αναδρ
+αναλ
+αναν
+ανασυλλ
+ανατολ
+ανατομ
+ανατυπ
+ανατ
+αναφορ
+αναφ
+ανα.ε
+ανδρων
+ανθρωπολ
+ανθρωπ
+ανθ
+ανομ
+αντίτ
+αντδ
+αντιγρ
+αντιθ
+αντικ
+αντιμετάθ
+αντων
+αντ
+ανωτ
+ανόργ
+ανών
+αορ
+απαρέμφ
+απαρφ
+απαρχ
+απαρ
+απλολ
+απλοπ
+αποβ
+αποηχηροπ
+αποθ
+αποκρυφ
+αποφ
+απρμφ
+απρφ
+απρόσ
+απόδ
+απόλ
+απόσπ
+απόφ
+αραβοτουρκ
+αραβ
+αραμ
+αρβαν
+αργκ
+αριθμτ
+αριθμ
+αριθ
+αρκτικόλ
+αρκ
+αρμεν
+αρμ
+αρνητ
+αρσ
+αρχαιολ
+αρχιτεκτ
+αρχιτ
+αρχκ
+αρχ
+αρωμουν
+αρωμ
+αρ
+αρ.μετρ
+αρ.φ
+ασσυρ
+αστρολ
+αστροναυτ
+αστρον
+αττ
+αυστραλ
+αυτοπ
+αυτ
+αφγαν
+αφηρ
+αφομ
+αφρικ
+αχώρ
+αόρ
+α.α
+α/α
+α0
+βαθμ
+βαθ
+βαπτ
+βασκ
+βεβαιωτ
+βεβ
+βεδ
+βενετ
+βεν
+βερβερ
+βιβλγρ
+βιολ
+βιομ
+βιοχημ
+βιοχ
+βλάχ
+βλ
+βλ.λ
+βοταν
+βοτ
+βουλγαρ
+βουλγ
+βούλ
+βραζιλ
+βρετον
+βόρ
+γαλλ
+γενικότ
+γενοβ
+γεν
+γερμαν
+γερμ
+γεωγρ
+γεωλ
+γεωμετρ
+γεωμ
+γεωπ
+γεωργ
+γλυπτ
+γλωσσολ
+γλωσσ
+γλ
+γνμδ
+γνμ
+γνωμ
+γοτθ
+γραμμ
+γραμ
+γρμ
+γρ
+γυμν
+δίδες
+δίκ
+δίφθ
+δαν
+δεικτ
+δεκατ
+δηλ
+δημογρ
+δημοτ
+δημώδ
+δημ
+διάγρ
+διάκρ
+διάλεξ
+διάλ
+διάσπ
+διαλεκτ
+διατρ
+διαφ
+διαχ
+διδα
+διεθν
+διεθ
+δικον
+διστ
+δισύλλ
+δισ
+διφθογγοπ
+δογμ
+δολ
+δοτ
+δρμ
+δρχ
+δρ(α)
+δωρ
+δ
+εβρ
+εγκλπ
+εδ
+εθνολ
+εθν
+ειδικότ
+ειδ
+ειδ.β
+εικ
+ειρ
+εισ
+εκατοστμ
+εκατοστ
+εκατστ.2
+εκατστ.3
+εκατ
+εκδ
+εκκλησ
+εκκλ
+εκ
+ελλην
+ελλ
+ελνστ
+ελπ
+εμβ
+εμφ
+εναλλ
+ενδ
+ενεργ
+ενεστ
+ενικ
+ενν
+εν
+εξέλ
+εξακολ
+εξομάλ
+εξ
+εο
+επέκτ
+επίδρ
+επίθ
+επίρρ
+επίσ
+επαγγελμ
+επανάλ
+επανέκδ
+επιθ
+επικ
+επιμ
+επιρρ
+επιστ
+επιτατ
+επιφ
+επών
+επ
+εργ
+ερμ
+ερρινοπ
+ερωτ
+ετρουσκ
+ετυμ
+ετ
+ευφ
+ευχετ
+εφ
+εύχρ
+ε.α
+ε/υ
+ε0
+ζωγρ
+ζωολ
+ηθικ
+ηθ
+ηλεκτρολ
+ηλεκτρον
+ηλεκτρ
+ημίτ
+ημίφ
+ημιφ
+ηχηροπ
+ηχηρ
+ηχομιμ
+ηχ
+η
+θέατρ
+θεολ
+θετ
+θηλ
+θρακ
+θρησκειολ
+θρησκ
+θ
+ιαπων
+ιατρ
+ιδιωμ
+ιδ
+ινδ
+ιραν
+ισπαν
+ιστορ
+ιστ
+ισχυροπ
+ιταλ
+ιχθυολ
+ιων
+κάτ
+καθ
+κακοσ
+καν
+καρ
+κατάλ
+κατατ
+κατωτ
+κατ
+κα
+κελτ
+κεφ
+κινεζ
+κινημ
+κλητ
+κλιτ
+κλπ
+κλ
+κν
+κοινωνιολ
+κοινων
+κοπτ
+κουτσοβλαχ
+κουτσοβλ
+κπ
+κρ.γν
+κτγ
+κτην
+κτητ
+κτλ
+κτ
+κυριολ
+κυρ
+κύρ
+κ
+κ.ά
+κ.ά.π
+κ.α
+κ.εξ
+κ.επ
+κ.ε
+κ.λπ
+κ.λ.π
+κ.ού.κ
+κ.ο.κ
+κ.τ.λ
+κ.τ.τ
+κ.τ.ό
+λέξ
+λαογρ
+λαπ
+λατιν
+λατ
+λαϊκότρ
+λαϊκ
+λετ
+λιθ
+λογιστ
+λογοτ
+λογ
+λουβ
+λυδ
+λόγ
+λ
+λ.χ
+μέλλ
+μέσ
+μαθημ
+μαθ
+μαιευτ
+μαλαισ
+μαλτ
+μαμμων
+μεγεθ
+μεε
+μειωτ
+μελ
+μεξ
+μεσν
+μεσογ
+μεσοπαθ
+μεσοφ
+μετάθ
+μεταβτ
+μεταβ
+μετακ
+μεταπλ
+μεταπτωτ
+μεταρ
+μεταφορ
+μετβ
+μετεπιθ
+μετεπιρρ
+μετεωρολ
+μετεωρ
+μετον
+μετουσ
+μετοχ
+μετρ
+μετ
+μητρων
+μηχανολ
+μηχ
+μικροβιολ
+μογγολ
+μορφολ
+μουσ
+μπενελούξ
+μσνλατ
+μσν
+μτβ
+μτγν
+μτγ
+μτφρδ
+μτφρ
+μτφ
+μτχ
+μυθ
+μυκην
+μυκ
+μφ
+μ
+μ.ε
+μ.μ
+μ.π.ε
+μ.π.π
+μ0
+ναυτ
+νεοελλ
+νεολατιν
+νεολατ
+νεολ
+νεότ
+νλατ
+νομ
+νορβ
+νοσ
+νότ
+ν
+ξ.λ
+οικοδ
+οικολ
+οικον
+οικ
+ολλανδ
+ολλ
+ομηρ
+ομόρρ
+ονομ
+ον
+οπτ
+ορθογρ
+ορθ
+οριστ
+ορυκτολ
+ορυκτ
+ορ
+οσε��
+οσκ
+ουαλ
+ουγγρ
+ουδ
+ουσιαστικοπ
+ουσιαστ
+ουσ
+πίν
+παθητ
+παθολ
+παθ
+παιδ
+παλαιοντ
+παλαιότ
+παλ
+παππων
+παράγρ
+παράγ
+παράλλ
+παράλ
+παραγ
+παρακ
+παραλ
+παραπ
+παρατ
+παρβ
+παρετυμ
+παροξ
+παρων
+παρωχ
+παρ
+παρ.φρ
+πατριδων
+πατρων
+πβ
+περιθ
+περιλ
+περιφρ
+περσ
+περ
+πιθ
+πληθ
+πληροφ
+ποδ
+ποιητ
+πολιτ
+πολλαπλ
+πολ
+πορτογαλ
+πορτ
+ποσ
+πρακριτ
+πρβλ
+πρβ
+πργ
+πρκμ
+πρκ
+πρλ
+προέλ
+προβηγκ
+προελλ
+προηγ
+προθεμ
+προπαραλ
+προπαροξ
+προπερισπ
+προσαρμ
+προσηγορ
+προσταχτ
+προστ
+προσφών
+προσ
+προτακτ
+προτ.Εισ
+προφ
+προχωρ
+πρτ
+πρόθ
+πρόσθ
+πρόσ
+πρότ
+πρ
+πρ.Εφ
+πτ
+πυ
+π
+π.Χ
+π.μ
+π.χ
+ρήμ
+ρίζ
+ρηματ
+ρητορ
+ριν
+ρουμ
+ρωμ
+ρωσ
+ρ
+σανσκρ
+σαξ
+σελ
+σερβοκρ
+σερβ
+σημασιολ
+σημδ
+σημειολ
+σημερ
+σημιτ
+σημ
+σκανδ
+σκυθ
+σκωπτ
+σλαβ
+σλοβ
+σουηδ
+σουμερ
+σουπ
+σπάν
+σπανιότ
+σπ
+σσ
+στατ
+στερ
+στιγμ
+στιχ
+στρέμ
+στρατιωτ
+στρατ
+στ
+συγγ
+συγκρ
+συγκ
+συμπερ
+συμπλεκτ
+συμπλ
+συμπροφ
+συμφυρ
+συμφ
+συνήθ
+συνίζ
+συναίρ
+συναισθ
+συνδετ
+συνδ
+συνεκδ
+συνηρ
+συνθετ
+συνθ
+συνοπτ
+συντελ
+συντομογρ
+συντ
+συν
+συρ
+σχημ
+σχ
+σύγκρ
+σύμπλ
+σύμφ
+σύνδ
+σύνθ
+σύντμ
+σύντ
+σ
+σ.π
+σ/β
+τακτ
+τελ
+τετρ
+τετρ.μ
+τεχνλ
+τεχνολ
+τεχν
+τεύχ
+τηλεπικ
+τηλεόρ
+τιμ
+τιμ.τομ
+τοΣ
+τον
+τοπογρ
+τοπων
+τοπ
+τοσκ
+τουρκ
+τοχ
+τριτοπρόσ
+τροποπ
+τροπ
+τσεχ
+τσιγγ
+ττ
+τυπ
+τόμ
+τόνν
+τ
+τ.μ
+τ.χλμ
+υβρ
+υπερθ
+υπερσ
+υπερ
+υπεύθ
+υποθ
+υποκορ
+υποκ
+υποσημ
+υποτ
+υποφ
+υποχωρ
+υπόλ
+υπόχρ
+υπ
+υστλατ
+υψόμ
+υψ
+φάκ
+φαρμακολ
+φαρμ
+φιλολ
+φιλοσ
+φιλοτ
+φινλ
+φοινικ
+φράγκ
+φρανκον
+φριζ
+φρ
+φυλλ
+φυσιολ
+φυσ
+φωνηεντ
+φωνητ
+φωνολ
+φων
+φωτογρ
+φ
+φ.τ.μ
+χαμιτ
+χαρτόσ
+χαρτ
+χασμ
+χαϊδ
+χγφ
+χειλ
+χεττ
+χημ
+χιλ
+χλγρ
+χλγ
+χλμ
+χλμ.2
+χλμ.3
+χλσγρ
+χλστγρ
+χλστμ
+χλστμ.2
+χλστμ.3
+χλ
+χργρ
+χρημ
+χρον
+χρ
+χφ
+χ.ε
+χ.κ
+χ.ο
+χ.σ
+χ.τ
+χ.χ
+ψευδ
+ψυχαν
+ψυχιατρ
+ψυχολ
+ψυχ
+ωκεαν
+όμ
+όν
+όπ.παρ
+όπ.π
+ό.π
+ύψ
+1Βσ
+1Εσ
+1Θσ
+1Ιν
+1Κρ
+1Μκ
+1Πρ
+1Πτ
+1Τμ
+2Βσ
+2Εσ
+2Θσ
+2Ιν
+2Κρ
+2Μκ
+2Πρ
+2Πτ
+2Τμ
+3Βσ
+3Ιν
+3Μκ
+4Βσ

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.en ADDED Viewed

	@@ -0,0 +1,121 @@

+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Drs
+Ens
+Gen
+Gov
+Hon
+Hr
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY#
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
+#month abbreviations
+Jan
+Feb
+Mar
+Apr
+#May is a full word
+Jun
+Jul
+Aug
+Sep
+Oct
+Nov
+Dec

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.es ADDED Viewed

	@@ -0,0 +1,118 @@

+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#any single upper case letter  followed by a period is not a sentence ender
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
+A.C
+Apdo
+Av
+Bco
+CC.AA
+Da
+Dep
+Dn
+Dr
+Dra
+EE.UU
+Excmo
+FF.CC
+Fil
+Gral
+J.C
+Let
+Lic
+N.B
+P.D
+P.V.P
+Prof
+Pts
+Rte
+S.A
+S.A.R
+S.E
+S.L
+S.R.C
+Sr
+Sra
+Srta
+Sta
+Sto
+T.V.E
+Tel
+Ud
+Uds
+V.B
+V.E
+Vd
+Vds
+a/c
+adj
+admón
+afmo
+apdo
+av
+c
+c.f
+c.g
+cap
+cm
+cta
+dcha
+doc
+ej
+entlo
+esq
+etc
+f.c
+gr
+grs
+izq
+kg
+km
+mg
+mm
+nÃºm
+núm
+p
+p.a
+p.ej
+ptas
+pÃ¡g
+pÃ¡gs
+pág
+págs
+q.e.g.e
+q.e.s.m
+s
+s.s.s
+vid
+vol

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.fi ADDED Viewed

	@@ -0,0 +1,138 @@

+#Anything in this file, followed by a period (and an upper-case word), does NOT
+#indicate an end-of-sentence marker.  Special cases are included for prefixes
+#that ONLY appear before 0-9 numbers.
+#This list is compiled from omorfi <http://code.google.com/p/omorfi> database
+#by Tommi A Pirinen.
+#any single upper case letter  followed by a period is not a sentence ender
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+Å
+Ä
+Ö
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+alik
+alil
+amir
+apul
+apul.prof
+arkkit
+ass
+assist
+dipl
+dipl.arkkit
+dipl.ekon
+dipl.ins
+dipl.kielenk
+dipl.kirjeenv
+dipl.kosm
+dipl.urk
+dos
+erikoiseläinl
+erikoishammasl
+erikoisl
+erikoist
+ev.luutn
+evp
+fil
+ft
+hallinton
+hallintot
+hammaslääket
+jatk
+jääk
+kansaned
+kapt
+kapt.luutn
+kenr
+kenr.luutn
+kenr.maj
+kers
+kirjeenv
+kom
+kom.kapt
+komm
+konst
+korpr
+luutn
+maist
+maj
+Mr
+Mrs
+Ms
+M.Sc
+neuv
+nimim
+Ph.D
+prof
+puh.joht
+pääll
+res
+san
+siht
+suom
+sähköp
+säv
+toht
+toim
+toim.apul
+toim.joht
+toim.siht
+tuom
+ups
+vänr
+vääp
+ye.ups
+ylik
+ylil
+ylim
+ylimatr
+yliop
+yliopp
+ylip
+yliv
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall
+#into this category - it sometimes ends a sentence)
+e.g
+ent
+esim
+huom
+i.e
+ilm
+l
+mm
+myöh
+nk
+nyk
+par
+po
+t
+v

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.fr ADDED Viewed

	@@ -0,0 +1,153 @@

+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#
+#any single upper case letter  followed by a period is not a sentence ender
+#usually upper case letters are initials in a name
+#no French words end in single lower-case letters, so we throw those in too?
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+#a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+# Period-final abbreviation list for French
+A.C.N
+A.M
+art
+ann
+apr
+av
+auj
+lib
+B.P
+boul
+ca
+c.-à-d
+cf
+ch.-l
+chap
+contr
+C.P.I
+C.Q.F.D
+C.N
+C.N.S
+C.S
+dir
+éd
+e.g
+env
+al
+etc
+E.V
+ex
+fasc
+fém
+fig
+fr
+hab
+ibid
+id
+i.e
+inf
+LL.AA
+LL.AA.II
+LL.AA.RR
+LL.AA.SS
+L.D
+LL.EE
+LL.MM
+LL.MM.II.RR
+loc.cit
+masc
+MM
+ms
+N.B
+N.D.A
+N.D.L.R
+N.D.T
+n/réf
+NN.SS
+N.S
+N.D
+N.P.A.I
+p.c.c
+pl
+pp
+p.ex
+p.j
+P.S
+R.A.S
+R.-V
+R.P
+R.I.P
+SS
+S.S
+S.A
+S.A.I
+S.A.R
+S.A.S
+S.E
+sec
+sect
+sing
+S.M
+S.M.I.R
+sq
+sqq
+suiv
+sup
+suppl
+tél
+T.S.V.P
+vb
+vol
+vs
+X.O
+Z.I

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ga ADDED Viewed

	@@ -0,0 +1,48 @@

+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+Á
+É
+Í
+Ó
+Ú
+Uacht
+Dr
+B.Arch
+m.sh
+.i
+Co
+Cf
+cf
+i.e
+r
+Chr
+lch #NUMERIC_ONLY#
+lgh #NUMERIC_ONLY#
+uimh #NUMERIC_ONLY#

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.hu ADDED Viewed

	@@ -0,0 +1,103 @@

+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+Á
+É
+Í
+Ó
+Ö
+Ő
+Ú
+Ü
+Ű
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Dr
+dr
+kb
+Kb
+vö
+Vö
+pl
+Pl
+ca
+Ca
+min
+Min
+max
+Max
+ún
+Ún
+prof
+Prof
+de
+De
+du
+Du
+Szt
+St
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+# Month name abbreviations
+jan #NUMERIC_ONLY#
+Jan #NUMERIC_ONLY#
+Feb #NUMERIC_ONLY#
+feb #NUMERIC_ONLY#
+márc #NUMERIC_ONLY#
+Márc #NUMERIC_ONLY#
+ápr #NUMERIC_ONLY#
+Ápr #NUMERIC_ONLY#
+máj #NUMERIC_ONLY#
+Máj #NUMERIC_ONLY#
+jún #NUMERIC_ONLY#
+Jún #NUMERIC_ONLY#
+Júl #NUMERIC_ONLY#
+júl #NUMERIC_ONLY#
+aug #NUMERIC_ONLY#
+Aug #NUMERIC_ONLY#
+Szept #NUMERIC_ONLY#
+szept #NUMERIC_ONLY#
+okt #NUMERIC_ONLY#
+Okt #NUMERIC_ONLY#
+nov #NUMERIC_ONLY#
+Nov #NUMERIC_ONLY#
+dec #NUMERIC_ONLY#
+Dec #NUMERIC_ONLY#
+# Other abbreviations
+tel #NUMERIC_ONLY#
+Tel #NUMERIC_ONLY#
+Fax #NUMERIC_ONLY#
+fax #NUMERIC_ONLY#

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.is ADDED Viewed

	@@ -0,0 +1,251 @@

+no #NUMERIC_ONLY#
+No #NUMERIC_ONLY#
+nr #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
+nR #NUMERIC_ONLY#
+NR #NUMERIC_ONLY#
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+^
+í
+á
+ó
+æ
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+ab.fn
+a.fn
+afs
+al
+alm
+alg
+andh
+ath
+aths
+atr
+ao
+au
+aukaf
+áfn
+áhrl.s
+áhrs
+ákv.gr
+ákv
+bh
+bls
+dr
+e.Kr
+et
+ef
+efn
+ennfr
+eink
+end
+e.st
+erl
+fél
+fskj
+fh
+f.hl
+físl
+fl
+fn
+fo
+forl
+frb
+frl
+frh
+frt
+fsl
+fsh
+fs
+fsk
+fst
+f.Kr
+ft
+fv
+fyrrn
+fyrrv
+germ
+gm
+gr
+hdl
+hdr
+hf
+hl
+hlsk
+hljsk
+hljv
+hljóðv
+hr
+hv
+hvk
+holl
+Hos
+höf
+hk
+hrl
+ísl
+kaf
+kap
+Khöfn
+kk
+kg
+kk
+km
+kl
+klst
+kr
+kt
+kgúrsk
+kvk
+leturbr
+lh
+lh.nt
+lh.þt
+lo
+ltr
+mlja
+mljó
+millj
+mm
+mms
+m.fl
+miðm
+mgr
+mst
+mín
+nf
+nh
+nhm
+nl
+nk
+nmgr
+no
+núv
+nt
+o.áfr
+o.m.fl
+ohf
+o.fl
+o.s.frv
+ófn
+ób
+óákv.gr
+óákv
+pfn
+PR
+pr
+Ritstj
+Rvík
+Rvk
+samb
+samhlj
+samn
+samn
+sbr
+sek
+sérn
+sf
+sfn
+sh
+sfn
+sh
+s.hl
+sk
+skv
+sl
+sn
+so
+ss.us
+s.st
+samþ
+sbr
+shlj
+sign
+skál
+st
+st.s
+stk
+sþ
+teg
+tbl
+tfn
+tl
+tvíhlj
+tvt
+till
+to
+umr
+uh
+us
+uppl
+útg
+vb
+Vf
+vh
+vkf
+Vl
+vl
+vlf
+vmf
+8vo
+vsk
+vth
+þt
+þf
+þjs
+þgf
+þlt
+þolm
+þm
+þml
+þýð

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.it ADDED Viewed

	@@ -0,0 +1,180 @@

+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Amn
+Arch
+Asst
+Avv
+Bart
+Bcc
+Bldg
+Brig
+Bros
+C.A.P
+C.P
+Capt
+Cc
+Cmdr
+Co
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dott
+Dr
+Drs
+Egr
+Ens
+Gen
+Geom
+Gov
+Hon
+Hosp
+Hr
+Id
+Ing
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mo
+Mons
+Mr
+Mrs
+Ms
+Msgr
+N.B
+Op
+Ord
+P.S
+P.T
+Pfc
+Ph
+Prof
+Pvt
+RP
+RSVP
+Rag
+Rep
+Reps
+Res
+Rev
+Rif
+Rt
+S.A
+S.B.F
+S.P.M
+S.p.A
+S.r.l
+Sen
+Sens
+Sfc
+Sgt
+Sig
+Sigg
+Soc
+Spett
+Sr
+St
+Supt
+Surg
+V.P
+# other
+a.c
+acc
+all
+banc
+c.a
+c.c.p
+c.m
+c.p
+c.s
+c.v
+corr
+dott
+e.p.c
+ecc
+es
+fatt
+gg
+int
+lett
+ogg
+on
+p.c
+p.c.c
+p.es
+p.f
+p.r
+p.v
+post
+pp
+racc
+ric
+s.n.c
+seg
+sgg
+ss
+tel
+u.s
+v.r
+v.s
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY#
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.lt ADDED Viewed

	@@ -0,0 +1,698 @@

+# Anything in this file, followed by a period (and an upper-case word),
+# does NOT indicate an end-of-sentence marker.
+# Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+# Any single upper case letter  followed by a period is not a sentence ender
+# (excluding I occasionally, but we leave it in)
+# usually upper case letters are initials in a name
+A
+Ā
+B
+C
+Č
+D
+E
+Ē
+F
+G
+Ģ
+H
+I
+Ī
+J
+K
+Ķ
+L
+Ļ
+M
+N
+Ņ
+O
+P
+Q
+R
+S
+Š
+T
+U
+Ū
+V
+W
+X
+Y
+Z
+Ž
+# Initialis -- Džonas
+Dz
+Dž
+Just
+# Day and month abbreviations
+# m. menesis d. diena  g. gimes
+m
+mėn
+d
+g
+gim
+# Pirmadienis Penktadienis
+Pr
+Pn
+Pirm
+Antr
+Treč
+Ketv
+Penkt
+Šešt
+Sekm
+Saus
+Vas
+Kov
+Bal
+Geg
+Birž
+Liep
+Rugpj
+Rugs
+Spal
+Lapkr
+Gruod
+# Business, governmental, geographical terms
+a
+# aikštė
+adv
+# advokatas
+akad
+# akademikas
+aklg
+# akligatvis
+akt
+# aktorius
+al
+# alėja
+A.V
+# antspaudo vieta
+aps
+apskr
+# apskritis
+apyg
+# apygarda
+aps
+apskr
+# apskritis
+asist
+# asistentas
+asmv
+avd
+# asmenvardis
+a.k
+asm
+asm.k
+# asmens kodas
+atsak
+# atsakingasis
+atsisk
+sąsk
+# atsiskaitomoji sąskaita
+aut
+# autorius
+b
+k
+b.k
+# banko kodas
+bkl
+# bakalauras
+bt
+# butas
+buv
+# buvęs, -usi
+dail
+# dailininkas
+dek
+# dekanas
+dėst
+# dėstytojas
+dir
+# direktorius
+dirig
+# dirigentas
+doc
+# docentas
+drp
+# durpynas
+dš
+# dešinysis
+egz
+# egzempliorius
+eil
+# eilutė
+ekon
+# ekonomika
+el
+# elektroninis
+etc
+ež
+# ežeras
+faks
+# faksas
+fak
+# fakultetas
+gen
+# generolas
+gyd
+# gydytojas
+gv
+# gyvenvietė
+įl
+# įlanka
+Įn
+# įnagininkas
+insp
+# inspektorius
+pan
+# ir panašiai
+t.t
+# ir taip toliau
+k.a
+# kaip antai
+kand
+# kandidatas
+kat
+# katedra
+kyš
+# kyšulys
+kl
+# klasė
+kln
+# kalnas
+kn
+# knyga
+koresp
+# korespondentas
+kpt
+# kapitonas
+kr
+# kairysis
+kt
+# kitas
+kun
+# kunigas
+l
+e
+p
+l.e.p
+# laikinai einantis pareigas
+ltn
+# leitenantas
+m
+mst
+# miestas
+m.e
+# mūsų eros
+m.m
+# mokslo metai
+mot
+# moteris
+mstl
+# miestelis
+mgr
+# magistras
+mgnt
+# magistrantas
+mjr
+# majoras
+mln
+# milijonas
+mlrd
+# milijardas
+mok
+# mokinys
+mokyt
+# mokytojas
+moksl
+# mokslinis
+nkt
+# nekaitomas
+ntk
+# neteiktinas
+Nr
+nr
+# numeris
+p
+# ponas
+p.d
+a.d
+# pašto dėžutė, abonentinė dėžutė
+p.m.e
+# prieš mūsų erą
+pan
+# ir panašiai
+pav
+# paveikslas
+pavad
+# pavaduotojas
+pirm
+# pirmininkas
+pl
+# plentas
+plg
+# palygink
+plk
+# pulkininkas; pelkė
+pr
+# prospektas
+Kr
+pr.Kr
+# prieš Kristų
+prok
+# prokuroras
+prot
+# protokolas
+pss
+# pusiasalis
+pšt
+# paštas
+pvz
+# pavyzdžiui
+r
+# rajonas
+red
+# redaktorius
+rš
+# raštų kalbos
+sąs
+# sąsiuvinis
+saviv
+sav
+# savivaldybė
+sekr
+# sekretorius
+sen
+# seniūnija, seniūnas
+sk
+# skaityk; skyrius
+skg
+# skersgatvis
+skyr
+sk
+# skyrius
+skv
+# skveras
+sp
+# spauda; spaustuvė
+spec
+# specialistas
+sr
+# sritis
+st
+# stotis
+str
+# straipsnis
+stud
+# studentas
+š
+š.m
+# šių metų
+šnek
+# šnekamosios
+tir
+# tiražas
+tūkst
+# tūkstantis
+up
+# upė
+upl
+# upelis
+vad
+# vadinamasis, -oji
+vlsč
+# valsčius
+ved
+# vedėjas
+vet
+# veterinarija
+virš
+# viršininkas, viršaitis
+vyr
+# vyriausiasis, -ioji; vyras
+vyresn
+# vyresnysis
+vlsč
+# valsčius
+vs
+# viensėdis
+Vt
+vt
+# vietininkas
+vtv
+vv
+# vietovardis
+žml
+# žemėlapis
+# Technical terms, abbreviations used in guidebooks, advertisments, etc.
+# Generally lower-case.
+air
+# airiškai
+amer
+# amerikanizmas
+anat
+# anatomija
+angl
+# angl. angliskai
+arab
+# arabų
+archeol
+archit
+asm
+# asmuo
+astr
+# astronomija
+austral
+# australiškai
+aut
+# automobilis
+av
+# aviacija
+bažn
+bdv
+# būdvardis
+bibl
+# Biblija
+biol
+# biologija
+bot
+# botanika
+brt
+# burtai, burtažodis.
+brus
+# baltarusių
+buh
+# buhalterija
+chem
+# chemija
+col
+# collectivum
+con
+conj
+# conjunctivus, jungtukas
+dab
+# dab. dabartine
+dgs
+# daugiskaita
+dial
+# dialektizmas
+dipl
+dktv
+# daiktavardis
+džn
+# dažnai
+ekon
+el
+# elektra
+esam
+# esamasis laikas
+euf
+# eufemizmas
+fam
+# familiariai
+farm
+# farmacija
+filol
+# filologija
+filos
+# filosofija
+fin
+# finansai
+fiz
+# fizika
+fiziol
+# fiziologija
+flk
+# folkloras
+fon
+# fonetika
+fot
+# fotografija
+geod
+# geodezija
+geogr
+geol
+# geologija
+geom
+# geometrija
+glžk
+gr
+# graikų
+gram
+her
+# heraldika
+hidr
+# hidrotechnika
+ind
+# Indų
+iron
+# ironiškai
+isp
+# ispanų
+ist
+istor
+# istorija
+it
+# italų
+įv
+reikšm
+įv.reikšm
+# įvairiomis reikšmėmis
+jap
+# japonų
+juok
+# juokaujamai
+jūr
+# jūrininkystė
+kalb
+# kalbotyra
+kar
+# karyba
+kas
+# kasyba
+kin
+# kinematografija
+klaus
+# klausiamasis
+knyg
+# knyginis
+kom
+# komercija
+komp
+# kompiuteris
+kosm
+# kosmonautika
+kt
+# kitas
+kul
+# kulinarija
+kuop
+# kuopine
+l
+# laikas
+lit
+# literatūrinis
+lingv
+# lingvistika
+log
+# logika
+lot
+# lotynų
+mat
+# matematika
+maž
+# mažybinis
+med
+# medicina
+medž
+# medžioklė
+men
+# menas
+menk
+# menkinamai
+metal
+# metalurgija
+meteor
+min
+# mineralogija
+mit
+# mitologija
+mok
+# mokyklinis
+ms
+# mįslė
+muz
+# muzikinis
+n
+# naujasis
+neig
+# neigiamasis
+neol
+# neologizmas
+niek
+# niekinamai
+ofic
+# oficialus
+opt
+# optika
+orig
+# original
+p
+# pietūs
+pan
+# panašiai
+parl
+# parlamentas
+pat
+# patarlė
+paž
+# pažodžiui
+plg
+# palygink
+poet
+# poetizmas
+poez
+#  poezija
+poligr
+# poligrafija
+polit
+# politika
+ppr
+# paprastai
+pranc
+pr
+# prancūzų, prūsų
+priet
+# prietaras
+prek
+# prekyba
+prk
+# perkeltine
+prs
+# persona, asmuo
+psn
+# pasenęs žodis
+psich
+# psichologija
+pvz
+# pavyzdžiui
+r
+# rytai
+rad
+# radiotechnika
+rel
+# religija
+ret
+# retai
+rus
+# rusų
+sen
+# senasis
+sl
+# slengas, slavų
+sov
+# sovietinis
+spec
+# specialus
+sport
+stat
+# statyba
+sudurt
+# sudurtinis
+sutr
+# sutrumpintas
+suv
+# suvalkiečių
+š
+# šiaurė
+šach
+# šachmatai
+šiaur
+škot
+# škotiškai
+šnek
+# šnekamoji
+teatr
+tech
+techn
+# technika
+teig
+# teigiamas
+teis
+# teisė
+tekst
+# tekstilė
+tel
+# telefonas
+teol
+# teologija
+v
+# tik vyriškosios, vakarai
+t.p
+t
+p
+# ir taip pat
+t.t
+# ir taip toliau
+t.y
+# tai yra
+vaik
+# vaikų
+vart
+# vartojama
+vet
+# veterinarija
+vid
+# vidurinis
+vksm
+# veiksmažodis
+vns
+# vienaskaita
+vok
+# vokiečių
+vulg
+# vulgariai
+zool
+# zoologija
+žr
+# žiūrėk
+ž.ū
+ž
+ū
+# žemės ūkis
+# List of titles. These are often followed by upper-case names, but do
+# not indicate sentence breaks
+#
+# Jo Eminencija
+Em.
+# Gerbiamasis
+Gerb
+gerb
+#  malonus
+malon
+# profesorius
+Prof
+prof
+# daktaras (mokslų)
+Dr
+dr
+habil
+med
+# inž inžinierius
+inž
+Inž
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY#

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.lv ADDED Viewed

	@@ -0,0 +1,100 @@

+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+Ā
+B
+C
+Č
+D
+E
+Ē
+F
+G
+Ģ
+H
+I
+Ī
+J
+K
+Ķ
+L
+Ļ
+M
+N
+Ņ
+O
+P
+Q
+R
+S
+Š
+T
+U
+Ū
+V
+W
+X
+Y
+Z
+Ž
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+dr
+Dr
+med
+prof
+Prof
+inž
+Inž
+ist.loc
+Ist.loc
+kor.loc
+Kor.loc
+v.i
+vietn
+Vietn
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+a.l
+t.p
+pārb
+Pārb
+vec
+Vec
+inv
+Inv
+sk
+Sk
+spec
+Spec
+vienk
+Vienk
+virz
+Virz
+māksl
+Māksl
+mūz
+Mūz
+akad
+Akad
+soc
+Soc
+galv
+Galv
+vad
+Vad
+sertif
+Sertif
+folkl
+Folkl
+hum
+Hum
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+Nr #NUMERIC_ONLY#

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.nl ADDED Viewed

	@@ -0,0 +1,115 @@

+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen
+#         http://nl.wikipedia.org/wiki/Aanspreekvorm
+#         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+bacc
+bc
+bgen
+c.i
+dhr
+dr
+dr.h.c
+drs
+drs
+ds
+eint
+fa
+Fa
+fam
+gen
+genm
+ing
+ir
+jhr
+jkvr
+jr
+kand
+kol
+lgen
+lkol
+Lt
+maj
+Mej
+mevr
+Mme
+mr
+mr
+Mw
+o.b.s
+plv
+prof
+ritm
+tint
+Vz
+Z.D
+Z.D.H
+Z.E
+Z.Em
+Z.H
+Z.K.H
+Z.K.M
+Z.M
+z.v
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
+a.g.v
+bijv
+bijz
+bv
+d.w.z
+e.c
+e.g
+e.k
+ev
+i.p.v
+i.s.m
+i.t.t
+i.v.m
+m.a.w
+m.b.t
+m.b.v
+m.h.o
+m.i
+m.i.v
+v.w.t
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+Nr #NUMERIC_ONLY#
+Nrs
+nrs
+nr #NUMERIC_ONLY#

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.pl ADDED Viewed

	@@ -0,0 +1,283 @@

+adw
+afr
+akad
+al
+Al
+am
+amer
+arch
+art
+Art
+artyst
+astr
+austr
+bałt
+bdb
+bł
+bm
+br
+bryg
+bryt
+centr
+ces
+chem
+chiń
+chir
+c.k
+c.o
+cyg
+cyw
+cyt
+czes
+czw
+cd
+Cd
+czyt
+ćw
+ćwicz
+daw
+dcn
+dekl
+demokr
+det
+diec
+dł
+dn
+dot
+dol
+dop
+dost
+dosł
+h.c
+ds
+dst
+duszp
+dypl
+egz
+ekol
+ekon
+elektr
+em
+ew
+fab
+farm
+fot
+fr
+gat
+gastr
+geogr
+geol
+gimn
+głęb
+gm
+godz
+górn
+gosp
+gr
+gram
+hist
+hiszp
+hr
+Hr
+hot
+id
+in
+im
+iron
+jn
+kard
+kat
+katol
+k.k
+kk
+kol
+kl
+k.p.a
+kpc
+k.p.c
+kpt
+kr
+k.r
+krak
+k.r.o
+kryt
+kult
+laic
+łac
+niem
+woj
+nb
+np
+Nb
+Np
+pol
+pow
+m.in
+pt
+ps
+Pt
+Ps
+cdn
+jw
+ryc
+rys
+Ryc
+Rys
+tj
+tzw
+Tzw
+tzn
+zob
+ang
+ub
+ul
+pw
+pn
+pl
+al
+k
+n
+nr #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
+ww
+wł
+ur
+zm
+żyd
+żarg
+żyw
+wył
+bp
+bp
+wyst
+tow
+Tow
+o
+sp
+Sp
+st
+spółdz
+Spółdz
+społ
+spółgł
+stoł
+stow
+Stoł
+Stow
+zn
+zew
+zewn
+zdr
+zazw
+zast
+zaw
+zał
+zal
+zam
+zak
+zakł
+zagr
+zach
+adw
+Adw
+lek
+Lek
+med
+mec
+Mec
+doc
+Doc
+dyw
+dyr
+Dyw
+Dyr
+inż
+Inż
+mgr
+Mgr
+dh
+dr
+Dh
+Dr
+p
+P
+red
+Red
+prof
+prok
+Prof
+Prok
+hab
+płk
+Płk
+nadkom
+Nadkom
+podkom
+Podkom
+ks
+Ks
+gen
+Gen
+por
+Por
+reż
+Reż
+przyp
+Przyp
+śp
+św
+śW
+Śp
+Św
+ŚW
+szer
+Szer
+pkt #NUMERIC_ONLY#
+str #NUMERIC_ONLY#
+tab #NUMERIC_ONLY#
+Tab #NUMERIC_ONLY#
+tel
+ust #NUMERIC_ONLY#
+par #NUMERIC_ONLY#
+poz
+pok
+oo
+oO
+Oo
+OO
+r #NUMERIC_ONLY#
+l #NUMERIC_ONLY#
+s #NUMERIC_ONLY#
+najśw
+Najśw
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+Ś
+Ć
+Ż
+Ź
+Dz

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4477f9ad690e262c04bc057f4757e12b16777892f80016539130f3b7eebd58b4
+size 1792

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ro ADDED Viewed

	@@ -0,0 +1,38 @@

+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+dpdv
+etc
+șamd
+M.Ap.N
+dl
+Dl
+d-na
+D-na
+dvs
+Dvs
+pt
+Pt

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ru ADDED Viewed

	@@ -0,0 +1,293 @@

+# added Cyrillic uppercase letters [А-Я]
+# removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes)
+# edited by Kate Young ([email protected]) 21 May 2013
+А
+Б
+В
+Г
+Д
+Е
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ъ
+Ы
+Ь
+Э
+Ю
+Я
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+0гг
+1гг
+2гг
+3гг
+4гг
+5гг
+6гг
+7гг
+8гг
+9гг
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+Xвв
+Vвв
+Iвв
+Lвв
+Mвв
+Cвв
+Xв
+Vв
+Iв
+Lв
+Mв
+Cв
+0м
+1м
+2м
+3м
+4м
+5м
+6м
+7м
+8м
+9м
+0мм
+1мм
+2мм
+3мм
+4мм
+5мм
+6мм
+7мм
+8мм
+9мм
+0см
+1см
+2см
+3см
+4см
+5см
+6см
+7см
+8см
+9см
+0дм
+1дм
+2дм
+3дм
+4дм
+5дм
+6дм
+7дм
+8дм
+9дм
+0л
+1л
+2л
+3л
+4л
+5л
+6л
+7л
+8л
+9л
+0км
+1км
+2км
+3км
+4км
+5км
+6км
+7км
+8км
+9км
+0га
+1га
+2га
+3га
+4га
+5га
+6га
+7га
+8га
+9га
+0кг
+1кг
+2кг
+3кг
+4кг
+5кг
+6кг
+7кг
+8кг
+9кг
+0т
+1т
+2т
+3т
+4т
+5т
+6т
+7т
+8т
+9т
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+0мг
+1мг
+2мг
+3мг
+4мг
+5мг
+6мг
+7мг
+8мг
+9мг
+бульв
+в
+вв
+г
+га
+гг
+гл
+гос
+д
+дм
+доп
+др
+е
+ед
+ед
+зам
+и
+инд
+исп
+Исп
+к
+кап
+кг
+кв
+кл
+км
+кол
+комн
+коп
+куб
+л
+лиц
+лл
+м
+макс
+мг
+мин
+мл
+млн
+млрд
+мм
+н
+наб
+нач
+неуд
+ном
+о
+обл
+обр
+общ
+ок
+ост
+отл
+п
+пер
+перераб
+пл
+пос
+пр
+просп
+проф
+р
+ред
+руб
+с
+сб
+св
+см
+соч
+ср
+ст
+стр
+т
+тел
+Тел
+тех
+тт
+туп
+тыс
+уд
+ул
+уч
+физ
+х
+хор
+ч
+чел
+шт
+экз
+э

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.sk ADDED Viewed

	@@ -0,0 +1,474 @@

+Bc
+Mgr
+RNDr
+PharmDr
+PhDr
+JUDr
+PaedDr
+ThDr
+Ing
+MUDr
+MDDr
+MVDr
+Dr
+ThLic
+PhD
+ArtD
+ThDr
+Dr
+DrSc
+CSs
+prof
+obr
+Obr
+Č
+č
+absol
+adj
+admin
+adr
+Adr
+adv
+advok
+afr
+ak
+akad
+akc
+akuz
+et
+al
+alch
+amer
+anat
+angl
+Angl
+anglosas
+anorg
+ap
+apod
+arch
+archeol
+archit
+arg
+art
+astr
+astrol
+astron
+atp
+atď
+austr
+Austr
+aut
+belg
+Belg
+bibl
+Bibl
+biol
+bot
+bud
+bás
+býv
+cest
+chem
+cirk
+csl
+čs
+Čs
+dat
+dep
+det
+dial
+diaľ
+dipl
+distrib
+dokl
+dosl
+dopr
+dram
+duš
+dv
+dvojčl
+dór
+ekol
+ekon
+el
+elektr
+elektrotech
+energet
+epic
+est
+etc
+etonym
+eufem
+európ
+Európ
+ev
+evid
+expr
+fa
+fam
+farm
+fem
+feud
+fil
+filat
+filoz
+fi
+fon
+form
+fot
+fr
+Fr
+franc
+Franc
+fraz
+fut
+fyz
+fyziol
+garb
+gen
+genet
+genpor
+geod
+geogr
+geol
+geom
+germ
+gr
+Gr
+gréc
+Gréc
+gréckokat
+hebr
+herald
+hist
+hlav
+hosp
+hromad
+hud
+hypok
+ident
+i.e
+ident
+imp
+impf
+indoeur
+inf
+inform
+instr
+int
+interj
+inšt
+inštr
+iron
+jap
+Jap
+jaz
+jedn
+juhoamer
+juhových
+juhozáp
+juž
+kanad
+Kanad
+kanc
+kapit
+kpt
+kart
+katastr
+knih
+kniž
+komp
+konj
+konkr
+kozmet
+krajč
+kresť
+kt
+kuch
+lat
+latinskoamer
+lek
+lex
+lingv
+lit
+litur
+log
+lok
+max
+Max
+maď
+Maď
+medzinár
+mest
+metr
+mil
+Mil
+min
+Min
+miner
+ml
+mld
+mn
+mod
+mytol
+napr
+nar
+Nar
+nasl
+nedok
+neg
+negat
+neklas
+nem
+Nem
+neodb
+neos
+neskl
+nesklon
+nespis
+nespráv
+neved
+než
+niekt
+niž
+nom
+náb
+nákl
+námor
+nár
+obch
+obj
+obv
+obyč
+obč
+občian
+odb
+odd
+ods
+ojed
+okr
+Okr
+opt
+opyt
+org
+os
+osob
+ot
+ovoc
+par
+part
+pejor
+pers
+pf
+Pf
+P.f
+p.f
+pl
+Plk
+pod
+podst
+pokl
+polit
+politol
+polygr
+pomn
+popl
+por
+porad
+porov
+posch
+potrav
+použ
+poz
+pozit
+poľ
+poľno
+poľnohosp
+poľov
+pošt
+pož
+prac
+predl
+pren
+prep
+preuk
+priezv
+Priezv
+privl
+prof
+práv
+príd
+príj
+prík
+príp
+prír
+prísl
+príslov
+príč
+psych
+publ
+pís
+písm
+pôv
+refl
+reg
+rep
+resp
+rozk
+rozlič
+rozpráv
+roč
+Roč
+ryb
+rádiotech
+rím
+samohl
+semest
+sev
+severoamer
+severových
+severozáp
+sg
+skr
+skup
+sl
+Sloven
+soc
+soch
+sociol
+sp
+spol
+Spol
+spoloč
+spoluhl
+správ
+spôs
+st
+star
+starogréc
+starorím
+s.r.o
+stol
+stor
+str
+stredoamer
+stredoškol
+subj
+subst
+superl
+sv
+sz
+súkr
+súp
+súvzť
+tal
+Tal
+tech
+tel
+Tel
+telef
+teles
+telev
+teol
+trans
+turist
+tuzem
+typogr
+tzn
+tzv
+ukaz
+ul
+Ul
+umel
+univ
+ust
+ved
+vedľ
+verb
+veter
+vin
+viď
+vl
+vod
+vodohosp
+pnl
+vulg
+vyj
+vys
+vysokoškol
+vzťaž
+vôb
+vých
+výd
+výrob
+výsk
+výsl
+výtv
+výtvar
+význ
+včel
+vš
+všeob
+zahr
+zar
+zariad
+zast
+zastar
+zastaráv
+zb
+zdravot
+združ
+zjemn
+zlat
+zn
+Zn
+zool
+zr
+zried
+zv
+záhr
+zák
+zákl
+zám
+záp
+západoeur
+zázn
+územ
+účt
+čast
+čes
+Čes
+čl
+čísl
+živ
+pr
+fak
+Kr
+p.n.l
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.sl ADDED Viewed

	@@ -0,0 +1,78 @@

+dr
+Dr
+itd
+itn
+št #NUMERIC_ONLY#
+Št #NUMERIC_ONLY#
+d
+jan
+Jan
+feb
+Feb
+mar
+Mar
+apr
+Apr
+jun
+Jun
+jul
+Jul
+avg
+Avg
+sept
+Sept
+sep
+Sep
+okt
+Okt
+nov
+Nov
+dec
+Dec
+tj
+Tj
+npr
+Npr
+sl
+Sl
+op
+Op
+gl
+Gl
+oz
+Oz
+prev
+dipl
+ing
+prim
+Prim
+cf
+Cf
+gl
+Gl
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.sv ADDED Viewed

	@@ -0,0 +1,46 @@

+#single upper case letter are usually initials
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+#misc abbreviations
+AB
+G
+VG
+dvs
+etc
+from
+iaf
+jfr
+kl
+kr
+mao
+mfl
+mm
+osv
+pga
+tex
+tom
+vs

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.ta ADDED Viewed

	@@ -0,0 +1,276 @@

+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+அ
+ஆ
+இ
+ஈ
+உ
+ஊ
+எ
+ஏ
+ஐ
+ஒ
+ஓ
+ஔ
+ஃ
+க
+கா
+கி
+கீ
+கு
+கூ
+கெ
+கே
+கை
+கொ
+கோ
+கௌ
+க்
+ச
+சா
+சி
+சீ
+சு
+சூ
+செ
+சே
+சை
+சொ
+சோ
+சௌ
+ச்
+ட
+டா
+டி
+டீ
+டு
+டூ
+டெ
+டே
+டை
+டொ
+டோ
+டௌ
+ட்
+த
+தா
+தி
+தீ
+து
+தூ
+தெ
+தே
+தை
+தொ
+தோ
+தௌ
+த்
+ப
+பா
+பி
+பீ
+பு
+பூ
+பெ
+பே
+பை
+பொ
+போ
+பௌ
+ப்
+ற
+றா
+றி
+றீ
+று
+றூ
+றெ
+றே
+றை
+றொ
+றோ
+றௌ
+ற்
+ய
+யா
+யி
+யீ
+யு
+யூ
+யெ
+யே
+யை
+யொ
+யோ
+யௌ
+ய்
+ர
+ரா
+ரி
+ரீ
+ரு
+ரூ
+ரெ
+ரே
+ரை
+ரொ
+ரோ
+ரௌ
+ர்
+ல
+லா
+லி
+லீ
+லு
+லூ
+லெ
+லே
+லை
+லொ
+லோ
+லௌ
+ல்
+வ
+வா
+வி
+வீ
+வு
+வூ
+வெ
+வே
+வை
+வொ
+வோ
+வௌ
+வ்
+ள
+ளா
+ளி
+ளீ
+ளு
+ளூ
+ளெ
+ளே
+ளை
+ளொ
+ளோ
+ளௌ
+ள்
+ழ
+ழா
+ழி
+ழீ
+ழு
+ழூ
+ழெ
+ழே
+ழை
+ழொ
+ழோ
+ழௌ
+ழ்
+ங
+ஙா
+ஙி
+ஙீ
+ஙு
+ஙூ
+ஙெ
+ஙே
+ஙை
+ஙொ
+ஙோ
+ஙௌ
+ங்
+ஞ
+ஞா
+ஞி
+ஞீ
+ஞு
+ஞூ
+ஞெ
+ஞே
+ஞை
+ஞொ
+ஞோ
+ஞௌ
+ஞ்
+ண
+ணா
+ணி
+ணீ
+ணு
+ணூ
+ணெ
+ணே
+ணை
+ணொ
+ணோ
+ணௌ
+ண்
+ந
+நா
+நி
+நீ
+நு
+நூ
+நெ
+நே
+நை
+நொ
+நோ
+நௌ
+ந்
+ம
+மா
+மி
+மீ
+மு
+மூ
+மெ
+மே
+மை
+மொ
+மோ
+மௌ
+ம்
+ன
+னா
+னி
+னீ
+னு
+னூ
+னெ
+னே
+னை
+னொ
+னோ
+னௌ
+ன்
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+திரு
+திருமதி
+வண
+கௌரவ
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+உ.ம்
+#கா.ம்
+#எ.ம்
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY#
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.yue ADDED Viewed

	@@ -0,0 +1,53 @@

+#
+# Cantonese (Chinese)
+#
+# Anything in this file, followed by a period,
+# does NOT indicate an end-of-sentence marker.
+#
+# English/Euro-language given-name initials (appearing in
+# news, periodicals, etc.)
+A
+Ā
+B
+C
+Č
+D
+E
+Ē
+F
+G
+Ģ
+H
+I
+Ī
+J
+K
+Ķ
+L
+Ļ
+M
+N
+Ņ
+O
+P
+Q
+R
+S
+Š
+T
+U
+Ū
+V
+W
+X
+Y
+Z
+Ž
+# Numbers only. These should only induce breaks when followed by
+# a numeric sequence.
+# Add NUMERIC_ONLY after the word for this function. This case is
+# mostly for the english "No." which can either be a sentence of its
+# own, or if followed by a number, a non-breaking prefix.
+No #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#

laser/tools-external/moses-tokenizer/share/nonbreaking_prefixes/nonbreaking_prefix.zh ADDED Viewed

	@@ -0,0 +1,53 @@

+#
+# Mandarin (Chinese)
+#
+# Anything in this file, followed by a period,
+# does NOT indicate an end-of-sentence marker.
+#
+# English/Euro-language given-name initials (appearing in
+# news, periodicals, etc.)
+A
+Ā
+B
+C
+Č
+D
+E
+Ē
+F
+G
+Ģ
+H
+I
+Ī
+J
+K
+Ķ
+L
+Ļ
+M
+N
+Ņ
+O
+P
+Q
+R
+S
+Š
+T
+U
+Ū
+V
+W
+X
+Y
+Z
+Ž
+# Numbers only. These should only induce breaks when followed by
+# a numeric sequence.
+# Add NUMERIC_ONLY after the word for this function. This case is
+# mostly for the english "No." which can either be a sentence of its
+# own, or if followed by a number, a non-breaking prefix.
+No #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#

laser/tools-external/moses-tokenizer/tokenizer/basic-protected-patterns ADDED Viewed

	@@ -0,0 +1,5 @@

+<\/?\S+\/?>
+<\S+( [a-zA-Z0-9]+\=\"?[^\"]\")+ ?\/?>
+<\S+( [a-zA-Z0-9]+\=\'?[^\']\')+ ?\/?>
+[\w\-\_\.]+\@([\w\-\_]+\.)+[a-zA-Z]{2,}
+(http[s]?|ftp):\/\/[^:\/\s]+(\/\w+)*\/[\w\-\.]+

laser/tools-external/moses-tokenizer/tokenizer/deescape-special-chars.perl ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+use strict;
+while(<STDIN>) {
+  s/\&bar;/\|/g;   # factor separator (legacy)
+  s/\&#124;/\|/g;  # factor separator
+  s/\&lt;/\</g;    # xml
+  s/\&gt;/\>/g;    # xml
+  s/\&bra;/\[/g;   # syntax non-terminal (legacy)
+  s/\&ket;/\]/g;   # syntax non-terminal (legacy)
+  s/\&quot;/\"/g;  # xml
+  s/\&apos;/\'/g;  # xml
+  s/\&#91;/\[/g;   # syntax non-terminal
+  s/\&#93;/\]/g;   # syntax non-terminal
+  s/\&amp;/\&/g;   # escape escape
+  print $_;
+}

laser/tools-external/moses-tokenizer/tokenizer/detokenizer.perl ADDED Viewed

	@@ -0,0 +1,373 @@

+#!/usr/bin/env perl
+# $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $
+# Sample De-Tokenizer
+# written by Josh Schroeder, based on code by Philipp Koehn
+# further modifications by Ondrej Bojar
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+use warnings;
+use strict;
+use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $UPPERCASE_SENT = 0;
+my $PENN = 0;
+while (@ARGV) {
+	$_ = shift;
+	/^-b$/ && ($| = 1, next);
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+	/^-u$/ && ($UPPERCASE_SENT = 1, next);
+  /^-penn$/ && ($PENN = 1, next);
+}
+if ($HELP) {
+	print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n";
+        print "Options:\n";
+        print "  -u     ... uppercase the first char in the final sentence.\n";
+        print "  -q     ... don't report detokenizer revision.\n";
+        print "  -b     ... disable Perl buffering.\n";
+        print "  -penn  ... assume input is tokenized as per tokenizer.perl's -penn option.\n";
+	exit;
+}
+if ($language !~ /^(cs|en|fr|it|fi)$/) {
+  print STDERR "Warning: No built-in rules for language $language.\n"
+}
+if ($PENN && $language ne "en") {
+  print STDERR "Error: -penn option only supported for English text.\n";
+  exit;
+}
+if (!$QUIET) {
+	print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n";
+	print STDERR "Language: $language\n";
+}
+while(<STDIN>) {
+	if (/^<.+>$/ || /^\s*$/) {
+		#don't try to detokenize XML/HTML tag lines
+		print $_;
+  } elsif ($PENN) {
+    print &detokenize_penn($_);
+  } else {
+		print &detokenize($_);
+	}
+}
+sub ucsecondarg {
+  # uppercase the second argument
+  my $arg1 = shift;
+  my $arg2 = shift;
+  return $arg1.uc($arg2);
+}
+sub deescape {
+  # de-escape special chars
+  my ($text) = @_;
+  $text =~ s/\&bar;/\|/g;   # factor separator (legacy)
+  $text =~ s/\&#124;/\|/g;  # factor separator
+  $text =~ s/\&lt;/\</g;    # xml
+  $text =~ s/\&gt;/\>/g;    # xml
+  $text =~ s/\&bra;/\[/g;   # syntax non-terminal (legacy)
+  $text =~ s/\&ket;/\]/g;   # syntax non-terminal (legacy)
+  $text =~ s/\&quot;/\"/g;  # xml
+  $text =~ s/\&apos;/\'/g;  # xml
+  $text =~ s/\&#91;/\[/g;   # syntax non-terminal
+  $text =~ s/\&#93;/\]/g;   # syntax non-terminal
+  $text =~ s/\&amp;/\&/g;   # escape escape
+  return $text;
+}
+sub detokenize {
+	my($text) = @_;
+	chomp($text);
+	$text = " $text ";
+  $text =~ s/ \@\-\@ /-/g;
+  $text = &deescape($text);
+	my $word;
+	my $i;
+	my @words = split(/ /,$text);
+	$text = "";
+	my %quoteCount =  ("\'"=>0,"\""=>0);
+	my $prependSpace = " ";
+	for ($i=0;$i<(scalar(@words));$i++) {
+		if (&startsWithCJKChar($words[$i])) {
+		    if ($i > 0 && &endsWithCJKChar($words[$i-1])) {
+			# perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word
+			$text=$text.$words[$i];
+		    } else {
+			# ... but do nothing special if this is a CJK word that doesn't follow a CJK word
+			$text=$text.$prependSpace.$words[$i];
+		    }
+		    $prependSpace = " ";
+		} elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+			#perform right shift on currency and other random punctuation items
+			$text = $text.$prependSpace.$words[$i];
+			$prependSpace = "";
+		} elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+		    if (($language eq "fr") && ($words[$i] =~ /^[\?\!\:\;\\\%]$/)) {
+			#these punctuations are prefixed with a non-breakable space in french
+			$text .= " "; }
+			#perform left shift on punctuation items
+			$text=$text.$words[$i];
+			$prependSpace = " ";
+		} elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+			#left-shift the contraction for English
+			$text=$text.$words[$i];
+			$prependSpace = " ";
+		} elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) {
+			#left-shift floats in Czech
+			$text=$text.$words[$i];
+			$prependSpace = " ";
+		}  elsif ((($language eq "fr") ||($language eq "it")) && ($i<=(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
+			#right-shift the contraction for French and Italian
+			$text = $text.$prependSpace.$words[$i];
+			$prependSpace = "";
+		} elsif (($language eq "cs") && ($i<(scalar(@words)-3))
+				&& ($words[$i] =~ /[\p{IsAlpha}]$/)
+				&& ($words[$i+1] =~ /^[-–]$/)
+				&& ($words[$i+2] =~ /^li$|^mail.*/i)
+				) {
+			#right-shift "-li" in Czech and a few Czech dashed words (e-mail)
+			$text = $text.$prependSpace.$words[$i].$words[$i+1];
+			$i++; # advance over the dash
+			$prependSpace = "";
+		} elsif ($words[$i] =~ /^[\'\"„“`]+$/) {
+			#combine punctuation smartly
+                        my $normalized_quo = $words[$i];
+                        $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/;
+                        $quoteCount{$normalized_quo} = 0
+                                if !defined $quoteCount{$normalized_quo};
+                        if ($language eq "cs" && $words[$i] eq "„") {
+                          # this is always the starting quote in Czech
+                          $quoteCount{$normalized_quo} = 0;
+                        }
+                        if ($language eq "cs" && $words[$i] eq "“") {
+                          # this is usually the ending quote in Czech
+                          $quoteCount{$normalized_quo} = 1;
+                        }
+			if (($quoteCount{$normalized_quo} % 2) eq 0) {
+				if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
+					#single quote for posesssives ending in s... "The Jones' house"
+					#left shift
+					$text=$text.$words[$i];
+					$prependSpace = " ";
+				} else {
+					#right shift
+					$text = $text.$prependSpace.$words[$i];
+					$prependSpace = "";
+					$quoteCount{$normalized_quo} ++;
+				}
+			} else {
+				#left shift
+				$text=$text.$words[$i];
+				$prependSpace = " ";
+				$quoteCount{$normalized_quo} ++;
+			}
+        } elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) {
+            # Finnish : without intervening space if followed by case suffix
+            # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
+            $text=$text. lc $words[$i];
+            $prependSpace = " ";
+		} else {
+			$text=$text.$prependSpace.$words[$i];
+			$prependSpace = " ";
+		}
+	}
+	# clean up spaces at head and tail of each line as well as any double-spacing
+	$text =~ s/ +/ /g;
+	$text =~ s/\n /\n/g;
+	$text =~ s/ \n/\n/g;
+	$text =~ s/^ //g;
+	$text =~ s/ $//g;
+	#add trailing break
+	$text .= "\n" unless $text =~ /\n$/;
+        $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+	return $text;
+}
+sub detokenize_penn {
+  my($text) = @_;
+  chomp($text);
+  $text = " $text ";
+  $text =~ s/ \@\-\@ /-/g;
+  $text =~ s/ \@\/\@ /\//g;
+  $text = &deescape($text);
+  # merge de-contracted forms except where the second word begins with an
+  # apostrophe (those are handled later)
+  $text =~ s/ n't /n't /g;
+  $text =~ s/ N'T /N'T /g;
+  $text =~ s/ ([Cc])an not / $1annot /g;
+  $text =~ s/ ([Dd])' ye / $1'ye /g;
+  $text =~ s/ ([Gg])im me / $1imme /g;
+  $text =~ s/ ([Gg])on na / $1onna /g;
+  $text =~ s/ ([Gg])ot ta / $1otta /g;
+  $text =~ s/ ([Ll])em me / $1emme /g;
+  $text =~ s/ '([Tt]) is / '$1is /g;
+  $text =~ s/ '([Tt]) was / '$1was /g;
+  $text =~ s/ ([Ww])an na / $1anna /g;
+  # restore brackets
+  $text =~ s/-LRB-/\(/g;
+  $text =~ s/-RRB-/\)/g;
+  $text =~ s/-LSB-/\[/g;
+  $text =~ s/-RSB-/\]/g;
+  $text =~ s/-LCB-/{/g;
+  $text =~ s/-RCB-/}/g;
+  my $i;
+  my @words = split(/ /,$text);
+  $text = "";
+  my $prependSpace = " ";
+  for ($i=0;$i<(scalar(@words));$i++) {
+    if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+      # perform right shift on currency and other random punctuation items
+      $text = $text.$prependSpace.$words[$i];
+      $prependSpace = "";
+    } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+      # perform left shift on punctuation items
+      $text=$text.$words[$i];
+      $prependSpace = " ";
+    } elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+      # left-shift the contraction
+      $text=$text.$words[$i];
+      $prependSpace = " ";
+    } elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only
+      # opening single quote: convert to straight quote and right-shift
+      $text = $text.$prependSpace."\'";
+      $prependSpace = "";
+    } elsif ($words[$i] eq "``") {
+      # opening double quote: convert to straight quote and right-shift
+      $text = $text.$prependSpace."\"";
+      $prependSpace = "";
+    } elsif ($words[$i] eq "\'") {
+      # closing single quote: convert to straight quote and left shift
+      $text = $text."\'";
+      $prependSpace = " ";
+    } elsif ($words[$i] eq "\'\'") {
+      # closing double quote: convert to straight quote and left shift
+      $text = $text."\"";
+      $prependSpace = " ";
+    } else {
+      $text = $text.$prependSpace.$words[$i];
+      $prependSpace = " ";
+    }
+  }
+  # clean up spaces at head and tail of each line as well as any double-spacing
+  $text =~ s/ +/ /g;
+  $text =~ s/\n /\n/g;
+  $text =~ s/ \n/\n/g;
+  $text =~ s/^ //g;
+  $text =~ s/ $//g;
+  # add trailing break
+  $text .= "\n" unless $text =~ /\n$/;
+  $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+  return $text;
+}
+sub startsWithCJKChar {
+    my ($str) = @_;
+    return 0 if length($str) == 0;
+    my $firstChar = substr($str, 0, 1);
+    return &charIsCJK($firstChar);
+}
+sub endsWithCJKChar {
+    my ($str) = @_;
+    return 0 if length($str) == 0;
+    my $lastChar = substr($str, length($str)-1, 1);
+    return &charIsCJK($lastChar);
+}
+# Given a string consisting of one character, returns true iff the character
+# is a CJK (Chinese/Japanese/Korean) character
+sub charIsCJK {
+    my ($char) = @_;
+    # $char should be a string of length 1
+    my $codepoint = &codepoint_dec($char);
+    # The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
+    # Hangul Jamo (1100–11FF)
+    return 1 if (&between_hexes($codepoint, '1100', '11FF'));
+    # CJK Radicals Supplement (2E80–2EFF)
+    # Kangxi Radicals (2F00–2FDF)
+    # Ideographic Description Characters (2FF0–2FFF)
+    # CJK Symbols and Punctuation (3000–303F)
+    # Hiragana (3040–309F)
+    # Katakana (30A0–30FF)
+    # Bopomofo (3100–312F)
+    # Hangul Compatibility Jamo (3130–318F)
+    # Kanbun (3190–319F)
+    # Bopomofo Extended (31A0–31BF)
+    # CJK Strokes (31C0–31EF)
+    # Katakana Phonetic Extensions (31F0–31FF)
+    # Enclosed CJK Letters and Months (3200–32FF)
+    # CJK Compatibility (3300–33FF)
+    # CJK Unified Ideographs Extension A (3400–4DBF)
+    # Yijing Hexagram Symbols (4DC0–4DFF)
+    # CJK Unified Ideographs (4E00–9FFF)
+    # Yi Syllables (A000–A48F)
+    # Yi Radicals (A490–A4CF)
+    return 1 if (&between_hexes($codepoint, '2E80', 'A4CF'));
+    # Phags-pa (A840–A87F)
+    return 1 if (&between_hexes($codepoint, 'A840', 'A87F'));
+    # Hangul Syllables (AC00–D7AF)
+    return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF'));
+    # CJK Compatibility Ideographs (F900–FAFF)
+    return 1 if (&between_hexes($codepoint, 'F900', 'FAFF'));
+    # CJK Compatibility Forms (FE30–FE4F)
+    return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F'));
+    # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
+    return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC'));
+    # Supplementary Ideographic Plane 20000–2FFFF
+    return 1 if (&between_hexes($codepoint, '20000', '2FFFF'));
+    return 0;
+}
+# Returns the code point of a Unicode char, represented as a decimal number
+sub codepoint_dec {
+    if (my $char = shift) {
+	return unpack('U0U*', $char);
+    }
+}
+sub between_hexes {
+    my ($num, $left, $right) = @_;
+    return $num >= hex($left) && $num <= hex($right);
+}

laser/tools-external/moses-tokenizer/tokenizer/lowercase.perl ADDED Viewed

	@@ -0,0 +1,14 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+use strict;
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+while(<STDIN>) {
+  print lc($_);
+}

laser/tools-external/moses-tokenizer/tokenizer/normalize-punctuation.perl ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+use strict;
+my $language = "en";
+my $PENN = 0;
+while (@ARGV) {
+    $_ = shift;
+    /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+    /^-l$/ && ($language = shift, next);
+    /^[^\-]/ && ($language = $_, next);
+  	/^-penn$/ && ($PENN = 1, next);
+}
+while(<STDIN>) {
+    s/\r//g;
+    # remove extra spaces
+    s/\(/ \(/g;
+    s/\)/\) /g; s/ +/ /g;
+    s/\) ([\.\!\:\?\;\,])/\)$1/g;
+    s/\( /\(/g;
+    s/ \)/\)/g;
+    s/(\d) \%/$1\%/g;
+    s/ :/:/g;
+    s/ ;/;/g;
+    # normalize unicode punctuation
+    if ($PENN == 0) {
+      s/\`/\'/g;
+      s/\'\'/ \" /g;
+    }
+    s/„/\"/g;
+    s/“/\"/g;
+    s/”/\"/g;
+    s/–/-/g;
+    s/—/ - /g; s/ +/ /g;
+    s/´/\'/g;
+    s/([a-z])‘([a-z])/$1\'$2/gi;
+    s/([a-z])’([a-z])/$1\'$2/gi;
+    s/‘/\"/g;
+    s/‚/\"/g;
+    s/’/\"/g;
+    s/''/\"/g;
+    s/´´/\"/g;
+    s/…/.../g;
+    # French quotes
+    s/ « / \"/g;
+    s/« /\"/g;
+    s/«/\"/g;
+    s/ » /\" /g;
+    s/ »/\"/g;
+    s/»/\"/g;
+    # handle pseudo-spaces
+    s/ \%/\%/g;
+    s/nº /nº /g;
+    s/ :/:/g;
+    s/ ºC/ ºC/g;
+    s/ cm/ cm/g;
+    s/ \?/\?/g;
+    s/ \!/\!/g;
+    s/ ;/;/g;
+    s/, /, /g; s/ +/ /g;
+    # English "quotation," followed by comma, style
+    if ($language eq "en") {
+	s/\"([,\.]+)/$1\"/g;
+    }
+    # Czech is confused
+    elsif ($language eq "cs" || $language eq "cz") {
+    }
+    # German/Spanish/French "quotation", followed by comma, style
+    else {
+	s/,\"/\",/g;
+	s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
+    }
+    if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
+	s/(\d) (\d)/$1,$2/g;
+    }
+    else {
+	s/(\d) (\d)/$1.$2/g;
+    }
+    print $_;
+}

laser/tools-external/moses-tokenizer/tokenizer/remove-non-printing-char.perl ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+use utf8;
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+while (my $line = <STDIN>) {
+  chomp($line);
+  #$line =~ tr/\040-\176/ /c;
+  #$line =~ s/[^[:print:]]/ /g;
+  #$line =~ s/\s+/ /g;
+  $line =~ s/\p{C}/ /g;
+  print "$line\n";
+}

laser/tools-external/moses-tokenizer/tokenizer/tokenizer.perl ADDED Viewed

	@@ -0,0 +1,563 @@

+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+# Sample Tokenizer
+### Version 1.1
+# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
+# Version 1.1 updates:
+#       (1) add multithreading option "-threads NUM_THREADS" (default is 1);
+#       (2) add a timing option "-time" to calculate the average speed of this tokenizer;
+#       (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
+### Version 1.0
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# written by Josh Schroeder, based on code by Philipp Koehn
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+use warnings;
+use FindBin qw($RealBin);
+use strict;
+use Time::HiRes;
+if  (eval {require Thread;1;}) {
+  #module loaded
+  Thread->import();
+}
+my $mydir = "$RealBin/../share/nonbreaking_prefixes";
+my %NONBREAKING_PREFIX = ();
+my @protected_patterns = ();
+my $protected_patterns_file = "";
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+my $SKIP_XML = 0;
+my $TIMING = 0;
+my $NUM_THREADS = 1;
+my $NUM_SENTENCES_PER_THREAD = 2000;
+my $PENN = 0;
+my $NO_ESCAPING = 0;
+while (@ARGV)
+{
+	$_ = shift;
+	/^-b$/ && ($| = 1, next);
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+	/^-x$/ && ($SKIP_XML = 1, next);
+	/^-a$/ && ($AGGRESSIVE = 1, next);
+	/^-time$/ && ($TIMING = 1, next);
+  # Option to add list of regexps to be protected
+  /^-protected/ && ($protected_patterns_file = shift, next);
+	/^-threads$/ && ($NUM_THREADS = int(shift), next);
+	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+	/^-penn$/ && ($PENN = 1, next);
+	/^-no-escape/ && ($NO_ESCAPING = 1, next);
+}
+# for time calculation
+my $start_time;
+if ($TIMING)
+{
+    $start_time = [ Time::HiRes::gettimeofday( ) ];
+}
+# print help message
+if ($HELP)
+{
+	print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
+        print "Options:\n";
+        print "  -q     ... quiet.\n";
+        print "  -a     ... aggressive hyphen splitting.\n";
+        print "  -b     ... disable Perl buffering.\n";
+        print "  -time  ... enable processing time calculation.\n";
+        print "  -penn  ... use Penn treebank-like tokenization.\n";
+        print "  -protected FILE  ... specify file with patters to be protected in tokenisation.\n";
+	print "  -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
+	exit;
+}
+if (!$QUIET)
+{
+	print STDERR "Tokenizer Version 1.1\n";
+	print STDERR "Language: $language\n";
+	print STDERR "Number of threads: $NUM_THREADS\n";
+}
+# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
+load_prefixes($language,\%NONBREAKING_PREFIX);
+if (scalar(%NONBREAKING_PREFIX) eq 0)
+{
+	print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+# Load protected patterns
+if ($protected_patterns_file)
+{
+  open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
+  while(<PP>) {
+    chomp;
+    push @protected_patterns, $_;
+  }
+}
+my @batch_sentences = ();
+my @thread_list = ();
+my $count_sentences = 0;
+if ($NUM_THREADS > 1)
+{# multi-threading tokenization
+    while(<STDIN>)
+    {
+        $count_sentences = $count_sentences + 1;
+        push(@batch_sentences, $_);
+        if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
+        {
+            # assign each thread work
+            for (my $i=0; $i<$NUM_THREADS; $i++)
+            {
+                my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+                my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+                my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+                my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+                push(@thread_list, $new_thread);
+            }
+            foreach (@thread_list)
+            {
+                my $tokenized_list = $_->join;
+                foreach (@$tokenized_list)
+                {
+                    print $_;
+                }
+            }
+            # reset for the new run
+            @thread_list = ();
+            @batch_sentences = ();
+        }
+    }
+    # the last batch
+    if (scalar(@batch_sentences)>0)
+    {
+        # assign each thread work
+        for (my $i=0; $i<$NUM_THREADS; $i++)
+        {
+            my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+            if ($start_index >= scalar(@batch_sentences))
+            {
+                last;
+            }
+            my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+            if ($end_index >= scalar(@batch_sentences))
+            {
+                $end_index = scalar(@batch_sentences)-1;
+            }
+            my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+            my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+            push(@thread_list, $new_thread);
+        }
+        foreach (@thread_list)
+        {
+            my $tokenized_list = $_->join;
+            foreach (@$tokenized_list)
+            {
+                print $_;
+            }
+        }
+    }
+}
+else
+{# single thread only
+    while(<STDIN>)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+        {
+            #don't try to tokenize XML/HTML tag lines
+            print $_;
+        }
+        else
+        {
+            print &tokenize($_);
+        }
+    }
+}
+if ($TIMING)
+{
+    my $duration = Time::HiRes::tv_interval( $start_time );
+    print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
+    print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
+}
+#####################################################################################
+# subroutines afterward
+# tokenize a batch of texts saved in an array
+# input: an array containing a batch of texts
+# return: another array containing a batch of tokenized texts for the input array
+sub tokenize_batch
+{
+    my(@text_list) = @_;
+    my(@tokenized_list) = ();
+    foreach (@text_list)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+        {
+            #don't try to tokenize XML/HTML tag lines
+            push(@tokenized_list, $_);
+        }
+        else
+        {
+            push(@tokenized_list, &tokenize($_));
+        }
+    }
+    return \@tokenized_list;
+}
+# the actual tokenize function which tokenizes one input string
+# input: one string
+# return: the tokenized string for the input string
+sub tokenize
+{
+    my($text) = @_;
+    if ($PENN) {
+      return tokenize_penn($text);
+    }
+    chomp($text);
+    $text = " $text ";
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+    # Find protected patterns
+    my @protected = ();
+    foreach my $protected_pattern (@protected_patterns) {
+      my $t = $text;
+      while ($t =~ /(?<PATTERN>$protected_pattern)(?<TAIL>.*)$/) {
+        push @protected, $+{PATTERN};
+        $t = $+{TAIL};
+      }
+    }
+    for (my $i = 0; $i < scalar(@protected); ++$i) {
+      my $subst = sprintf("THISISPROTECTED%.3d", $i);
+      $text =~ s,\Q$protected[$i], $subst ,g;
+    }
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+    # seperate out all "other" special characters
+    $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+    # aggressive hyphen splitting
+    if ($AGGRESSIVE)
+    {
+        $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
+    }
+    #multi-dots stay together
+    $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+    while($text =~ /DOTMULTI\./)
+    {
+        $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+        $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+    }
+    # seperate out "," except if within numbers (5,300)
+    #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    # separate out "," except if within numbers (5,300)
+    # previous "global" application skips some:  A,B,C,D,E > A , B,C , D,E
+    # first application uses up B so rule can't see B,C
+    # two-step version here may create extra spaces but these are removed later
+    # will also space digit,letter or letter,digit forms (redundant with next section)
+    $text =~ s/([^\p{IsN}])[,]/$1 , /g;
+    $text =~ s/[,]([^\p{IsN}])/ , $1/g;
+    # separate "," after a number if it's the end of a sentence
+    $text =~ s/([\p{IsN}])[,]$/$1 ,/g;
+    # separate , pre and post number
+    #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+    # turn `into '
+    #$text =~ s/\`/\'/g;
+    #turn '' into "
+    #$text =~ s/\'\'/ \" /g;
+    if ($language eq "en")
+    {
+        #split contractions right
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+        #special case for "1990's"
+        $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+    }
+    elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga"))
+    {
+        #split contractions left
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+    }
+    else
+    {
+        $text =~ s/\'/ \' /g;
+    }
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++)
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/)
+        {
+            my $pre = $1;
+            if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+            {
+                #no change
+			}
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+            {
+                #no change
+            }
+            else
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }
+    # clean up extraneous spaces
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+    # .' at end of sentence is missed
+    $text =~ s/\.\' ?$/ . ' /;
+    # restore protected
+    for (my $i = 0; $i < scalar(@protected); ++$i) {
+      my $subst = sprintf("THISISPROTECTED%.3d", $i);
+      $text =~ s/$subst/$protected[$i]/g;
+    }
+    #restore multi-dots
+    while($text =~ /DOTDOTMULTI/)
+    {
+        $text =~ s/DOTDOTMULTI/DOTMULTI./g;
+    }
+    $text =~ s/DOTMULTI/./g;
+    #escape special chars
+    if (!$NO_ESCAPING)
+      {
+	$text =~ s/\&/\&amp;/g;   # escape escape
+	$text =~ s/\|/\&#124;/g;  # factor separator
+	$text =~ s/\</\&lt;/g;    # xml
+	$text =~ s/\>/\&gt;/g;    # xml
+	$text =~ s/\'/\&apos;/g;  # xml
+	$text =~ s/\"/\&quot;/g;  # xml
+	$text =~ s/\[/\&#91;/g;   # syntax non-terminal
+	$text =~ s/\]/\&#93;/g;   # syntax non-terminal
+      }
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+    return $text;
+}
+sub tokenize_penn
+{
+    # Improved compatibility with Penn Treebank tokenization.  Useful if
+    # the text is to later be parsed with a PTB-trained parser.
+    #
+    # Adapted from Robert MacIntyre's sed script:
+    #   http://www.cis.upenn.edu/~treebank/tokenizer.sed
+    my($text) = @_;
+    chomp($text);
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+    # attempt to get correct directional quotes
+    $text =~ s/^``/`` /g;
+    $text =~ s/^"/`` /g;
+    $text =~ s/^`([^`])/` $1/g;
+    $text =~ s/^'/`  /g;
+    $text =~ s/([ ([{<])"/$1 `` /g;
+    $text =~ s/([ ([{<])``/$1 `` /g;
+    $text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
+    $text =~ s/([ ([{<])'/$1 ` /g;
+    # close quotes handled at end
+    $text =~ s=\.\.\.= _ELLIPSIS_ =g;
+    # separate out "," except if within numbers (5,300)
+    $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    # separate , pre and post number
+    $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+    #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
+$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
+    # Separate out intra-token slashes.  PTB tokenization doesn't do this, so
+    # the tokens should be merged prior to parsing with a PTB-trained parser
+    # (see syntax-hyphen-splitting.perl).
+    $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
+    # Assume sentence tokenization has been done first, so split FINAL periods
+    # only.
+    $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
+    # however, we may as well split ALL question marks and exclamation points,
+    # since they shouldn't have the abbrev.-marker ambiguity problem
+    $text =~ s=([?!])= $1 =g;
+    # parentheses, brackets, etc.
+    $text =~ s=([\]\[\(\){}<>])= $1 =g;
+    $text =~ s/\(/-LRB-/g;
+    $text =~ s/\)/-RRB-/g;
+    $text =~ s/\[/-LSB-/g;
+    $text =~ s/\]/-RSB-/g;
+    $text =~ s/{/-LCB-/g;
+    $text =~ s/}/-RCB-/g;
+    $text =~ s=--= -- =g;
+    # First off, add a space to the beginning and end of each line, to reduce
+    # necessary number of regexps.
+    $text =~ s=$= =;
+    $text =~ s=^= =;
+    $text =~ s="= '' =g;
+    # possessive or close-single-quote
+    $text =~ s=([^'])' =$1 ' =g;
+    # as in it's, I'm, we'd
+    $text =~ s='([sSmMdD]) = '$1 =g;
+    $text =~ s='ll = 'll =g;
+    $text =~ s='re = 're =g;
+    $text =~ s='ve = 've =g;
+    $text =~ s=n't = n't =g;
+    $text =~ s='LL = 'LL =g;
+    $text =~ s='RE = 'RE =g;
+    $text =~ s='VE = 'VE =g;
+    $text =~ s=N'T = N'T =g;
+    $text =~ s= ([Cc])annot = $1an not =g;
+    $text =~ s= ([Dd])'ye = $1' ye =g;
+    $text =~ s= ([Gg])imme = $1im me =g;
+    $text =~ s= ([Gg])onna = $1on na =g;
+    $text =~ s= ([Gg])otta = $1ot ta =g;
+    $text =~ s= ([Ll])emme = $1em me =g;
+    $text =~ s= ([Mm])ore'n = $1ore 'n =g;
+    $text =~ s= '([Tt])is = '$1 is =g;
+    $text =~ s= '([Tt])was = '$1 was =g;
+    $text =~ s= ([Ww])anna = $1an na =g;
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++)
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/)
+        {
+            my $pre = $1;
+            if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+            {
+                #no change
+            }
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+            {
+                #no change
+            }
+            else
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }
+    # restore ellipses
+    $text =~ s=_ELLIPSIS_=\.\.\.=g;
+    # clean out extra spaces
+    $text =~ s=  *= =g;
+    $text =~ s=^ *==g;
+    $text =~ s= *$==g;
+    #escape special chars
+    $text =~ s/\&/\&amp;/g;   # escape escape
+    $text =~ s/\|/\&#124;/g;  # factor separator
+    $text =~ s/\</\&lt;/g;    # xml
+    $text =~ s/\>/\&gt;/g;    # xml
+    $text =~ s/\'/\&apos;/g;  # xml
+    $text =~ s/\"/\&quot;/g;  # xml
+    $text =~ s/\[/\&#91;/g;   # syntax non-terminal
+    $text =~ s/\]/\&#93;/g;   # syntax non-terminal
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+    return $text;
+}
+sub load_prefixes
+{
+    my ($language, $PREFIX_REF) = @_;
+    my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+    #default back to English if we don't have a language-specific prefix file
+    if (!(-e $prefixfile))
+    {
+        $prefixfile = "$mydir/nonbreaking_prefix.en";
+        print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+        die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+    }
+    if (-e "$prefixfile")
+    {
+        open(PREFIX, "<:utf8", "$prefixfile");
+        while (<PREFIX>)
+        {
+            my $item = $_;
+            chomp($item);
+            if (($item) && (substr($item,0,1) ne "#"))
+            {
+                if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
+                {
+                    $PREFIX_REF->{$1} = 2;
+                }
+                else
+                {
+                    $PREFIX_REF->{$item} = 1;
+                }
+            }
+        }
+        close(PREFIX);
+    }
+}

laser/tools-external/sentencepiece-master/.github/dependabot.yml ADDED Viewed

	@@ -0,0 +1,23 @@

+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"
+    groups:
+      github-actions:
+        patterns:
+          - "*"
+  - package-ecosystem: "pip"
+    directory: "/.github/workflows/requirements"
+    schedule:
+      interval: "monthly"
+    groups:
+      build-time-deps:
+        patterns:
+          - "*"

laser/tools-external/sentencepiece-master/.github/workflows/cifuzz.yml ADDED Viewed

	@@ -0,0 +1,30 @@

+name: CIFuzz
+on: [pull_request]
+permissions:
+  contents: read
+jobs:
+  Fuzzing:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Build Fuzzers
+      id: build
+      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'sentencepiece'
+        dry-run: false
+        language: c++
+    - name: Run Fuzzers
+      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'sentencepiece'
+        fuzz-seconds: 300
+        dry-run: false
+        language: c++
+    - name: Upload Crash
+      uses: actions/upload-artifact@v3
+      if: failure() && steps.build.outcome == 'success'
+      with:
+        name: artifacts
+        path: ./out/artifacts

laser/tools-external/sentencepiece-master/.github/workflows/cmake.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+name: CI for general build
+on:
+  push:
+    branches: [ master ]
+    tags:
+      - 'v*'
+  pull_request:
+    branches: [ master ]
+permissions:
+  contents: read
+jobs:
+  build:
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, ubuntu-20.04, windows-latest, macOS-11 ]
+        arch: [ x64 ]
+        include:
+          - os: windows-latest
+            arch: x86
+    runs-on: ${{ matrix.os }}
+    permissions:
+      contents: write # svenstaro/upload-release-action
+    steps:
+    - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+    - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+      with:
+        python-version: '3.x'
+        architecture: ${{matrix.arch}}
+    - name: Config for Windows
+      if: runner.os == 'Windows'
+      run: |
+        if ("${{matrix.arch}}" -eq "x64") {
+          $msbuildPlatform = "x64"
+        } else {
+          $msbuildPlatform = "Win32"
+        }
+        cmake -A $msbuildPlatform -B ${{github.workspace}}/build -DSPM_BUILD_TEST=ON -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/build/root
+    - name: Config for Unix
+      if: runner.os != 'Windows'
+      run: cmake -B ${{github.workspace}}/build -DSPM_BUILD_TEST=ON -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/build/root
+      env:
+        CMAKE_OSX_ARCHITECTURES: arm64;x86_64
+    - name: Build
+      run: cmake --build ${{github.workspace}}/build --config Release --target install --parallel 8
+    - name: Test
+      working-directory: ${{github.workspace}}/build
+      run: ctest -C Release --output-on-failure
+    - name: Package
+      working-directory: ${{github.workspace}}/build
+      run: cpack
+    - name: Build Python wrapper
+      working-directory: ${{github.workspace}}/python
+      run: |
+        python -m pip install --require-hashes --no-dependencies -r ../.github/workflows/requirements/base.txt
+        python setup.py build
+        python setup.py bdist_wheel
+        python -m pytest
+    - name: Upload artifcacts
+      uses: actions/upload-artifact@v3
+      with:
+        name: artifcacts
+        path: ./build/*.7z
+    - name: Upload Release Assets
+      if: startsWith(github.ref, 'refs/tags/')
+      uses: svenstaro/upload-release-action@04733e069f2d7f7f0b4aebc4fbdbce8613b03ccd # v2.9.0
+      with:
+        repo_token: ${{ secrets.GITHUB_TOKEN }}
+        file: ./build/*.7z
+        tag: ${{ github.ref }}
+        overwrite: true
+        prerelease: true
+        file_glob: true
+        body: "This is my release text"