Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
Copyright 2019 Brian Thompson | |
Licensed under the Apache License, Version 2.0 (the "License"); | |
you may not use this file except in compliance with the License. | |
You may obtain a copy of the License at | |
https://www.apache.org/licenses/LICENSE-2.0 | |
Unless required by applicable law or agreed to in writing, software | |
distributed under the License is distributed on an "AS IS" BASIS, | |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | |
limitations under the License. | |
""" | |
import argparse | |
from dp_utils import yield_overlaps | |
def go(output_file, input_files, num_overlaps): | |
output = set() | |
for fin in input_files: | |
lines = open(fin, 'rt', encoding="utf-8").readlines() | |
for out_line in yield_overlaps(lines, num_overlaps): | |
output.add(out_line) | |
# for reproducibility | |
output = list(output) | |
output.sort() | |
with open(output_file, 'wt', encoding="utf-8") as fout: | |
for line in output: | |
fout.write(line + '\n') | |
def _main(): | |
parser = argparse.ArgumentParser('Create text file containing overlapping sentences.', | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument('-i', '--inputs', type=str, nargs='+', | |
help='input text file(s).') | |
parser.add_argument('-o', '--output', type=str, | |
help='output text file containing overlapping sentneces') | |
parser.add_argument('-n', '--num_overlaps', type=int, default=4, | |
help='Maximum number of allowed overlaps.') | |
args = parser.parse_args() | |
go(output_file=args.output, | |
num_overlaps=args.num_overlaps, | |
input_files=args.inputs) | |
if __name__ == '__main__': | |
_main() | |