File size: 3,318 Bytes
256a159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from typing import List, Tuple
from modules.alignment import read_cilin, read_confusion, Alignment
from modules.merger import Merger
from modules.classifier import Classifier

class Annotator:
    def __init__(self,
                 align: Alignment,
                 merger: Merger,
                 classifier: Classifier,
                 granularity: str = "word",
                 strategy: str = "first"):
        self.align = align
        self.merger = merger
        self.classifier = classifier
        self.granularity = granularity
        self.strategy = strategy

    @classmethod
    def create_default(cls, granularity: str = "word", strategy: str = "first"):
        """
        Default parameters used in the paper
        """
        semantic_dict, semantic_class = read_cilin()
        confusion_dict = read_confusion()
        align = Alignment(semantic_dict, confusion_dict, granularity)
        merger = Merger(granularity)
        classifier = Classifier(granularity)
        return cls(align, merger, classifier, granularity, strategy)

    def __call__(self,
                 src: List[Tuple],
                 tgt: List[Tuple],
                 annotator_id: int = 0,
                 verbose: bool = False):
        """
        Align sentences and annotate them with error type information
        """
        src_tokens = [x[0] for x in src]
        tgt_tokens = [x[0] for x in tgt]
        src_str = "".join(src_tokens)
        tgt_str = "".join(tgt_tokens)
        # convert to text form
        annotations_out = ["S " + " ".join(src_tokens) + "\n"]
        if tgt_str == "没有错误" or src_str == tgt_str:   # Error Free Case
            annotations_out.append(f"T{annotator_id} 没有错误\n")
            cors = [tgt_str]
            op, toks, inds = "noop", "-NONE-", (-1, -1)
            a_str = f"A {inds[0]} {inds[1]}|||{op}|||{toks}|||REQUIRED|||-NONE-|||{annotator_id}\n"
            annotations_out.append(a_str)
        elif tgt_str == "无法标注":  # Not Annotatable Case
            annotations_out.append(f"T{annotator_id} 无法标注\n")
            cors = [tgt_str]
            op, toks, inds = "NA", "-NONE-", (-1, -1)
            a_str = f"A {inds[0]} {inds[1]}|||{op}|||{toks}|||REQUIRED|||-NONE-|||{annotator_id}\n"
            annotations_out.append(a_str)
        else:  # Other
            align_objs = self.align(src, tgt)
            edit_objs = []
            align_idx = 0
            if self.strategy == "first":
                align_objs = align_objs[:1]
            for align_obj in align_objs:
                edits = self.merger(align_obj, src, tgt, verbose)
                if edits not in edit_objs:
                    edit_objs.append(edits)
                    annotations_out.append(f"T{annotator_id}-A{align_idx} " + " ".join(tgt_tokens) + "\n")
                    align_idx += 1
                    cors = self.classifier(src, tgt, edits, verbose)
                    # annotations_out = []
                    for cor in cors:
                        op, toks, inds = cor.op, cor.toks, cor.inds
                        a_str = f"A {inds[0]} {inds[1]}|||{op}|||{toks}|||REQUIRED|||-NONE-|||{annotator_id}\n"
                        annotations_out.append(a_str)
        annotations_out.append("\n")
        return annotations_out, cors