Spaces:

responsibility-framing
/

sociolome

Running

File size: 3,685 Bytes

05922fb

import argparse
import os
from collections import defaultdict
from typing import Dict, Any

from concrete.util import CommunicationWriterTGZ
from nltk.corpus import framenet, framenet15
from tqdm import tqdm

from sftp.data_reader.concrete_srl import concrete_doc
from tools.framenet.fn_util import framenet_split, Sentence as TokSentence


def process_sentence(sent) -> Dict[str, Any]:
    ret = {'sentence': sent.text, 'tokenization': list(), 'annotations': list()}
    tok_sent = TokSentence(sent.text)
    for token in tok_sent.tokens:
        ret['tokenization'].append((token.idx, token.idx_end-1))

    def process_one_ann_set(ann_set):
        ret['annotations'].append(event := {'label': ann_set.frame.name, 'children': (arg_list := list())})
        target_list = list()
        for tar_start, tar_end in ann_set.Target:
            target_list.extend(
                list(range(tok_sent.span(tar_start, tar_end)[0], tok_sent.span(tar_start, tar_end)[1]+1))
            )
        target_list.sort()
        event['span'] = (target_list[0], target_list[-1])

        for fe_start, fe_end, fe_name in ann_set.FE[0]:
            fe_start, fe_end = tok_sent.span(fe_start, fe_end)
            arg_list.append({
                'span': (fe_start, fe_end),
                'label': fe_name
            })

    if 'annotationSet' in sent:
        for ann_item in sent.annotationSet:
            if 'Target' not in ann_item:
                continue
            process_one_ann_set(ann_item)
    if 'Target' in sent:
        process_one_ann_set(sent)

    return ret


def process_doc(docs, dst_path: str):
    writer = CommunicationWriterTGZ(dst_path)
    for doc in tqdm(docs):
        sentences = list()
        for sent in doc.sentence:
            sentences.append(process_sentence(sent))
        comm = concrete_doc(sentences, doc.filename)
        writer.write(comm, comm.id + '.concrete')
    writer.close()


def process_exemplar(dst_path, fn):
    bar = tqdm()
    raw_annotations = list()
    print('Loading exemplars...')
    try:
        for ann_sent in fn.annotations(full_text=False):
            if 'Target' not in ann_sent:
                continue
            bar.update()
            raw_annotations.append(ann_sent)
    except RuntimeError:
        pass
    finally:
        bar.close()

    char_idx_offset = 0
    sentences = list()
    for sent in raw_annotations:
        sentences.append(process_sentence(sent))
        char_idx_offset += len(sent.text)+1

    comm = concrete_doc(sentences, 'exemplar')
    CommunicationWriterTGZ(dst_path).write(comm, 'exemplar.concrete')


def run():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'dst', metavar='DESTINATION', type=str,
        help='Destination folder path.'
    )
    parser.add_argument(
        '-v', metavar='VERSION', default='1.7', type=str, choices=['1.5', '1.7'],
        help='Version of FrameNet. Either 1.5 or 1.7.'
    )
    args = parser.parse_args()
    fn = framenet if args.v == '1.7' else framenet15
    os.makedirs(args.dst, exist_ok=True)

    doc_group = defaultdict(list)
    for doc in fn.docs():
        if doc.filename in framenet_split['dev']:
            doc_group['dev'].append(doc)
        elif doc.filename in framenet_split['test']:
            doc_group['test'].append(doc)
        else:
            doc_group['train'].append(doc)

    for sp in framenet_split:
        print(f'Loaded {len(doc_group[sp])} docs for {sp}.')

    for sp in framenet_split:
        process_doc(doc_group[sp], dst_path=os.path.join(args.dst, f'{sp}.tar.gz'))

    process_exemplar(os.path.join(args.dst, 'exemplar.tar.gz'), fn)


if __name__ == '__main__':
    run()