Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

File size: 5,310 Bytes

d6585f5

#
# Pyserini: Reproducible IR research with sparse and dense representations
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import re
import spacy
"""
This file provides helpers to convert passage and queries
"""
def read_stopwords(fileName='stopwords.txt', lower_case=True):
    """Reads a list of stopwords from a file. By default the words
       are read from a standard repo location and are lower_cased.
      :param fileName a stopword file name
      :param lower_case  a boolean flag indicating if lowercasing is needed.
      :return a list of stopwords
    """
    stopwords = set()
    with open(fileName) as f:
        for w in f:
            w = w.strip()
            if w:
                if lower_case:
                    w = w.lower()
                stopwords.add(w)
    return stopwords

def is_alpha_num(s):
    return s and (re.match("^[a-zA-Z-_.0-9]+$", s) is not None)

class SpacyTextParser:
    def __init__(self, model_name, stopwords,
                 remove_punct=True,
                 sent_split=False,
                 keep_only_alpha_num=False,
                 lower_case=True,
                 enable_POS=True):
        """Constructor.
                :param  model_name    a name of the spacy model to use, e.g., en_core_web_sm
                :param  stopwords    a list of stop words to be excluded (case insensitive);
                                     a token is also excluded when its lemma is in the stop word list.
                :param  remove_punct  a bool flag indicating if the punctuation tokens need to be removed
                :param  sent_split    a bool flag indicating if sentence splitting is necessary
                :param  keep_only_alpha_num a bool flag indicating if we need to keep only alpha-numeric characters
                :param  enable_POS    a bool flag that enables POS tagging (which, e.g., can improve lemmatization)
        """

        disable_list = ['ner', 'parser']
        if not enable_POS:
            disable_list.append('tagger')
        print('Disabled Spacy components: ', disable_list)

        self._nlp = spacy.load(model_name, disable=disable_list)
        if sent_split:
            sentencizer = self._nlp.create_pipe("sentencizer")
            self._nlp.add_pipe(sentencizer)

        self._remove_punct = remove_punct
        self._stopwords = frozenset([w.lower() for w in stopwords])
        self._keep_only_alpha_num = keep_only_alpha_num
        self._lower_case = lower_case

    @staticmethod
    def _basic_clean(text):
        return text.replace("’", "'")

    def __call__(self, text):
        """A thin wrapper that merely calls spacy.
        :param text     input text string
        :return         a spacy Doc object
        """

        return self._nlp(SpacyTextParser._basic_clean(text))

    def proc_text(self, text):
        """Process text, remove stopwords and obtain lemmas, but does not split into sentences.
        This function should not emit newlines!
        :param text     input text string
        :return         a tuple (lemmatized text, original-form text). Text is white-space separated.
        """

        lemmas = []
        tokens = []
        doc = self(text)
        for tokObj in doc:
            if self._remove_punct and tokObj.is_punct:
                continue
            lemma = tokObj.lemma_
            text = tokObj.text
            if self._keep_only_alpha_num and not is_alpha_num(text):
                continue
            tok1 = text.lower()
            tok2 = lemma.lower()
            if tok1 in self._stopwords or tok2 in self._stopwords:
                continue

            if self._lower_case:
                text = text.lower()
                lemma = lemma.lower()

            lemmas.append(lemma)
            tokens.append(text)

        return ' '.join(lemmas), ' '.join(tokens)


def get_retokenized(tokenizer, text):
    """Obtain a space separated re-tokenized text.
    :param tokenizer:  a tokenizer that has the function
                       tokenize that returns an array of tokens.
    :param text:       a text to re-tokenize.
    """
    return ' '.join(tokenizer.tokenize(text))


def add_retokenized_field(data_entry,
                        src_field,
                        dst_field,
                        tokenizer):
    """
    Create a re-tokenized field from an existing one.
    :param data_entry:   a dictionary of entries (keys are field names, values are text items)
    :param src_field:    a source field
    :param dst_field:    a target field
    :param tokenizer:    a tokenizer to use, if None, nothing is done
    """
    if tokenizer is not None:
        dst = ''
        if src_field in data_entry:
            dst = get_retokenized(tokenizer, data_entry[src_field])

        data_entry[dst_field] = dst