Spaces:

RugNlpFlashcards
/

Speech_Language_Processing_Jurafsky_Martin

Build error

App Files Files Community

Robert commited on Mar 30, 2022

Commit

b7158e7

1 Parent(s): 9889a50

- Remove useless paragraphs that only contain formulas

Browse files

- Added some code to run the script over all questions to calculate overall performance

Files changed (3) hide show

main.py +36 -2
src/retrievers/faiss_retriever.py +3 -0
src/utils/preprocessing.py +19 -0

main.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import random
 from typing import cast
 import torch
 import transformers
@@ -32,8 +33,8 @@ if __name__ == '__main__':
         "GroNLP/ik-nlp-22_slp", "paragraphs"))
     # Initialize retriever
-    # retriever = FaissRetriever(dataset_paragraphs)
-    retriever = ESRetriever(dataset_paragraphs)
     # Retrieve example
     # random.seed(111)
@@ -84,3 +85,36 @@ if __name__ == '__main__':
           f"Predicted answer: {answers[highest_index].text}\n"
           f"Exact match: {exact_match:.02f}\n"
           f"F1-score: {f1_score:.02f}")

 import os
 import random
 from typing import cast
+import time
 import torch
 import transformers
         "GroNLP/ik-nlp-22_slp", "paragraphs"))
     # Initialize retriever
+    retriever = FaissRetriever(dataset_paragraphs)
+    #retriever = ESRetriever(dataset_paragraphs)
     # Retrieve example
     # random.seed(111)
           f"Predicted answer: {answers[highest_index].text}\n"
           f"Exact match: {exact_match:.02f}\n"
           f"F1-score: {f1_score:.02f}")
+    # Calculate overall performance
+    # total_f1 = 0
+    # total_exact = 0
+    # total_len = len(questions_test["question"])
+    # start_time = time.time()
+    # for i, question in enumerate(questions_test["question"]):
+    #     print(question)
+    #     answer = questions_test["answer"][i]
+    #     print(answer)
+    #
+    #     scores, result = retriever.retrieve(question)
+    #     reader_input = result_to_reader_input(result)
+    #     answers = reader.read(question, reader_input)
+    #
+    #     document_scores = sm(torch.Tensor(
+    #         [pred.relevance_score for pred in answers]))
+    #     span_scores = sm(torch.Tensor(
+    #         [pred.span_score for pred in answers]))
+    #
+    #     highest, highest_index = 0, 0
+    #     for j, value in enumerate(span_scores):
+    #         if value + document_scores[j] > highest:
+    #             highest = value + document_scores[j]
+    #             highest_index = j
+    #     print(answers[highest_index])
+    #     exact_match, f1_score = evaluate(answer, answers[highest_index].text)
+    #     total_f1 += f1_score
+    #     total_exact += exact_match
+    # print(f"Total time:", round(time.time() - start_time, 2), "seconds.")
+    # print(total_f1)
+    # print(total_exact)
+    # print(total_f1/total_len)

src/retrievers/faiss_retriever.py CHANGED Viewed

@@ -12,6 +12,7 @@ from transformers import (
 from src.retrievers.base_retriever import Retriever
 from src.utils.log import get_logger
 # Hacky fix for FAISS error on macOS
 # See https://stackoverflow.com/a/63374568/4545692
@@ -55,6 +56,8 @@ class FaissRetriever(Retriever):
             force_new_embedding: bool = False):
         ds = self.dataset["train"]
         if not force_new_embedding and os.path.exists(self.embedding_path):
             ds.load_faiss_index(

 from src.retrievers.base_retriever import Retriever
 from src.utils.log import get_logger
+from src.utils.preprocessing import remove_formulas
 # Hacky fix for FAISS error on macOS
 # See https://stackoverflow.com/a/63374568/4545692
             force_new_embedding: bool = False):
         ds = self.dataset["train"]
+        ds = ds.map(remove_formulas)
         if not force_new_embedding and os.path.exists(self.embedding_path):
             ds.load_faiss_index(

src/utils/preprocessing.py CHANGED Viewed

@@ -33,3 +33,22 @@ def result_to_reader_input(result: Dict[str, List[str]]) \
         reader_result['texts'].append(result['text'][n])
     return reader_result

         reader_result['texts'].append(result['text'][n])
     return reader_result
+def remove_formulas(ds):
+    """Replaces text in the 'text' column of the ds which has an average
+    word length of <= 3.5 with blanks. This essentially means that most
+    of the formulas are removed.
+    To-do:
+    - more-preprocessing
+    - a summarization model perhaps
+    Args:
+        ds: HuggingFace dataset that contains the information for the retriever
+    Returns:
+        ds: preprocessed HuggingFace dataset
+    """
+    words = ds['text'].split()
+    average = sum(len(word) for word in words) / len(words)
+    if average <= 3.5:
+        ds['text'] = ''
+    return ds