lytang
/

MiniCheck-Flan-T5-Large

Text Classification

Transformers

PyTorch

English

text2text-generation

text-generation-inference

Model card Files Files and versions Community

Liyan06 commited on May 21, 2024

Commit

1104bf8

1 Parent(s): a496016

add web search

Browse files

Files changed (2) hide show

handler.py +50 -8
web_retrieval.py +153 -0

handler.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from minicheck_web.minicheck import MiniCheck
 def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
@@ -24,17 +25,58 @@ class EndpointHandler():
     def __call__(self, data):
-        _, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=data)
         # Using user-provided document to do fact-checking
-        if data['inputs']['docs'] != "":
             ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
-        else:
-            raise NotImplementedError("Currently, only user-provided document is supported.")
-        outputs = {
             'ranked_docs': ranked_docs,
             'scores': scores
         }
-        return outputs

 from minicheck_web.minicheck import MiniCheck
+from web_retrieval import *
 def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
     def __call__(self, data):
         # Using user-provided document to do fact-checking
+        if len(data['inputs']['docs']) == 1 and data['inputs']['docs'][0] != '':
+            _, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=data)
             ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
+            outputs = {
             'ranked_docs': ranked_docs,
             'scores': scores
+            }
+        else:
+            assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
+            claim = data['inputs']['claims'][0]
+            ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim)
+            outputs = {
+                'ranked_docs': ranked_docs,
+                'scores': scores,
+                'ranked_urls': ranked_urls
+            }
+        return outputs
+    def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False):
+        search_results = search_google(claim, timeout=timeout)
+        print('Searching webpages...')
+        start = time()
+        with concurrent.futures.ThreadPoolExecutor() as e:
+            scraped_results = e.map(scrape_url, search_results, itertools.repeat(timeout))
+        end = time()
+        print(f"Finished searching in {round((end - start), 1)} seconds.\n")
+        scraped_results = [(r[0][:50000], r[1]) for r in scraped_results if r[0] and '��' not in r[0] and ".pdf" not in r[1]]
+        retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
+        print('Scoring webpages...')
+        start = time()
+        retrieved_data = {
+            'inputs': {
+                'docs': list(retrieved_docs),
+                'claims': [claim]*len(retrieved_docs)
+            }
         }
+        _, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=retrieved_data)
+        end = time()
+        num_chunks = len([item for items in used_chunk for item in items])
+        print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
+        ranked_docs, scores, ranked_urls = order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=allow_duplicated_urls)
+        return ranked_docs, scores, ranked_urls

web_retrieval.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Some functions are adapted from https://github.com/yuxiaw/Factcheck-GPT
+import concurrent.futures
+import requests
+import bs4
+import re
+from typing import List, Tuple
+import itertools
+import numpy as np
+from time import time
+def is_tag_visible(element: bs4.element) -> bool:
+    """Determines if an HTML element is visible.
+    Args:
+        element: A BeautifulSoup element to check the visiblity of.
+    returns:
+        Whether the element is visible.
+    """
+    if element.parent.name in [
+        "style",
+        "script",
+        "head",
+        "title",
+        "meta",
+        "[document]",
+    ] or isinstance(element, bs4.element.Comment):
+        return False
+    return True
+def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
+    """Scrapes a URL for all text information.
+    Args:
+        url: URL of webpage to scrape.
+        timeout: Timeout of the requests call.
+    Returns:
+        web_text: The visible text of the scraped URL.
+        url: URL input.
+    """
+    # Scrape the URL
+    try:
+        response = requests.get(url, timeout=timeout)
+        response.raise_for_status()
+    except requests.exceptions.RequestException as _:
+        return None, url
+    # Extract out all text from the tags
+    try:
+        soup = bs4.BeautifulSoup(response.text, "html.parser")
+        texts = soup.findAll(string=True)
+        # Filter out invisible text from the page.
+        visible_text = filter(is_tag_visible, texts)
+    except Exception as _:
+        return None, url
+    # Returns all the text concatenated as a string.
+    web_text = " ".join(t.strip() for t in visible_text).strip()
+    # Clean up spacing.
+    web_text = " ".join(web_text.split())
+    return web_text, url
+def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='') -> List[str]:
+    """Searches the query using Google.
+    Args:
+        query: Search query.
+        num_web_pages: the number of web pages to request.
+        save_url: path to save returned urls, such as 'urls.txt'
+    Returns:
+        search_results: A list of the top URLs relevant to the query.
+    """
+    # set headers: Google returns different web-pages according to agent device
+    # desktop user-agent
+    USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
+    headers = {'User-Agent': USER_AGENT}
+    # set language
+    # set the Google interface language, use &hl=XX
+    # set the preferred language of the search results, use &lr=lang_XX
+    # set language as en, otherwise it will return many translation web pages to Arabic that can't be opened correctly.
+    lang = "en"
+    # scrape google results
+    urls = []
+    for page in range(0, num_web_pages, 10):
+        # here page is google search's bottom page meaning, click 2 -> start=10
+        # url = "https://www.google.com/search?q={}&start={}".format(query, page)
+        url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(query, lang, lang, page)
+        r = requests.get(url, headers=headers, timeout=timeout)
+        # collect all urls by regular expression
+        # how to do if I just want to have the returned top-k pages?
+        urls += re.findall('href="(https?://.*?)"', r.text)
+    # set to remove repeated urls
+    urls = list(set(urls))
+    # save all url into a txt file
+    if not save_url == "":
+        with open(save_url, 'w') as file:
+            for url in urls:
+                file.write(url + '\n')
+    return urls
+def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False):
+    """
+    Orders the documents, scores, and URLs based on the scores in descending order.
+    allow_duplicated_urls:
+        - If False, the function will return the highest scored chunk per doc + scores + urls.
+        - If True, the function will return all the chunks per doc + scores + urls.
+    """
+    # Flatten the used_chunk and support_prob_per_chunk lists
+    flattened_docs = [doc for chunk in used_chunk for doc in chunk]
+    flattened_scores = [score for chunk in support_prob_per_chunk for score in chunk]
+    # Create a list of tuples containing the doc, score, and corresponding URL
+    doc_score_url = list(zip(flattened_docs, flattened_scores, np.repeat(urls, [len(chunk) for chunk in used_chunk])))
+    # Sort the list based on the scores in descending order
+    ranked_doc_score_url = sorted(doc_score_url, key=lambda x: x[1], reverse=True)
+    # Unzip the sorted list to get the ranked docs, scores, and URLs
+    ranked_docs, scores, ranked_urls = zip(*ranked_doc_score_url)
+    if allow_duplicated_urls:
+        return ranked_docs, scores, ranked_urls
+    else:
+        filtered_docs = []
+        filtered_scores = []
+        filtered_urls = []
+        seen_urls = set()
+        for doc, score, url in zip(ranked_docs, scores, ranked_urls):
+            if url not in seen_urls:
+                filtered_docs.append(doc)
+                filtered_scores.append(score)
+                filtered_urls.append(url)
+                seen_urls.add(url)
+        # Update the variables with the filtered results
+        ranked_docs = filtered_docs
+        scores = filtered_scores
+        ranked_urls = filtered_urls
+        return ranked_docs, scores, ranked_urls