Liyan06 commited on
Commit
1104bf8
·
1 Parent(s): a496016

add web search

Browse files
Files changed (2) hide show
  1. handler.py +50 -8
  2. web_retrieval.py +153 -0
handler.py CHANGED
@@ -1,4 +1,5 @@
1
  from minicheck_web.minicheck import MiniCheck
 
2
 
3
 
4
  def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
@@ -24,17 +25,58 @@ class EndpointHandler():
24
 
25
  def __call__(self, data):
26
 
27
- _, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=data)
28
-
29
  # Using user-provided document to do fact-checking
30
- if data['inputs']['docs'] != "":
 
31
  ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
32
- else:
33
- raise NotImplementedError("Currently, only user-provided document is supported.")
34
-
35
- outputs = {
36
  'ranked_docs': ranked_docs,
37
  'scores': scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  }
 
 
 
 
 
 
39
 
40
- return outputs
 
1
  from minicheck_web.minicheck import MiniCheck
2
+ from web_retrieval import *
3
 
4
 
5
  def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
 
25
 
26
  def __call__(self, data):
27
 
 
 
28
  # Using user-provided document to do fact-checking
29
+ if len(data['inputs']['docs']) == 1 and data['inputs']['docs'][0] != '':
30
+ _, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=data)
31
  ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
32
+
33
+ outputs = {
 
 
34
  'ranked_docs': ranked_docs,
35
  'scores': scores
36
+ }
37
+
38
+ else:
39
+ assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
40
+
41
+ claim = data['inputs']['claims'][0]
42
+ ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim)
43
+
44
+ outputs = {
45
+ 'ranked_docs': ranked_docs,
46
+ 'scores': scores,
47
+ 'ranked_urls': ranked_urls
48
+ }
49
+
50
+ return outputs
51
+
52
+
53
+ def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False):
54
+
55
+ search_results = search_google(claim, timeout=timeout)
56
+
57
+ print('Searching webpages...')
58
+ start = time()
59
+ with concurrent.futures.ThreadPoolExecutor() as e:
60
+ scraped_results = e.map(scrape_url, search_results, itertools.repeat(timeout))
61
+ end = time()
62
+ print(f"Finished searching in {round((end - start), 1)} seconds.\n")
63
+ scraped_results = [(r[0][:50000], r[1]) for r in scraped_results if r[0] and '��' not in r[0] and ".pdf" not in r[1]]
64
+
65
+ retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
66
+
67
+ print('Scoring webpages...')
68
+ start = time()
69
+ retrieved_data = {
70
+ 'inputs': {
71
+ 'docs': list(retrieved_docs),
72
+ 'claims': [claim]*len(retrieved_docs)
73
+ }
74
  }
75
+ _, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=retrieved_data)
76
+ end = time()
77
+ num_chunks = len([item for items in used_chunk for item in items])
78
+ print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
79
+
80
+ ranked_docs, scores, ranked_urls = order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=allow_duplicated_urls)
81
 
82
+ return ranked_docs, scores, ranked_urls
web_retrieval.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Some functions are adapted from https://github.com/yuxiaw/Factcheck-GPT
2
+
3
+ import concurrent.futures
4
+ import requests
5
+ import bs4
6
+ import re
7
+ from typing import List, Tuple
8
+ import itertools
9
+ import numpy as np
10
+ from time import time
11
+
12
+
13
+ def is_tag_visible(element: bs4.element) -> bool:
14
+ """Determines if an HTML element is visible.
15
+
16
+ Args:
17
+ element: A BeautifulSoup element to check the visiblity of.
18
+ returns:
19
+ Whether the element is visible.
20
+ """
21
+ if element.parent.name in [
22
+ "style",
23
+ "script",
24
+ "head",
25
+ "title",
26
+ "meta",
27
+ "[document]",
28
+ ] or isinstance(element, bs4.element.Comment):
29
+ return False
30
+ return True
31
+
32
+
33
+ def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
34
+ """Scrapes a URL for all text information.
35
+
36
+ Args:
37
+ url: URL of webpage to scrape.
38
+ timeout: Timeout of the requests call.
39
+ Returns:
40
+ web_text: The visible text of the scraped URL.
41
+ url: URL input.
42
+ """
43
+ # Scrape the URL
44
+ try:
45
+ response = requests.get(url, timeout=timeout)
46
+ response.raise_for_status()
47
+ except requests.exceptions.RequestException as _:
48
+ return None, url
49
+
50
+ # Extract out all text from the tags
51
+ try:
52
+ soup = bs4.BeautifulSoup(response.text, "html.parser")
53
+ texts = soup.findAll(string=True)
54
+ # Filter out invisible text from the page.
55
+ visible_text = filter(is_tag_visible, texts)
56
+ except Exception as _:
57
+ return None, url
58
+
59
+ # Returns all the text concatenated as a string.
60
+ web_text = " ".join(t.strip() for t in visible_text).strip()
61
+ # Clean up spacing.
62
+ web_text = " ".join(web_text.split())
63
+ return web_text, url
64
+
65
+
66
+ def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='') -> List[str]:
67
+ """Searches the query using Google.
68
+ Args:
69
+ query: Search query.
70
+ num_web_pages: the number of web pages to request.
71
+ save_url: path to save returned urls, such as 'urls.txt'
72
+ Returns:
73
+ search_results: A list of the top URLs relevant to the query.
74
+ """
75
+ # set headers: Google returns different web-pages according to agent device
76
+ # desktop user-agent
77
+ USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
78
+ headers = {'User-Agent': USER_AGENT}
79
+
80
+ # set language
81
+ # set the Google interface language, use &hl=XX
82
+ # set the preferred language of the search results, use &lr=lang_XX
83
+ # set language as en, otherwise it will return many translation web pages to Arabic that can't be opened correctly.
84
+ lang = "en"
85
+
86
+ # scrape google results
87
+ urls = []
88
+ for page in range(0, num_web_pages, 10):
89
+ # here page is google search's bottom page meaning, click 2 -> start=10
90
+ # url = "https://www.google.com/search?q={}&start={}".format(query, page)
91
+ url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(query, lang, lang, page)
92
+ r = requests.get(url, headers=headers, timeout=timeout)
93
+ # collect all urls by regular expression
94
+ # how to do if I just want to have the returned top-k pages?
95
+ urls += re.findall('href="(https?://.*?)"', r.text)
96
+
97
+ # set to remove repeated urls
98
+ urls = list(set(urls))
99
+
100
+ # save all url into a txt file
101
+ if not save_url == "":
102
+ with open(save_url, 'w') as file:
103
+ for url in urls:
104
+ file.write(url + '\n')
105
+ return urls
106
+
107
+
108
+ def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False):
109
+
110
+ """
111
+ Orders the documents, scores, and URLs based on the scores in descending order.
112
+
113
+ allow_duplicated_urls:
114
+ - If False, the function will return the highest scored chunk per doc + scores + urls.
115
+ - If True, the function will return all the chunks per doc + scores + urls.
116
+ """
117
+
118
+ # Flatten the used_chunk and support_prob_per_chunk lists
119
+ flattened_docs = [doc for chunk in used_chunk for doc in chunk]
120
+ flattened_scores = [score for chunk in support_prob_per_chunk for score in chunk]
121
+
122
+ # Create a list of tuples containing the doc, score, and corresponding URL
123
+ doc_score_url = list(zip(flattened_docs, flattened_scores, np.repeat(urls, [len(chunk) for chunk in used_chunk])))
124
+
125
+ # Sort the list based on the scores in descending order
126
+ ranked_doc_score_url = sorted(doc_score_url, key=lambda x: x[1], reverse=True)
127
+
128
+ # Unzip the sorted list to get the ranked docs, scores, and URLs
129
+ ranked_docs, scores, ranked_urls = zip(*ranked_doc_score_url)
130
+
131
+ if allow_duplicated_urls:
132
+ return ranked_docs, scores, ranked_urls
133
+
134
+ else:
135
+
136
+ filtered_docs = []
137
+ filtered_scores = []
138
+ filtered_urls = []
139
+ seen_urls = set()
140
+
141
+ for doc, score, url in zip(ranked_docs, scores, ranked_urls):
142
+ if url not in seen_urls:
143
+ filtered_docs.append(doc)
144
+ filtered_scores.append(score)
145
+ filtered_urls.append(url)
146
+ seen_urls.add(url)
147
+
148
+ # Update the variables with the filtered results
149
+ ranked_docs = filtered_docs
150
+ scores = filtered_scores
151
+ ranked_urls = filtered_urls
152
+
153
+ return ranked_docs, scores, ranked_urls