Liyan06
commited on
Commit
·
1104bf8
1
Parent(s):
a496016
add web search
Browse files- handler.py +50 -8
- web_retrieval.py +153 -0
handler.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from minicheck_web.minicheck import MiniCheck
|
|
|
2 |
|
3 |
|
4 |
def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
|
@@ -24,17 +25,58 @@ class EndpointHandler():
|
|
24 |
|
25 |
def __call__(self, data):
|
26 |
|
27 |
-
_, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=data)
|
28 |
-
|
29 |
# Using user-provided document to do fact-checking
|
30 |
-
if data['inputs']['docs'] !=
|
|
|
31 |
ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
outputs = {
|
36 |
'ranked_docs': ranked_docs,
|
37 |
'scores': scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
return
|
|
|
1 |
from minicheck_web.minicheck import MiniCheck
|
2 |
+
from web_retrieval import *
|
3 |
|
4 |
|
5 |
def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
|
|
|
25 |
|
26 |
def __call__(self, data):
|
27 |
|
|
|
|
|
28 |
# Using user-provided document to do fact-checking
|
29 |
+
if len(data['inputs']['docs']) == 1 and data['inputs']['docs'][0] != '':
|
30 |
+
_, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=data)
|
31 |
ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
|
32 |
+
|
33 |
+
outputs = {
|
|
|
|
|
34 |
'ranked_docs': ranked_docs,
|
35 |
'scores': scores
|
36 |
+
}
|
37 |
+
|
38 |
+
else:
|
39 |
+
assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
|
40 |
+
|
41 |
+
claim = data['inputs']['claims'][0]
|
42 |
+
ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim)
|
43 |
+
|
44 |
+
outputs = {
|
45 |
+
'ranked_docs': ranked_docs,
|
46 |
+
'scores': scores,
|
47 |
+
'ranked_urls': ranked_urls
|
48 |
+
}
|
49 |
+
|
50 |
+
return outputs
|
51 |
+
|
52 |
+
|
53 |
+
def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False):
|
54 |
+
|
55 |
+
search_results = search_google(claim, timeout=timeout)
|
56 |
+
|
57 |
+
print('Searching webpages...')
|
58 |
+
start = time()
|
59 |
+
with concurrent.futures.ThreadPoolExecutor() as e:
|
60 |
+
scraped_results = e.map(scrape_url, search_results, itertools.repeat(timeout))
|
61 |
+
end = time()
|
62 |
+
print(f"Finished searching in {round((end - start), 1)} seconds.\n")
|
63 |
+
scraped_results = [(r[0][:50000], r[1]) for r in scraped_results if r[0] and '��' not in r[0] and ".pdf" not in r[1]]
|
64 |
+
|
65 |
+
retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
|
66 |
+
|
67 |
+
print('Scoring webpages...')
|
68 |
+
start = time()
|
69 |
+
retrieved_data = {
|
70 |
+
'inputs': {
|
71 |
+
'docs': list(retrieved_docs),
|
72 |
+
'claims': [claim]*len(retrieved_docs)
|
73 |
+
}
|
74 |
}
|
75 |
+
_, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=retrieved_data)
|
76 |
+
end = time()
|
77 |
+
num_chunks = len([item for items in used_chunk for item in items])
|
78 |
+
print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
|
79 |
+
|
80 |
+
ranked_docs, scores, ranked_urls = order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=allow_duplicated_urls)
|
81 |
|
82 |
+
return ranked_docs, scores, ranked_urls
|
web_retrieval.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Some functions are adapted from https://github.com/yuxiaw/Factcheck-GPT
|
2 |
+
|
3 |
+
import concurrent.futures
|
4 |
+
import requests
|
5 |
+
import bs4
|
6 |
+
import re
|
7 |
+
from typing import List, Tuple
|
8 |
+
import itertools
|
9 |
+
import numpy as np
|
10 |
+
from time import time
|
11 |
+
|
12 |
+
|
13 |
+
def is_tag_visible(element: bs4.element) -> bool:
|
14 |
+
"""Determines if an HTML element is visible.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
element: A BeautifulSoup element to check the visiblity of.
|
18 |
+
returns:
|
19 |
+
Whether the element is visible.
|
20 |
+
"""
|
21 |
+
if element.parent.name in [
|
22 |
+
"style",
|
23 |
+
"script",
|
24 |
+
"head",
|
25 |
+
"title",
|
26 |
+
"meta",
|
27 |
+
"[document]",
|
28 |
+
] or isinstance(element, bs4.element.Comment):
|
29 |
+
return False
|
30 |
+
return True
|
31 |
+
|
32 |
+
|
33 |
+
def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
|
34 |
+
"""Scrapes a URL for all text information.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
url: URL of webpage to scrape.
|
38 |
+
timeout: Timeout of the requests call.
|
39 |
+
Returns:
|
40 |
+
web_text: The visible text of the scraped URL.
|
41 |
+
url: URL input.
|
42 |
+
"""
|
43 |
+
# Scrape the URL
|
44 |
+
try:
|
45 |
+
response = requests.get(url, timeout=timeout)
|
46 |
+
response.raise_for_status()
|
47 |
+
except requests.exceptions.RequestException as _:
|
48 |
+
return None, url
|
49 |
+
|
50 |
+
# Extract out all text from the tags
|
51 |
+
try:
|
52 |
+
soup = bs4.BeautifulSoup(response.text, "html.parser")
|
53 |
+
texts = soup.findAll(string=True)
|
54 |
+
# Filter out invisible text from the page.
|
55 |
+
visible_text = filter(is_tag_visible, texts)
|
56 |
+
except Exception as _:
|
57 |
+
return None, url
|
58 |
+
|
59 |
+
# Returns all the text concatenated as a string.
|
60 |
+
web_text = " ".join(t.strip() for t in visible_text).strip()
|
61 |
+
# Clean up spacing.
|
62 |
+
web_text = " ".join(web_text.split())
|
63 |
+
return web_text, url
|
64 |
+
|
65 |
+
|
66 |
+
def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='') -> List[str]:
|
67 |
+
"""Searches the query using Google.
|
68 |
+
Args:
|
69 |
+
query: Search query.
|
70 |
+
num_web_pages: the number of web pages to request.
|
71 |
+
save_url: path to save returned urls, such as 'urls.txt'
|
72 |
+
Returns:
|
73 |
+
search_results: A list of the top URLs relevant to the query.
|
74 |
+
"""
|
75 |
+
# set headers: Google returns different web-pages according to agent device
|
76 |
+
# desktop user-agent
|
77 |
+
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
|
78 |
+
headers = {'User-Agent': USER_AGENT}
|
79 |
+
|
80 |
+
# set language
|
81 |
+
# set the Google interface language, use &hl=XX
|
82 |
+
# set the preferred language of the search results, use &lr=lang_XX
|
83 |
+
# set language as en, otherwise it will return many translation web pages to Arabic that can't be opened correctly.
|
84 |
+
lang = "en"
|
85 |
+
|
86 |
+
# scrape google results
|
87 |
+
urls = []
|
88 |
+
for page in range(0, num_web_pages, 10):
|
89 |
+
# here page is google search's bottom page meaning, click 2 -> start=10
|
90 |
+
# url = "https://www.google.com/search?q={}&start={}".format(query, page)
|
91 |
+
url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(query, lang, lang, page)
|
92 |
+
r = requests.get(url, headers=headers, timeout=timeout)
|
93 |
+
# collect all urls by regular expression
|
94 |
+
# how to do if I just want to have the returned top-k pages?
|
95 |
+
urls += re.findall('href="(https?://.*?)"', r.text)
|
96 |
+
|
97 |
+
# set to remove repeated urls
|
98 |
+
urls = list(set(urls))
|
99 |
+
|
100 |
+
# save all url into a txt file
|
101 |
+
if not save_url == "":
|
102 |
+
with open(save_url, 'w') as file:
|
103 |
+
for url in urls:
|
104 |
+
file.write(url + '\n')
|
105 |
+
return urls
|
106 |
+
|
107 |
+
|
108 |
+
def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False):
|
109 |
+
|
110 |
+
"""
|
111 |
+
Orders the documents, scores, and URLs based on the scores in descending order.
|
112 |
+
|
113 |
+
allow_duplicated_urls:
|
114 |
+
- If False, the function will return the highest scored chunk per doc + scores + urls.
|
115 |
+
- If True, the function will return all the chunks per doc + scores + urls.
|
116 |
+
"""
|
117 |
+
|
118 |
+
# Flatten the used_chunk and support_prob_per_chunk lists
|
119 |
+
flattened_docs = [doc for chunk in used_chunk for doc in chunk]
|
120 |
+
flattened_scores = [score for chunk in support_prob_per_chunk for score in chunk]
|
121 |
+
|
122 |
+
# Create a list of tuples containing the doc, score, and corresponding URL
|
123 |
+
doc_score_url = list(zip(flattened_docs, flattened_scores, np.repeat(urls, [len(chunk) for chunk in used_chunk])))
|
124 |
+
|
125 |
+
# Sort the list based on the scores in descending order
|
126 |
+
ranked_doc_score_url = sorted(doc_score_url, key=lambda x: x[1], reverse=True)
|
127 |
+
|
128 |
+
# Unzip the sorted list to get the ranked docs, scores, and URLs
|
129 |
+
ranked_docs, scores, ranked_urls = zip(*ranked_doc_score_url)
|
130 |
+
|
131 |
+
if allow_duplicated_urls:
|
132 |
+
return ranked_docs, scores, ranked_urls
|
133 |
+
|
134 |
+
else:
|
135 |
+
|
136 |
+
filtered_docs = []
|
137 |
+
filtered_scores = []
|
138 |
+
filtered_urls = []
|
139 |
+
seen_urls = set()
|
140 |
+
|
141 |
+
for doc, score, url in zip(ranked_docs, scores, ranked_urls):
|
142 |
+
if url not in seen_urls:
|
143 |
+
filtered_docs.append(doc)
|
144 |
+
filtered_scores.append(score)
|
145 |
+
filtered_urls.append(url)
|
146 |
+
seen_urls.add(url)
|
147 |
+
|
148 |
+
# Update the variables with the filtered results
|
149 |
+
ranked_docs = filtered_docs
|
150 |
+
scores = filtered_scores
|
151 |
+
ranked_urls = filtered_urls
|
152 |
+
|
153 |
+
return ranked_docs, scores, ranked_urls
|