Spaces:
Runtime error
Runtime error
John Yang
commited on
Commit
·
4caa7f7
1
Parent(s):
14fbfba
Refactored caching of scrape requests
Browse files- app.py +54 -31
- predict_help.py +26 -9
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
import torch
|
3 |
from transformers import BartTokenizer, BartForConditionalGeneration, AutoModel, AutoTokenizer
|
4 |
|
5 |
from webshop_lite import dict_to_fake_html
|
@@ -90,8 +90,6 @@ def predict(obs, info):
|
|
90 |
else:
|
91 |
return "search[" + bart_predict(process_goal(obs)) + "]"
|
92 |
|
93 |
-
NUM_PROD_LIMIT = 10
|
94 |
-
|
95 |
def run_episode(goal, verbose=True):
|
96 |
"""
|
97 |
Interact with amazon to find a product given input goal.
|
@@ -101,9 +99,10 @@ def run_episode(goal, verbose=True):
|
|
101 |
obs = "Amazon Shopping Game\nInstruction:" + goal + "\n[button] search [button]"
|
102 |
info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
|
103 |
product_map = {}
|
104 |
-
|
|
|
105 |
visited_asins, clicked_options = set(), set()
|
106 |
-
|
107 |
search_terms, prod_title, asin, num_prods, = None, None, None, None
|
108 |
options = {}
|
109 |
|
@@ -125,9 +124,9 @@ def run_episode(goal, verbose=True):
|
|
125 |
if action.startswith('click[item -'):
|
126 |
prod_title = action_content[len("item -"):].strip()
|
127 |
found = False
|
128 |
-
for
|
129 |
-
if prod_title ==
|
130 |
-
asin =
|
131 |
page_type = Page.ITEM_PAGE
|
132 |
visited_asins.add(asin)
|
133 |
found = True
|
@@ -157,7 +156,12 @@ def run_episode(goal, verbose=True):
|
|
157 |
page_type = Page.SEARCH
|
158 |
|
159 |
elif action == 'click[buy now]':
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
elif prev_page_type == Page.ITEM_PAGE:
|
163 |
found = False
|
@@ -178,24 +182,28 @@ def run_episode(goal, verbose=True):
|
|
178 |
|
179 |
# URL -> Real HTML -> Dict of Info
|
180 |
if page_type == Page.RESULTS:
|
181 |
-
if search_terms
|
182 |
-
|
183 |
-
asins = parse_results(search_terms, page_num)
|
184 |
-
num_prods = len(asins)
|
185 |
-
for asin_ in asins[:NUM_PROD_LIMIT]:
|
186 |
-
product_map[asin_] = parse_item_page(asin_)
|
187 |
-
if search_terms not in page_to_product_map_memo:
|
188 |
-
page_to_product_map_memo[search_terms] = {}
|
189 |
-
page_to_product_map_memo[search_terms][page_num] = product_map
|
190 |
else:
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
elif page_type == Page.SEARCH:
|
200 |
if verbose:
|
201 |
print("Executing search")
|
@@ -203,24 +211,39 @@ def run_episode(goal, verbose=True):
|
|
203 |
info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
|
204 |
continue
|
205 |
else:
|
206 |
-
raise Exception("Page of type `", page_type,
|
207 |
|
208 |
# Dict of Info -> Fake HTML -> Text Observation
|
|
|
209 |
html_str = dict_to_fake_html(data, page_type, asin, sub_page_type, options, product_map, goal)
|
210 |
obs = convert_html_to_text(html_str, simple=False, clicked_options=clicked_options, visited_asins=visited_asins)
|
211 |
-
|
|
|
|
|
212 |
# Dict of Info -> Valid Action State (Info)
|
213 |
-
|
|
|
|
|
|
|
|
|
214 |
|
215 |
if i == 99:
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
217 |
|
218 |
gr.Interface(fn=run_episode,\
|
219 |
inputs=gr.inputs.Textbox(lines=7, label="Input Text"),\
|
220 |
outputs="text",\
|
221 |
examples=[
|
222 |
"Please select a 1 pound, certified organic sea salt shaker in the flavor triple blend flakes, and price lower than 40.00 dollars",
|
223 |
-
"I want to find a gold floor lamp with a glass shade and a nickel finish that i can use for my living room, and price lower than 270.00 dollars"
|
|
|
|
|
|
|
224 |
],\
|
225 |
title="WebShop",\
|
226 |
article="<p style='padding-top:15px;text-align:center;'>To learn more about this project, check out the <a href='https://webshop-pnlp.github.io/' target='_blank'>project page</a>!</p>",\
|
|
|
1 |
import gradio as gr
|
2 |
+
import time, torch
|
3 |
from transformers import BartTokenizer, BartForConditionalGeneration, AutoModel, AutoTokenizer
|
4 |
|
5 |
from webshop_lite import dict_to_fake_html
|
|
|
90 |
else:
|
91 |
return "search[" + bart_predict(process_goal(obs)) + "]"
|
92 |
|
|
|
|
|
93 |
def run_episode(goal, verbose=True):
|
94 |
"""
|
95 |
Interact with amazon to find a product given input goal.
|
|
|
99 |
obs = "Amazon Shopping Game\nInstruction:" + goal + "\n[button] search [button]"
|
100 |
info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
|
101 |
product_map = {}
|
102 |
+
title_to_asin_map = {}
|
103 |
+
search_results_cache = {}
|
104 |
visited_asins, clicked_options = set(), set()
|
105 |
+
sub_page_type, page_type, page_num = None, None, None
|
106 |
search_terms, prod_title, asin, num_prods, = None, None, None, None
|
107 |
options = {}
|
108 |
|
|
|
124 |
if action.startswith('click[item -'):
|
125 |
prod_title = action_content[len("item -"):].strip()
|
126 |
found = False
|
127 |
+
for key in title_to_asin_map:
|
128 |
+
if prod_title == key:
|
129 |
+
asin = title_to_asin_map[key]
|
130 |
page_type = Page.ITEM_PAGE
|
131 |
visited_asins.add(asin)
|
132 |
found = True
|
|
|
156 |
page_type = Page.SEARCH
|
157 |
|
158 |
elif action == 'click[buy now]':
|
159 |
+
asin_url = f"https://www.amazon.com/dp/{asin}"
|
160 |
+
return_value = "Product URL: " + asin_url
|
161 |
+
if len(clicked_options) > 0:
|
162 |
+
options_str = ', '.join(list(clicked_options))
|
163 |
+
return_value += "\nSelected Options: " + options_str
|
164 |
+
return return_value
|
165 |
|
166 |
elif prev_page_type == Page.ITEM_PAGE:
|
167 |
found = False
|
|
|
182 |
|
183 |
# URL -> Real HTML -> Dict of Info
|
184 |
if page_type == Page.RESULTS:
|
185 |
+
if search_terms in search_results_cache:
|
186 |
+
data = search_results_cache[search_terms]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
else:
|
188 |
+
begin = time.time()
|
189 |
+
data = parse_results(search_terms, page_num)
|
190 |
+
end = time.time()
|
191 |
+
print("Parsing search results took", end-begin, "seconds")
|
192 |
+
|
193 |
+
search_results_cache[search_terms] = data
|
194 |
+
num_prods = len(data)
|
195 |
+
for d in data:
|
196 |
+
title_to_asin_map[d['Title']] = d['asin']
|
197 |
+
elif page_type == Page.ITEM_PAGE or page_type == Page.SUB_PAGE:
|
198 |
+
if asin in product_map:
|
199 |
+
print("Loading cached item page for", asin)
|
200 |
+
data = product_map[asin]
|
201 |
+
else:
|
202 |
+
begin = time.time()
|
203 |
+
data = parse_item_page(asin)
|
204 |
+
end = time.time()
|
205 |
+
print("Parsing item page took", end-begin, "seconds")
|
206 |
+
product_map[asin] = data
|
207 |
elif page_type == Page.SEARCH:
|
208 |
if verbose:
|
209 |
print("Executing search")
|
|
|
211 |
info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
|
212 |
continue
|
213 |
else:
|
214 |
+
raise Exception("Page of type `", page_type, "` not found")
|
215 |
|
216 |
# Dict of Info -> Fake HTML -> Text Observation
|
217 |
+
begin = time.time()
|
218 |
html_str = dict_to_fake_html(data, page_type, asin, sub_page_type, options, product_map, goal)
|
219 |
obs = convert_html_to_text(html_str, simple=False, clicked_options=clicked_options, visited_asins=visited_asins)
|
220 |
+
end = time.time()
|
221 |
+
print("[Page Info -> WebShop HTML -> Observation] took", end-begin, "seconds")
|
222 |
+
|
223 |
# Dict of Info -> Valid Action State (Info)
|
224 |
+
begin = time.time()
|
225 |
+
prod_arg = product_map if page_type == Page.ITEM_PAGE else data
|
226 |
+
info = convert_dict_to_actions(page_type, prod_arg, asin, page_num, num_prods)
|
227 |
+
end = time.time()
|
228 |
+
print("Extracting available actions took", end-begin, "seconds")
|
229 |
|
230 |
if i == 99:
|
231 |
+
asin_url = f"https://www.amazon.com/dp/{asin}"
|
232 |
+
return_value = "Product URL: " + asin_url
|
233 |
+
if len(clicked_options) > 0:
|
234 |
+
options_str = ', '.join(list(clicked_options))
|
235 |
+
return_value += "\nSelected Options: " + options_str
|
236 |
+
return return_value
|
237 |
|
238 |
gr.Interface(fn=run_episode,\
|
239 |
inputs=gr.inputs.Textbox(lines=7, label="Input Text"),\
|
240 |
outputs="text",\
|
241 |
examples=[
|
242 |
"Please select a 1 pound, certified organic sea salt shaker in the flavor triple blend flakes, and price lower than 40.00 dollars",
|
243 |
+
"I want to find a gold floor lamp with a glass shade and a nickel finish that i can use for my living room, and price lower than 270.00 dollars",
|
244 |
+
"I'm trying to find white bluetooth speakers that are not only water resistant but also come with stereo sound",
|
245 |
+
"I'm looking for a kids toothbrush for ages 6 to 12 that will help with teeth whitening and is easy to use",
|
246 |
+
"I need some cute heart-shaped glittery cupcake picks as a gift to bring to a baby shower",
|
247 |
],\
|
248 |
title="WebShop",\
|
249 |
article="<p style='padding-top:15px;text-align:center;'>To learn more about this project, check out the <a href='https://webshop-pnlp.github.io/' target='_blank'>project page</a>!</p>",\
|
predict_help.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from bs4 import BeautifulSoup
|
2 |
from bs4.element import Comment
|
3 |
from enum import Enum
|
|
|
4 |
from urllib.parse import urlencode
|
5 |
|
6 |
import json, requests, torch
|
@@ -17,6 +18,7 @@ class Page(Enum):
|
|
17 |
HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
|
18 |
DEBUG_HTML = "temp.html"
|
19 |
VERBOSE = True
|
|
|
20 |
|
21 |
API = '85956985fae328bfe5a759a2984448d2'
|
22 |
def get_url(url):
|
@@ -30,9 +32,8 @@ def parse_results(query, page_num=None):
|
|
30 |
if page_num is not None:
|
31 |
url += "&page=" + str(page_num)
|
32 |
if VERBOSE:
|
33 |
-
print("
|
34 |
-
webpage = requests.get(
|
35 |
-
asins = []
|
36 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
37 |
products = soup.findAll('div', {'data-component-type': 's-search-result'})
|
38 |
if products is None:
|
@@ -40,11 +41,23 @@ def parse_results(query, page_num=None):
|
|
40 |
temp.write(str(soup))
|
41 |
temp.close()
|
42 |
raise Exception("Couldn't find search results page, outputted html for inspection")
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
if VERBOSE:
|
46 |
-
print("Scraped", len(
|
47 |
-
return
|
48 |
|
49 |
# Scrape information of each product
|
50 |
def parse_item_page(asin):
|
@@ -52,7 +65,12 @@ def parse_item_page(asin):
|
|
52 |
product_dict["asin"] = asin
|
53 |
|
54 |
url = f"https://www.amazon.com/dp/{asin}"
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
56 |
soup = BeautifulSoup(webpage.content, "html.parser")
|
57 |
|
58 |
# Title
|
@@ -103,7 +121,6 @@ def parse_item_page(asin):
|
|
103 |
|
104 |
# Main Image
|
105 |
try:
|
106 |
-
body = soup.find("body")
|
107 |
imgtag = soup.find("img", {"id":"landingImage"})
|
108 |
imageurl = dict(imgtag.attrs)["src"]
|
109 |
except AttributeError:
|
|
|
1 |
from bs4 import BeautifulSoup
|
2 |
from bs4.element import Comment
|
3 |
from enum import Enum
|
4 |
+
import time
|
5 |
from urllib.parse import urlencode
|
6 |
|
7 |
import json, requests, torch
|
|
|
18 |
HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
|
19 |
DEBUG_HTML = "temp.html"
|
20 |
VERBOSE = True
|
21 |
+
NUM_PROD_LIMIT = 10
|
22 |
|
23 |
API = '85956985fae328bfe5a759a2984448d2'
|
24 |
def get_url(url):
|
|
|
32 |
if page_num is not None:
|
33 |
url += "&page=" + str(page_num)
|
34 |
if VERBOSE:
|
35 |
+
print("Search Results URL:", url)
|
36 |
+
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
|
|
37 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
38 |
products = soup.findAll('div', {'data-component-type': 's-search-result'})
|
39 |
if products is None:
|
|
|
41 |
temp.write(str(soup))
|
42 |
temp.close()
|
43 |
raise Exception("Couldn't find search results page, outputted html for inspection")
|
44 |
+
results = []
|
45 |
+
|
46 |
+
for product in products[:NUM_PROD_LIMIT]:
|
47 |
+
asin = product['data-asin']
|
48 |
+
title = product.find("h2", {'class': "a-size-mini"})
|
49 |
+
price_div = product.find("div", {'class': 's-price-instructions-style'})
|
50 |
+
price = price_div.find("span", {'class': 'a-offscreen'})
|
51 |
+
|
52 |
+
result = {
|
53 |
+
'asin': asin,
|
54 |
+
'Title': title.text.strip(),
|
55 |
+
'Price': price.text.strip().strip("$")
|
56 |
+
}
|
57 |
+
results.append(result)
|
58 |
if VERBOSE:
|
59 |
+
print("Scraped", len(results), "products")
|
60 |
+
return results
|
61 |
|
62 |
# Scrape information of each product
|
63 |
def parse_item_page(asin):
|
|
|
65 |
product_dict["asin"] = asin
|
66 |
|
67 |
url = f"https://www.amazon.com/dp/{asin}"
|
68 |
+
if VERBOSE:
|
69 |
+
print("Item Page URL:", url)
|
70 |
+
begin = time.time()
|
71 |
+
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
72 |
+
end = time.time()
|
73 |
+
print("Item page scraping took", end-begin, "seconds")
|
74 |
soup = BeautifulSoup(webpage.content, "html.parser")
|
75 |
|
76 |
# Title
|
|
|
121 |
|
122 |
# Main Image
|
123 |
try:
|
|
|
124 |
imgtag = soup.find("img", {"id":"landingImage"})
|
125 |
imageurl = dict(imgtag.attrs)["src"]
|
126 |
except AttributeError:
|