Spaces:
Runtime error
Runtime error
John Yang
commited on
Commit
·
9544646
1
Parent(s):
7a50274
Finished webshop transfer impl
Browse files- .gitignore +3 -1
- app.py +51 -15
- predict_help.py +114 -3
.gitignore
CHANGED
@@ -1 +1,3 @@
|
|
1 |
-
*.pyc
|
|
|
|
|
|
1 |
+
*.pyc
|
2 |
+
|
3 |
+
.DS_Store
|
app.py
CHANGED
@@ -1,9 +1,16 @@
|
|
1 |
import gradio as gr
|
2 |
-
import time, torch
|
3 |
from transformers import BartTokenizer, BartForConditionalGeneration, AutoModel, AutoTokenizer
|
4 |
|
5 |
from webshop_lite import dict_to_fake_html
|
6 |
-
from predict_help import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
# load IL models
|
9 |
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
|
@@ -90,12 +97,15 @@ def predict(obs, info):
|
|
90 |
else:
|
91 |
return "search[" + bart_predict(process_goal(obs)) + "]"
|
92 |
|
93 |
-
def run_episode(goal, verbose=True):
|
94 |
"""
|
95 |
Interact with amazon to find a product given input goal.
|
96 |
Input: text goal
|
97 |
Output: a url of found item on amazon.
|
98 |
"""
|
|
|
|
|
|
|
99 |
obs = "Amazon Shopping Game\nInstruction:" + goal + "\n[button] search [button]"
|
100 |
info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
|
101 |
product_map = {}
|
@@ -156,11 +166,21 @@ def run_episode(goal, verbose=True):
|
|
156 |
page_type = Page.SEARCH
|
157 |
|
158 |
elif action == 'click[buy now]':
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
return return_value
|
165 |
|
166 |
elif prev_page_type == Page.ITEM_PAGE:
|
@@ -186,7 +206,10 @@ def run_episode(goal, verbose=True):
|
|
186 |
data = search_results_cache[search_terms]
|
187 |
else:
|
188 |
begin = time.time()
|
189 |
-
|
|
|
|
|
|
|
190 |
end = time.time()
|
191 |
print("Parsing search results took", end-begin, "seconds")
|
192 |
|
@@ -200,7 +223,10 @@ def run_episode(goal, verbose=True):
|
|
200 |
data = product_map[asin]
|
201 |
else:
|
202 |
begin = time.time()
|
203 |
-
|
|
|
|
|
|
|
204 |
end = time.time()
|
205 |
print("Parsing item page took", end-begin, "seconds")
|
206 |
product_map[asin] = data
|
@@ -228,11 +254,21 @@ def run_episode(goal, verbose=True):
|
|
228 |
print("Extracting available actions took", end-begin, "seconds")
|
229 |
|
230 |
if i == 99:
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
return return_value
|
237 |
|
238 |
gr.Interface(fn=run_episode,\
|
|
|
1 |
import gradio as gr
|
2 |
+
import json, time, torch
|
3 |
from transformers import BartTokenizer, BartForConditionalGeneration, AutoModel, AutoTokenizer
|
4 |
|
5 |
from webshop_lite import dict_to_fake_html
|
6 |
+
from predict_help import (
|
7 |
+
Page, convert_dict_to_actions, convert_html_to_text,
|
8 |
+
parse_results_amz, parse_item_page_amz,
|
9 |
+
parse_results_ws, parse_item_page_ws,
|
10 |
+
WEBSHOP_URL, WEBSHOP_SESSION
|
11 |
+
)
|
12 |
+
|
13 |
+
ENVIRONMENTS = ['amazon', 'webshop', 'ebay']
|
14 |
|
15 |
# load IL models
|
16 |
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
|
|
|
97 |
else:
|
98 |
return "search[" + bart_predict(process_goal(obs)) + "]"
|
99 |
|
100 |
+
def run_episode(goal, verbose=True, env='webshop'):
|
101 |
"""
|
102 |
Interact with amazon to find a product given input goal.
|
103 |
Input: text goal
|
104 |
Output: a url of found item on amazon.
|
105 |
"""
|
106 |
+
if env not in ENVIRONMENTS:
|
107 |
+
print(f"[ERROR] Environment {env} not recognized")
|
108 |
+
|
109 |
obs = "Amazon Shopping Game\nInstruction:" + goal + "\n[button] search [button]"
|
110 |
info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
|
111 |
product_map = {}
|
|
|
166 |
page_type = Page.SEARCH
|
167 |
|
168 |
elif action == 'click[buy now]':
|
169 |
+
return_value = None
|
170 |
+
if env == 'amazon':
|
171 |
+
asin_url = f"https://www.amazon.com/dp/{asin}"
|
172 |
+
return_value = "Product URL: " + asin_url
|
173 |
+
if len(clicked_options) > 0:
|
174 |
+
options_str = ', '.join(list(clicked_options))
|
175 |
+
return_value += "\nSelected Options: " + options_str
|
176 |
+
if env == 'webshop':
|
177 |
+
query_str = "+".join(search_terms.split())
|
178 |
+
options_str = json.dumps(options)
|
179 |
+
asin_url = (
|
180 |
+
f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
|
181 |
+
f'{asin}/{query_str}/{page_num}/{options_str}'
|
182 |
+
)
|
183 |
+
return_value = "Product URL: " + asin_url
|
184 |
return return_value
|
185 |
|
186 |
elif prev_page_type == Page.ITEM_PAGE:
|
|
|
206 |
data = search_results_cache[search_terms]
|
207 |
else:
|
208 |
begin = time.time()
|
209 |
+
if env == 'amazon':
|
210 |
+
data = parse_results_amz(search_terms, page_num)
|
211 |
+
if env == 'webshop':
|
212 |
+
data = parse_results_ws(search_terms, page_num)
|
213 |
end = time.time()
|
214 |
print("Parsing search results took", end-begin, "seconds")
|
215 |
|
|
|
223 |
data = product_map[asin]
|
224 |
else:
|
225 |
begin = time.time()
|
226 |
+
if env == 'amazon':
|
227 |
+
data = parse_item_page_amz(asin)
|
228 |
+
if env == 'webshop':
|
229 |
+
data = parse_item_page_ws(asin, search_terms, page_num, options)
|
230 |
end = time.time()
|
231 |
print("Parsing item page took", end-begin, "seconds")
|
232 |
product_map[asin] = data
|
|
|
254 |
print("Extracting available actions took", end-begin, "seconds")
|
255 |
|
256 |
if i == 99:
|
257 |
+
return_value = None
|
258 |
+
if env == 'amazon':
|
259 |
+
asin_url = f"https://www.amazon.com/dp/{asin}"
|
260 |
+
return_value = "Product URL: " + asin_url
|
261 |
+
if len(clicked_options) > 0:
|
262 |
+
options_str = ', '.join(list(clicked_options))
|
263 |
+
return_value += "\nSelected Options: " + options_str
|
264 |
+
if env == 'webshop':
|
265 |
+
query_str = "+".join(search_terms.split())
|
266 |
+
options_str = json.dumps(options)
|
267 |
+
asin_url = (
|
268 |
+
f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
|
269 |
+
f'{asin}/{query_str}/{page_num}/{options_str}'
|
270 |
+
)
|
271 |
+
return_value = "Product URL: " + asin_url
|
272 |
return return_value
|
273 |
|
274 |
gr.Interface(fn=run_episode,\
|
predict_help.py
CHANGED
@@ -19,6 +19,8 @@ HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (K
|
|
19 |
DEBUG_HTML = "temp.html"
|
20 |
VERBOSE = True
|
21 |
NUM_PROD_LIMIT = 10
|
|
|
|
|
22 |
|
23 |
API = '85956985fae328bfe5a759a2984448d2'
|
24 |
def get_url(url):
|
@@ -26,13 +28,122 @@ def get_url(url):
|
|
26 |
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
|
27 |
return proxy_url
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
# Query -> Search Result ASINs
|
30 |
-
def
|
31 |
url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
|
32 |
if page_num is not None:
|
33 |
url += "&page=" + str(page_num)
|
34 |
if VERBOSE:
|
35 |
-
print("Search Results URL:
|
36 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
37 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
38 |
products = soup.findAll('div', {'data-component-type': 's-search-result'})
|
@@ -60,7 +171,7 @@ def parse_results(query, page_num=None):
|
|
60 |
return results
|
61 |
|
62 |
# Scrape information of each product
|
63 |
-
def
|
64 |
product_dict = {}
|
65 |
product_dict["asin"] = asin
|
66 |
|
|
|
19 |
DEBUG_HTML = "temp.html"
|
20 |
VERBOSE = True
|
21 |
NUM_PROD_LIMIT = 10
|
22 |
+
WEBSHOP_URL = "http://3.83.245.205:3000"
|
23 |
+
WEBSHOP_SESSION = "abc"
|
24 |
|
25 |
API = '85956985fae328bfe5a759a2984448d2'
|
26 |
def get_url(url):
|
|
|
28 |
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
|
29 |
return proxy_url
|
30 |
|
31 |
+
def parse_results_ws(query, page_num=None):
|
32 |
+
query_string = '+'.join(query.split())
|
33 |
+
page_num = 1 if page_num is None else page_num
|
34 |
+
url = (
|
35 |
+
f'{WEBSHOP_URL}/search_results/{WEBSHOP_SESSION}/'
|
36 |
+
f'{query_string}/{page_num}'
|
37 |
+
)
|
38 |
+
if VERBOSE:
|
39 |
+
print(f"Search Results URL: {url}")
|
40 |
+
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
41 |
+
soup = BeautifulSoup(webpage.content, 'html.parser')
|
42 |
+
products = soup.findAll('div', {'class': 'list-group-item'})
|
43 |
+
|
44 |
+
results = []
|
45 |
+
for product in products:
|
46 |
+
asin = product.find('a', {'class': 'product-link'})
|
47 |
+
title = product.find('h4', {'class': 'product-title'})
|
48 |
+
price = product.find('h5', {'class': 'product-price'})
|
49 |
+
|
50 |
+
if "\n" in title:
|
51 |
+
title = title.text.split("\n")[0].strip()
|
52 |
+
else:
|
53 |
+
title = title.text.strip().strip("\n")
|
54 |
+
|
55 |
+
if "to" in price.text:
|
56 |
+
# Parse if price presented as range
|
57 |
+
prices = price.text.split(" to ")
|
58 |
+
price = [float(p.strip().strip("\n$")) for p in prices]
|
59 |
+
else:
|
60 |
+
price = float(price.text.strip().strip("\n$"))
|
61 |
+
|
62 |
+
results.append({
|
63 |
+
"asin": asin.text,
|
64 |
+
"Title": title,
|
65 |
+
"Price": price
|
66 |
+
})
|
67 |
+
|
68 |
+
if VERBOSE:
|
69 |
+
print(f"Scraped {len(results)} products")
|
70 |
+
return results
|
71 |
+
|
72 |
+
def parse_item_page_ws(asin, query, page_num, options):
|
73 |
+
product_dict = {}
|
74 |
+
product_dict["asin"] = asin
|
75 |
+
|
76 |
+
query_string = '+'.join(query.split())
|
77 |
+
options_string = json.dumps(options)
|
78 |
+
url = (
|
79 |
+
f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
|
80 |
+
f'{asin}/{query_string}/{page_num}/{options_string}'
|
81 |
+
)
|
82 |
+
if VERBOSE:
|
83 |
+
print("Item Page URL: ", url)
|
84 |
+
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
85 |
+
soup = BeautifulSoup(webpage.content, 'html.parser')
|
86 |
+
|
87 |
+
# Title, Price, Rating, and MainImage
|
88 |
+
product_dict["Title"] = soup.find('h2').text
|
89 |
+
|
90 |
+
h4_headers = soup.findAll("h4")
|
91 |
+
for header in h4_headers:
|
92 |
+
text = header.text
|
93 |
+
if "Price" in text:
|
94 |
+
product_dict["Price"] = text.split(":")[1].strip().strip("$")
|
95 |
+
elif "Rating" in text:
|
96 |
+
product_dict["Rating"] = text.split(":")[1].strip()
|
97 |
+
|
98 |
+
product_dict["MainImage"] = soup.find('img')['src']
|
99 |
+
|
100 |
+
# Options
|
101 |
+
options, options_to_image = {}, {}
|
102 |
+
option_blocks = soup.findAll("div", {'class': 'radio-toolbar'})
|
103 |
+
for block in option_blocks:
|
104 |
+
name = block.find("input")["name"]
|
105 |
+
labels = block.findAll("label")
|
106 |
+
inputs = block.findAll("input")
|
107 |
+
opt_list = []
|
108 |
+
for label, input in zip(labels, inputs):
|
109 |
+
opt = label.text
|
110 |
+
opt_img_path = input["onclick"].split("href=")[1].strip('\';')
|
111 |
+
opt_img_url = f'{WEBSHOP_URL}{opt_img_path}'
|
112 |
+
|
113 |
+
opt_list.append(opt)
|
114 |
+
options_to_image[opt] = opt_img_url
|
115 |
+
options[name] = opt_list
|
116 |
+
product_dict["options"] = options
|
117 |
+
product_dict["option_to_image"] = options_to_image
|
118 |
+
|
119 |
+
# Description
|
120 |
+
url = (
|
121 |
+
f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/'
|
122 |
+
f'{asin}/{query_string}/{page_num}/Description/{options_string}'
|
123 |
+
)
|
124 |
+
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
125 |
+
soup = BeautifulSoup(webpage.content, 'html.parser')
|
126 |
+
product_dict["Description"] = soup.find(name="p", attrs={'class': 'product-info'}).text.strip()
|
127 |
+
|
128 |
+
# Features
|
129 |
+
url = (
|
130 |
+
f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/'
|
131 |
+
f'{asin}/{query_string}/{page_num}/Features/{options_string}'
|
132 |
+
)
|
133 |
+
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
134 |
+
soup = BeautifulSoup(webpage.content, 'html.parser')
|
135 |
+
bullets = soup.find(name="ul").findAll(name="li")
|
136 |
+
product_dict["BulletPoints"] = '\n'.join([b.text.strip() for b in bullets])
|
137 |
+
|
138 |
+
return product_dict
|
139 |
+
|
140 |
# Query -> Search Result ASINs
|
141 |
+
def parse_results_amz(query, page_num=None):
|
142 |
url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
|
143 |
if page_num is not None:
|
144 |
url += "&page=" + str(page_num)
|
145 |
if VERBOSE:
|
146 |
+
print(f"Search Results URL: ${url}")
|
147 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
148 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
149 |
products = soup.findAll('div', {'data-component-type': 's-search-result'})
|
|
|
171 |
return results
|
172 |
|
173 |
# Scrape information of each product
|
174 |
+
def parse_item_page_amz(asin):
|
175 |
product_dict = {}
|
176 |
product_dict["asin"] = asin
|
177 |
|