John Yang commited on
Commit
9544646
·
1 Parent(s): 7a50274

Finished webshop transfer impl

Browse files
Files changed (3) hide show
  1. .gitignore +3 -1
  2. app.py +51 -15
  3. predict_help.py +114 -3
.gitignore CHANGED
@@ -1 +1,3 @@
1
- *.pyc
 
 
 
1
+ *.pyc
2
+
3
+ .DS_Store
app.py CHANGED
@@ -1,9 +1,16 @@
1
  import gradio as gr
2
- import time, torch
3
  from transformers import BartTokenizer, BartForConditionalGeneration, AutoModel, AutoTokenizer
4
 
5
  from webshop_lite import dict_to_fake_html
6
- from predict_help import convert_dict_to_actions, convert_html_to_text, parse_results, parse_item_page, Page
 
 
 
 
 
 
 
7
 
8
  # load IL models
9
  bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
@@ -90,12 +97,15 @@ def predict(obs, info):
90
  else:
91
  return "search[" + bart_predict(process_goal(obs)) + "]"
92
 
93
- def run_episode(goal, verbose=True):
94
  """
95
  Interact with amazon to find a product given input goal.
96
  Input: text goal
97
  Output: a url of found item on amazon.
98
  """
 
 
 
99
  obs = "Amazon Shopping Game\nInstruction:" + goal + "\n[button] search [button]"
100
  info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
101
  product_map = {}
@@ -156,11 +166,21 @@ def run_episode(goal, verbose=True):
156
  page_type = Page.SEARCH
157
 
158
  elif action == 'click[buy now]':
159
- asin_url = f"https://www.amazon.com/dp/{asin}"
160
- return_value = "Product URL: " + asin_url
161
- if len(clicked_options) > 0:
162
- options_str = ', '.join(list(clicked_options))
163
- return_value += "\nSelected Options: " + options_str
 
 
 
 
 
 
 
 
 
 
164
  return return_value
165
 
166
  elif prev_page_type == Page.ITEM_PAGE:
@@ -186,7 +206,10 @@ def run_episode(goal, verbose=True):
186
  data = search_results_cache[search_terms]
187
  else:
188
  begin = time.time()
189
- data = parse_results(search_terms, page_num)
 
 
 
190
  end = time.time()
191
  print("Parsing search results took", end-begin, "seconds")
192
 
@@ -200,7 +223,10 @@ def run_episode(goal, verbose=True):
200
  data = product_map[asin]
201
  else:
202
  begin = time.time()
203
- data = parse_item_page(asin)
 
 
 
204
  end = time.time()
205
  print("Parsing item page took", end-begin, "seconds")
206
  product_map[asin] = data
@@ -228,11 +254,21 @@ def run_episode(goal, verbose=True):
228
  print("Extracting available actions took", end-begin, "seconds")
229
 
230
  if i == 99:
231
- asin_url = f"https://www.amazon.com/dp/{asin}"
232
- return_value = "Product URL: " + asin_url
233
- if len(clicked_options) > 0:
234
- options_str = ', '.join(list(clicked_options))
235
- return_value += "\nSelected Options: " + options_str
 
 
 
 
 
 
 
 
 
 
236
  return return_value
237
 
238
  gr.Interface(fn=run_episode,\
 
1
  import gradio as gr
2
+ import json, time, torch
3
  from transformers import BartTokenizer, BartForConditionalGeneration, AutoModel, AutoTokenizer
4
 
5
  from webshop_lite import dict_to_fake_html
6
+ from predict_help import (
7
+ Page, convert_dict_to_actions, convert_html_to_text,
8
+ parse_results_amz, parse_item_page_amz,
9
+ parse_results_ws, parse_item_page_ws,
10
+ WEBSHOP_URL, WEBSHOP_SESSION
11
+ )
12
+
13
+ ENVIRONMENTS = ['amazon', 'webshop', 'ebay']
14
 
15
  # load IL models
16
  bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
 
97
  else:
98
  return "search[" + bart_predict(process_goal(obs)) + "]"
99
 
100
+ def run_episode(goal, verbose=True, env='webshop'):
101
  """
102
  Interact with amazon to find a product given input goal.
103
  Input: text goal
104
  Output: a url of found item on amazon.
105
  """
106
+ if env not in ENVIRONMENTS:
107
+ print(f"[ERROR] Environment {env} not recognized")
108
+
109
  obs = "Amazon Shopping Game\nInstruction:" + goal + "\n[button] search [button]"
110
  info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
111
  product_map = {}
 
166
  page_type = Page.SEARCH
167
 
168
  elif action == 'click[buy now]':
169
+ return_value = None
170
+ if env == 'amazon':
171
+ asin_url = f"https://www.amazon.com/dp/{asin}"
172
+ return_value = "Product URL: " + asin_url
173
+ if len(clicked_options) > 0:
174
+ options_str = ', '.join(list(clicked_options))
175
+ return_value += "\nSelected Options: " + options_str
176
+ if env == 'webshop':
177
+ query_str = "+".join(search_terms.split())
178
+ options_str = json.dumps(options)
179
+ asin_url = (
180
+ f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
181
+ f'{asin}/{query_str}/{page_num}/{options_str}'
182
+ )
183
+ return_value = "Product URL: " + asin_url
184
  return return_value
185
 
186
  elif prev_page_type == Page.ITEM_PAGE:
 
206
  data = search_results_cache[search_terms]
207
  else:
208
  begin = time.time()
209
+ if env == 'amazon':
210
+ data = parse_results_amz(search_terms, page_num)
211
+ if env == 'webshop':
212
+ data = parse_results_ws(search_terms, page_num)
213
  end = time.time()
214
  print("Parsing search results took", end-begin, "seconds")
215
 
 
223
  data = product_map[asin]
224
  else:
225
  begin = time.time()
226
+ if env == 'amazon':
227
+ data = parse_item_page_amz(asin)
228
+ if env == 'webshop':
229
+ data = parse_item_page_ws(asin, search_terms, page_num, options)
230
  end = time.time()
231
  print("Parsing item page took", end-begin, "seconds")
232
  product_map[asin] = data
 
254
  print("Extracting available actions took", end-begin, "seconds")
255
 
256
  if i == 99:
257
+ return_value = None
258
+ if env == 'amazon':
259
+ asin_url = f"https://www.amazon.com/dp/{asin}"
260
+ return_value = "Product URL: " + asin_url
261
+ if len(clicked_options) > 0:
262
+ options_str = ', '.join(list(clicked_options))
263
+ return_value += "\nSelected Options: " + options_str
264
+ if env == 'webshop':
265
+ query_str = "+".join(search_terms.split())
266
+ options_str = json.dumps(options)
267
+ asin_url = (
268
+ f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
269
+ f'{asin}/{query_str}/{page_num}/{options_str}'
270
+ )
271
+ return_value = "Product URL: " + asin_url
272
  return return_value
273
 
274
  gr.Interface(fn=run_episode,\
predict_help.py CHANGED
@@ -19,6 +19,8 @@ HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (K
19
  DEBUG_HTML = "temp.html"
20
  VERBOSE = True
21
  NUM_PROD_LIMIT = 10
 
 
22
 
23
  API = '85956985fae328bfe5a759a2984448d2'
24
  def get_url(url):
@@ -26,13 +28,122 @@ def get_url(url):
26
  proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
27
  return proxy_url
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # Query -> Search Result ASINs
30
- def parse_results(query, page_num=None):
31
  url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
32
  if page_num is not None:
33
  url += "&page=" + str(page_num)
34
  if VERBOSE:
35
- print("Search Results URL:", url)
36
  webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
37
  soup = BeautifulSoup(webpage.content, 'html.parser')
38
  products = soup.findAll('div', {'data-component-type': 's-search-result'})
@@ -60,7 +171,7 @@ def parse_results(query, page_num=None):
60
  return results
61
 
62
  # Scrape information of each product
63
- def parse_item_page(asin):
64
  product_dict = {}
65
  product_dict["asin"] = asin
66
 
 
19
  DEBUG_HTML = "temp.html"
20
  VERBOSE = True
21
  NUM_PROD_LIMIT = 10
22
+ WEBSHOP_URL = "http://3.83.245.205:3000"
23
+ WEBSHOP_SESSION = "abc"
24
 
25
  API = '85956985fae328bfe5a759a2984448d2'
26
  def get_url(url):
 
28
  proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
29
  return proxy_url
30
 
31
+ def parse_results_ws(query, page_num=None):
32
+ query_string = '+'.join(query.split())
33
+ page_num = 1 if page_num is None else page_num
34
+ url = (
35
+ f'{WEBSHOP_URL}/search_results/{WEBSHOP_SESSION}/'
36
+ f'{query_string}/{page_num}'
37
+ )
38
+ if VERBOSE:
39
+ print(f"Search Results URL: {url}")
40
+ webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
41
+ soup = BeautifulSoup(webpage.content, 'html.parser')
42
+ products = soup.findAll('div', {'class': 'list-group-item'})
43
+
44
+ results = []
45
+ for product in products:
46
+ asin = product.find('a', {'class': 'product-link'})
47
+ title = product.find('h4', {'class': 'product-title'})
48
+ price = product.find('h5', {'class': 'product-price'})
49
+
50
+ if "\n" in title:
51
+ title = title.text.split("\n")[0].strip()
52
+ else:
53
+ title = title.text.strip().strip("\n")
54
+
55
+ if "to" in price.text:
56
+ # Parse if price presented as range
57
+ prices = price.text.split(" to ")
58
+ price = [float(p.strip().strip("\n$")) for p in prices]
59
+ else:
60
+ price = float(price.text.strip().strip("\n$"))
61
+
62
+ results.append({
63
+ "asin": asin.text,
64
+ "Title": title,
65
+ "Price": price
66
+ })
67
+
68
+ if VERBOSE:
69
+ print(f"Scraped {len(results)} products")
70
+ return results
71
+
72
+ def parse_item_page_ws(asin, query, page_num, options):
73
+ product_dict = {}
74
+ product_dict["asin"] = asin
75
+
76
+ query_string = '+'.join(query.split())
77
+ options_string = json.dumps(options)
78
+ url = (
79
+ f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
80
+ f'{asin}/{query_string}/{page_num}/{options_string}'
81
+ )
82
+ if VERBOSE:
83
+ print("Item Page URL: ", url)
84
+ webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
85
+ soup = BeautifulSoup(webpage.content, 'html.parser')
86
+
87
+ # Title, Price, Rating, and MainImage
88
+ product_dict["Title"] = soup.find('h2').text
89
+
90
+ h4_headers = soup.findAll("h4")
91
+ for header in h4_headers:
92
+ text = header.text
93
+ if "Price" in text:
94
+ product_dict["Price"] = text.split(":")[1].strip().strip("$")
95
+ elif "Rating" in text:
96
+ product_dict["Rating"] = text.split(":")[1].strip()
97
+
98
+ product_dict["MainImage"] = soup.find('img')['src']
99
+
100
+ # Options
101
+ options, options_to_image = {}, {}
102
+ option_blocks = soup.findAll("div", {'class': 'radio-toolbar'})
103
+ for block in option_blocks:
104
+ name = block.find("input")["name"]
105
+ labels = block.findAll("label")
106
+ inputs = block.findAll("input")
107
+ opt_list = []
108
+ for label, input in zip(labels, inputs):
109
+ opt = label.text
110
+ opt_img_path = input["onclick"].split("href=")[1].strip('\';')
111
+ opt_img_url = f'{WEBSHOP_URL}{opt_img_path}'
112
+
113
+ opt_list.append(opt)
114
+ options_to_image[opt] = opt_img_url
115
+ options[name] = opt_list
116
+ product_dict["options"] = options
117
+ product_dict["option_to_image"] = options_to_image
118
+
119
+ # Description
120
+ url = (
121
+ f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/'
122
+ f'{asin}/{query_string}/{page_num}/Description/{options_string}'
123
+ )
124
+ webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
125
+ soup = BeautifulSoup(webpage.content, 'html.parser')
126
+ product_dict["Description"] = soup.find(name="p", attrs={'class': 'product-info'}).text.strip()
127
+
128
+ # Features
129
+ url = (
130
+ f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/'
131
+ f'{asin}/{query_string}/{page_num}/Features/{options_string}'
132
+ )
133
+ webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
134
+ soup = BeautifulSoup(webpage.content, 'html.parser')
135
+ bullets = soup.find(name="ul").findAll(name="li")
136
+ product_dict["BulletPoints"] = '\n'.join([b.text.strip() for b in bullets])
137
+
138
+ return product_dict
139
+
140
  # Query -> Search Result ASINs
141
+ def parse_results_amz(query, page_num=None):
142
  url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
143
  if page_num is not None:
144
  url += "&page=" + str(page_num)
145
  if VERBOSE:
146
+ print(f"Search Results URL: ${url}")
147
  webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
148
  soup = BeautifulSoup(webpage.content, 'html.parser')
149
  products = soup.findAll('div', {'data-component-type': 's-search-result'})
 
171
  return results
172
 
173
  # Scrape information of each product
174
+ def parse_item_page_amz(asin):
175
  product_dict = {}
176
  product_dict["asin"] = asin
177