John Yang commited on
Commit
4caa7f7
·
1 Parent(s): 14fbfba

Refactored caching of scrape requests

Browse files
Files changed (2) hide show
  1. app.py +54 -31
  2. predict_help.py +26 -9
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- import torch
3
  from transformers import BartTokenizer, BartForConditionalGeneration, AutoModel, AutoTokenizer
4
 
5
  from webshop_lite import dict_to_fake_html
@@ -90,8 +90,6 @@ def predict(obs, info):
90
  else:
91
  return "search[" + bart_predict(process_goal(obs)) + "]"
92
 
93
- NUM_PROD_LIMIT = 10
94
-
95
  def run_episode(goal, verbose=True):
96
  """
97
  Interact with amazon to find a product given input goal.
@@ -101,9 +99,10 @@ def run_episode(goal, verbose=True):
101
  obs = "Amazon Shopping Game\nInstruction:" + goal + "\n[button] search [button]"
102
  info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
103
  product_map = {}
104
- page_to_product_map_memo = {}
 
105
  visited_asins, clicked_options = set(), set()
106
- arg, sub_page_type, page_type, page_num = None, None, None, None
107
  search_terms, prod_title, asin, num_prods, = None, None, None, None
108
  options = {}
109
 
@@ -125,9 +124,9 @@ def run_episode(goal, verbose=True):
125
  if action.startswith('click[item -'):
126
  prod_title = action_content[len("item -"):].strip()
127
  found = False
128
- for value in product_map.values():
129
- if prod_title == value["Title"]:
130
- asin = value["asin"]
131
  page_type = Page.ITEM_PAGE
132
  visited_asins.add(asin)
133
  found = True
@@ -157,7 +156,12 @@ def run_episode(goal, verbose=True):
157
  page_type = Page.SEARCH
158
 
159
  elif action == 'click[buy now]':
160
- return asin
 
 
 
 
 
161
 
162
  elif prev_page_type == Page.ITEM_PAGE:
163
  found = False
@@ -178,24 +182,28 @@ def run_episode(goal, verbose=True):
178
 
179
  # URL -> Real HTML -> Dict of Info
180
  if page_type == Page.RESULTS:
181
- if search_terms not in page_to_product_map_memo or page_num not in page_to_product_map_memo[search_terms]:
182
- product_map = {}
183
- asins = parse_results(search_terms, page_num)
184
- num_prods = len(asins)
185
- for asin_ in asins[:NUM_PROD_LIMIT]:
186
- product_map[asin_] = parse_item_page(asin_)
187
- if search_terms not in page_to_product_map_memo:
188
- page_to_product_map_memo[search_terms] = {}
189
- page_to_product_map_memo[search_terms][page_num] = product_map
190
  else:
191
- if verbose:
192
- print("Loaded memoized search results (" + str(page_num) + ")...")
193
- product_map = page_to_product_map_memo[search_terms][page_num]
194
- if verbose:
195
- print("Product Map Length:", len(product_map))
196
- data = list(product_map.values())
197
- elif page_type == Page.ITEM_PAGE or page_type == Page.SUB_PAGE:
198
- data = product_map
 
 
 
 
 
 
 
 
 
 
 
199
  elif page_type == Page.SEARCH:
200
  if verbose:
201
  print("Executing search")
@@ -203,24 +211,39 @@ def run_episode(goal, verbose=True):
203
  info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
204
  continue
205
  else:
206
- raise Exception("Page of type `", page_type,value, "` not found")
207
 
208
  # Dict of Info -> Fake HTML -> Text Observation
 
209
  html_str = dict_to_fake_html(data, page_type, asin, sub_page_type, options, product_map, goal)
210
  obs = convert_html_to_text(html_str, simple=False, clicked_options=clicked_options, visited_asins=visited_asins)
211
-
 
 
212
  # Dict of Info -> Valid Action State (Info)
213
- info = convert_dict_to_actions(page_type, data, asin, page_num, num_prods)
 
 
 
 
214
 
215
  if i == 99:
216
- return asin
 
 
 
 
 
217
 
218
  gr.Interface(fn=run_episode,\
219
  inputs=gr.inputs.Textbox(lines=7, label="Input Text"),\
220
  outputs="text",\
221
  examples=[
222
  "Please select a 1 pound, certified organic sea salt shaker in the flavor triple blend flakes, and price lower than 40.00 dollars",
223
- "I want to find a gold floor lamp with a glass shade and a nickel finish that i can use for my living room, and price lower than 270.00 dollars"
 
 
 
224
  ],\
225
  title="WebShop",\
226
  article="<p style='padding-top:15px;text-align:center;'>To learn more about this project, check out the <a href='https://webshop-pnlp.github.io/' target='_blank'>project page</a>!</p>",\
 
1
  import gradio as gr
2
+ import time, torch
3
  from transformers import BartTokenizer, BartForConditionalGeneration, AutoModel, AutoTokenizer
4
 
5
  from webshop_lite import dict_to_fake_html
 
90
  else:
91
  return "search[" + bart_predict(process_goal(obs)) + "]"
92
 
 
 
93
  def run_episode(goal, verbose=True):
94
  """
95
  Interact with amazon to find a product given input goal.
 
99
  obs = "Amazon Shopping Game\nInstruction:" + goal + "\n[button] search [button]"
100
  info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
101
  product_map = {}
102
+ title_to_asin_map = {}
103
+ search_results_cache = {}
104
  visited_asins, clicked_options = set(), set()
105
+ sub_page_type, page_type, page_num = None, None, None
106
  search_terms, prod_title, asin, num_prods, = None, None, None, None
107
  options = {}
108
 
 
124
  if action.startswith('click[item -'):
125
  prod_title = action_content[len("item -"):].strip()
126
  found = False
127
+ for key in title_to_asin_map:
128
+ if prod_title == key:
129
+ asin = title_to_asin_map[key]
130
  page_type = Page.ITEM_PAGE
131
  visited_asins.add(asin)
132
  found = True
 
156
  page_type = Page.SEARCH
157
 
158
  elif action == 'click[buy now]':
159
+ asin_url = f"https://www.amazon.com/dp/{asin}"
160
+ return_value = "Product URL: " + asin_url
161
+ if len(clicked_options) > 0:
162
+ options_str = ', '.join(list(clicked_options))
163
+ return_value += "\nSelected Options: " + options_str
164
+ return return_value
165
 
166
  elif prev_page_type == Page.ITEM_PAGE:
167
  found = False
 
182
 
183
  # URL -> Real HTML -> Dict of Info
184
  if page_type == Page.RESULTS:
185
+ if search_terms in search_results_cache:
186
+ data = search_results_cache[search_terms]
 
 
 
 
 
 
 
187
  else:
188
+ begin = time.time()
189
+ data = parse_results(search_terms, page_num)
190
+ end = time.time()
191
+ print("Parsing search results took", end-begin, "seconds")
192
+
193
+ search_results_cache[search_terms] = data
194
+ num_prods = len(data)
195
+ for d in data:
196
+ title_to_asin_map[d['Title']] = d['asin']
197
+ elif page_type == Page.ITEM_PAGE or page_type == Page.SUB_PAGE:
198
+ if asin in product_map:
199
+ print("Loading cached item page for", asin)
200
+ data = product_map[asin]
201
+ else:
202
+ begin = time.time()
203
+ data = parse_item_page(asin)
204
+ end = time.time()
205
+ print("Parsing item page took", end-begin, "seconds")
206
+ product_map[asin] = data
207
  elif page_type == Page.SEARCH:
208
  if verbose:
209
  print("Executing search")
 
211
  info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
212
  continue
213
  else:
214
+ raise Exception("Page of type `", page_type, "` not found")
215
 
216
  # Dict of Info -> Fake HTML -> Text Observation
217
+ begin = time.time()
218
  html_str = dict_to_fake_html(data, page_type, asin, sub_page_type, options, product_map, goal)
219
  obs = convert_html_to_text(html_str, simple=False, clicked_options=clicked_options, visited_asins=visited_asins)
220
+ end = time.time()
221
+ print("[Page Info -> WebShop HTML -> Observation] took", end-begin, "seconds")
222
+
223
  # Dict of Info -> Valid Action State (Info)
224
+ begin = time.time()
225
+ prod_arg = product_map if page_type == Page.ITEM_PAGE else data
226
+ info = convert_dict_to_actions(page_type, prod_arg, asin, page_num, num_prods)
227
+ end = time.time()
228
+ print("Extracting available actions took", end-begin, "seconds")
229
 
230
  if i == 99:
231
+ asin_url = f"https://www.amazon.com/dp/{asin}"
232
+ return_value = "Product URL: " + asin_url
233
+ if len(clicked_options) > 0:
234
+ options_str = ', '.join(list(clicked_options))
235
+ return_value += "\nSelected Options: " + options_str
236
+ return return_value
237
 
238
  gr.Interface(fn=run_episode,\
239
  inputs=gr.inputs.Textbox(lines=7, label="Input Text"),\
240
  outputs="text",\
241
  examples=[
242
  "Please select a 1 pound, certified organic sea salt shaker in the flavor triple blend flakes, and price lower than 40.00 dollars",
243
+ "I want to find a gold floor lamp with a glass shade and a nickel finish that i can use for my living room, and price lower than 270.00 dollars",
244
+ "I'm trying to find white bluetooth speakers that are not only water resistant but also come with stereo sound",
245
+ "I'm looking for a kids toothbrush for ages 6 to 12 that will help with teeth whitening and is easy to use",
246
+ "I need some cute heart-shaped glittery cupcake picks as a gift to bring to a baby shower",
247
  ],\
248
  title="WebShop",\
249
  article="<p style='padding-top:15px;text-align:center;'>To learn more about this project, check out the <a href='https://webshop-pnlp.github.io/' target='_blank'>project page</a>!</p>",\
predict_help.py CHANGED
@@ -1,6 +1,7 @@
1
  from bs4 import BeautifulSoup
2
  from bs4.element import Comment
3
  from enum import Enum
 
4
  from urllib.parse import urlencode
5
 
6
  import json, requests, torch
@@ -17,6 +18,7 @@ class Page(Enum):
17
  HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
18
  DEBUG_HTML = "temp.html"
19
  VERBOSE = True
 
20
 
21
  API = '85956985fae328bfe5a759a2984448d2'
22
  def get_url(url):
@@ -30,9 +32,8 @@ def parse_results(query, page_num=None):
30
  if page_num is not None:
31
  url += "&page=" + str(page_num)
32
  if VERBOSE:
33
- print("Action URL: ", get_url(url))
34
- webpage = requests.get(get_url(url), headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
35
- asins = []
36
  soup = BeautifulSoup(webpage.content, 'html.parser')
37
  products = soup.findAll('div', {'data-component-type': 's-search-result'})
38
  if products is None:
@@ -40,11 +41,23 @@ def parse_results(query, page_num=None):
40
  temp.write(str(soup))
41
  temp.close()
42
  raise Exception("Couldn't find search results page, outputted html for inspection")
43
- for product in products:
44
- asins.append(product['data-asin'])
 
 
 
 
 
 
 
 
 
 
 
 
45
  if VERBOSE:
46
- print("Scraped", len(asins), "products")
47
- return asins
48
 
49
  # Scrape information of each product
50
  def parse_item_page(asin):
@@ -52,7 +65,12 @@ def parse_item_page(asin):
52
  product_dict["asin"] = asin
53
 
54
  url = f"https://www.amazon.com/dp/{asin}"
55
- webpage = requests.get(get_url(url), headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
 
 
 
 
 
56
  soup = BeautifulSoup(webpage.content, "html.parser")
57
 
58
  # Title
@@ -103,7 +121,6 @@ def parse_item_page(asin):
103
 
104
  # Main Image
105
  try:
106
- body = soup.find("body")
107
  imgtag = soup.find("img", {"id":"landingImage"})
108
  imageurl = dict(imgtag.attrs)["src"]
109
  except AttributeError:
 
1
  from bs4 import BeautifulSoup
2
  from bs4.element import Comment
3
  from enum import Enum
4
+ import time
5
  from urllib.parse import urlencode
6
 
7
  import json, requests, torch
 
18
  HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
19
  DEBUG_HTML = "temp.html"
20
  VERBOSE = True
21
+ NUM_PROD_LIMIT = 10
22
 
23
  API = '85956985fae328bfe5a759a2984448d2'
24
  def get_url(url):
 
32
  if page_num is not None:
33
  url += "&page=" + str(page_num)
34
  if VERBOSE:
35
+ print("Search Results URL:", url)
36
+ webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
 
37
  soup = BeautifulSoup(webpage.content, 'html.parser')
38
  products = soup.findAll('div', {'data-component-type': 's-search-result'})
39
  if products is None:
 
41
  temp.write(str(soup))
42
  temp.close()
43
  raise Exception("Couldn't find search results page, outputted html for inspection")
44
+ results = []
45
+
46
+ for product in products[:NUM_PROD_LIMIT]:
47
+ asin = product['data-asin']
48
+ title = product.find("h2", {'class': "a-size-mini"})
49
+ price_div = product.find("div", {'class': 's-price-instructions-style'})
50
+ price = price_div.find("span", {'class': 'a-offscreen'})
51
+
52
+ result = {
53
+ 'asin': asin,
54
+ 'Title': title.text.strip(),
55
+ 'Price': price.text.strip().strip("$")
56
+ }
57
+ results.append(result)
58
  if VERBOSE:
59
+ print("Scraped", len(results), "products")
60
+ return results
61
 
62
  # Scrape information of each product
63
  def parse_item_page(asin):
 
65
  product_dict["asin"] = asin
66
 
67
  url = f"https://www.amazon.com/dp/{asin}"
68
+ if VERBOSE:
69
+ print("Item Page URL:", url)
70
+ begin = time.time()
71
+ webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
72
+ end = time.time()
73
+ print("Item page scraping took", end-begin, "seconds")
74
  soup = BeautifulSoup(webpage.content, "html.parser")
75
 
76
  # Title
 
121
 
122
  # Main Image
123
  try:
 
124
  imgtag = soup.find("img", {"id":"landingImage"})
125
  imageurl = dict(imgtag.attrs)["src"]
126
  except AttributeError: