Spaces:

webshop
/

amazon_shop

Runtime error

App Files Files Community

John Yang commited on Jun 30, 2022

Commit

4caa7f7

1 Parent(s): 14fbfba

Refactored caching of scrape requests

Browse files

Files changed (2) hide show

app.py +54 -31
predict_help.py +26 -9

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-import torch
 from transformers import BartTokenizer, BartForConditionalGeneration, AutoModel, AutoTokenizer
 from webshop_lite import dict_to_fake_html
@@ -90,8 +90,6 @@ def predict(obs, info):
     else:
         return "search[" + bart_predict(process_goal(obs)) + "]"
-NUM_PROD_LIMIT = 10
 def run_episode(goal, verbose=True):
     """
     Interact with amazon to find a product given input goal.
@@ -101,9 +99,10 @@ def run_episode(goal, verbose=True):
     obs = "Amazon Shopping Game\nInstruction:" + goal + "\n[button] search [button]"
     info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
     product_map = {}
-    page_to_product_map_memo = {}
     visited_asins, clicked_options = set(), set()
-    arg, sub_page_type, page_type, page_num = None, None, None, None
     search_terms, prod_title, asin, num_prods, = None, None, None, None
     options = {}
@@ -125,9 +124,9 @@ def run_episode(goal, verbose=True):
             if action.startswith('click[item -'):
                 prod_title = action_content[len("item -"):].strip()
                 found = False
-                for value in product_map.values():
-                    if prod_title == value["Title"]:
-                        asin = value["asin"]
                         page_type = Page.ITEM_PAGE
                         visited_asins.add(asin)
                         found = True
@@ -157,7 +156,12 @@ def run_episode(goal, verbose=True):
                 page_type = Page.SEARCH
             elif action == 'click[buy now]':
-                return asin
             elif prev_page_type == Page.ITEM_PAGE:
                 found = False
@@ -178,24 +182,28 @@ def run_episode(goal, verbose=True):
         # URL -> Real HTML -> Dict of Info
         if page_type == Page.RESULTS:
-            if search_terms not in page_to_product_map_memo or page_num not in page_to_product_map_memo[search_terms]:
-                product_map = {}
-                asins = parse_results(search_terms, page_num)
-                num_prods = len(asins)
-                for asin_ in asins[:NUM_PROD_LIMIT]:
-                    product_map[asin_] = parse_item_page(asin_)
-                if search_terms not in page_to_product_map_memo:
-                    page_to_product_map_memo[search_terms] = {}
-                page_to_product_map_memo[search_terms][page_num] = product_map
             else:
-                if verbose:
-                    print("Loaded memoized search results (" + str(page_num) + ")...")
-                product_map = page_to_product_map_memo[search_terms][page_num]
-            if verbose:
-                print("Product Map Length:", len(product_map))
-            data = list(product_map.values())
-        elif page_type == Page.ITEM_PAGE or page_type == Page.SUB_PAGE:
-            data = product_map
         elif page_type == Page.SEARCH:
             if verbose:
                 print("Executing search")
@@ -203,24 +211,39 @@ def run_episode(goal, verbose=True):
             info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
             continue
         else:
-            raise Exception("Page of type `", page_type,value, "` not found")
         # Dict of Info -> Fake HTML -> Text Observation
         html_str = dict_to_fake_html(data, page_type, asin, sub_page_type, options, product_map, goal)
         obs = convert_html_to_text(html_str, simple=False, clicked_options=clicked_options, visited_asins=visited_asins)
         # Dict of Info -> Valid Action State (Info)
-        info = convert_dict_to_actions(page_type, data, asin, page_num, num_prods)
         if i == 99:
-            return asin
 gr.Interface(fn=run_episode,\
     inputs=gr.inputs.Textbox(lines=7, label="Input Text"),\
     outputs="text",\
     examples=[
         "Please select a 1 pound, certified organic sea salt shaker in the flavor triple blend flakes, and price lower than 40.00 dollars",
-        "I want to find a gold floor lamp with a glass shade and a nickel finish that i can use for my living room, and price lower than 270.00 dollars"
     ],\
     title="WebShop",\
     article="<p style='padding-top:15px;text-align:center;'>To learn more about this project, check out the <a href='https://webshop-pnlp.github.io/' target='_blank'>project page</a>!</p>",\

 import gradio as gr
+import time, torch
 from transformers import BartTokenizer, BartForConditionalGeneration, AutoModel, AutoTokenizer
 from webshop_lite import dict_to_fake_html
     else:
         return "search[" + bart_predict(process_goal(obs)) + "]"
 def run_episode(goal, verbose=True):
     """
     Interact with amazon to find a product given input goal.
     obs = "Amazon Shopping Game\nInstruction:" + goal + "\n[button] search [button]"
     info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
     product_map = {}
+    title_to_asin_map = {}
+    search_results_cache = {}
     visited_asins, clicked_options = set(), set()
+    sub_page_type, page_type, page_num = None, None, None
     search_terms, prod_title, asin, num_prods, = None, None, None, None
     options = {}
             if action.startswith('click[item -'):
                 prod_title = action_content[len("item -"):].strip()
                 found = False
+                for key in title_to_asin_map:
+                    if prod_title == key:
+                        asin = title_to_asin_map[key]
                         page_type = Page.ITEM_PAGE
                         visited_asins.add(asin)
                         found = True
                 page_type = Page.SEARCH
             elif action == 'click[buy now]':
+                asin_url = f"https://www.amazon.com/dp/{asin}"
+                return_value = "Product URL: " + asin_url
+                if len(clicked_options) > 0:
+                    options_str = ', '.join(list(clicked_options))
+                    return_value += "\nSelected Options: " + options_str
+                return return_value
             elif prev_page_type == Page.ITEM_PAGE:
                 found = False
         # URL -> Real HTML -> Dict of Info
         if page_type == Page.RESULTS:
+            if search_terms in search_results_cache:
+                data = search_results_cache[search_terms]
             else:
+                begin = time.time()
+                data = parse_results(search_terms, page_num)
+                end = time.time()
+                print("Parsing search results took", end-begin, "seconds")
+                search_results_cache[search_terms] = data
+                num_prods = len(data)
+                for d in data:
+                    title_to_asin_map[d['Title']] = d['asin']
+        elif page_type == Page.ITEM_PAGE or page_type == Page.SUB_PAGE:
+            if asin in product_map:
+                print("Loading cached item page for", asin)
+                data = product_map[asin]
+            else:
+                begin = time.time()
+                data = parse_item_page(asin)
+                end = time.time()
+                print("Parsing item page took", end-begin, "seconds")
+                product_map[asin] = data
         elif page_type == Page.SEARCH:
             if verbose:
                 print("Executing search")
             info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
             continue
         else:
+            raise Exception("Page of type `", page_type, "` not found")
         # Dict of Info -> Fake HTML -> Text Observation
+        begin = time.time()
         html_str = dict_to_fake_html(data, page_type, asin, sub_page_type, options, product_map, goal)
         obs = convert_html_to_text(html_str, simple=False, clicked_options=clicked_options, visited_asins=visited_asins)
+        end = time.time()
+        print("[Page Info -> WebShop HTML -> Observation] took", end-begin, "seconds")
         # Dict of Info -> Valid Action State (Info)
+        begin = time.time()
+        prod_arg = product_map if page_type == Page.ITEM_PAGE else data
+        info = convert_dict_to_actions(page_type, prod_arg, asin, page_num, num_prods)
+        end = time.time()
+        print("Extracting available actions took", end-begin, "seconds")
         if i == 99:
+            asin_url = f"https://www.amazon.com/dp/{asin}"
+            return_value = "Product URL: " + asin_url
+            if len(clicked_options) > 0:
+                options_str = ', '.join(list(clicked_options))
+                return_value += "\nSelected Options: " + options_str
+            return return_value
 gr.Interface(fn=run_episode,\
     inputs=gr.inputs.Textbox(lines=7, label="Input Text"),\
     outputs="text",\
     examples=[
         "Please select a 1 pound, certified organic sea salt shaker in the flavor triple blend flakes, and price lower than 40.00 dollars",
+        "I want to find a gold floor lamp with a glass shade and a nickel finish that i can use for my living room, and price lower than 270.00 dollars",
+        "I'm trying to find white bluetooth speakers that are not only water resistant but also come with stereo sound",
+        "I'm looking for a kids toothbrush for ages 6 to 12 that will help with teeth whitening and is easy to use",
+        "I need some cute heart-shaped glittery cupcake picks as a gift to bring to a baby shower",
     ],\
     title="WebShop",\
     article="<p style='padding-top:15px;text-align:center;'>To learn more about this project, check out the <a href='https://webshop-pnlp.github.io/' target='_blank'>project page</a>!</p>",\

predict_help.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from bs4 import BeautifulSoup
 from bs4.element import Comment
 from enum import Enum
 from urllib.parse import urlencode
 import json, requests, torch
@@ -17,6 +18,7 @@ class Page(Enum):
 HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
 DEBUG_HTML = "temp.html"
 VERBOSE = True
 API = '85956985fae328bfe5a759a2984448d2'
 def get_url(url):
@@ -30,9 +32,8 @@ def parse_results(query, page_num=None):
     if page_num is not None:
         url += "&page=" + str(page_num)
     if VERBOSE:
-        print("Action URL: ", get_url(url))
-    webpage = requests.get(get_url(url), headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
-    asins = []
     soup = BeautifulSoup(webpage.content, 'html.parser')
     products = soup.findAll('div', {'data-component-type': 's-search-result'})
     if products is None:
@@ -40,11 +41,23 @@ def parse_results(query, page_num=None):
         temp.write(str(soup))
         temp.close()
         raise Exception("Couldn't find search results page, outputted html for inspection")
-    for product in products:
-        asins.append(product['data-asin'])
     if VERBOSE:
-        print("Scraped", len(asins), "products")
-    return asins
 # Scrape information of each product
 def parse_item_page(asin):
@@ -52,7 +65,12 @@ def parse_item_page(asin):
     product_dict["asin"] = asin
     url = f"https://www.amazon.com/dp/{asin}"
-    webpage = requests.get(get_url(url), headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
     soup = BeautifulSoup(webpage.content, "html.parser")
     # Title
@@ -103,7 +121,6 @@ def parse_item_page(asin):
     # Main Image
     try:
-        body = soup.find("body")
         imgtag = soup.find("img", {"id":"landingImage"})
         imageurl = dict(imgtag.attrs)["src"]
     except AttributeError:

 from bs4 import BeautifulSoup
 from bs4.element import Comment
 from enum import Enum
+import time
 from urllib.parse import urlencode
 import json, requests, torch
 HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
 DEBUG_HTML = "temp.html"
 VERBOSE = True
+NUM_PROD_LIMIT = 10
 API = '85956985fae328bfe5a759a2984448d2'
 def get_url(url):
     if page_num is not None:
         url += "&page=" + str(page_num)
     if VERBOSE:
+        print("Search Results URL:", url)
+    webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
     soup = BeautifulSoup(webpage.content, 'html.parser')
     products = soup.findAll('div', {'data-component-type': 's-search-result'})
     if products is None:
         temp.write(str(soup))
         temp.close()
         raise Exception("Couldn't find search results page, outputted html for inspection")
+    results = []
+    for product in products[:NUM_PROD_LIMIT]:
+        asin = product['data-asin']
+        title = product.find("h2", {'class': "a-size-mini"})
+        price_div = product.find("div", {'class': 's-price-instructions-style'})
+        price = price_div.find("span", {'class': 'a-offscreen'})
+        result = {
+            'asin': asin,
+            'Title': title.text.strip(),
+            'Price': price.text.strip().strip("$")
+        }
+        results.append(result)
     if VERBOSE:
+        print("Scraped", len(results), "products")
+    return results
 # Scrape information of each product
 def parse_item_page(asin):
     product_dict["asin"] = asin
     url = f"https://www.amazon.com/dp/{asin}"
+    if VERBOSE:
+        print("Item Page URL:", url)
+    begin = time.time()
+    webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
+    end = time.time()
+    print("Item page scraping took", end-begin, "seconds")
     soup = BeautifulSoup(webpage.content, "html.parser")
     # Title
     # Main Image
     try:
         imgtag = soup.find("img", {"id":"landingImage"})
         imageurl = dict(imgtag.attrs)["src"]
     except AttributeError: