Spaces:

webshop
/

amazon_shop

Runtime error

App Files Files Community

John Yang commited on Jul 3, 2022

Commit

9544646

1 Parent(s): 7a50274

Finished webshop transfer impl

Browse files

Files changed (3) hide show

.gitignore +3 -1
app.py +51 -15
predict_help.py +114 -3

.gitignore CHANGED Viewed

	@@ -1 +1,3 @@
1	- *.pyc


1	+ *.pyc
2	+
3	+ .DS_Store

app.py CHANGED Viewed

@@ -1,9 +1,16 @@
 import gradio as gr
-import time, torch
 from transformers import BartTokenizer, BartForConditionalGeneration, AutoModel, AutoTokenizer
 from webshop_lite import dict_to_fake_html
-from predict_help import convert_dict_to_actions, convert_html_to_text, parse_results, parse_item_page, Page
 # load IL models
 bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
@@ -90,12 +97,15 @@ def predict(obs, info):
     else:
         return "search[" + bart_predict(process_goal(obs)) + "]"
-def run_episode(goal, verbose=True):
     """
     Interact with amazon to find a product given input goal.
     Input: text goal
     Output: a url of found item on amazon.
     """
     obs = "Amazon Shopping Game\nInstruction:" + goal + "\n[button] search [button]"
     info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
     product_map = {}
@@ -156,11 +166,21 @@ def run_episode(goal, verbose=True):
                 page_type = Page.SEARCH
             elif action == 'click[buy now]':
-                asin_url = f"https://www.amazon.com/dp/{asin}"
-                return_value = "Product URL: " + asin_url
-                if len(clicked_options) > 0:
-                    options_str = ', '.join(list(clicked_options))
-                    return_value += "\nSelected Options: " + options_str
                 return return_value
             elif prev_page_type == Page.ITEM_PAGE:
@@ -186,7 +206,10 @@ def run_episode(goal, verbose=True):
                 data = search_results_cache[search_terms]
             else:
                 begin = time.time()
-                data = parse_results(search_terms, page_num)
                 end = time.time()
                 print("Parsing search results took", end-begin, "seconds")
@@ -200,7 +223,10 @@ def run_episode(goal, verbose=True):
                 data = product_map[asin]
             else:
                 begin = time.time()
-                data = parse_item_page(asin)
                 end = time.time()
                 print("Parsing item page took", end-begin, "seconds")
                 product_map[asin] = data
@@ -228,11 +254,21 @@ def run_episode(goal, verbose=True):
         print("Extracting available actions took", end-begin, "seconds")
         if i == 99:
-            asin_url = f"https://www.amazon.com/dp/{asin}"
-            return_value = "Product URL: " + asin_url
-            if len(clicked_options) > 0:
-                options_str = ', '.join(list(clicked_options))
-                return_value += "\nSelected Options: " + options_str
             return return_value
 gr.Interface(fn=run_episode,\

 import gradio as gr
+import json, time, torch
 from transformers import BartTokenizer, BartForConditionalGeneration, AutoModel, AutoTokenizer
 from webshop_lite import dict_to_fake_html
+from predict_help import (
+    Page, convert_dict_to_actions, convert_html_to_text,
+    parse_results_amz, parse_item_page_amz,
+    parse_results_ws, parse_item_page_ws,
+    WEBSHOP_URL, WEBSHOP_SESSION
+)
+ENVIRONMENTS = ['amazon', 'webshop', 'ebay']
 # load IL models
 bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
     else:
         return "search[" + bart_predict(process_goal(obs)) + "]"
+def run_episode(goal, verbose=True, env='webshop'):
     """
     Interact with amazon to find a product given input goal.
     Input: text goal
     Output: a url of found item on amazon.
     """
+    if env not in ENVIRONMENTS:
+        print(f"[ERROR] Environment {env} not recognized")
     obs = "Amazon Shopping Game\nInstruction:" + goal + "\n[button] search [button]"
     info = {'valid': ['search[stuff]'], 'image_feat': torch.zeros(512)}
     product_map = {}
                 page_type = Page.SEARCH
             elif action == 'click[buy now]':
+                return_value = None
+                if env == 'amazon':
+                    asin_url = f"https://www.amazon.com/dp/{asin}"
+                    return_value = "Product URL: " + asin_url
+                    if len(clicked_options) > 0:
+                        options_str = ', '.join(list(clicked_options))
+                        return_value += "\nSelected Options: " + options_str
+                if env == 'webshop':
+                    query_str = "+".join(search_terms.split())
+                    options_str = json.dumps(options)
+                    asin_url = (
+                        f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
+                        f'{asin}/{query_str}/{page_num}/{options_str}'
+                    )
+                    return_value = "Product URL: " + asin_url
                 return return_value
             elif prev_page_type == Page.ITEM_PAGE:
                 data = search_results_cache[search_terms]
             else:
                 begin = time.time()
+                if env == 'amazon':
+                    data = parse_results_amz(search_terms, page_num)
+                if env == 'webshop':
+                    data = parse_results_ws(search_terms, page_num)
                 end = time.time()
                 print("Parsing search results took", end-begin, "seconds")
                 data = product_map[asin]
             else:
                 begin = time.time()
+                if env == 'amazon':
+                    data = parse_item_page_amz(asin)
+                if env == 'webshop':
+                    data = parse_item_page_ws(asin, search_terms, page_num, options)
                 end = time.time()
                 print("Parsing item page took", end-begin, "seconds")
                 product_map[asin] = data
         print("Extracting available actions took", end-begin, "seconds")
         if i == 99:
+            return_value = None
+            if env == 'amazon':
+                asin_url = f"https://www.amazon.com/dp/{asin}"
+                return_value = "Product URL: " + asin_url
+                if len(clicked_options) > 0:
+                    options_str = ', '.join(list(clicked_options))
+                    return_value += "\nSelected Options: " + options_str
+            if env == 'webshop':
+                query_str = "+".join(search_terms.split())
+                options_str = json.dumps(options)
+                asin_url = (
+                    f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
+                    f'{asin}/{query_str}/{page_num}/{options_str}'
+                )
+                return_value = "Product URL: " + asin_url
             return return_value
 gr.Interface(fn=run_episode,\

predict_help.py CHANGED Viewed

@@ -19,6 +19,8 @@ HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (K
 DEBUG_HTML = "temp.html"
 VERBOSE = True
 NUM_PROD_LIMIT = 10
 API = '85956985fae328bfe5a759a2984448d2'
 def get_url(url):
@@ -26,13 +28,122 @@ def get_url(url):
     proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
     return proxy_url
 # Query -> Search Result ASINs
-def parse_results(query, page_num=None):
     url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
     if page_num is not None:
         url += "&page=" + str(page_num)
     if VERBOSE:
-        print("Search Results URL:", url)
     webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
     soup = BeautifulSoup(webpage.content, 'html.parser')
     products = soup.findAll('div', {'data-component-type': 's-search-result'})
@@ -60,7 +171,7 @@ def parse_results(query, page_num=None):
     return results
 # Scrape information of each product
-def parse_item_page(asin):
     product_dict = {}
     product_dict["asin"] = asin

 DEBUG_HTML = "temp.html"
 VERBOSE = True
 NUM_PROD_LIMIT = 10
+WEBSHOP_URL = "http://3.83.245.205:3000"
+WEBSHOP_SESSION = "abc"
 API = '85956985fae328bfe5a759a2984448d2'
 def get_url(url):
     proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
     return proxy_url
+def parse_results_ws(query, page_num=None):
+    query_string = '+'.join(query.split())
+    page_num = 1 if page_num is None else page_num
+    url = (
+        f'{WEBSHOP_URL}/search_results/{WEBSHOP_SESSION}/'
+        f'{query_string}/{page_num}'
+    )
+    if VERBOSE:
+        print(f"Search Results URL: {url}")
+    webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
+    soup = BeautifulSoup(webpage.content, 'html.parser')
+    products = soup.findAll('div', {'class': 'list-group-item'})
+    results = []
+    for product in products:
+        asin = product.find('a', {'class': 'product-link'})
+        title = product.find('h4', {'class': 'product-title'})
+        price = product.find('h5', {'class': 'product-price'})
+        if "\n" in title:
+            title = title.text.split("\n")[0].strip()
+        else:
+            title = title.text.strip().strip("\n")
+        if "to" in price.text:
+            # Parse if price presented as range
+            prices = price.text.split(" to ")
+            price = [float(p.strip().strip("\n$")) for p in prices]
+        else:
+            price = float(price.text.strip().strip("\n$"))
+        results.append({
+            "asin": asin.text,
+            "Title": title,
+            "Price": price
+        })
+    if VERBOSE:
+        print(f"Scraped {len(results)} products")
+    return results
+def parse_item_page_ws(asin, query, page_num, options):
+    product_dict = {}
+    product_dict["asin"] = asin
+    query_string = '+'.join(query.split())
+    options_string = json.dumps(options)
+    url = (
+        f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/'
+        f'{asin}/{query_string}/{page_num}/{options_string}'
+    )
+    if VERBOSE:
+        print("Item Page URL: ", url)
+    webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
+    soup = BeautifulSoup(webpage.content, 'html.parser')
+    # Title, Price, Rating, and MainImage
+    product_dict["Title"] = soup.find('h2').text
+    h4_headers = soup.findAll("h4")
+    for header in h4_headers:
+        text = header.text
+        if "Price" in text:
+            product_dict["Price"] = text.split(":")[1].strip().strip("$")
+        elif "Rating" in text:
+            product_dict["Rating"] = text.split(":")[1].strip()
+    product_dict["MainImage"] = soup.find('img')['src']
+    # Options
+    options, options_to_image = {}, {}
+    option_blocks = soup.findAll("div", {'class': 'radio-toolbar'})
+    for block in option_blocks:
+        name = block.find("input")["name"]
+        labels = block.findAll("label")
+        inputs = block.findAll("input")
+        opt_list = []
+        for label, input in zip(labels, inputs):
+            opt = label.text
+            opt_img_path = input["onclick"].split("href=")[1].strip('\';')
+            opt_img_url = f'{WEBSHOP_URL}{opt_img_path}'
+            opt_list.append(opt)
+            options_to_image[opt] = opt_img_url
+        options[name] = opt_list
+    product_dict["options"] = options
+    product_dict["option_to_image"] = options_to_image
+    # Description
+    url = (
+        f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/'
+        f'{asin}/{query_string}/{page_num}/Description/{options_string}'
+    )
+    webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
+    soup = BeautifulSoup(webpage.content, 'html.parser')
+    product_dict["Description"] = soup.find(name="p", attrs={'class': 'product-info'}).text.strip()
+    # Features
+    url = (
+        f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/'
+        f'{asin}/{query_string}/{page_num}/Features/{options_string}'
+    )
+    webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
+    soup = BeautifulSoup(webpage.content, 'html.parser')
+    bullets = soup.find(name="ul").findAll(name="li")
+    product_dict["BulletPoints"] = '\n'.join([b.text.strip() for b in bullets])
+    return product_dict
 # Query -> Search Result ASINs
+def parse_results_amz(query, page_num=None):
     url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
     if page_num is not None:
         url += "&page=" + str(page_num)
     if VERBOSE:
+        print(f"Search Results URL: ${url}")
     webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
     soup = BeautifulSoup(webpage.content, 'html.parser')
     products = soup.findAll('div', {'data-component-type': 's-search-result'})
     return results
 # Scrape information of each product
+def parse_item_page_amz(asin):
     product_dict = {}
     product_dict["asin"] = asin