John Yang commited on
Commit
7a50274
·
1 Parent(s): 4b9c9b6

Revert to working version

Browse files
Files changed (2) hide show
  1. .gitignore +1 -7
  2. predict_help.py +11 -11
.gitignore CHANGED
@@ -1,7 +1 @@
1
- *.csv
2
- *.pyc
3
- *.txt
4
-
5
- .DS_Store
6
-
7
- run.py
 
1
+ *.pyc
 
 
 
 
 
 
predict_help.py CHANGED
@@ -17,6 +17,7 @@ class Page(Enum):
17
 
18
  HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
19
  DEBUG_HTML = "temp.html"
 
20
  NUM_PROD_LIMIT = 10
21
 
22
  API = '85956985fae328bfe5a759a2984448d2'
@@ -26,11 +27,11 @@ def get_url(url):
26
  return proxy_url
27
 
28
  # Query -> Search Result ASINs
29
- def parse_results(query, page_num=None, verbose=True):
30
  url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
31
  if page_num is not None:
32
  url += "&page=" + str(page_num)
33
- if verbose:
34
  print("Search Results URL:", url)
35
  webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
36
  soup = BeautifulSoup(webpage.content, 'html.parser')
@@ -51,26 +52,25 @@ def parse_results(query, page_num=None, verbose=True):
51
  result = {
52
  'asin': asin,
53
  'Title': title.text.strip(),
54
- 'Price': price.text.strip().strip("$") if isinstance(price, str) else price
55
  }
56
  results.append(result)
57
- if verbose:
58
  print("Scraped", len(results), "products")
59
  return results
60
 
61
  # Scrape information of each product
62
- def parse_item_page(asin, verbose=True):
63
  product_dict = {}
64
  product_dict["asin"] = asin
65
 
66
  url = f"https://www.amazon.com/dp/{asin}"
67
- if verbose:
68
  print("Item Page URL:", url)
69
  begin = time.time()
70
  webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
71
  end = time.time()
72
- if verbose:
73
- print("Item page scraping took", end-begin, "seconds")
74
  soup = BeautifulSoup(webpage.content, "html.parser")
75
 
76
  # Title
@@ -195,9 +195,9 @@ def convert_dict_to_actions(page_type, products=None, asin=None, page_num=None,
195
  if page_type == Page.RESULTS:
196
  info["valid"] = ['click[back to search]']
197
  if products is None or page_num is None or num_prods is None:
198
- print("Page Num:", page_num)
199
- print("# of Products:", num_prods)
200
- print("Products: ", products)
201
  raise Exception('Provide `products`, `num_prods`, `page_num` to get `results` valid actions')
202
  # Decide whether to add `next >` as clickable based on # of search results
203
  if num_prods > 10:
 
17
 
18
  HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
19
  DEBUG_HTML = "temp.html"
20
+ VERBOSE = True
21
  NUM_PROD_LIMIT = 10
22
 
23
  API = '85956985fae328bfe5a759a2984448d2'
 
27
  return proxy_url
28
 
29
  # Query -> Search Result ASINs
30
+ def parse_results(query, page_num=None):
31
  url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
32
  if page_num is not None:
33
  url += "&page=" + str(page_num)
34
+ if VERBOSE:
35
  print("Search Results URL:", url)
36
  webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
37
  soup = BeautifulSoup(webpage.content, 'html.parser')
 
52
  result = {
53
  'asin': asin,
54
  'Title': title.text.strip(),
55
+ 'Price': price.text.strip().strip("$")
56
  }
57
  results.append(result)
58
+ if VERBOSE:
59
  print("Scraped", len(results), "products")
60
  return results
61
 
62
  # Scrape information of each product
63
+ def parse_item_page(asin):
64
  product_dict = {}
65
  product_dict["asin"] = asin
66
 
67
  url = f"https://www.amazon.com/dp/{asin}"
68
+ if VERBOSE:
69
  print("Item Page URL:", url)
70
  begin = time.time()
71
  webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
72
  end = time.time()
73
+ print("Item page scraping took", end-begin, "seconds")
 
74
  soup = BeautifulSoup(webpage.content, "html.parser")
75
 
76
  # Title
 
195
  if page_type == Page.RESULTS:
196
  info["valid"] = ['click[back to search]']
197
  if products is None or page_num is None or num_prods is None:
198
+ print(page_num)
199
+ print(num_prods)
200
+ print(products)
201
  raise Exception('Provide `products`, `num_prods`, `page_num` to get `results` valid actions')
202
  # Decide whether to add `next >` as clickable based on # of search results
203
  if num_prods > 10: