Spaces:
Runtime error
Runtime error
John Yang
commited on
Commit
·
7a50274
1
Parent(s):
4b9c9b6
Revert to working version
Browse files- .gitignore +1 -7
- predict_help.py +11 -11
.gitignore
CHANGED
@@ -1,7 +1 @@
|
|
1 |
-
*.
|
2 |
-
*.pyc
|
3 |
-
*.txt
|
4 |
-
|
5 |
-
.DS_Store
|
6 |
-
|
7 |
-
run.py
|
|
|
1 |
+
*.pyc
|
|
|
|
|
|
|
|
|
|
|
|
predict_help.py
CHANGED
@@ -17,6 +17,7 @@ class Page(Enum):
|
|
17 |
|
18 |
HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
|
19 |
DEBUG_HTML = "temp.html"
|
|
|
20 |
NUM_PROD_LIMIT = 10
|
21 |
|
22 |
API = '85956985fae328bfe5a759a2984448d2'
|
@@ -26,11 +27,11 @@ def get_url(url):
|
|
26 |
return proxy_url
|
27 |
|
28 |
# Query -> Search Result ASINs
|
29 |
-
def parse_results(query, page_num=None
|
30 |
url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
|
31 |
if page_num is not None:
|
32 |
url += "&page=" + str(page_num)
|
33 |
-
if
|
34 |
print("Search Results URL:", url)
|
35 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
36 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
@@ -51,26 +52,25 @@ def parse_results(query, page_num=None, verbose=True):
|
|
51 |
result = {
|
52 |
'asin': asin,
|
53 |
'Title': title.text.strip(),
|
54 |
-
'Price': price.text.strip().strip("$")
|
55 |
}
|
56 |
results.append(result)
|
57 |
-
if
|
58 |
print("Scraped", len(results), "products")
|
59 |
return results
|
60 |
|
61 |
# Scrape information of each product
|
62 |
-
def parse_item_page(asin
|
63 |
product_dict = {}
|
64 |
product_dict["asin"] = asin
|
65 |
|
66 |
url = f"https://www.amazon.com/dp/{asin}"
|
67 |
-
if
|
68 |
print("Item Page URL:", url)
|
69 |
begin = time.time()
|
70 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
71 |
end = time.time()
|
72 |
-
|
73 |
-
print("Item page scraping took", end-begin, "seconds")
|
74 |
soup = BeautifulSoup(webpage.content, "html.parser")
|
75 |
|
76 |
# Title
|
@@ -195,9 +195,9 @@ def convert_dict_to_actions(page_type, products=None, asin=None, page_num=None,
|
|
195 |
if page_type == Page.RESULTS:
|
196 |
info["valid"] = ['click[back to search]']
|
197 |
if products is None or page_num is None or num_prods is None:
|
198 |
-
print(
|
199 |
-
print(
|
200 |
-
print(
|
201 |
raise Exception('Provide `products`, `num_prods`, `page_num` to get `results` valid actions')
|
202 |
# Decide whether to add `next >` as clickable based on # of search results
|
203 |
if num_prods > 10:
|
|
|
17 |
|
18 |
HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36'
|
19 |
DEBUG_HTML = "temp.html"
|
20 |
+
VERBOSE = True
|
21 |
NUM_PROD_LIMIT = 10
|
22 |
|
23 |
API = '85956985fae328bfe5a759a2984448d2'
|
|
|
27 |
return proxy_url
|
28 |
|
29 |
# Query -> Search Result ASINs
|
30 |
+
def parse_results(query, page_num=None):
|
31 |
url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
|
32 |
if page_num is not None:
|
33 |
url += "&page=" + str(page_num)
|
34 |
+
if VERBOSE:
|
35 |
print("Search Results URL:", url)
|
36 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
37 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
|
|
52 |
result = {
|
53 |
'asin': asin,
|
54 |
'Title': title.text.strip(),
|
55 |
+
'Price': price.text.strip().strip("$")
|
56 |
}
|
57 |
results.append(result)
|
58 |
+
if VERBOSE:
|
59 |
print("Scraped", len(results), "products")
|
60 |
return results
|
61 |
|
62 |
# Scrape information of each product
|
63 |
+
def parse_item_page(asin):
|
64 |
product_dict = {}
|
65 |
product_dict["asin"] = asin
|
66 |
|
67 |
url = f"https://www.amazon.com/dp/{asin}"
|
68 |
+
if VERBOSE:
|
69 |
print("Item Page URL:", url)
|
70 |
begin = time.time()
|
71 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
72 |
end = time.time()
|
73 |
+
print("Item page scraping took", end-begin, "seconds")
|
|
|
74 |
soup = BeautifulSoup(webpage.content, "html.parser")
|
75 |
|
76 |
# Title
|
|
|
195 |
if page_type == Page.RESULTS:
|
196 |
info["valid"] = ['click[back to search]']
|
197 |
if products is None or page_num is None or num_prods is None:
|
198 |
+
print(page_num)
|
199 |
+
print(num_prods)
|
200 |
+
print(products)
|
201 |
raise Exception('Provide `products`, `num_prods`, `page_num` to get `results` valid actions')
|
202 |
# Decide whether to add `next >` as clickable based on # of search results
|
203 |
if num_prods > 10:
|