Spaces:
Sleeping
Sleeping
Update scraper.py
Browse files- scraper.py +71 -71
scraper.py
CHANGED
@@ -1,71 +1,71 @@
|
|
1 |
-
from selenium import webdriver
|
2 |
-
from selenium.webdriver.common.by import By
|
3 |
-
import undetected_chromedriver as uc
|
4 |
-
import re
|
5 |
-
import logging
|
6 |
-
import os
|
7 |
-
import time
|
8 |
-
import random
|
9 |
-
from config import SCRAPER_TIMEOUT, CHROME_DRIVER_PATH
|
10 |
-
|
11 |
-
|
12 |
-
def get_text(url, n_words=15):
|
13 |
-
try:
|
14 |
-
driver = None
|
15 |
-
logging.warning(f"Initiated Scraping {url}")
|
16 |
-
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
|
17 |
-
options = uc.ChromeOptions()
|
18 |
-
options.add_argument("--headless")
|
19 |
-
options.add_argument(f"user-agent={user_agent}")
|
20 |
-
options.add_argument("--blink-settings=imagesEnabled=false")
|
21 |
-
options.add_argument("--disable-images")
|
22 |
-
options.add_argument("--disable-blink-features=AutomationControlled")
|
23 |
-
options.add_argument("--disable-dev-shm-usage")
|
24 |
-
|
25 |
-
# options.add_argument("--disable-extensions")
|
26 |
-
# options.add_argument("--autoplay-policy=no-user-gesture-required")
|
27 |
-
# options.add_argument("--disable-infobars")
|
28 |
-
# options.add_argument("--disable-gpu")
|
29 |
-
|
30 |
-
driver = uc.Chrome(version_main=127, options=options, driver_executable_path=CHROME_DRIVER_PATH)
|
31 |
-
time.sleep(random.uniform(0.5, 1.5))
|
32 |
-
driver.set_page_load_timeout(SCRAPER_TIMEOUT)
|
33 |
-
driver.set_script_timeout(SCRAPER_TIMEOUT)
|
34 |
-
driver.implicitly_wait(3)
|
35 |
-
driver.get(url)
|
36 |
-
elem = driver.find_element(By.TAG_NAME, "body").text
|
37 |
-
sents = elem.split("\n")
|
38 |
-
sentence_list = []
|
39 |
-
for sent in sents:
|
40 |
-
sent = sent.strip()
|
41 |
-
if (len(sent.split()) >= n_words) and (len(re.findall(r"^\w.+[^\w\)\s]$", sent))>0):
|
42 |
-
sentence_list.append(sent)
|
43 |
-
driver.close()
|
44 |
-
driver.quit()
|
45 |
-
logging.warning("Closed Webdriver")
|
46 |
-
logging.warning("Successfully scraped text")
|
47 |
-
if len(sentence_list) < 3:
|
48 |
-
raise Exception("Found nothing to scrape.")
|
49 |
-
return "\n".join(sentence_list), ""
|
50 |
-
except Exception as e:
|
51 |
-
logging.warning(str(e))
|
52 |
-
if driver:
|
53 |
-
driver.close()
|
54 |
-
driver.quit()
|
55 |
-
logging.warning("Closed Webdriver")
|
56 |
-
err_msg = str(e).split('\n')[0]
|
57 |
-
return "", err_msg
|
58 |
-
|
59 |
-
|
60 |
-
def scrape_text(url, n_words=15,max_retries=2):
|
61 |
-
scraped_text = ""
|
62 |
-
scrape_error = ""
|
63 |
-
try:
|
64 |
-
n_tries = 1
|
65 |
-
while (n_tries <= max_retries) and (scraped_text == ""):
|
66 |
-
scraped_text, scrape_error = get_text(url=url, n_words=n_words)
|
67 |
-
n_tries += 1
|
68 |
-
return scraped_text, scrape_error
|
69 |
-
except Exception as e:
|
70 |
-
err_msg = str(e).split('\n')[0]
|
71 |
-
return "", err_msg
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.common.by import By
|
3 |
+
import undetected_chromedriver as uc
|
4 |
+
import re
|
5 |
+
import logging
|
6 |
+
import os
|
7 |
+
import time
|
8 |
+
import random
|
9 |
+
from config import SCRAPER_TIMEOUT, CHROME_DRIVER_PATH
|
10 |
+
|
11 |
+
|
12 |
+
def get_text(url, n_words=15):
|
13 |
+
try:
|
14 |
+
driver = None
|
15 |
+
logging.warning(f"Initiated Scraping {url}")
|
16 |
+
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
|
17 |
+
options = uc.ChromeOptions()
|
18 |
+
# options.add_argument("--headless")
|
19 |
+
options.add_argument(f"user-agent={user_agent}")
|
20 |
+
options.add_argument("--blink-settings=imagesEnabled=false")
|
21 |
+
options.add_argument("--disable-images")
|
22 |
+
options.add_argument("--disable-blink-features=AutomationControlled")
|
23 |
+
options.add_argument("--disable-dev-shm-usage")
|
24 |
+
|
25 |
+
# options.add_argument("--disable-extensions")
|
26 |
+
# options.add_argument("--autoplay-policy=no-user-gesture-required")
|
27 |
+
# options.add_argument("--disable-infobars")
|
28 |
+
# options.add_argument("--disable-gpu")
|
29 |
+
|
30 |
+
driver = uc.Chrome(version_main=127, options=options, driver_executable_path=CHROME_DRIVER_PATH)
|
31 |
+
time.sleep(random.uniform(0.5, 1.5))
|
32 |
+
driver.set_page_load_timeout(SCRAPER_TIMEOUT)
|
33 |
+
driver.set_script_timeout(SCRAPER_TIMEOUT)
|
34 |
+
driver.implicitly_wait(3)
|
35 |
+
driver.get(url)
|
36 |
+
elem = driver.find_element(By.TAG_NAME, "body").text
|
37 |
+
sents = elem.split("\n")
|
38 |
+
sentence_list = []
|
39 |
+
for sent in sents:
|
40 |
+
sent = sent.strip()
|
41 |
+
if (len(sent.split()) >= n_words) and (len(re.findall(r"^\w.+[^\w\)\s]$", sent))>0):
|
42 |
+
sentence_list.append(sent)
|
43 |
+
driver.close()
|
44 |
+
driver.quit()
|
45 |
+
logging.warning("Closed Webdriver")
|
46 |
+
logging.warning("Successfully scraped text")
|
47 |
+
if len(sentence_list) < 3:
|
48 |
+
raise Exception("Found nothing to scrape.")
|
49 |
+
return "\n".join(sentence_list), ""
|
50 |
+
except Exception as e:
|
51 |
+
logging.warning(str(e))
|
52 |
+
if driver:
|
53 |
+
driver.close()
|
54 |
+
driver.quit()
|
55 |
+
logging.warning("Closed Webdriver")
|
56 |
+
err_msg = str(e).split('\n')[0]
|
57 |
+
return "", err_msg
|
58 |
+
|
59 |
+
|
60 |
+
def scrape_text(url, n_words=15,max_retries=2):
|
61 |
+
scraped_text = ""
|
62 |
+
scrape_error = ""
|
63 |
+
try:
|
64 |
+
n_tries = 1
|
65 |
+
while (n_tries <= max_retries) and (scraped_text == ""):
|
66 |
+
scraped_text, scrape_error = get_text(url=url, n_words=n_words)
|
67 |
+
n_tries += 1
|
68 |
+
return scraped_text, scrape_error
|
69 |
+
except Exception as e:
|
70 |
+
err_msg = str(e).split('\n')[0]
|
71 |
+
return "", err_msg
|