ksvmuralidhar commited on
Commit
58c1821
·
verified ·
1 Parent(s): 83d8595

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +71 -71
scraper.py CHANGED
@@ -1,71 +1,71 @@
1
- from selenium import webdriver
2
- from selenium.webdriver.common.by import By
3
- import undetected_chromedriver as uc
4
- import re
5
- import logging
6
- import os
7
- import time
8
- import random
9
- from config import SCRAPER_TIMEOUT, CHROME_DRIVER_PATH
10
-
11
-
12
- def get_text(url, n_words=15):
13
- try:
14
- driver = None
15
- logging.warning(f"Initiated Scraping {url}")
16
- user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
17
- options = uc.ChromeOptions()
18
- options.add_argument("--headless")
19
- options.add_argument(f"user-agent={user_agent}")
20
- options.add_argument("--blink-settings=imagesEnabled=false")
21
- options.add_argument("--disable-images")
22
- options.add_argument("--disable-blink-features=AutomationControlled")
23
- options.add_argument("--disable-dev-shm-usage")
24
-
25
- # options.add_argument("--disable-extensions")
26
- # options.add_argument("--autoplay-policy=no-user-gesture-required")
27
- # options.add_argument("--disable-infobars")
28
- # options.add_argument("--disable-gpu")
29
-
30
- driver = uc.Chrome(version_main=127, options=options, driver_executable_path=CHROME_DRIVER_PATH)
31
- time.sleep(random.uniform(0.5, 1.5))
32
- driver.set_page_load_timeout(SCRAPER_TIMEOUT)
33
- driver.set_script_timeout(SCRAPER_TIMEOUT)
34
- driver.implicitly_wait(3)
35
- driver.get(url)
36
- elem = driver.find_element(By.TAG_NAME, "body").text
37
- sents = elem.split("\n")
38
- sentence_list = []
39
- for sent in sents:
40
- sent = sent.strip()
41
- if (len(sent.split()) >= n_words) and (len(re.findall(r"^\w.+[^\w\)\s]$", sent))>0):
42
- sentence_list.append(sent)
43
- driver.close()
44
- driver.quit()
45
- logging.warning("Closed Webdriver")
46
- logging.warning("Successfully scraped text")
47
- if len(sentence_list) < 3:
48
- raise Exception("Found nothing to scrape.")
49
- return "\n".join(sentence_list), ""
50
- except Exception as e:
51
- logging.warning(str(e))
52
- if driver:
53
- driver.close()
54
- driver.quit()
55
- logging.warning("Closed Webdriver")
56
- err_msg = str(e).split('\n')[0]
57
- return "", err_msg
58
-
59
-
60
- def scrape_text(url, n_words=15,max_retries=2):
61
- scraped_text = ""
62
- scrape_error = ""
63
- try:
64
- n_tries = 1
65
- while (n_tries <= max_retries) and (scraped_text == ""):
66
- scraped_text, scrape_error = get_text(url=url, n_words=n_words)
67
- n_tries += 1
68
- return scraped_text, scrape_error
69
- except Exception as e:
70
- err_msg = str(e).split('\n')[0]
71
- return "", err_msg
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.common.by import By
3
+ import undetected_chromedriver as uc
4
+ import re
5
+ import logging
6
+ import os
7
+ import time
8
+ import random
9
+ from config import SCRAPER_TIMEOUT, CHROME_DRIVER_PATH
10
+
11
+
12
+ def get_text(url, n_words=15):
13
+ try:
14
+ driver = None
15
+ logging.warning(f"Initiated Scraping {url}")
16
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
17
+ options = uc.ChromeOptions()
18
+ # options.add_argument("--headless")
19
+ options.add_argument(f"user-agent={user_agent}")
20
+ options.add_argument("--blink-settings=imagesEnabled=false")
21
+ options.add_argument("--disable-images")
22
+ options.add_argument("--disable-blink-features=AutomationControlled")
23
+ options.add_argument("--disable-dev-shm-usage")
24
+
25
+ # options.add_argument("--disable-extensions")
26
+ # options.add_argument("--autoplay-policy=no-user-gesture-required")
27
+ # options.add_argument("--disable-infobars")
28
+ # options.add_argument("--disable-gpu")
29
+
30
+ driver = uc.Chrome(version_main=127, options=options, driver_executable_path=CHROME_DRIVER_PATH)
31
+ time.sleep(random.uniform(0.5, 1.5))
32
+ driver.set_page_load_timeout(SCRAPER_TIMEOUT)
33
+ driver.set_script_timeout(SCRAPER_TIMEOUT)
34
+ driver.implicitly_wait(3)
35
+ driver.get(url)
36
+ elem = driver.find_element(By.TAG_NAME, "body").text
37
+ sents = elem.split("\n")
38
+ sentence_list = []
39
+ for sent in sents:
40
+ sent = sent.strip()
41
+ if (len(sent.split()) >= n_words) and (len(re.findall(r"^\w.+[^\w\)\s]$", sent))>0):
42
+ sentence_list.append(sent)
43
+ driver.close()
44
+ driver.quit()
45
+ logging.warning("Closed Webdriver")
46
+ logging.warning("Successfully scraped text")
47
+ if len(sentence_list) < 3:
48
+ raise Exception("Found nothing to scrape.")
49
+ return "\n".join(sentence_list), ""
50
+ except Exception as e:
51
+ logging.warning(str(e))
52
+ if driver:
53
+ driver.close()
54
+ driver.quit()
55
+ logging.warning("Closed Webdriver")
56
+ err_msg = str(e).split('\n')[0]
57
+ return "", err_msg
58
+
59
+
60
+ def scrape_text(url, n_words=15,max_retries=2):
61
+ scraped_text = ""
62
+ scrape_error = ""
63
+ try:
64
+ n_tries = 1
65
+ while (n_tries <= max_retries) and (scraped_text == ""):
66
+ scraped_text, scrape_error = get_text(url=url, n_words=n_words)
67
+ n_tries += 1
68
+ return scraped_text, scrape_error
69
+ except Exception as e:
70
+ err_msg = str(e).split('\n')[0]
71
+ return "", err_msg