import re from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options import time import getFiles.getKaggle as getKaggle import getFiles.getGithub as getGithub import os times=15 def googleDatasets(query): # query="Covid 19" download_folder = "./downloads/"+query kag=[] git=[] hug=[] count=0 if not os.path.exists(download_folder): os.makedirs(download_folder) chrome_options = Options() chrome_options.add_argument("--headless") # Uncomment to run headless (no UI) chrome_options.add_experimental_option("prefs", { "download.default_directory": download_folder, # Set the custom download folder "download.prompt_for_download": False, # Don't ask for confirmation to download "download.directory_upgrade": True, # Allow downloading into the custom folder "safebrowsing.enabled": True # Enable safe browsing (to avoid warnings during download) }) driver = webdriver.Chrome(options=chrome_options) driver.get("https://datasetsearch.research.google.com/") try: WebDriverWait(driver, 20).until( EC.presence_of_element_located((By.TAG_NAME, "c-wiz")) ) search = WebDriverWait(driver, 20).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "input[aria-label='Dataset Search']")) ) search.send_keys(query) search.send_keys(Keys.RETURN) WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, "div[jscontroller]")) ) WebDriverWait(driver,10).until( EC.presence_of_element_located((By.TAG_NAME,"c-wiz")) ) WebDriverWait(driver,10).until( EC.presence_of_element_located((By.CSS_SELECTOR,"ol.VAt4")) ) links=driver.find_elements(By.CSS_SELECTOR,"li.UnWQ5") for link in links: if count==times: break # print(link) link.click() time.sleep(2) WebDriverWait(driver,10).until( EC.presence_of_element_located((By.CSS_SELECTOR,"ul.eEUDce")) ) downloads=driver.find_elements(By.CSS_SELECTOR,"li.dy4aPc") dataset=downloads[0] # dataset_url = dataset.get_attribute("href") # print("Dataset URL:", dataset_url) # print(dataset.get_attribute("href")) tag=dataset.find_element(By.TAG_NAME,"a") url=tag.get_attribute("href") # print(url) # print(driver.current_url) try: if "kaggle" in url: match=re.search(r'datasets\/(.*)',url) print(match) string=match.group(1) print("This is "+string) kag.append(string) # getKaggle.kaggleDataset(string,query) # time.sleep(5) continue elif "github" in url: print("This is "+url) git.append(url) # getGithub.githubDataset(url,query) # time.sleep(5) continue elif "huggingface" in url and "turkish" not in url and "spanish" not in url: match=re.search(r'datasets\/(.*)',url) string=match.group(1) print("Again "+string) hug.append(string) continue except: continue dataset.click() count+=1 time.sleep(5) time.sleep(5) except Exception as e: print("Error:", e) finally: driver.quit() return kag,git,hug # # googleDatasets("house predictions")