|
import re |
|
from selenium import webdriver |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.common.keys import Keys |
|
from selenium.webdriver.support.ui import WebDriverWait |
|
from selenium.webdriver.support import expected_conditions as EC |
|
from selenium.webdriver.chrome.options import Options |
|
import time |
|
import getFiles.getKaggle as getKaggle |
|
import getFiles.getGithub as getGithub |
|
import os |
|
|
|
times=15 |
|
|
|
|
|
def googleDatasets(query): |
|
|
|
download_folder = "./downloads/"+query |
|
kag=[] |
|
git=[] |
|
hug=[] |
|
count=0 |
|
if not os.path.exists(download_folder): |
|
os.makedirs(download_folder) |
|
|
|
chrome_options = Options() |
|
chrome_options.add_argument("--headless") |
|
chrome_options.add_experimental_option("prefs", { |
|
"download.default_directory": download_folder, |
|
"download.prompt_for_download": False, |
|
"download.directory_upgrade": True, |
|
"safebrowsing.enabled": True |
|
}) |
|
driver = webdriver.Chrome(options=chrome_options) |
|
|
|
driver.get("https://datasetsearch.research.google.com/") |
|
|
|
try: |
|
WebDriverWait(driver, 20).until( |
|
EC.presence_of_element_located((By.TAG_NAME, "c-wiz")) |
|
) |
|
|
|
search = WebDriverWait(driver, 20).until( |
|
EC.element_to_be_clickable((By.CSS_SELECTOR, "input[aria-label='Dataset Search']")) |
|
) |
|
|
|
search.send_keys(query) |
|
search.send_keys(Keys.RETURN) |
|
|
|
WebDriverWait(driver, 10).until( |
|
EC.presence_of_element_located((By.CSS_SELECTOR, "div[jscontroller]")) |
|
) |
|
|
|
WebDriverWait(driver,10).until( |
|
EC.presence_of_element_located((By.TAG_NAME,"c-wiz")) |
|
) |
|
|
|
WebDriverWait(driver,10).until( |
|
EC.presence_of_element_located((By.CSS_SELECTOR,"ol.VAt4")) |
|
) |
|
|
|
links=driver.find_elements(By.CSS_SELECTOR,"li.UnWQ5") |
|
for link in links: |
|
if count==times: |
|
break |
|
|
|
link.click() |
|
time.sleep(2) |
|
WebDriverWait(driver,10).until( |
|
EC.presence_of_element_located((By.CSS_SELECTOR,"ul.eEUDce")) |
|
) |
|
|
|
downloads=driver.find_elements(By.CSS_SELECTOR,"li.dy4aPc") |
|
dataset=downloads[0] |
|
|
|
|
|
|
|
tag=dataset.find_element(By.TAG_NAME,"a") |
|
url=tag.get_attribute("href") |
|
|
|
|
|
try: |
|
if "kaggle" in url: |
|
match=re.search(r'datasets\/(.*)',url) |
|
print(match) |
|
string=match.group(1) |
|
print("This is "+string) |
|
kag.append(string) |
|
|
|
|
|
continue |
|
elif "github" in url: |
|
print("This is "+url) |
|
git.append(url) |
|
|
|
|
|
continue |
|
elif "huggingface" in url and "turkish" not in url and "spanish" not in url: |
|
match=re.search(r'datasets\/(.*)',url) |
|
string=match.group(1) |
|
print("Again "+string) |
|
hug.append(string) |
|
continue |
|
except: |
|
continue |
|
dataset.click() |
|
count+=1 |
|
time.sleep(5) |
|
time.sleep(5) |
|
|
|
except Exception as e: |
|
print("Error:", e) |
|
|
|
finally: |
|
driver.quit() |
|
|
|
return kag,git,hug |
|
|
|
|
|
|