Dataset / getFiles /getGoogle.py
vansh9878's picture
files added
825e978
raw
history blame
3.99 kB
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import getFiles.getKaggle as getKaggle
import getFiles.getGithub as getGithub
import os
times=15
def googleDatasets(query):
# query="Covid 19"
download_folder = "./downloads/"+query
kag=[]
git=[]
hug=[]
count=0
if not os.path.exists(download_folder):
os.makedirs(download_folder)
chrome_options = Options()
chrome_options.add_argument("--headless") # Uncomment to run headless (no UI)
chrome_options.add_experimental_option("prefs", {
"download.default_directory": download_folder, # Set the custom download folder
"download.prompt_for_download": False, # Don't ask for confirmation to download
"download.directory_upgrade": True, # Allow downloading into the custom folder
"safebrowsing.enabled": True # Enable safe browsing (to avoid warnings during download)
})
driver = webdriver.Chrome(options=chrome_options)
driver.get("https://datasetsearch.research.google.com/")
try:
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.TAG_NAME, "c-wiz"))
)
search = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "input[aria-label='Dataset Search']"))
)
search.send_keys(query)
search.send_keys(Keys.RETURN)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div[jscontroller]"))
)
WebDriverWait(driver,10).until(
EC.presence_of_element_located((By.TAG_NAME,"c-wiz"))
)
WebDriverWait(driver,10).until(
EC.presence_of_element_located((By.CSS_SELECTOR,"ol.VAt4"))
)
links=driver.find_elements(By.CSS_SELECTOR,"li.UnWQ5")
for link in links:
if count==times:
break
# print(link)
link.click()
time.sleep(2)
WebDriverWait(driver,10).until(
EC.presence_of_element_located((By.CSS_SELECTOR,"ul.eEUDce"))
)
downloads=driver.find_elements(By.CSS_SELECTOR,"li.dy4aPc")
dataset=downloads[0]
# dataset_url = dataset.get_attribute("href")
# print("Dataset URL:", dataset_url)
# print(dataset.get_attribute("href"))
tag=dataset.find_element(By.TAG_NAME,"a")
url=tag.get_attribute("href")
# print(url)
# print(driver.current_url)
try:
if "kaggle" in url:
match=re.search(r'datasets\/(.*)',url)
print(match)
string=match.group(1)
print("This is "+string)
kag.append(string)
# getKaggle.kaggleDataset(string,query)
# time.sleep(5)
continue
elif "github" in url:
print("This is "+url)
git.append(url)
# getGithub.githubDataset(url,query)
# time.sleep(5)
continue
elif "huggingface" in url and "turkish" not in url and "spanish" not in url:
match=re.search(r'datasets\/(.*)',url)
string=match.group(1)
print("Again "+string)
hug.append(string)
continue
except:
continue
dataset.click()
count+=1
time.sleep(5)
time.sleep(5)
except Exception as e:
print("Error:", e)
finally:
driver.quit()
return kag,git,hug
#
# googleDatasets("house predictions")