Dataset / getFiles /getGoogle.py
vansh9878's picture
files added
825e978
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import getFiles.getKaggle as getKaggle
import getFiles.getGithub as getGithub
import os
times=15
def googleDatasets(query):
# query="Covid 19"
download_folder = "./downloads/"+query
kag=[]
git=[]
hug=[]
count=0
if not os.path.exists(download_folder):
os.makedirs(download_folder)
chrome_options = Options()
chrome_options.add_argument("--headless") # Uncomment to run headless (no UI)
chrome_options.add_experimental_option("prefs", {
"download.default_directory": download_folder, # Set the custom download folder
"download.prompt_for_download": False, # Don't ask for confirmation to download
"download.directory_upgrade": True, # Allow downloading into the custom folder
"safebrowsing.enabled": True # Enable safe browsing (to avoid warnings during download)
})
driver = webdriver.Chrome(options=chrome_options)
driver.get("https://datasetsearch.research.google.com/")
try:
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.TAG_NAME, "c-wiz"))
)
search = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "input[aria-label='Dataset Search']"))
)
search.send_keys(query)
search.send_keys(Keys.RETURN)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div[jscontroller]"))
)
WebDriverWait(driver,10).until(
EC.presence_of_element_located((By.TAG_NAME,"c-wiz"))
)
WebDriverWait(driver,10).until(
EC.presence_of_element_located((By.CSS_SELECTOR,"ol.VAt4"))
)
links=driver.find_elements(By.CSS_SELECTOR,"li.UnWQ5")
for link in links:
if count==times:
break
# print(link)
link.click()
time.sleep(2)
WebDriverWait(driver,10).until(
EC.presence_of_element_located((By.CSS_SELECTOR,"ul.eEUDce"))
)
downloads=driver.find_elements(By.CSS_SELECTOR,"li.dy4aPc")
dataset=downloads[0]
# dataset_url = dataset.get_attribute("href")
# print("Dataset URL:", dataset_url)
# print(dataset.get_attribute("href"))
tag=dataset.find_element(By.TAG_NAME,"a")
url=tag.get_attribute("href")
# print(url)
# print(driver.current_url)
try:
if "kaggle" in url:
match=re.search(r'datasets\/(.*)',url)
print(match)
string=match.group(1)
print("This is "+string)
kag.append(string)
# getKaggle.kaggleDataset(string,query)
# time.sleep(5)
continue
elif "github" in url:
print("This is "+url)
git.append(url)
# getGithub.githubDataset(url,query)
# time.sleep(5)
continue
elif "huggingface" in url and "turkish" not in url and "spanish" not in url:
match=re.search(r'datasets\/(.*)',url)
string=match.group(1)
print("Again "+string)
hug.append(string)
continue
except:
continue
dataset.click()
count+=1
time.sleep(5)
time.sleep(5)
except Exception as e:
print("Error:", e)
finally:
driver.quit()
return kag,git,hug
#
# googleDatasets("house predictions")