Spaces:

vansh9878
/

Dataset

Runtime error

App Files Files Community

Dataset / getFiles /getGoogle.py

vansh9878

files added

825e978 9 days ago

raw

history blame contribute delete

3.99 kB

	import re
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.chrome.options import Options
	import time
	import getFiles.getKaggle as getKaggle
	import getFiles.getGithub as getGithub
	import os

	times=15


	def googleDatasets(query):
	# query="Covid 19"
	download_folder = "./downloads/"+query
	kag=[]
	git=[]
	hug=[]
	count=0
	if not os.path.exists(download_folder):
	os.makedirs(download_folder)

	chrome_options = Options()
	chrome_options.add_argument("--headless") # Uncomment to run headless (no UI)
	chrome_options.add_experimental_option("prefs", {
	"download.default_directory": download_folder, # Set the custom download folder
	"download.prompt_for_download": False, # Don't ask for confirmation to download
	"download.directory_upgrade": True, # Allow downloading into the custom folder
	"safebrowsing.enabled": True # Enable safe browsing (to avoid warnings during download)
	})
	driver = webdriver.Chrome(options=chrome_options)

	driver.get("https://datasetsearch.research.google.com/")

	try:
	WebDriverWait(driver, 20).until(
	EC.presence_of_element_located((By.TAG_NAME, "c-wiz"))
	)

	search = WebDriverWait(driver, 20).until(
	EC.element_to_be_clickable((By.CSS_SELECTOR, "input[aria-label='Dataset Search']"))
	)

	search.send_keys(query)
	search.send_keys(Keys.RETURN)

	WebDriverWait(driver, 10).until(
	EC.presence_of_element_located((By.CSS_SELECTOR, "div[jscontroller]"))
	)

	WebDriverWait(driver,10).until(
	EC.presence_of_element_located((By.TAG_NAME,"c-wiz"))
	)

	WebDriverWait(driver,10).until(
	EC.presence_of_element_located((By.CSS_SELECTOR,"ol.VAt4"))
	)

	links=driver.find_elements(By.CSS_SELECTOR,"li.UnWQ5")
	for link in links:
	if count==times:
	break
	# print(link)
	link.click()
	time.sleep(2)
	WebDriverWait(driver,10).until(
	EC.presence_of_element_located((By.CSS_SELECTOR,"ul.eEUDce"))
	)

	downloads=driver.find_elements(By.CSS_SELECTOR,"li.dy4aPc")
	dataset=downloads[0]
	# dataset_url = dataset.get_attribute("href")
	# print("Dataset URL:", dataset_url)
	# print(dataset.get_attribute("href"))
	tag=dataset.find_element(By.TAG_NAME,"a")
	url=tag.get_attribute("href")
	# print(url)
	# print(driver.current_url)
	try:
	if "kaggle" in url:
	match=re.search(r'datasets\/(.*)',url)
	print(match)
	string=match.group(1)
	print("This is "+string)
	kag.append(string)
	# getKaggle.kaggleDataset(string,query)
	# time.sleep(5)
	continue
	elif "github" in url:
	print("This is "+url)
	git.append(url)
	# getGithub.githubDataset(url,query)
	# time.sleep(5)
	continue
	elif "huggingface" in url and "turkish" not in url and "spanish" not in url:
	match=re.search(r'datasets\/(.*)',url)
	string=match.group(1)
	print("Again "+string)
	hug.append(string)
	continue
	except:
	continue
	dataset.click()
	count+=1
	time.sleep(5)
	time.sleep(5)

	except Exception as e:
	print("Error:", e)

	finally:
	driver.quit()

	return kag,git,hug
	#
	# googleDatasets("house predictions")