Spaces:

vansh9878
/

Dataset

Runtime error

File size: 3,986 Bytes

825e978

import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import getFiles.getKaggle as getKaggle
import getFiles.getGithub as getGithub
import os

times=15


def googleDatasets(query):
    # query="Covid 19"
    download_folder = "./downloads/"+query
    kag=[]
    git=[]
    hug=[]
    count=0
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Uncomment to run headless (no UI)
    chrome_options.add_experimental_option("prefs", {
        "download.default_directory": download_folder,  # Set the custom download folder
        "download.prompt_for_download": False,  # Don't ask for confirmation to download
        "download.directory_upgrade": True,  # Allow downloading into the custom folder
        "safebrowsing.enabled": True  # Enable safe browsing (to avoid warnings during download)
    })
    driver = webdriver.Chrome(options=chrome_options)

    driver.get("https://datasetsearch.research.google.com/")

    try:
        WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.TAG_NAME, "c-wiz"))
    )

        search = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "input[aria-label='Dataset Search']"))
    )

        search.send_keys(query)
        search.send_keys(Keys.RETURN)

        WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div[jscontroller]"))
    )
    
        WebDriverWait(driver,10).until(
        EC.presence_of_element_located((By.TAG_NAME,"c-wiz"))
    )
    
        WebDriverWait(driver,10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR,"ol.VAt4"))
    )
    
        links=driver.find_elements(By.CSS_SELECTOR,"li.UnWQ5")
        for link in links:
            if count==times:
                break
        # print(link)
            link.click()
            time.sleep(2)
            WebDriverWait(driver,10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR,"ul.eEUDce"))
        )
        
            downloads=driver.find_elements(By.CSS_SELECTOR,"li.dy4aPc")
            dataset=downloads[0]
        # dataset_url = dataset.get_attribute("href")
        # print("Dataset URL:", dataset_url)
        # print(dataset.get_attribute("href"))
            tag=dataset.find_element(By.TAG_NAME,"a")
            url=tag.get_attribute("href")
        # print(url)
        # print(driver.current_url)
            try:
                if "kaggle" in url:
                    match=re.search(r'datasets\/(.*)',url)
                    print(match)
                    string=match.group(1)
                    print("This is "+string)
                    kag.append(string)
                # getKaggle.kaggleDataset(string,query)
                # time.sleep(5)
                    continue
                elif "github" in url:
                    print("This is "+url)
                    git.append(url)
                    # getGithub.githubDataset(url,query)
                    # time.sleep(5)
                    continue
                elif "huggingface" in url and "turkish" not in url and "spanish" not in url:
                    match=re.search(r'datasets\/(.*)',url)
                    string=match.group(1)
                    print("Again "+string)
                    hug.append(string)
                    continue
            except:
                continue
            dataset.click()
            count+=1
            time.sleep(5)
        time.sleep(5)

    except Exception as e:
        print("Error:", e)

    finally:
        driver.quit()
    
    return kag,git,hug
# 
# googleDatasets("house predictions")