File size: 3,986 Bytes
825e978
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import getFiles.getKaggle as getKaggle
import getFiles.getGithub as getGithub
import os

times=15


def googleDatasets(query):
    # query="Covid 19"
    download_folder = "./downloads/"+query
    kag=[]
    git=[]
    hug=[]
    count=0
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Uncomment to run headless (no UI)
    chrome_options.add_experimental_option("prefs", {
        "download.default_directory": download_folder,  # Set the custom download folder
        "download.prompt_for_download": False,  # Don't ask for confirmation to download
        "download.directory_upgrade": True,  # Allow downloading into the custom folder
        "safebrowsing.enabled": True  # Enable safe browsing (to avoid warnings during download)
    })
    driver = webdriver.Chrome(options=chrome_options)

    driver.get("https://datasetsearch.research.google.com/")

    try:
        WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.TAG_NAME, "c-wiz"))
    )

        search = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "input[aria-label='Dataset Search']"))
    )

        search.send_keys(query)
        search.send_keys(Keys.RETURN)

        WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div[jscontroller]"))
    )
    
        WebDriverWait(driver,10).until(
        EC.presence_of_element_located((By.TAG_NAME,"c-wiz"))
    )
    
        WebDriverWait(driver,10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR,"ol.VAt4"))
    )
    
        links=driver.find_elements(By.CSS_SELECTOR,"li.UnWQ5")
        for link in links:
            if count==times:
                break
        # print(link)
            link.click()
            time.sleep(2)
            WebDriverWait(driver,10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR,"ul.eEUDce"))
        )
        
            downloads=driver.find_elements(By.CSS_SELECTOR,"li.dy4aPc")
            dataset=downloads[0]
        # dataset_url = dataset.get_attribute("href")
        # print("Dataset URL:", dataset_url)
        # print(dataset.get_attribute("href"))
            tag=dataset.find_element(By.TAG_NAME,"a")
            url=tag.get_attribute("href")
        # print(url)
        # print(driver.current_url)
            try:
                if "kaggle" in url:
                    match=re.search(r'datasets\/(.*)',url)
                    print(match)
                    string=match.group(1)
                    print("This is "+string)
                    kag.append(string)
                # getKaggle.kaggleDataset(string,query)
                # time.sleep(5)
                    continue
                elif "github" in url:
                    print("This is "+url)
                    git.append(url)
                    # getGithub.githubDataset(url,query)
                    # time.sleep(5)
                    continue
                elif "huggingface" in url and "turkish" not in url and "spanish" not in url:
                    match=re.search(r'datasets\/(.*)',url)
                    string=match.group(1)
                    print("Again "+string)
                    hug.append(string)
                    continue
            except:
                continue
            dataset.click()
            count+=1
            time.sleep(5)
        time.sleep(5)

    except Exception as e:
        print("Error:", e)

    finally:
        driver.quit()
    
    return kag,git,hug
# 
# googleDatasets("house predictions")