import openml from langchain_core.prompts import PromptTemplate from langchain_folder.llm_helper import llm from dotenv import load_dotenv import os from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service import requests import openai_openml as oo load_dotenv() url_list = [] api_key = os.getenv('openml_api') openml.config.apikey = api_key def extract_keywords(query): prompt = PromptTemplate.from_template(""" You are an assistant whose job is to extract the keywords from the query and return it: Query = "{query}" For example, if the query is Generate a list of links to datasets related to house price prediction your response should be -> "house price". Note that the query might not always be related to house price predictions, it can be related to other things as well. return only the keywords do not return anything else """) rendered_prompt = prompt.format(query=query) response = llm.invoke(rendered_prompt) return response.content def fetch_dataset_urls(query, limit=4): print(f"Searching for datasets related to: {query}") # datasets = openml.datasets.list_datasets(output_format="dataframe") # matching_datasets = datasets[datasets['name'].str.contains(query, case=False, na=False)] # if matching_datasets.empty: # keywords = query.lower().split() # mask = datasets['name'].apply(lambda name: all(kw in str(name).lower() for kw in keywords)) # matching_datasets = datasets[mask] # if matching_datasets.empty: # print("No datasets found for the query.") # else: # matching_datasets = matching_datasets.head(limit) # for index, row in matching_datasets.iterrows(): # print(f"📌 Dataset: {row['name']}") # dataset_url = f"https://www.openml.org/d/{row['did']}" # url_list.append(dataset_url) # print(f"🔗 URL: https://www.openml.org/d/{row['did']}\n") global url_list url_list=oo.openDataset(query) def openDataset(user_prompt): # user_prompt = input("Enter user prompt: ") extracted_keywords = extract_keywords(user_prompt) print(extracted_keywords) fetch_dataset_urls(extracted_keywords) download_folder = "./input_folder/"+user_prompt if not os.path.exists(download_folder): os.makedirs(download_folder) chrome_options = Options() chrome_options.add_argument("--headless") # Uncomment to run headless (no UI) chrome_options.add_experimental_option("prefs", { "download.default_directory": download_folder, # Set the custom download folder "download.prompt_for_download": False, # Don't ask for confirmation to download "download.directory_upgrade": True, # Allow downloading into the custom folder "safebrowsing.enabled": True # Enable safe browsing (to avoid warnings during download) }) driver = webdriver.Chrome(options=chrome_options) for url in url_list: driver.get(url) try: download_button = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, "a[aria-label='Download dataset']")) ) actual_download_url = download_button.get_attribute("href") filename = actual_download_url.split("/")[-2] + "_" + actual_download_url.split("/")[-1] file_path = os.path.join(download_folder, filename) print(f"⬇️ Downloading from {actual_download_url}") response = requests.get(actual_download_url) with open(file_path, "wb") as f: f.write(response.content) print(f"✅ Saved to {file_path}\n") except Exception as e: print(f"❌ Failed to fetch or download from {url}: {e}") # openDataset("stock market predictions")