Spaces:
Runtime error
Runtime error
import os | |
import logging | |
import csv | |
import shutil | |
import nltk | |
import pandas as pd | |
from tqdm import tqdm | |
import gradio as gr | |
from datasets import Dataset | |
from transformers import pipeline | |
from huggingface_hub import HfApi | |
# ---------------------- Logging Setup ---------------------- | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s [%(levelname)s] %(message)s", | |
handlers=[logging.StreamHandler()] | |
) | |
# ---------------------- NLTK Setup ---------------------- | |
def download_nltk(): | |
nltk.download("all") | |
# nltk.download("punkt") | |
logging.info("NLTK resources downloaded.") | |
download_nltk() | |
# ---------------------- Data Preparation ---------------------- | |
def get_all_words(): | |
from nltk.corpus import words as nltk_words | |
all_words = nltk_words.words() | |
logging.info(f"Got {len(all_words)} words from NLTK.") | |
return all_words | |
def generate_meaning(word, generator): | |
prompt = f"Define the word '{word}' in one concise sentence." | |
try: | |
result = generator(prompt, max_length=50)[0]["generated_text"] | |
return result.strip() | |
except Exception as e: | |
logging.error(f"Error generating meaning for '{word}': {e}") | |
return "" | |
def process_words(model_name, limit=None): | |
logging.info("Initializing Hugging Face text2text-generation pipeline...") | |
generator = pipeline("text2text-generation", model=model_name, device=-1) | |
words_list = get_all_words() | |
if limit: | |
words_list = words_list[:limit] | |
data = [] | |
for word in tqdm(words_list, desc="Processing words"): | |
tokens = nltk.word_tokenize(word) | |
meaning = generate_meaning(word, generator) | |
data.append({ | |
"tokenizer": tokens, | |
"words": word, | |
"meaning": meaning | |
}) | |
logging.info("Finished processing words.") | |
return data | |
def save_to_csv(data, filename="output.csv"): | |
df = pd.DataFrame(data) | |
df.to_csv(filename, index=False) | |
logging.info(f"Saved CSV to {filename}.") | |
return filename | |
# ---------------------- Push to Hugging Face ---------------------- | |
def push_dataset(csv_file, repo_id="katsukiai/DeepFocus-X3"): | |
repo_local_dir = "." | |
if not os.path.exists(repo_local_dir): | |
os.system(f"git clone https://huggingface.co/{repo_id} {repo_local_dir}") | |
logging.info("Repository cloned locally.") | |
shutil.copy(csv_file, os.path.join(repo_local_dir, csv_file)) | |
current_dir = os.getcwd() | |
os.chdir(repo_local_dir) | |
os.system("git add .") | |
os.system('git commit -m "Update dataset"') | |
os.system("git push") | |
os.chdir(current_dir) | |
logging.info("Pushed dataset to Hugging Face repository.") | |
def generate_all(model_name, word_limit): | |
try: | |
word_limit = int(word_limit) | |
except Exception: | |
word_limit = None | |
data = process_words(model_name, limit=word_limit) | |
csv_file = save_to_csv(data) | |
push_dataset(csv_file) | |
return csv_file | |
# ---------------------- Gradio Interface Functions ---------------------- | |
def run_generate(model_name, word_limit): | |
output_csv = generate_all(model_name, word_limit) | |
return f"Generated and pushed CSV: {output_csv}" | |
def about_tab_content(): | |
about_text = ( | |
"## DeepFocus-X3 Dataset Generator\n\n" | |
"This tool downloads all available words from the NLTK corpus, " | |
"generates concise meanings using a Hugging Face text-to-text generation model, " | |
"and converts the data into a CSV file. Finally, it pushes the CSV to the " | |
"[katsukiai/DeepFocus-X3](https://huggingface.co/datasets/katsukiai/DeepFocus-X3) repository." | |
) | |
return about_text | |
def settings_tab_content(): | |
settings_text = ( | |
"**Current Settings**\n\n" | |
"- Model: `google/flan-t5-xl`\n" | |
"- Word Limit: 50 (set to empty to process all words)\n" | |
"\nYou can update these settings in the Generate tab." | |
) | |
return settings_text | |
# ---------------------- Gradio App ---------------------- | |
with gr.Blocks() as demo: | |
gr.Markdown("## DeepFocus-X3 Dataset Generator") | |
with gr.Tabs(): | |
# About Tab | |
with gr.Tab("About"): | |
gr.Markdown(about_tab_content()) | |
# Generate All Tab | |
with gr.Tab("Generate all"): | |
model_name_input = gr.Textbox(value="google/flan-t5-xl", label="Hugging Face Model Name for Means") | |
word_limit_input = gr.Textbox(value="50", label="Word Limit (Leave empty for all)") | |
generate_button = gr.Button("Generate and Push Dataset") | |
generate_output = gr.Textbox(label="Output") | |
generate_button.click(run_generate, inputs=[model_name_input, word_limit_input], outputs=generate_output) | |
# Settings Tab | |
with gr.Tab("Settings"): | |
gr.Markdown(settings_tab_content()) | |
demo.launch() |