DeepFocusTrain / app.py
katsukiai's picture
Update app.py
a0d9318 verified
raw
history blame
4.82 kB
import os
import logging
import csv
import shutil
import nltk
import pandas as pd
from tqdm import tqdm
import gradio as gr
from datasets import Dataset
from transformers import pipeline
from huggingface_hub import HfApi
# ---------------------- Logging Setup ----------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[logging.StreamHandler()]
)
# ---------------------- NLTK Setup ----------------------
def download_nltk():
nltk.download("all")
# nltk.download("punkt")
logging.info("NLTK resources downloaded.")
download_nltk()
# ---------------------- Data Preparation ----------------------
def get_all_words():
from nltk.corpus import words as nltk_words
all_words = nltk_words.words()
logging.info(f"Got {len(all_words)} words from NLTK.")
return all_words
def generate_meaning(word, generator):
prompt = f"Define the word '{word}' in one concise sentence."
try:
result = generator(prompt, max_length=50)[0]["generated_text"]
return result.strip()
except Exception as e:
logging.error(f"Error generating meaning for '{word}': {e}")
return ""
def process_words(model_name, limit=None):
logging.info("Initializing Hugging Face text2text-generation pipeline...")
generator = pipeline("text2text-generation", model=model_name, device=-1)
words_list = get_all_words()
if limit:
words_list = words_list[:limit]
data = []
for word in tqdm(words_list, desc="Processing words"):
tokens = nltk.word_tokenize(word)
meaning = generate_meaning(word, generator)
data.append({
"tokenizer": tokens,
"words": word,
"meaning": meaning
})
logging.info("Finished processing words.")
return data
def save_to_csv(data, filename="output.csv"):
df = pd.DataFrame(data)
df.to_csv(filename, index=False)
logging.info(f"Saved CSV to {filename}.")
return filename
# ---------------------- Push to Hugging Face ----------------------
def push_dataset(csv_file, repo_id="katsukiai/DeepFocus-X3"):
repo_local_dir = "."
if not os.path.exists(repo_local_dir):
os.system(f"git clone https://huggingface.co/{repo_id} {repo_local_dir}")
logging.info("Repository cloned locally.")
shutil.copy(csv_file, os.path.join(repo_local_dir, csv_file))
current_dir = os.getcwd()
os.chdir(repo_local_dir)
os.system("git add .")
os.system('git commit -m "Update dataset"')
os.system("git push")
os.chdir(current_dir)
logging.info("Pushed dataset to Hugging Face repository.")
def generate_all(model_name, word_limit):
try:
word_limit = int(word_limit)
except Exception:
word_limit = None
data = process_words(model_name, limit=word_limit)
csv_file = save_to_csv(data)
push_dataset(csv_file)
return csv_file
# ---------------------- Gradio Interface Functions ----------------------
def run_generate(model_name, word_limit):
output_csv = generate_all(model_name, word_limit)
return f"Generated and pushed CSV: {output_csv}"
def about_tab_content():
about_text = (
"## DeepFocus-X3 Dataset Generator\n\n"
"This tool downloads all available words from the NLTK corpus, "
"generates concise meanings using a Hugging Face text-to-text generation model, "
"and converts the data into a CSV file. Finally, it pushes the CSV to the "
"[katsukiai/DeepFocus-X3](https://huggingface.co/datasets/katsukiai/DeepFocus-X3) repository."
)
return about_text
def settings_tab_content():
settings_text = (
"**Current Settings**\n\n"
"- Model: `google/flan-t5-xl`\n"
"- Word Limit: 50 (set to empty to process all words)\n"
"\nYou can update these settings in the Generate tab."
)
return settings_text
# ---------------------- Gradio App ----------------------
with gr.Blocks() as demo:
gr.Markdown("## DeepFocus-X3 Dataset Generator")
with gr.Tabs():
# About Tab
with gr.Tab("About"):
gr.Markdown(about_tab_content())
# Generate All Tab
with gr.Tab("Generate all"):
model_name_input = gr.Textbox(value="google/flan-t5-xl", label="Hugging Face Model Name for Means")
word_limit_input = gr.Textbox(value="50", label="Word Limit (Leave empty for all)")
generate_button = gr.Button("Generate and Push Dataset")
generate_output = gr.Textbox(label="Output")
generate_button.click(run_generate, inputs=[model_name_input, word_limit_input], outputs=generate_output)
# Settings Tab
with gr.Tab("Settings"):
gr.Markdown(settings_tab_content())
demo.launch()