Spaces:

universeofml
/

DeepFocusTrain

Runtime error

App Files Files Community

DeepFocusTrain / app.py

katsukiai

Update app.py

a0d9318 verified about 2 months ago

raw

history blame

4.82 kB

	import os
	import logging
	import csv
	import shutil
	import nltk
	import pandas as pd
	from tqdm import tqdm
	import gradio as gr
	from datasets import Dataset
	from transformers import pipeline
	from huggingface_hub import HfApi

	# ---------------------- Logging Setup ----------------------
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	handlers=[logging.StreamHandler()]
	)

	# ---------------------- NLTK Setup ----------------------
	def download_nltk():
	nltk.download("all")
	# nltk.download("punkt")
	logging.info("NLTK resources downloaded.")

	download_nltk()

	# ---------------------- Data Preparation ----------------------
	def get_all_words():
	from nltk.corpus import words as nltk_words
	all_words = nltk_words.words()
	logging.info(f"Got {len(all_words)} words from NLTK.")
	return all_words

	def generate_meaning(word, generator):
	prompt = f"Define the word '{word}' in one concise sentence."
	try:
	result = generator(prompt, max_length=50)[0]["generated_text"]
	return result.strip()
	except Exception as e:
	logging.error(f"Error generating meaning for '{word}': {e}")
	return ""

	def process_words(model_name, limit=None):
	logging.info("Initializing Hugging Face text2text-generation pipeline...")
	generator = pipeline("text2text-generation", model=model_name, device=-1)
	words_list = get_all_words()
	if limit:
	words_list = words_list[:limit]
	data = []
	for word in tqdm(words_list, desc="Processing words"):
	tokens = nltk.word_tokenize(word)
	meaning = generate_meaning(word, generator)
	data.append({
	"tokenizer": tokens,
	"words": word,
	"meaning": meaning
	})
	logging.info("Finished processing words.")
	return data

	def save_to_csv(data, filename="output.csv"):
	df = pd.DataFrame(data)
	df.to_csv(filename, index=False)
	logging.info(f"Saved CSV to {filename}.")
	return filename

	# ---------------------- Push to Hugging Face ----------------------
	def push_dataset(csv_file, repo_id="katsukiai/DeepFocus-X3"):
	repo_local_dir = "."
	if not os.path.exists(repo_local_dir):
	os.system(f"git clone https://huggingface.co/{repo_id} {repo_local_dir}")
	logging.info("Repository cloned locally.")
	shutil.copy(csv_file, os.path.join(repo_local_dir, csv_file))
	current_dir = os.getcwd()
	os.chdir(repo_local_dir)
	os.system("git add .")
	os.system('git commit -m "Update dataset"')
	os.system("git push")
	os.chdir(current_dir)
	logging.info("Pushed dataset to Hugging Face repository.")

	def generate_all(model_name, word_limit):
	try:
	word_limit = int(word_limit)
	except Exception:
	word_limit = None
	data = process_words(model_name, limit=word_limit)
	csv_file = save_to_csv(data)
	push_dataset(csv_file)
	return csv_file

	# ---------------------- Gradio Interface Functions ----------------------
	def run_generate(model_name, word_limit):
	output_csv = generate_all(model_name, word_limit)
	return f"Generated and pushed CSV: {output_csv}"

	def about_tab_content():
	about_text = (
	"## DeepFocus-X3 Dataset Generator\n\n"
	"This tool downloads all available words from the NLTK corpus, "
	"generates concise meanings using a Hugging Face text-to-text generation model, "
	"and converts the data into a CSV file. Finally, it pushes the CSV to the "
	"[katsukiai/DeepFocus-X3](https://huggingface.co/datasets/katsukiai/DeepFocus-X3) repository."
	)
	return about_text

	def settings_tab_content():
	settings_text = (
	"Current Settings\n\n"
	"- Model: `google/flan-t5-xl`\n"
	"- Word Limit: 50 (set to empty to process all words)\n"
	"\nYou can update these settings in the Generate tab."
	)
	return settings_text

	# ---------------------- Gradio App ----------------------
	with gr.Blocks() as demo:
	gr.Markdown("## DeepFocus-X3 Dataset Generator")

	with gr.Tabs():
	# About Tab
	with gr.Tab("About"):
	gr.Markdown(about_tab_content())

	# Generate All Tab
	with gr.Tab("Generate all"):
	model_name_input = gr.Textbox(value="google/flan-t5-xl", label="Hugging Face Model Name for Means")
	word_limit_input = gr.Textbox(value="50", label="Word Limit (Leave empty for all)")
	generate_button = gr.Button("Generate and Push Dataset")
	generate_output = gr.Textbox(label="Output")
	generate_button.click(run_generate, inputs=[model_name_input, word_limit_input], outputs=generate_output)

	# Settings Tab
	with gr.Tab("Settings"):
	gr.Markdown(settings_tab_content())

	demo.launch()