Spaces:

universeofml
/

DeepFocusTrain

Runtime error

App Files Files Community

katsukiai commited on Mar 6

Commit

1726149

verified ·

1 Parent(s): ffd44b8

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -127

app.py CHANGED Viewed

@@ -1,135 +1,138 @@
-import json
-import logging
 import os
-import datetime
 import gradio as gr
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import HfApi
-# Set up logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-logger = logging.getLogger(__name__)
-# List of 37 popular models
-MODEL_LIST = [
-    "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl",
-    "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b",
-    "mistralai/Mistral-7B-Instruct-v0.1", "mistralai/Mixtral-8x7B-Instruct",
-    "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-13b-chat-hf",
-    "microsoft/DialoGPT-small", "microsoft/DialoGPT-medium", "microsoft/DialoGPT-large",
-    "bigscience/bloom-560m", "bigscience/bloomz-560m",
-    "EleutherAI/gpt-neo-125m", "EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-2.7B",
-    "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neox-20b",
-    "huggingfaceh4/starchat-alpha", "huggingfaceh4/zephyr-7b-alpha",
-    "deepseek-ai/deepseek-coder-1.3b", "deepseek-ai/deepseek-coder-6.7b",
-    "deepseek-ai/deepseek-v3", "databricks/dolly-v2-7b", "cerebras/Cerebras-GPT-1.3B",
-    "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct",
-    "google/gemma-2b", "google/gemma-7b", "google/flan-t5-large",
-    "stabilityai/stablelm-tuned-alpha-7b", "stabilityai/stablelm-2-7b-chat"
-]
-# Function to load selected model
-def load_model(model_name):
-    logger.info(f"Loading model: {model_name} (CPU mode)")
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    tokenizer.pad_token = tokenizer.eos_token  # Avoid padding token errors
-    model = AutoModelForCausalLM.from_pretrained(model_name)
-    return tokenizer, model
-# Function to process text with selected model
-def process_text(model_name, text):
-    tokenizer, model = load_model(model_name)
-    logger.info(f"Processing text with {model_name}...")
-    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
-    outputs = model.generate(**inputs, max_length=200)
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
-# Function to convert text to JSON
-def text_to_json(text):
-    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-    filename = f"output_{timestamp}.json"
-    with open(filename, "w") as f:
-        json.dump([{"text": text}], f, indent=4)
-    logger.info(f"JSON file created: {filename}")
-    return filename
-# Function to generate JSON and upload to Hugging Face
-def generate_and_upload(model_name, text):
     try:
-        if not text.strip():
-            raise ValueError("Text input is empty.")
-        logger.info(f"Received text input for model {model_name}")
-        # Process text
-        processed_text = process_text(model_name, text)
-        logger.info(f"Processed text: {processed_text}")
-        # Convert to JSON
-        json_file = text_to_json(processed_text)
-        # Get Hugging Face API token
-        token = os.getenv("HUGGINGFACE_API_TOKEN")
-        if not token:
-            raise ValueError("Hugging Face API token not found. Please set HUGGINGFACE_API_TOKEN.")
-        # Upload file to Hugging Face
-        api = HfApi()
-        repo_id = "katsukiai/DeepFocus-X3"
-        upload_info = api.upload_file(
-            path_or_fileobj=json_file,
-            path_in_repo=f"convert/{os.path.basename(json_file)}",
-            repo_id=repo_id,
-            repo_type="dataset",
-            token=token
-        )
-        logger.info(f"File uploaded successfully: {upload_info}")
-        # Delete local JSON file after upload
-        os.remove(json_file)
-        logger.info(f"Deleted local file: {json_file}")
-        return f"Upload successful! Filename: {os.path.basename(json_file)}", None
     except Exception as e:
-        logger.error(f"Error: {e}")
-        return f"Error: {str(e)}", None
-# Create Gradio UI
 with gr.Blocks() as demo:
-    with gr.Tab("About"):
-        gr.Markdown("""
-        # Text Processor with Selectable Model (CPU)
-        - Choose from **37 popular transformer models**
-        - Processes text and converts to JSON
-        - Uploads to Hugging Face
-        ## Instructions:
-        1. Select a model from the dropdown.
-        2. Enter text in the "Generate" tab.
-        3. Click "Generate and Upload."
-        4. Download JSON if needed.
-        5. Check upload status.
-        ## Requirements:
-        - **Runs on CPU** (No GPU required).
-        - **Hugging Face API Token** (`HUGGINGFACE_API_TOKEN`) must be set.
-        """)
-    with gr.Tab("Generate"):
-        model_selector = gr.Dropdown(choices=MODEL_LIST, value="gpt2", label="Choose Model")
-        text_input = gr.Textbox(label="Enter text")
-        output_message = gr.Textbox(label="Status message")
-        json_file_downloader = gr.File(label="Download JSON", interactive=True)
-        generate_button = gr.Button("Generate and Upload")
-        generate_button.click(
-            fn=generate_and_upload,
-            inputs=[model_selector, text_input],
-            outputs=[output_message, json_file_downloader]
-        )
-# Launch Gradio app
-demo.launch()

 import os
+import logging
+import csv
+import shutil
+import nltk
+import pandas as pd
+from tqdm import tqdm
 import gradio as gr
+from datasets import Dataset
+from transformers import pipeline
 from huggingface_hub import HfApi
+# ---------------------- Logging Setup ----------------------
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[logging.StreamHandler()]
+)
+# ---------------------- NLTK Setup ----------------------
+def download_nltk():
+    nltk.download("words")
+    nltk.download("punkt")
+    logging.info("NLTK resources downloaded.")
+download_nltk()
+# ---------------------- Data Preparation ----------------------
+def get_all_words():
+    from nltk.corpus import words as nltk_words
+    all_words = nltk_words.words()
+    logging.info(f"Got {len(all_words)} words from NLTK.")
+    return all_words
+def generate_meaning(word, generator):
+    prompt = f"Define the word '{word}' in one concise sentence."
     try:
+        result = generator(prompt, max_length=50)[0]["generated_text"]
+        return result.strip()
     except Exception as e:
+        logging.error(f"Error generating meaning for '{word}': {e}")
+        return ""
+def process_words(model_name, limit=None):
+    logging.info("Initializing Hugging Face text2text-generation pipeline...")
+    generator = pipeline("text2text-generation", model=model_name, device=-1)
+    words_list = get_all_words()
+    if limit:
+        words_list = words_list[:limit]
+    data = []
+    for word in tqdm(words_list, desc="Processing words"):
+        tokens = nltk.word_tokenize(word)
+        meaning = generate_meaning(word, generator)
+        data.append({
+            "tokenizer": tokens,
+            "words": word,
+            "meaning": meaning
+        })
+    logging.info("Finished processing words.")
+    return data
+def save_to_csv(data, filename="output.csv"):
+    df = pd.DataFrame(data)
+    df.to_csv(filename, index=False)
+    logging.info(f"Saved CSV to {filename}.")
+    return filename
+# ---------------------- Push to Hugging Face ----------------------
+def push_dataset(csv_file, repo_id="katsukiai/DeepFocus-X3"):
+    repo_local_dir = "DeepFocus-X3_repo"
+    if not os.path.exists(repo_local_dir):
+        os.system(f"git clone https://huggingface.co/{repo_id} {repo_local_dir}")
+        logging.info("Repository cloned locally.")
+    shutil.copy(csv_file, os.path.join(repo_local_dir, csv_file))
+    current_dir = os.getcwd()
+    os.chdir(repo_local_dir)
+    os.system("git add .")
+    os.system('git commit -m "Update dataset"')
+    os.system("git push")
+    os.chdir(current_dir)
+    logging.info("Pushed dataset to Hugging Face repository.")
+def generate_all(model_name, word_limit):
+    try:
+        word_limit = int(word_limit)
+    except Exception:
+        word_limit = None
+    data = process_words(model_name, limit=word_limit)
+    csv_file = save_to_csv(data)
+    push_dataset(csv_file)
+    return csv_file
+# ---------------------- Gradio Interface Functions ----------------------
+def run_generate(model_name, word_limit):
+    output_csv = generate_all(model_name, word_limit)
+    return f"Generated and pushed CSV: {output_csv}"
+def about_tab_content():
+    about_text = (
+        "## DeepFocus-X3 Dataset Generator\n\n"
+        "This tool downloads all available words from the NLTK corpus, "
+        "generates concise meanings using a Hugging Face text-to-text generation model, "
+        "and converts the data into a CSV file. Finally, it pushes the CSV to the "
+        "[katsukiai/DeepFocus-X3](https://huggingface.co/datasets/katsukiai/DeepFocus-X3) repository."
+    )
+    return about_text
+def settings_tab_content():
+    settings_text = (
+        "**Current Settings**\n\n"
+        "- Model: `google/flan-t5-xl`\n"
+        "- Word Limit: 50 (set to empty to process all words)\n"
+        "\nYou can update these settings in the Generate tab."
+    )
+    return settings_text
+# ---------------------- Gradio App ----------------------
 with gr.Blocks() as demo:
+    gr.Markdown("## DeepFocus-X3 Dataset Generator")
+    with gr.Tabs():
+        # About Tab
+        with gr.Tab("About"):
+            gr.Markdown(about_tab_content())
+        # Generate All Tab
+        with gr.Tab("Generate all"):
+            model_name_input = gr.Textbox(value="google/flan-t5-xl", label="Hugging Face Model Name for Means")
+            word_limit_input = gr.Textbox(value="50", label="Word Limit (Leave empty for all)")
+            generate_button = gr.Button("Generate and Push Dataset")
+            generate_output = gr.Textbox(label="Output")
+            generate_button.click(run_generate, inputs=[model_name_input, word_limit_input], outputs=generate_output)
+        # Settings Tab
+        with gr.Tab("Settings"):
+            gr.Markdown(settings_tab_content())
+demo.launch()