Spaces:

universeofml
/

DeepFocusTrain

Runtime error

App Files Files Community

katsukiai commited on Mar 5

Commit

86c0663

verified ·

1 Parent(s): 6a80db8

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -79

app.py CHANGED Viewed

@@ -1,91 +1,100 @@
 import json
 import logging
-import os
-import datetime
 import gradio as gr
-from huggingface_hub import HfApi, HfFolder
-# Set up logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Define the function to convert text to JSON
-def text_to_json(text):
-    lines = text.strip().split('\n')
-    data = [{"text": line} for line in lines]
-    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-    filename = f"output_{timestamp}.json"
-    with open(filename, "w") as f:
-        json.dump(data, f, indent=4)
-    return filename
-# Define the function to generate and upload the JSON file
-def generate_and_upload(text):
-    try:
-        if not text:
-            raise ValueError("Text input is empty.")
-        logger.info(f"Received text input: {text}")
-        # Convert text to JSON and save to file
-        json_file = text_to_json(text)
-        logger.info(f"JSON file created: {json_file}")
-        # Authenticate with Hugging Face Hub
-        api = HfApi()
-        token = os.environ['HUGGINGFACE_API_TOKEN']
-        if token is None:
-            raise ValueError("Hugging Face API token not found. Please set HUGGINGFACE_API_TOKEN environment variable.")
-        # Upload the file to the dataset repository
-        repo_id = "katsukiai/DeepFocus-X3"
-        upload_info = api.upload_file(
-            path_or_fileobj=json_file,
-            path_in_repo=os.path.basename(json_file),
-            repo_id=repo_id,
-            repo_type="dataset",
-            token=token
-        )
-        logger.info(f"Upload info: {upload_info}")
-        message = f"Upload successful! Filename: {os.path.basename(json_file)}"
-        return message, json_file
-    except Exception as e:
-        logger.error(f"Error uploading file: {e}")
-        return f"Error: {e}", None
-# Create the Gradio interface
-with gr.Blocks() as demo:
     with gr.Tab("About"):
         gr.Markdown("""
-        # Text to JSON uploader
-        This app allows you to input text, convert it to JSON format, and upload it to the Hugging Face dataset repository.
-        ## Instructions
-        1. Enter your text in the "Generate" tab.
-        2. Click the "Generate and Upload" button.
-        3. Download the JSON file if desired.
-        4. Check the message for upload status.
-        ## Requirements
-        - Hugging Face API token set as environment variable `HUGGINGFACE_API_TOKEN`.
-        ## Obtaining Hugging Face API Token
-        1. Log in to your Hugging Face account.
-        2. Go to your profile settings.
-        3. Generate a new token or use an existing one.
-        4. Set the token as an environment variable named `HUGGINGFACE_API_TOKEN`.
-        ## Setting Environment Variable
-        - **Windows**: Set it in System Properties > Advanced > Environment Variables.
-        - **macOS/Linux**: Add `export HUGGINGFACE_API_TOKEN=your_token` to your shell profile (e.g., `.bashrc`, `.zshrc`).
         """)
-    with gr.Tab("Generate"):
-        text_input = gr.Textbox(label="Enter text")
-        output_message = gr.Textbox(label="Status message")
-        json_file_downloader = gr.File(label="Download JSON", interactive=False)
-        generate_button = gr.Button("Generate and Upload")
-        generate_button.click(fn=generate_and_upload, inputs=text_input, outputs=[output_message, json_file_downloader])
-# Launch the Gradio app
-demo.launch()

+import os
 import json
 import logging
+import nltk
+from nltk import word_tokenize, pos_tag
+from tqdm import tqdm
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from datasets import Dataset
+from huggingface_hub import HfApi
+import shutil
+# Setup environment and logging
+os.environ["HF_TOKEN"] = os.getenv("HUGGINGFACE_API_TOKEN")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
+# Download NLTK data
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+# Load DeepSeek-R1 model and tokenizer
+model_name = "deepseek-ai/DeepSeek-R1"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+# Paths
+converted_dir = "converted/"
+os.makedirs(converted_dir, exist_ok=True)
+# Training dataset preparation
+def prepare_dataset(text_data):
+    logger.info("Preparing dataset...")
+    dataset = []
+    for text in tqdm(text_data.split('\n'), desc="Tokenizing"):
+        if text.strip():
+            tokens = word_tokenize(text)
+            tagged = pos_tag(tokens)
+            words = [word for word, _ in tagged]
+            means = [tag for _, tag in tagged]
+            dataset.append({"tokenizer": tokens, "words": words, "meaning": means})
+    return dataset
+# Convert to JSONL
+def convert_to_jsonl(dataset, output_file):
+    logger.info(f"Converting to JSONL: {output_file}")
+    with open(output_file, 'w') as f:
+        for entry in tqdm(dataset, desc="Writing JSONL"):
+            f.write(json.dumps(entry) + '\n')
+# Push to HuggingFace
+def push_to_hf(dataset_path):
+    logger.info("Pushing to HuggingFace dataset: katsukiai/DeepFocus-X3")
+    api = HfApi()
+    dataset = Dataset.from_json(dataset_path)
+    dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"])
+    logger.info("Dataset pushed successfully")
+# Generate text using DeepSeek-R1
+def generate_text(input_text):
+    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
+    outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1)
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+# Gradio conversion function
+def gradio_convert(text):
+    logger.info("Processing text with Gradio...")
+    long_text = generate_text(text) if len(text) > 100 else text
+    dataset = prepare_dataset(long_text)
+    output_file = os.path.join(converted_dir, "output.jsonl")
+    convert_to_jsonl(dataset, output_file)
+    push_to_hf(output_file)
+    return json.dumps(dataset, indent=2)
+# Gradio Interface
+with gr.Blocks(title="Text to JSON Converter") as demo:
+    gr.Markdown("# Text to JSON Converter")
     with gr.Tab("About"):
         gr.Markdown("""
+        This tool converts text to JSONL format using NLTK for tokenization and DeepSeek-R1 for long text generation.
+        The output is saved in 'converted/' folder and pushed to HuggingFace dataset 'katsukiai/DeepFocus-X3'.
+        Format: {"tokenizer": tokens, "words": words, "meaning": means}
         """)
+    with gr.Tab("Generate all"):
+        text_input = gr.Textbox(label="Input Text", lines=10)
+        output_json = gr.Textbox(label="JSON Output", lines=10)
+        convert_btn = gr.Button("Convert & Push")
+        convert_btn.click(
+            fn=gradio_convert,
+            inputs=text_input,
+            outputs=output_json
+        )
+# Launch Gradio app
+demo.launch()
+# Cleanup (optional)
+shutil.rmtree(converted_dir, ignore_errors=True)