DeepFocusTrain / app.py
katsukiai's picture
Update app.py
c4e7910 verified
raw
history blame
3.59 kB
import os
import json
import logging
import nltk
from nltk import word_tokenize, pos_tag
from tqdm import tqdm
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
from huggingface_hub import HfApi
import shutil
# Setup environment and logging
os.environ["HF_TOKEN"] = os.getenv("HUGGINGFACE_API_TOKEN")
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Download NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Load DeepSeek-R1 model and tokenizer
model_name = "deepseek-ai/DeepSeek-R1"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True)
# Paths
converted_dir = "converted/"
os.makedirs(converted_dir, exist_ok=True)
# Training dataset preparation
def prepare_dataset(text_data):
logger.info("Preparing dataset...")
dataset = []
for text in tqdm(text_data.split('\n'), desc="Tokenizing"):
if text.strip():
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
words = [word for word, _ in tagged]
means = [tag for _, tag in tagged]
dataset.append({"tokenizer": tokens, "words": words, "meaning": means})
return dataset
# Convert to JSONL
def convert_to_jsonl(dataset, output_file):
logger.info(f"Converting to JSONL: {output_file}")
with open(output_file, 'w') as f:
for entry in tqdm(dataset, desc="Writing JSONL"):
f.write(json.dumps(entry) + '\n')
# Push to HuggingFace
def push_to_hf(dataset_path):
logger.info("Pushing to HuggingFace dataset: katsukiai/DeepFocus-X3")
api = HfApi()
dataset = Dataset.from_json(dataset_path)
dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"])
logger.info("Dataset pushed successfully")
# Generate text using DeepSeek-R1
def generate_text(input_text):
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# Gradio conversion function
def gradio_convert(text):
logger.info("Processing text with Gradio...")
long_text = generate_text(text) if len(text) > 100 else text
dataset = prepare_dataset(long_text)
output_file = os.path.join(converted_dir, "output.jsonl")
convert_to_jsonl(dataset, output_file)
push_to_hf(output_file)
return json.dumps(dataset, indent=2)
# Gradio Interface
with gr.Blocks(title="Text to JSON Converter") as demo:
gr.Markdown("# Text to JSON Converter")
with gr.Tab("About"):
gr.Markdown("""
This tool converts text to JSONL format using NLTK for tokenization and DeepSeek-R1 for long text generation.
The output is saved in 'converted/' folder and pushed to HuggingFace dataset 'katsukiai/DeepFocus-X3'.
Format: {"tokenizer": tokens, "words": words, "meaning": means}
""")
with gr.Tab("Generate all"):
text_input = gr.Textbox(label="Input Text", lines=10)
output_json = gr.Textbox(label="JSON Output", lines=10)
convert_btn = gr.Button("Convert & Push")
convert_btn.click(
fn=gradio_convert,
inputs=text_input,
outputs=output_json
)
# Launch Gradio app
demo.launch()
# Cleanup (optional)
shutil.rmtree(converted_dir, ignore_errors=True)