Spaces:
Runtime error
Runtime error
import os | |
import json | |
import logging | |
import nltk | |
from nltk import word_tokenize, pos_tag | |
from tqdm import tqdm | |
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
from datasets import Dataset | |
from huggingface_hub import HfApi | |
import shutil | |
# Setup environment and logging | |
os.environ["HF_TOKEN"] = os.getenv("HUGGINGFACE_API_TOKEN") | |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
logger = logging.getLogger(__name__) | |
# Download NLTK data | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
# Load DeepSeek-R1 model and tokenizer | |
model_name = "deepseek-ai/DeepSeek-R1" | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True) | |
# Paths | |
converted_dir = "converted/" | |
os.makedirs(converted_dir, exist_ok=True) | |
# Training dataset preparation | |
def prepare_dataset(text_data): | |
logger.info("Preparing dataset...") | |
dataset = [] | |
for text in tqdm(text_data.split('\n'), desc="Tokenizing"): | |
if text.strip(): | |
tokens = word_tokenize(text) | |
tagged = pos_tag(tokens) | |
words = [word for word, _ in tagged] | |
means = [tag for _, tag in tagged] | |
dataset.append({"tokenizer": tokens, "words": words, "meaning": means}) | |
return dataset | |
# Convert to JSONL | |
def convert_to_jsonl(dataset, output_file): | |
logger.info(f"Converting to JSONL: {output_file}") | |
with open(output_file, 'w') as f: | |
for entry in tqdm(dataset, desc="Writing JSONL"): | |
f.write(json.dumps(entry) + '\n') | |
# Push to HuggingFace | |
def push_to_hf(dataset_path): | |
logger.info("Pushing to HuggingFace dataset: katsukiai/DeepFocus-X3") | |
api = HfApi() | |
dataset = Dataset.from_json(dataset_path) | |
dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"]) | |
logger.info("Dataset pushed successfully") | |
# Generate text using DeepSeek-R1 | |
def generate_text(input_text): | |
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024) | |
outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1) | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Gradio conversion function | |
def gradio_convert(text): | |
logger.info("Processing text with Gradio...") | |
long_text = generate_text(text) if len(text) > 100 else text | |
dataset = prepare_dataset(long_text) | |
output_file = os.path.join(converted_dir, "output.jsonl") | |
convert_to_jsonl(dataset, output_file) | |
push_to_hf(output_file) | |
return json.dumps(dataset, indent=2) | |
# Gradio Interface | |
with gr.Blocks(title="Text to JSON Converter") as demo: | |
gr.Markdown("# Text to JSON Converter") | |
with gr.Tab("About"): | |
gr.Markdown(""" | |
This tool converts text to JSONL format using NLTK for tokenization and DeepSeek-R1 for long text generation. | |
The output is saved in 'converted/' folder and pushed to HuggingFace dataset 'katsukiai/DeepFocus-X3'. | |
Format: {"tokenizer": tokens, "words": words, "meaning": means} | |
""") | |
with gr.Tab("Generate all"): | |
text_input = gr.Textbox(label="Input Text", lines=10) | |
output_json = gr.Textbox(label="JSON Output", lines=10) | |
convert_btn = gr.Button("Convert & Push") | |
convert_btn.click( | |
fn=gradio_convert, | |
inputs=text_input, | |
outputs=output_json | |
) | |
# Launch Gradio app | |
demo.launch() | |
# Cleanup (optional) | |
shutil.rmtree(converted_dir, ignore_errors=True) |