import os import json import logging import nltk from nltk import word_tokenize, pos_tag from tqdm import tqdm import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM from datasets import Dataset from huggingface_hub import HfApi import shutil # Setup environment and logging os.environ["HF_TOKEN"] = os.getenv("HUGGINGFACE_API_TOKEN") logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) # Download NLTK data nltk.download('punkt') nltk.download('averaged_perceptron_tagger') # Load DeepSeek-R1 model and tokenizer model_name = "deepseek-ai/DeepSeek-R1" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True) # Paths converted_dir = "converted/" os.makedirs(converted_dir, exist_ok=True) # Training dataset preparation def prepare_dataset(text_data): logger.info("Preparing dataset...") dataset = [] for text in tqdm(text_data.split('\n'), desc="Tokenizing"): if text.strip(): tokens = word_tokenize(text) tagged = pos_tag(tokens) words = [word for word, _ in tagged] means = [tag for _, tag in tagged] dataset.append({"tokenizer": tokens, "words": words, "meaning": means}) return dataset # Convert to JSONL def convert_to_jsonl(dataset, output_file): logger.info(f"Converting to JSONL: {output_file}") with open(output_file, 'w') as f: for entry in tqdm(dataset, desc="Writing JSONL"): f.write(json.dumps(entry) + '\n') # Push to HuggingFace def push_to_hf(dataset_path): logger.info("Pushing to HuggingFace dataset: katsukiai/DeepFocus-X3") api = HfApi() dataset = Dataset.from_json(dataset_path) dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"]) logger.info("Dataset pushed successfully") # Generate text using DeepSeek-R1 def generate_text(input_text): inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024) outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Gradio conversion function def gradio_convert(text): logger.info("Processing text with Gradio...") long_text = generate_text(text) if len(text) > 100 else text dataset = prepare_dataset(long_text) output_file = os.path.join(converted_dir, "output.jsonl") convert_to_jsonl(dataset, output_file) push_to_hf(output_file) return json.dumps(dataset, indent=2) # Gradio Interface with gr.Blocks(title="Text to JSON Converter") as demo: gr.Markdown("# Text to JSON Converter") with gr.Tab("About"): gr.Markdown(""" This tool converts text to JSONL format using NLTK for tokenization and DeepSeek-R1 for long text generation. The output is saved in 'converted/' folder and pushed to HuggingFace dataset 'katsukiai/DeepFocus-X3'. Format: {"tokenizer": tokens, "words": words, "meaning": means} """) with gr.Tab("Generate all"): text_input = gr.Textbox(label="Input Text", lines=10) output_json = gr.Textbox(label="JSON Output", lines=10) convert_btn = gr.Button("Convert & Push") convert_btn.click( fn=gradio_convert, inputs=text_input, outputs=output_json ) # Launch Gradio app demo.launch() # Cleanup (optional) shutil.rmtree(converted_dir, ignore_errors=True)