File size: 3,590 Bytes
86c0663
ec8a358
 
86c0663
 
 
ec8a358
86c0663
 
 
 
ec8a358
86c0663
 
 
ec8a358
 
86c0663
 
 
ec8a358
86c0663
 
c4e7910
 
86c0663
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec8a358
86c0663
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec8a358
 
86c0663
 
 
ec8a358
 
86c0663
 
 
 
 
 
 
 
 
 
 
c4e7910
86c0663
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import json
import logging
import nltk
from nltk import word_tokenize, pos_tag
from tqdm import tqdm
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
from huggingface_hub import HfApi
import shutil

# Setup environment and logging
os.environ["HF_TOKEN"] = os.getenv("HUGGINGFACE_API_TOKEN")
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Download NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Load DeepSeek-R1 model and tokenizer
model_name = "deepseek-ai/DeepSeek-R1"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True)

# Paths
converted_dir = "converted/"
os.makedirs(converted_dir, exist_ok=True)

# Training dataset preparation
def prepare_dataset(text_data):
    logger.info("Preparing dataset...")
    dataset = []
    for text in tqdm(text_data.split('\n'), desc="Tokenizing"):
        if text.strip():
            tokens = word_tokenize(text)
            tagged = pos_tag(tokens)
            words = [word for word, _ in tagged]
            means = [tag for _, tag in tagged]
            dataset.append({"tokenizer": tokens, "words": words, "meaning": means})
    return dataset

# Convert to JSONL
def convert_to_jsonl(dataset, output_file):
    logger.info(f"Converting to JSONL: {output_file}")
    with open(output_file, 'w') as f:
        for entry in tqdm(dataset, desc="Writing JSONL"):
            f.write(json.dumps(entry) + '\n')

# Push to HuggingFace
def push_to_hf(dataset_path):
    logger.info("Pushing to HuggingFace dataset: katsukiai/DeepFocus-X3")
    api = HfApi()
    dataset = Dataset.from_json(dataset_path)
    dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"])
    logger.info("Dataset pushed successfully")

# Generate text using DeepSeek-R1
def generate_text(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
    outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Gradio conversion function
def gradio_convert(text):
    logger.info("Processing text with Gradio...")
    long_text = generate_text(text) if len(text) > 100 else text
    dataset = prepare_dataset(long_text)
    output_file = os.path.join(converted_dir, "output.jsonl")
    convert_to_jsonl(dataset, output_file)
    push_to_hf(output_file)
    return json.dumps(dataset, indent=2)

# Gradio Interface
with gr.Blocks(title="Text to JSON Converter") as demo:
    gr.Markdown("# Text to JSON Converter")
    
    with gr.Tab("About"):
        gr.Markdown("""
        This tool converts text to JSONL format using NLTK for tokenization and DeepSeek-R1 for long text generation.
        The output is saved in 'converted/' folder and pushed to HuggingFace dataset 'katsukiai/DeepFocus-X3'.
        Format: {"tokenizer": tokens, "words": words, "meaning": means}
        """)
    
    with gr.Tab("Generate all"):
        text_input = gr.Textbox(label="Input Text", lines=10)
        output_json = gr.Textbox(label="JSON Output", lines=10)
        convert_btn = gr.Button("Convert & Push")
        convert_btn.click(
            fn=gradio_convert,
            inputs=text_input,
            outputs=output_json
        )

# Launch Gradio app
demo.launch()

# Cleanup (optional)
shutil.rmtree(converted_dir, ignore_errors=True)