Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,91 +1,100 @@
|
|
|
|
1 |
import json
|
2 |
import logging
|
3 |
-
import
|
4 |
-
import
|
|
|
5 |
import gradio as gr
|
6 |
-
from
|
|
|
|
|
|
|
7 |
|
8 |
-
#
|
9 |
-
|
|
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
-
#
|
13 |
-
|
14 |
-
|
15 |
-
data = [{"text": line} for line in lines]
|
16 |
-
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
17 |
-
filename = f"output_{timestamp}.json"
|
18 |
-
with open(filename, "w") as f:
|
19 |
-
json.dump(data, f, indent=4)
|
20 |
-
return filename
|
21 |
|
22 |
-
#
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
)
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
55 |
|
56 |
-
#
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
with gr.Tab("About"):
|
59 |
gr.Markdown("""
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
## Instructions
|
64 |
-
1. Enter your text in the "Generate" tab.
|
65 |
-
2. Click the "Generate and Upload" button.
|
66 |
-
3. Download the JSON file if desired.
|
67 |
-
4. Check the message for upload status.
|
68 |
-
|
69 |
-
## Requirements
|
70 |
-
- Hugging Face API token set as environment variable `HUGGINGFACE_API_TOKEN`.
|
71 |
-
|
72 |
-
## Obtaining Hugging Face API Token
|
73 |
-
1. Log in to your Hugging Face account.
|
74 |
-
2. Go to your profile settings.
|
75 |
-
3. Generate a new token or use an existing one.
|
76 |
-
4. Set the token as an environment variable named `HUGGINGFACE_API_TOKEN`.
|
77 |
-
|
78 |
-
## Setting Environment Variable
|
79 |
-
- **Windows**: Set it in System Properties > Advanced > Environment Variables.
|
80 |
-
- **macOS/Linux**: Add `export HUGGINGFACE_API_TOKEN=your_token` to your shell profile (e.g., `.bashrc`, `.zshrc`).
|
81 |
""")
|
82 |
|
83 |
-
with gr.Tab("Generate"):
|
84 |
-
text_input = gr.Textbox(label="
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
import json
|
3 |
import logging
|
4 |
+
import nltk
|
5 |
+
from nltk import word_tokenize, pos_tag
|
6 |
+
from tqdm import tqdm
|
7 |
import gradio as gr
|
8 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
9 |
+
from datasets import Dataset
|
10 |
+
from huggingface_hub import HfApi
|
11 |
+
import shutil
|
12 |
|
13 |
+
# Setup environment and logging
|
14 |
+
os.environ["HF_TOKEN"] = os.getenv("HUGGINGFACE_API_TOKEN")
|
15 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
18 |
+
# Download NLTK data
|
19 |
+
nltk.download('punkt')
|
20 |
+
nltk.download('averaged_perceptron_tagger')
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
+
# Load DeepSeek-R1 model and tokenizer
|
23 |
+
model_name = "deepseek-ai/DeepSeek-R1"
|
24 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
25 |
+
model = AutoModelForCausalLM.from_pretrained(model_name)
|
26 |
+
|
27 |
+
# Paths
|
28 |
+
converted_dir = "converted/"
|
29 |
+
os.makedirs(converted_dir, exist_ok=True)
|
30 |
+
|
31 |
+
# Training dataset preparation
|
32 |
+
def prepare_dataset(text_data):
|
33 |
+
logger.info("Preparing dataset...")
|
34 |
+
dataset = []
|
35 |
+
for text in tqdm(text_data.split('\n'), desc="Tokenizing"):
|
36 |
+
if text.strip():
|
37 |
+
tokens = word_tokenize(text)
|
38 |
+
tagged = pos_tag(tokens)
|
39 |
+
words = [word for word, _ in tagged]
|
40 |
+
means = [tag for _, tag in tagged]
|
41 |
+
dataset.append({"tokenizer": tokens, "words": words, "meaning": means})
|
42 |
+
return dataset
|
43 |
+
|
44 |
+
# Convert to JSONL
|
45 |
+
def convert_to_jsonl(dataset, output_file):
|
46 |
+
logger.info(f"Converting to JSONL: {output_file}")
|
47 |
+
with open(output_file, 'w') as f:
|
48 |
+
for entry in tqdm(dataset, desc="Writing JSONL"):
|
49 |
+
f.write(json.dumps(entry) + '\n')
|
50 |
+
|
51 |
+
# Push to HuggingFace
|
52 |
+
def push_to_hf(dataset_path):
|
53 |
+
logger.info("Pushing to HuggingFace dataset: katsukiai/DeepFocus-X3")
|
54 |
+
api = HfApi()
|
55 |
+
dataset = Dataset.from_json(dataset_path)
|
56 |
+
dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"])
|
57 |
+
logger.info("Dataset pushed successfully")
|
58 |
|
59 |
+
# Generate text using DeepSeek-R1
|
60 |
+
def generate_text(input_text):
|
61 |
+
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
|
62 |
+
outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1)
|
63 |
+
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
64 |
+
|
65 |
+
# Gradio conversion function
|
66 |
+
def gradio_convert(text):
|
67 |
+
logger.info("Processing text with Gradio...")
|
68 |
+
long_text = generate_text(text) if len(text) > 100 else text
|
69 |
+
dataset = prepare_dataset(long_text)
|
70 |
+
output_file = os.path.join(converted_dir, "output.jsonl")
|
71 |
+
convert_to_jsonl(dataset, output_file)
|
72 |
+
push_to_hf(output_file)
|
73 |
+
return json.dumps(dataset, indent=2)
|
74 |
+
|
75 |
+
# Gradio Interface
|
76 |
+
with gr.Blocks(title="Text to JSON Converter") as demo:
|
77 |
+
gr.Markdown("# Text to JSON Converter")
|
78 |
+
|
79 |
with gr.Tab("About"):
|
80 |
gr.Markdown("""
|
81 |
+
This tool converts text to JSONL format using NLTK for tokenization and DeepSeek-R1 for long text generation.
|
82 |
+
The output is saved in 'converted/' folder and pushed to HuggingFace dataset 'katsukiai/DeepFocus-X3'.
|
83 |
+
Format: {"tokenizer": tokens, "words": words, "meaning": means}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
""")
|
85 |
|
86 |
+
with gr.Tab("Generate all"):
|
87 |
+
text_input = gr.Textbox(label="Input Text", lines=10)
|
88 |
+
output_json = gr.Textbox(label="JSON Output", lines=10)
|
89 |
+
convert_btn = gr.Button("Convert & Push")
|
90 |
+
convert_btn.click(
|
91 |
+
fn=gradio_convert,
|
92 |
+
inputs=text_input,
|
93 |
+
outputs=output_json
|
94 |
+
)
|
95 |
+
|
96 |
+
# Launch Gradio app
|
97 |
+
demo.launch()
|
98 |
+
|
99 |
+
# Cleanup (optional)
|
100 |
+
shutil.rmtree(converted_dir, ignore_errors=True)
|