Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -4,14 +4,15 @@ import logging
|
|
4 |
import nltk
|
5 |
from nltk import word_tokenize, pos_tag
|
6 |
from tqdm import tqdm
|
7 |
-
import
|
8 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
9 |
from datasets import Dataset
|
10 |
from huggingface_hub import HfApi
|
11 |
import shutil
|
|
|
12 |
|
13 |
# Setup environment and logging
|
14 |
-
os.environ["HF_TOKEN"] = os.getenv("
|
15 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
@@ -19,10 +20,15 @@ logger = logging.getLogger(__name__)
|
|
19 |
nltk.download('punkt')
|
20 |
nltk.download('averaged_perceptron_tagger')
|
21 |
|
22 |
-
# Load DeepSeek-
|
23 |
-
model_name = "deepseek-ai/DeepSeek-
|
24 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
25 |
-
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
# Paths
|
28 |
converted_dir = "converted/"
|
@@ -56,15 +62,15 @@ def push_to_hf(dataset_path):
|
|
56 |
dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"])
|
57 |
logger.info("Dataset pushed successfully")
|
58 |
|
59 |
-
# Generate text using DeepSeek-
|
60 |
def generate_text(input_text):
|
61 |
-
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
|
62 |
outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1)
|
63 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
64 |
|
65 |
-
#
|
66 |
-
def
|
67 |
-
logger.info("Processing text with
|
68 |
long_text = generate_text(text) if len(text) > 100 else text
|
69 |
dataset = prepare_dataset(long_text)
|
70 |
output_file = os.path.join(converted_dir, "output.jsonl")
|
@@ -72,29 +78,29 @@ def gradio_convert(text):
|
|
72 |
push_to_hf(output_file)
|
73 |
return json.dumps(dataset, indent=2)
|
74 |
|
75 |
-
#
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
82 |
The output is saved in 'converted/' folder and pushed to HuggingFace dataset 'katsukiai/DeepFocus-X3'.
|
83 |
Format: {"tokenizer": tokens, "words": words, "meaning": means}
|
84 |
""")
|
85 |
-
|
86 |
-
with gr.Tab("Generate all"):
|
87 |
-
text_input = gr.Textbox(label="Input Text", lines=10)
|
88 |
-
output_json = gr.Textbox(label="JSON Output", lines=10)
|
89 |
-
convert_btn = gr.Button("Convert & Push")
|
90 |
-
convert_btn.click(
|
91 |
-
fn=gradio_convert,
|
92 |
-
inputs=text_input,
|
93 |
-
outputs=output_json
|
94 |
-
)
|
95 |
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
# Cleanup (optional)
|
100 |
shutil.rmtree(converted_dir, ignore_errors=True)
|
|
|
4 |
import nltk
|
5 |
from nltk import word_tokenize, pos_tag
|
6 |
from tqdm import tqdm
|
7 |
+
import streamlit as st
|
8 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
9 |
from datasets import Dataset
|
10 |
from huggingface_hub import HfApi
|
11 |
import shutil
|
12 |
+
import torch
|
13 |
|
14 |
# Setup environment and logging
|
15 |
+
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN", "your_hf_token_here")
|
16 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
|
|
20 |
nltk.download('punkt')
|
21 |
nltk.download('averaged_perceptron_tagger')
|
22 |
|
23 |
+
# Load DeepSeek-V3 model and tokenizer
|
24 |
+
model_name = "deepseek-ai/DeepSeek-V3" # Updated to V3
|
25 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
26 |
+
model = AutoModelForCausalLM.from_pretrained(
|
27 |
+
model_name,
|
28 |
+
trust_remote_code=True,
|
29 |
+
torch_dtype=torch.float32, # CPU compatibility
|
30 |
+
device_map="cpu"
|
31 |
+
)
|
32 |
|
33 |
# Paths
|
34 |
converted_dir = "converted/"
|
|
|
62 |
dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"])
|
63 |
logger.info("Dataset pushed successfully")
|
64 |
|
65 |
+
# Generate text using DeepSeek-V3
|
66 |
def generate_text(input_text):
|
67 |
+
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).to("cpu")
|
68 |
outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1)
|
69 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
70 |
|
71 |
+
# Streamlit conversion function
|
72 |
+
def convert_text(text):
|
73 |
+
logger.info("Processing text with Streamlit...")
|
74 |
long_text = generate_text(text) if len(text) > 100 else text
|
75 |
dataset = prepare_dataset(long_text)
|
76 |
output_file = os.path.join(converted_dir, "output.jsonl")
|
|
|
78 |
push_to_hf(output_file)
|
79 |
return json.dumps(dataset, indent=2)
|
80 |
|
81 |
+
# Streamlit Interface
|
82 |
+
def main():
|
83 |
+
st.title("Text to JSON Converter")
|
84 |
+
|
85 |
+
# Tabs using Streamlit expander
|
86 |
+
tab = st.sidebar.selectbox("Select Tab", ["About", "Generate all"])
|
87 |
+
|
88 |
+
if tab == "About":
|
89 |
+
st.markdown("""
|
90 |
+
This tool converts text to JSONL format using NLTK for tokenization and DeepSeek-V3 for long text generation.
|
91 |
The output is saved in 'converted/' folder and pushed to HuggingFace dataset 'katsukiai/DeepFocus-X3'.
|
92 |
Format: {"tokenizer": tokens, "words": words, "meaning": means}
|
93 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
+
elif tab == "Generate all":
|
96 |
+
text_input = st.text_area("Input Text", height=200)
|
97 |
+
if st.button("Convert & Push"):
|
98 |
+
with st.spinner("Processing..."):
|
99 |
+
result = convert_text(text_input)
|
100 |
+
st.text_area("JSON Output", value=result, height=200)
|
101 |
+
|
102 |
+
if __name__ == "__main__":
|
103 |
+
main()
|
104 |
|
105 |
# Cleanup (optional)
|
106 |
shutil.rmtree(converted_dir, ignore_errors=True)
|