katsukiai commited on
Commit
ad7dde5
·
verified ·
1 Parent(s): 3ad6413

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -29
app.py CHANGED
@@ -4,14 +4,15 @@ import logging
4
  import nltk
5
  from nltk import word_tokenize, pos_tag
6
  from tqdm import tqdm
7
- import gradio as gr
8
  from transformers import AutoTokenizer, AutoModelForCausalLM
9
  from datasets import Dataset
10
  from huggingface_hub import HfApi
11
  import shutil
 
12
 
13
  # Setup environment and logging
14
- os.environ["HF_TOKEN"] = os.getenv("HUGGINGFACE_API_TOKEN")
15
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
16
  logger = logging.getLogger(__name__)
17
 
@@ -19,10 +20,15 @@ logger = logging.getLogger(__name__)
19
  nltk.download('punkt')
20
  nltk.download('averaged_perceptron_tagger')
21
 
22
- # Load DeepSeek-R1 model and tokenizer
23
- model_name = "deepseek-ai/DeepSeek-R1"
24
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
25
- model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True)
 
 
 
 
 
26
 
27
  # Paths
28
  converted_dir = "converted/"
@@ -56,15 +62,15 @@ def push_to_hf(dataset_path):
56
  dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"])
57
  logger.info("Dataset pushed successfully")
58
 
59
- # Generate text using DeepSeek-R1
60
  def generate_text(input_text):
61
- inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
62
  outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1)
63
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
64
 
65
- # Gradio conversion function
66
- def gradio_convert(text):
67
- logger.info("Processing text with Gradio...")
68
  long_text = generate_text(text) if len(text) > 100 else text
69
  dataset = prepare_dataset(long_text)
70
  output_file = os.path.join(converted_dir, "output.jsonl")
@@ -72,29 +78,29 @@ def gradio_convert(text):
72
  push_to_hf(output_file)
73
  return json.dumps(dataset, indent=2)
74
 
75
- # Gradio Interface
76
- with gr.Blocks(title="Text to JSON Converter") as demo:
77
- gr.Markdown("# Text to JSON Converter")
78
-
79
- with gr.Tab("About"):
80
- gr.Markdown("""
81
- This tool converts text to JSONL format using NLTK for tokenization and DeepSeek-R1 for long text generation.
 
 
 
82
  The output is saved in 'converted/' folder and pushed to HuggingFace dataset 'katsukiai/DeepFocus-X3'.
83
  Format: {"tokenizer": tokens, "words": words, "meaning": means}
84
  """)
85
-
86
- with gr.Tab("Generate all"):
87
- text_input = gr.Textbox(label="Input Text", lines=10)
88
- output_json = gr.Textbox(label="JSON Output", lines=10)
89
- convert_btn = gr.Button("Convert & Push")
90
- convert_btn.click(
91
- fn=gradio_convert,
92
- inputs=text_input,
93
- outputs=output_json
94
- )
95
 
96
- # Launch Gradio app
97
- demo.launch()
 
 
 
 
 
 
 
98
 
99
  # Cleanup (optional)
100
  shutil.rmtree(converted_dir, ignore_errors=True)
 
4
  import nltk
5
  from nltk import word_tokenize, pos_tag
6
  from tqdm import tqdm
7
+ import streamlit as st
8
  from transformers import AutoTokenizer, AutoModelForCausalLM
9
  from datasets import Dataset
10
  from huggingface_hub import HfApi
11
  import shutil
12
+ import torch
13
 
14
  # Setup environment and logging
15
+ os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN", "your_hf_token_here")
16
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
17
  logger = logging.getLogger(__name__)
18
 
 
20
  nltk.download('punkt')
21
  nltk.download('averaged_perceptron_tagger')
22
 
23
+ # Load DeepSeek-V3 model and tokenizer
24
+ model_name = "deepseek-ai/DeepSeek-V3" # Updated to V3
25
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
26
+ model = AutoModelForCausalLM.from_pretrained(
27
+ model_name,
28
+ trust_remote_code=True,
29
+ torch_dtype=torch.float32, # CPU compatibility
30
+ device_map="cpu"
31
+ )
32
 
33
  # Paths
34
  converted_dir = "converted/"
 
62
  dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"])
63
  logger.info("Dataset pushed successfully")
64
 
65
+ # Generate text using DeepSeek-V3
66
  def generate_text(input_text):
67
+ inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).to("cpu")
68
  outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1)
69
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
70
 
71
+ # Streamlit conversion function
72
+ def convert_text(text):
73
+ logger.info("Processing text with Streamlit...")
74
  long_text = generate_text(text) if len(text) > 100 else text
75
  dataset = prepare_dataset(long_text)
76
  output_file = os.path.join(converted_dir, "output.jsonl")
 
78
  push_to_hf(output_file)
79
  return json.dumps(dataset, indent=2)
80
 
81
+ # Streamlit Interface
82
+ def main():
83
+ st.title("Text to JSON Converter")
84
+
85
+ # Tabs using Streamlit expander
86
+ tab = st.sidebar.selectbox("Select Tab", ["About", "Generate all"])
87
+
88
+ if tab == "About":
89
+ st.markdown("""
90
+ This tool converts text to JSONL format using NLTK for tokenization and DeepSeek-V3 for long text generation.
91
  The output is saved in 'converted/' folder and pushed to HuggingFace dataset 'katsukiai/DeepFocus-X3'.
92
  Format: {"tokenizer": tokens, "words": words, "meaning": means}
93
  """)
 
 
 
 
 
 
 
 
 
 
94
 
95
+ elif tab == "Generate all":
96
+ text_input = st.text_area("Input Text", height=200)
97
+ if st.button("Convert & Push"):
98
+ with st.spinner("Processing..."):
99
+ result = convert_text(text_input)
100
+ st.text_area("JSON Output", value=result, height=200)
101
+
102
+ if __name__ == "__main__":
103
+ main()
104
 
105
  # Cleanup (optional)
106
  shutil.rmtree(converted_dir, ignore_errors=True)