katsukiai commited on
Commit
86c0663
·
verified ·
1 Parent(s): 6a80db8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -79
app.py CHANGED
@@ -1,91 +1,100 @@
 
1
  import json
2
  import logging
3
- import os
4
- import datetime
 
5
  import gradio as gr
6
- from huggingface_hub import HfApi, HfFolder
 
 
 
7
 
8
- # Set up logging
9
- logging.basicConfig(level=logging.INFO)
 
10
  logger = logging.getLogger(__name__)
11
 
12
- # Define the function to convert text to JSON
13
- def text_to_json(text):
14
- lines = text.strip().split('\n')
15
- data = [{"text": line} for line in lines]
16
- timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
17
- filename = f"output_{timestamp}.json"
18
- with open(filename, "w") as f:
19
- json.dump(data, f, indent=4)
20
- return filename
21
 
22
- # Define the function to generate and upload the JSON file
23
- def generate_and_upload(text):
24
- try:
25
- if not text:
26
- raise ValueError("Text input is empty.")
27
-
28
- logger.info(f"Received text input: {text}")
29
-
30
- # Convert text to JSON and save to file
31
- json_file = text_to_json(text)
32
- logger.info(f"JSON file created: {json_file}")
33
-
34
- # Authenticate with Hugging Face Hub
35
- api = HfApi()
36
- token = os.environ['HUGGINGFACE_API_TOKEN']
37
- if token is None:
38
- raise ValueError("Hugging Face API token not found. Please set HUGGINGFACE_API_TOKEN environment variable.")
39
-
40
- # Upload the file to the dataset repository
41
- repo_id = "katsukiai/DeepFocus-X3"
42
- upload_info = api.upload_file(
43
- path_or_fileobj=json_file,
44
- path_in_repo=os.path.basename(json_file),
45
- repo_id=repo_id,
46
- repo_type="dataset",
47
- token=token
48
- )
49
- logger.info(f"Upload info: {upload_info}")
50
- message = f"Upload successful! Filename: {os.path.basename(json_file)}"
51
- return message, json_file
52
- except Exception as e:
53
- logger.error(f"Error uploading file: {e}")
54
- return f"Error: {e}", None
 
 
 
55
 
56
- # Create the Gradio interface
57
- with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  with gr.Tab("About"):
59
  gr.Markdown("""
60
- # Text to JSON uploader
61
- This app allows you to input text, convert it to JSON format, and upload it to the Hugging Face dataset repository.
62
-
63
- ## Instructions
64
- 1. Enter your text in the "Generate" tab.
65
- 2. Click the "Generate and Upload" button.
66
- 3. Download the JSON file if desired.
67
- 4. Check the message for upload status.
68
-
69
- ## Requirements
70
- - Hugging Face API token set as environment variable `HUGGINGFACE_API_TOKEN`.
71
-
72
- ## Obtaining Hugging Face API Token
73
- 1. Log in to your Hugging Face account.
74
- 2. Go to your profile settings.
75
- 3. Generate a new token or use an existing one.
76
- 4. Set the token as an environment variable named `HUGGINGFACE_API_TOKEN`.
77
-
78
- ## Setting Environment Variable
79
- - **Windows**: Set it in System Properties > Advanced > Environment Variables.
80
- - **macOS/Linux**: Add `export HUGGINGFACE_API_TOKEN=your_token` to your shell profile (e.g., `.bashrc`, `.zshrc`).
81
  """)
82
 
83
- with gr.Tab("Generate"):
84
- text_input = gr.Textbox(label="Enter text")
85
- output_message = gr.Textbox(label="Status message")
86
- json_file_downloader = gr.File(label="Download JSON", interactive=False)
87
- generate_button = gr.Button("Generate and Upload")
88
- generate_button.click(fn=generate_and_upload, inputs=text_input, outputs=[output_message, json_file_downloader])
89
-
90
- # Launch the Gradio app
91
- demo.launch()
 
 
 
 
 
 
 
1
+ import os
2
  import json
3
  import logging
4
+ import nltk
5
+ from nltk import word_tokenize, pos_tag
6
+ from tqdm import tqdm
7
  import gradio as gr
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM
9
+ from datasets import Dataset
10
+ from huggingface_hub import HfApi
11
+ import shutil
12
 
13
+ # Setup environment and logging
14
+ os.environ["HF_TOKEN"] = os.getenv("HUGGINGFACE_API_TOKEN")
15
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
16
  logger = logging.getLogger(__name__)
17
 
18
+ # Download NLTK data
19
+ nltk.download('punkt')
20
+ nltk.download('averaged_perceptron_tagger')
 
 
 
 
 
 
21
 
22
+ # Load DeepSeek-R1 model and tokenizer
23
+ model_name = "deepseek-ai/DeepSeek-R1"
24
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
25
+ model = AutoModelForCausalLM.from_pretrained(model_name)
26
+
27
+ # Paths
28
+ converted_dir = "converted/"
29
+ os.makedirs(converted_dir, exist_ok=True)
30
+
31
+ # Training dataset preparation
32
+ def prepare_dataset(text_data):
33
+ logger.info("Preparing dataset...")
34
+ dataset = []
35
+ for text in tqdm(text_data.split('\n'), desc="Tokenizing"):
36
+ if text.strip():
37
+ tokens = word_tokenize(text)
38
+ tagged = pos_tag(tokens)
39
+ words = [word for word, _ in tagged]
40
+ means = [tag for _, tag in tagged]
41
+ dataset.append({"tokenizer": tokens, "words": words, "meaning": means})
42
+ return dataset
43
+
44
+ # Convert to JSONL
45
+ def convert_to_jsonl(dataset, output_file):
46
+ logger.info(f"Converting to JSONL: {output_file}")
47
+ with open(output_file, 'w') as f:
48
+ for entry in tqdm(dataset, desc="Writing JSONL"):
49
+ f.write(json.dumps(entry) + '\n')
50
+
51
+ # Push to HuggingFace
52
+ def push_to_hf(dataset_path):
53
+ logger.info("Pushing to HuggingFace dataset: katsukiai/DeepFocus-X3")
54
+ api = HfApi()
55
+ dataset = Dataset.from_json(dataset_path)
56
+ dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"])
57
+ logger.info("Dataset pushed successfully")
58
 
59
+ # Generate text using DeepSeek-R1
60
+ def generate_text(input_text):
61
+ inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
62
+ outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1)
63
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
64
+
65
+ # Gradio conversion function
66
+ def gradio_convert(text):
67
+ logger.info("Processing text with Gradio...")
68
+ long_text = generate_text(text) if len(text) > 100 else text
69
+ dataset = prepare_dataset(long_text)
70
+ output_file = os.path.join(converted_dir, "output.jsonl")
71
+ convert_to_jsonl(dataset, output_file)
72
+ push_to_hf(output_file)
73
+ return json.dumps(dataset, indent=2)
74
+
75
+ # Gradio Interface
76
+ with gr.Blocks(title="Text to JSON Converter") as demo:
77
+ gr.Markdown("# Text to JSON Converter")
78
+
79
  with gr.Tab("About"):
80
  gr.Markdown("""
81
+ This tool converts text to JSONL format using NLTK for tokenization and DeepSeek-R1 for long text generation.
82
+ The output is saved in 'converted/' folder and pushed to HuggingFace dataset 'katsukiai/DeepFocus-X3'.
83
+ Format: {"tokenizer": tokens, "words": words, "meaning": means}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  """)
85
 
86
+ with gr.Tab("Generate all"):
87
+ text_input = gr.Textbox(label="Input Text", lines=10)
88
+ output_json = gr.Textbox(label="JSON Output", lines=10)
89
+ convert_btn = gr.Button("Convert & Push")
90
+ convert_btn.click(
91
+ fn=gradio_convert,
92
+ inputs=text_input,
93
+ outputs=output_json
94
+ )
95
+
96
+ # Launch Gradio app
97
+ demo.launch()
98
+
99
+ # Cleanup (optional)
100
+ shutil.rmtree(converted_dir, ignore_errors=True)