katsukiai commited on
Commit
0506cec
·
verified ·
1 Parent(s): ba51acd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -80
app.py CHANGED
@@ -1,95 +1,97 @@
1
- import os
2
  import json
3
  import logging
4
- import nltk
5
- from tqdm import tqdm
6
  import gradio as gr
7
- from transformers import pipeline
8
-
9
- # Setup logging
10
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
- logger = logging.getLogger()
12
-
13
- # Download NLTK data
14
- nltk.download('punkt')
15
- nltk.download('averaged_perceptron_tagger')
16
- nltk.download('wordnet')
17
 
18
- # Initialize DeepSeek AI pipeline for long text processing
19
- deepseek_pipeline = pipeline('text-generation', model='DeepSeekAI/DeepFocus-X3')
20
 
21
- # Function to process text and convert to JSONL
22
- def text_to_jsonl(text):
23
- sentences = nltk.sent_tokenize(text)
24
- jsonl_data = []
 
 
 
 
 
25
 
26
- for sentence in tqdm(sentences, desc="Processing sentences"):
27
- words = nltk.word_tokenize(sentence)
28
- pos_tags = nltk.pos_tag(words)
29
- meanings = [nltk.corpus.wordnet.synsets(word)[0].definition() if nltk.corpus.wordnet.synsets(word) else 'No definition' for word in words]
 
30
 
31
- jsonl_data.append({
32
- "tokenizer": pos_tags,
33
- "words": words,
34
- "meaning": meanings
35
- })
36
-
37
- return jsonl_data
38
-
39
- # Function to push data to Hugging Face dataset repository
40
- def push_to_huggingface(jsonl_data, repo_name='katsukiai/DeepFocus-X3'):
41
- import huggingface_hub
42
- from huggingface_hub import HfApi, DatasetCard
43
-
44
- api = HfApi()
45
- repo_id = f"{repo_name}"
46
-
47
- # Create or get the dataset repository
48
- api.create_repo(repo_id, repo_type="dataset", private=False, exist_ok=True)
 
 
 
 
 
 
 
 
 
49
 
50
- # Write JSONL data to a file
51
- jsonl_file_path = "data.jsonl"
52
- with open(jsonl_file_path, "w") as f:
53
- for item in jsonl_data:
54
- f.write(json.dumps(item) + "\n")
55
 
56
- # Upload the file to the repository
57
- api.upload_file(
58
- path_or_fileobj=jsonl_file_path,
59
- path_in_repo="data.jsonl",
60
- repo_id=repo_id,
61
- repo_type="dataset"
62
- )
63
- logger.info(f"Data pushed to {repo_id}")
64
 
65
- # Gradio interface
66
- def generate_jsonl(text):
67
- jsonl_data = text_to_jsonl(text)
68
- push_to_huggingface(jsonl_data)
69
- return "Data processed and pushed to Hugging Face"
70
 
71
- # Define Gradio interface
72
- def gradio_interface():
73
- with gr.Blocks() as demo:
74
- gr.Markdown("# Text to JSONL Converter and Hugging Face Pusher")
 
 
 
 
 
 
 
 
75
 
76
- with gr.Tab("About"):
77
- gr.Markdown("""
78
- ## About
79
- This tool converts text into JSONL format with detailed information about each word, including its tokenizer and meaning.
80
- It then pushes the processed data to a Hugging Face dataset repository.
81
- """)
82
 
83
- with gr.Tab("Generate"):
84
- with gr.Row():
85
- input_text = gr.Textbox(label="Input Text", lines=5)
86
- output_text = gr.Textbox(label="Output Status", lines=1)
87
-
88
- generate_button = gr.Button("Generate and Push")
89
- generate_button.click(fn=generate_jsonl, inputs=input_text, outputs=output_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- demo.launch()
92
 
93
- # Run the Gradio interface
94
- if __name__ == "__main__":
95
- gradio_interface()
 
 
1
  import json
2
  import logging
3
+ import os
4
+ import datetime
5
  import gradio as gr
6
+ from huggingface_hub import HfApi, HfFolder
7
+ # Set up logging
8
+ logging.basicConfig(level=logging.INFO)
 
 
 
 
 
 
 
9
 
10
+ logger = logging.getLogger(__name__)
 
11
 
12
+ # Define the function to convert text to JSON
13
+ def text_to_json(text):
14
+ lines = text.strip().split('\n')
15
+ data = [{"text": line} for line in lines]
16
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
17
+ filename = f"converted/output_{timestamp}.json"
18
+ with open(filename, "w") as f:
19
+ json.dump(data, f, indent=4)
20
+ return filename
21
 
22
+ # Define the function to generate and upload the JSON file
23
+ def generate_and_upload(text):
24
+ try:
25
+ if not text:
26
+ raise ValueError("Text input is empty.")
27
 
28
+ logger.info(f"Received text input: {text}")
29
+
30
+ # Convert text to JSON and save to file
31
+ json_file = text_to_json(text)
32
+ logger.info(f"JSON file created: {json_file}")
33
+
34
+ # Authenticate with Hugging Face Hub
35
+ api = HfApi()
36
+ token = os.environ['HUGGINGFACE_API_TOKEN']
37
+ if token is None:
38
+ raise ValueError("Hugging Face API token not found. Please set HUGGINGFACE_API_TOKEN environment variable.")
39
+
40
+ # Upload the file to the dataset repository
41
+ repo_id = "katsukiai/DeepFocus-X3"
42
+ upload_info = api.upload_file(
43
+ path_or_fileobj=json_file,
44
+ path_in_repo=os.path.basename(json_file),
45
+ repo_id=repo_id,
46
+ repo_type="dataset",
47
+ token=token
48
+ )
49
+ logger.info(f"Upload info: {upload_info}")
50
+ message = f"Upload successful! Filename: {os.path.basename(json_file)}"
51
+ return message, json_file
52
+ except Exception as e:
53
+ logger.error(f"Error uploading file: {e}")
54
+ return f"Error: {e}", None
55
 
 
 
 
 
 
56
 
 
 
 
 
 
 
 
 
57
 
 
 
 
 
 
58
 
59
+ # Create the Gradio interface
60
+ with gr.Blocks() as demo:
61
+ with gr.Tab("About"):
62
+ gr.Markdown("""
63
+ # Text to JSON uploader
64
+ This app allows you to input text, convert it to JSON format, and upload it to the Hugging Face dataset repository.
65
+
66
+ ## Instructions
67
+ 1. Enter your text in the "Generate" tab.
68
+ 2. Click the "Generate and Upload" button.
69
+ 3. Download the JSON file if desired.
70
+ 4. Check the message for upload status.
71
 
72
+ ## Requirements
73
+ - Hugging Face API token set as environment variable `HUGGINGFACE_API_TOKEN`.
 
 
 
 
74
 
75
+ ## Obtaining Hugging Face API Token
76
+ 1. Log in to your Hugging Face account.
77
+ 2. Go to your profile settings.
78
+ 3. Generate a new token or use an existing one.
79
+ 4. Set the token as an environment variable named `HUGGINGFACE_API_TOKEN`.
80
+
81
+ ## Setting Environment Variable
82
+ - **Windows**: Set it in System Properties > Advanced > Environment Variables.
83
+ - **macOS/Linux**: Add `export HUGGINGFACE_API_TOKEN=your_token` to your shell profile (e.g., `.bashrc`, `.zshrc`).
84
+ """)
85
+
86
+ with gr.Tab("Generate"):
87
+ text_input = gr.Textbox(label="Enter text")
88
+ output_message = gr.Textbox(label="Status message")
89
+ json_file_downloader = gr.File(label="Download JSON", interactive=False)
90
+ generate_button = gr.Button("Generate and Upload")
91
+ generate_button.click(fn=generate_and_upload, inputs=text_input, outputs=[output_message, json_file_downloader])
92
+
93
+ # Launch the Gradio app
94
+ demo.launch()
95
+
96
 
 
97