katsukiai commited on
Commit
1726149
·
verified ·
1 Parent(s): ffd44b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -127
app.py CHANGED
@@ -1,135 +1,138 @@
1
- import json
2
- import logging
3
  import os
4
- import datetime
 
 
 
 
 
5
  import gradio as gr
6
- import torch
7
- from transformers import AutoModelForCausalLM, AutoTokenizer
8
  from huggingface_hub import HfApi
9
 
10
- # Set up logging
11
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
12
- logger = logging.getLogger(__name__)
13
-
14
- # List of 37 popular models
15
- MODEL_LIST = [
16
- "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl",
17
- "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b",
18
- "mistralai/Mistral-7B-Instruct-v0.1", "mistralai/Mixtral-8x7B-Instruct",
19
- "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-13b-chat-hf",
20
- "microsoft/DialoGPT-small", "microsoft/DialoGPT-medium", "microsoft/DialoGPT-large",
21
- "bigscience/bloom-560m", "bigscience/bloomz-560m",
22
- "EleutherAI/gpt-neo-125m", "EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-2.7B",
23
- "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neox-20b",
24
- "huggingfaceh4/starchat-alpha", "huggingfaceh4/zephyr-7b-alpha",
25
- "deepseek-ai/deepseek-coder-1.3b", "deepseek-ai/deepseek-coder-6.7b",
26
- "deepseek-ai/deepseek-v3", "databricks/dolly-v2-7b", "cerebras/Cerebras-GPT-1.3B",
27
- "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct",
28
- "google/gemma-2b", "google/gemma-7b", "google/flan-t5-large",
29
- "stabilityai/stablelm-tuned-alpha-7b", "stabilityai/stablelm-2-7b-chat"
30
- ]
31
-
32
- # Function to load selected model
33
- def load_model(model_name):
34
- logger.info(f"Loading model: {model_name} (CPU mode)")
35
- tokenizer = AutoTokenizer.from_pretrained(model_name)
36
- tokenizer.pad_token = tokenizer.eos_token # Avoid padding token errors
37
- model = AutoModelForCausalLM.from_pretrained(model_name)
38
- return tokenizer, model
39
-
40
- # Function to process text with selected model
41
- def process_text(model_name, text):
42
- tokenizer, model = load_model(model_name)
43
- logger.info(f"Processing text with {model_name}...")
44
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
45
- outputs = model.generate(**inputs, max_length=200)
46
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
47
-
48
- # Function to convert text to JSON
49
- def text_to_json(text):
50
- timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
51
- filename = f"output_{timestamp}.json"
52
- with open(filename, "w") as f:
53
- json.dump([{"text": text}], f, indent=4)
54
- logger.info(f"JSON file created: {filename}")
55
- return filename
56
-
57
- # Function to generate JSON and upload to Hugging Face
58
- def generate_and_upload(model_name, text):
59
  try:
60
- if not text.strip():
61
- raise ValueError("Text input is empty.")
62
-
63
- logger.info(f"Received text input for model {model_name}")
64
-
65
- # Process text
66
- processed_text = process_text(model_name, text)
67
- logger.info(f"Processed text: {processed_text}")
68
-
69
- # Convert to JSON
70
- json_file = text_to_json(processed_text)
71
-
72
- # Get Hugging Face API token
73
- token = os.getenv("HUGGINGFACE_API_TOKEN")
74
- if not token:
75
- raise ValueError("Hugging Face API token not found. Please set HUGGINGFACE_API_TOKEN.")
76
-
77
- # Upload file to Hugging Face
78
- api = HfApi()
79
- repo_id = "katsukiai/DeepFocus-X3"
80
- upload_info = api.upload_file(
81
- path_or_fileobj=json_file,
82
- path_in_repo=f"convert/{os.path.basename(json_file)}",
83
- repo_id=repo_id,
84
- repo_type="dataset",
85
- token=token
86
- )
87
-
88
- logger.info(f"File uploaded successfully: {upload_info}")
89
-
90
- # Delete local JSON file after upload
91
- os.remove(json_file)
92
- logger.info(f"Deleted local file: {json_file}")
93
-
94
- return f"Upload successful! Filename: {os.path.basename(json_file)}", None
95
-
96
  except Exception as e:
97
- logger.error(f"Error: {e}")
98
- return f"Error: {str(e)}", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- # Create Gradio UI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  with gr.Blocks() as demo:
102
- with gr.Tab("About"):
103
- gr.Markdown("""
104
- # Text Processor with Selectable Model (CPU)
105
- - Choose from **37 popular transformer models**
106
- - Processes text and converts to JSON
107
- - Uploads to Hugging Face
108
-
109
- ## Instructions:
110
- 1. Select a model from the dropdown.
111
- 2. Enter text in the "Generate" tab.
112
- 3. Click "Generate and Upload."
113
- 4. Download JSON if needed.
114
- 5. Check upload status.
115
-
116
- ## Requirements:
117
- - **Runs on CPU** (No GPU required).
118
- - **Hugging Face API Token** (`HUGGINGFACE_API_TOKEN`) must be set.
119
- """)
120
-
121
- with gr.Tab("Generate"):
122
- model_selector = gr.Dropdown(choices=MODEL_LIST, value="gpt2", label="Choose Model")
123
- text_input = gr.Textbox(label="Enter text")
124
- output_message = gr.Textbox(label="Status message")
125
- json_file_downloader = gr.File(label="Download JSON", interactive=True)
126
- generate_button = gr.Button("Generate and Upload")
127
-
128
- generate_button.click(
129
- fn=generate_and_upload,
130
- inputs=[model_selector, text_input],
131
- outputs=[output_message, json_file_downloader]
132
- )
133
-
134
- # Launch Gradio app
135
- demo.launch()
 
 
 
1
  import os
2
+ import logging
3
+ import csv
4
+ import shutil
5
+ import nltk
6
+ import pandas as pd
7
+ from tqdm import tqdm
8
  import gradio as gr
9
+ from datasets import Dataset
10
+ from transformers import pipeline
11
  from huggingface_hub import HfApi
12
 
13
+ # ---------------------- Logging Setup ----------------------
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format="%(asctime)s [%(levelname)s] %(message)s",
17
+ handlers=[logging.StreamHandler()]
18
+ )
19
+
20
+ # ---------------------- NLTK Setup ----------------------
21
+ def download_nltk():
22
+ nltk.download("words")
23
+ nltk.download("punkt")
24
+ logging.info("NLTK resources downloaded.")
25
+
26
+ download_nltk()
27
+
28
+ # ---------------------- Data Preparation ----------------------
29
+ def get_all_words():
30
+ from nltk.corpus import words as nltk_words
31
+ all_words = nltk_words.words()
32
+ logging.info(f"Got {len(all_words)} words from NLTK.")
33
+ return all_words
34
+
35
+ def generate_meaning(word, generator):
36
+ prompt = f"Define the word '{word}' in one concise sentence."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  try:
38
+ result = generator(prompt, max_length=50)[0]["generated_text"]
39
+ return result.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  except Exception as e:
41
+ logging.error(f"Error generating meaning for '{word}': {e}")
42
+ return ""
43
+
44
+ def process_words(model_name, limit=None):
45
+ logging.info("Initializing Hugging Face text2text-generation pipeline...")
46
+ generator = pipeline("text2text-generation", model=model_name, device=-1)
47
+ words_list = get_all_words()
48
+ if limit:
49
+ words_list = words_list[:limit]
50
+ data = []
51
+ for word in tqdm(words_list, desc="Processing words"):
52
+ tokens = nltk.word_tokenize(word)
53
+ meaning = generate_meaning(word, generator)
54
+ data.append({
55
+ "tokenizer": tokens,
56
+ "words": word,
57
+ "meaning": meaning
58
+ })
59
+ logging.info("Finished processing words.")
60
+ return data
61
+
62
+ def save_to_csv(data, filename="output.csv"):
63
+ df = pd.DataFrame(data)
64
+ df.to_csv(filename, index=False)
65
+ logging.info(f"Saved CSV to {filename}.")
66
+ return filename
67
 
68
+ # ---------------------- Push to Hugging Face ----------------------
69
+ def push_dataset(csv_file, repo_id="katsukiai/DeepFocus-X3"):
70
+ repo_local_dir = "DeepFocus-X3_repo"
71
+ if not os.path.exists(repo_local_dir):
72
+ os.system(f"git clone https://huggingface.co/{repo_id} {repo_local_dir}")
73
+ logging.info("Repository cloned locally.")
74
+ shutil.copy(csv_file, os.path.join(repo_local_dir, csv_file))
75
+ current_dir = os.getcwd()
76
+ os.chdir(repo_local_dir)
77
+ os.system("git add .")
78
+ os.system('git commit -m "Update dataset"')
79
+ os.system("git push")
80
+ os.chdir(current_dir)
81
+ logging.info("Pushed dataset to Hugging Face repository.")
82
+
83
+ def generate_all(model_name, word_limit):
84
+ try:
85
+ word_limit = int(word_limit)
86
+ except Exception:
87
+ word_limit = None
88
+ data = process_words(model_name, limit=word_limit)
89
+ csv_file = save_to_csv(data)
90
+ push_dataset(csv_file)
91
+ return csv_file
92
+
93
+ # ---------------------- Gradio Interface Functions ----------------------
94
+ def run_generate(model_name, word_limit):
95
+ output_csv = generate_all(model_name, word_limit)
96
+ return f"Generated and pushed CSV: {output_csv}"
97
+
98
+ def about_tab_content():
99
+ about_text = (
100
+ "## DeepFocus-X3 Dataset Generator\n\n"
101
+ "This tool downloads all available words from the NLTK corpus, "
102
+ "generates concise meanings using a Hugging Face text-to-text generation model, "
103
+ "and converts the data into a CSV file. Finally, it pushes the CSV to the "
104
+ "[katsukiai/DeepFocus-X3](https://huggingface.co/datasets/katsukiai/DeepFocus-X3) repository."
105
+ )
106
+ return about_text
107
+
108
+ def settings_tab_content():
109
+ settings_text = (
110
+ "**Current Settings**\n\n"
111
+ "- Model: `google/flan-t5-xl`\n"
112
+ "- Word Limit: 50 (set to empty to process all words)\n"
113
+ "\nYou can update these settings in the Generate tab."
114
+ )
115
+ return settings_text
116
+
117
+ # ---------------------- Gradio App ----------------------
118
  with gr.Blocks() as demo:
119
+ gr.Markdown("## DeepFocus-X3 Dataset Generator")
120
+
121
+ with gr.Tabs():
122
+ # About Tab
123
+ with gr.Tab("About"):
124
+ gr.Markdown(about_tab_content())
125
+
126
+ # Generate All Tab
127
+ with gr.Tab("Generate all"):
128
+ model_name_input = gr.Textbox(value="google/flan-t5-xl", label="Hugging Face Model Name for Means")
129
+ word_limit_input = gr.Textbox(value="50", label="Word Limit (Leave empty for all)")
130
+ generate_button = gr.Button("Generate and Push Dataset")
131
+ generate_output = gr.Textbox(label="Output")
132
+ generate_button.click(run_generate, inputs=[model_name_input, word_limit_input], outputs=generate_output)
133
+
134
+ # Settings Tab
135
+ with gr.Tab("Settings"):
136
+ gr.Markdown(settings_tab_content())
137
+
138
+ demo.launch()