katsukiai commited on
Commit
df438e3
·
verified ·
1 Parent(s): a0d9318

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -128
app.py CHANGED
@@ -1,138 +1,92 @@
1
  import os
2
- import logging
3
- import csv
4
- import shutil
5
  import nltk
6
- import pandas as pd
 
7
  from tqdm import tqdm
8
  import gradio as gr
9
- from datasets import Dataset
10
  from transformers import pipeline
11
- from huggingface_hub import HfApi
12
-
13
- # ---------------------- Logging Setup ----------------------
14
- logging.basicConfig(
15
- level=logging.INFO,
16
- format="%(asctime)s [%(levelname)s] %(message)s",
17
- handlers=[logging.StreamHandler()]
18
- )
19
-
20
- # ---------------------- NLTK Setup ----------------------
21
- def download_nltk():
22
- nltk.download("all")
23
- # nltk.download("punkt")
24
- logging.info("NLTK resources downloaded.")
25
-
26
- download_nltk()
27
-
28
- # ---------------------- Data Preparation ----------------------
29
- def get_all_words():
30
- from nltk.corpus import words as nltk_words
31
- all_words = nltk_words.words()
32
- logging.info(f"Got {len(all_words)} words from NLTK.")
33
- return all_words
34
-
35
- def generate_meaning(word, generator):
36
- prompt = f"Define the word '{word}' in one concise sentence."
37
- try:
38
- result = generator(prompt, max_length=50)[0]["generated_text"]
39
- return result.strip()
40
- except Exception as e:
41
- logging.error(f"Error generating meaning for '{word}': {e}")
42
- return ""
43
-
44
- def process_words(model_name, limit=None):
45
- logging.info("Initializing Hugging Face text2text-generation pipeline...")
46
- generator = pipeline("text2text-generation", model=model_name, device=-1)
47
- words_list = get_all_words()
48
- if limit:
49
- words_list = words_list[:limit]
50
- data = []
51
- for word in tqdm(words_list, desc="Processing words"):
52
- tokens = nltk.word_tokenize(word)
53
- meaning = generate_meaning(word, generator)
54
- data.append({
55
- "tokenizer": tokens,
56
- "words": word,
57
- "meaning": meaning
58
- })
59
- logging.info("Finished processing words.")
60
- return data
61
 
62
  def save_to_csv(data, filename="output.csv"):
63
- df = pd.DataFrame(data)
64
- df.to_csv(filename, index=False)
65
- logging.info(f"Saved CSV to {filename}.")
66
- return filename
67
-
68
- # ---------------------- Push to Hugging Face ----------------------
69
- def push_dataset(csv_file, repo_id="katsukiai/DeepFocus-X3"):
70
- repo_local_dir = "."
71
- if not os.path.exists(repo_local_dir):
72
- os.system(f"git clone https://huggingface.co/{repo_id} {repo_local_dir}")
73
- logging.info("Repository cloned locally.")
74
- shutil.copy(csv_file, os.path.join(repo_local_dir, csv_file))
75
- current_dir = os.getcwd()
76
- os.chdir(repo_local_dir)
77
- os.system("git add .")
78
- os.system('git commit -m "Update dataset"')
79
- os.system("git push")
80
- os.chdir(current_dir)
81
- logging.info("Pushed dataset to Hugging Face repository.")
82
-
83
- def generate_all(model_name, word_limit):
84
- try:
85
- word_limit = int(word_limit)
86
- except Exception:
87
- word_limit = None
88
- data = process_words(model_name, limit=word_limit)
89
- csv_file = save_to_csv(data)
90
- push_dataset(csv_file)
91
- return csv_file
92
-
93
- # ---------------------- Gradio Interface Functions ----------------------
94
- def run_generate(model_name, word_limit):
95
- output_csv = generate_all(model_name, word_limit)
96
- return f"Generated and pushed CSV: {output_csv}"
97
-
98
- def about_tab_content():
99
- about_text = (
100
- "## DeepFocus-X3 Dataset Generator\n\n"
101
- "This tool downloads all available words from the NLTK corpus, "
102
- "generates concise meanings using a Hugging Face text-to-text generation model, "
103
- "and converts the data into a CSV file. Finally, it pushes the CSV to the "
104
- "[katsukiai/DeepFocus-X3](https://huggingface.co/datasets/katsukiai/DeepFocus-X3) repository."
105
- )
106
- return about_text
107
 
108
- def settings_tab_content():
109
- settings_text = (
110
- "**Current Settings**\n\n"
111
- "- Model: `google/flan-t5-xl`\n"
112
- "- Word Limit: 50 (set to empty to process all words)\n"
113
- "\nYou can update these settings in the Generate tab."
 
 
 
114
  )
115
- return settings_text
116
-
117
- # ---------------------- Gradio App ----------------------
118
- with gr.Blocks() as demo:
119
- gr.Markdown("## DeepFocus-X3 Dataset Generator")
120
-
121
- with gr.Tabs():
122
- # About Tab
123
- with gr.Tab("About"):
124
- gr.Markdown(about_tab_content())
125
-
126
- # Generate All Tab
127
- with gr.Tab("Generate all"):
128
- model_name_input = gr.Textbox(value="google/flan-t5-xl", label="Hugging Face Model Name for Means")
129
- word_limit_input = gr.Textbox(value="50", label="Word Limit (Leave empty for all)")
130
- generate_button = gr.Button("Generate and Push Dataset")
131
- generate_output = gr.Textbox(label="Output")
132
- generate_button.click(run_generate, inputs=[model_name_input, word_limit_input], outputs=generate_output)
133
-
134
- # Settings Tab
135
- with gr.Tab("Settings"):
136
- gr.Markdown(settings_tab_content())
137
 
138
- demo.launch()
 
 
 
1
  import os
 
 
 
2
  import nltk
3
+ import csv
4
+ import logging
5
  from tqdm import tqdm
6
  import gradio as gr
 
7
  from transformers import pipeline
8
+ from huggingface_hub import HfApi, upload_file
9
+
10
+ # Setup Logging
11
+ logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
+
13
+ # Download NLTK Data
14
+ nltk.download('punkt')
15
+
16
+ # Constants
17
+ HF_REPO = "katsukiai/DeepFocus-X3"
18
+ TOKENIZER = 'bert-base-uncased'
19
+ MODELS = ["bert-base-uncased", "gpt2", "roberta-base", "distilbert-base-uncased", "albert-base-v2"] # Add more models as needed
20
+
21
+ # Initialize Models
22
+ models = {model: pipeline('feature-extraction', model=model) for model in MODELS}
23
+
24
+ # Functions
25
+ def process_text(text):
26
+ tokens = nltk.word_tokenize(text)
27
+ words = list(set(tokens))
28
+ means = {}
29
+ for word in tqdm(words, desc="Processing Words"):
30
+ word_means = {}
31
+ for model_name, model in models.items():
32
+ try:
33
+ output = model(word)
34
+ word_means[model_name] = output[0].mean().item()
35
+ except Exception as e:
36
+ logging.error(f"Error processing word {word} with model {model_name}: {e}")
37
+ word_means[model_name] = None
38
+ means[word] = word_means
39
+ return {"tokenizer": tokens, "words": words, "meaning": means}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  def save_to_csv(data, filename="output.csv"):
42
+ with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
43
+ writer = csv.DictWriter(csvfile, fieldnames=["word", "tokenizer", "meanings"])
44
+ writer.writeheader()
45
+ for word in data['words']:
46
+ writer.writerow({
47
+ "word": word,
48
+ "tokenizer": data['tokenizer'],
49
+ "meanings": str(data['meaning'][word])
50
+ })
51
+
52
+ def train_dataset():
53
+ text = "Your long text goes here..."
54
+ data = process_text(text)
55
+ save_to_csv(data)
56
+ logging.info("Dataset processed and saved to CSV.")
57
+
58
+ def generate_report():
59
+ with open('app.log', 'r') as log_file:
60
+ log_content = log_file.read()
61
+ return log_content
62
+
63
+ # Gradio Interface
64
+ def generate_all(text):
65
+ data = process_text(text)
66
+ save_to_csv(data)
67
+ return f"Processed data saved to output.csv"
68
+
69
+ iface = gr.Interface(
70
+ fn=[generate_all, generate_report],
71
+ inputs="text",
72
+ outputs=["text", "text"],
73
+ title="DeepFocus-X3",
74
+ tab_titles=["Generate All", "Logs"],
75
+ description="Generate processed data and view logs."
76
+ )
 
 
 
 
 
 
 
 
 
77
 
78
+ # Run and Push to HuggingFace
79
+ def run_and_push():
80
+ train_dataset()
81
+ api = HfApi()
82
+ api.create_repo(repo_id=HF_REPO, private=False, exist_ok=True)
83
+ upload_file(
84
+ path_or_fileobj="output.csv",
85
+ path_in_repo="output.csv",
86
+ repo_id=HF_REPO
87
  )
88
+ logging.info("Dataset pushed to HuggingFace.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ if __name__ == "__main__":
91
+ iface.launch()
92
+ run_and_push()