DeepFocusTrain / app.py
katsukiai's picture
Update app.py
3dafe4f verified
raw
history blame
2.82 kB
import os
import nltk
import csv
import logging
from tqdm import tqdm
import gradio as gr
from transformers import pipeline
from huggingface_hub import HfApi, upload_file
# Setup Logging
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Download NLTK Data
nltk.download('all')
# Constants
HF_REPO = "katsukiai/DeepFocus-X3"
TOKENIZER = 'bert-base-uncased'
MODELS = ["bert-base-uncased", "gpt2", "roberta-base", "distilbert-base-uncased", "albert-base-v2"] # Add more models as needed
# Initialize Models
models = {model: pipeline('feature-extraction', model=model) for model in MODELS}
# Functions
def process_text(text):
tokens = nltk.word_tokenize(text)
words = list(set(tokens))
means = {}
for word in tqdm(words, desc="Processing Words"):
word_means = {}
for model_name, model in models.items():
try:
output = model(word)
word_means[model_name] = output[0].mean().item()
except Exception as e:
logging.error(f"Error processing word {word} with model {model_name}: {e}")
word_means[model_name] = None
means[word] = word_means
return {"tokenizer": tokens, "words": words, "meaning": means}
def save_to_csv(data, filename="output.csv"):
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=["word", "tokenizer", "meanings"])
writer.writeheader()
for word in data['words']:
writer.writerow({
"word": word,
"tokenizer": data['tokenizer'],
"meanings": str(data['meaning'][word])
})
def train_dataset():
text = "Your long text goes here..."
data = process_text(text)
save_to_csv(data)
logging.info("Dataset processed and saved to CSV.")
def generate_report():
with open('app.log', 'r') as log_file:
log_content = log_file.read()
return log_content
# Gradio Interface
def generate_all(text):
data = process_text(text)
save_to_csv(data)
return f"Processed data saved to output.csv"
iface = gr.Interface(
fn=[generate_all, generate_report],
inputs="text",
outputs=["text", "text"],
title="DeepFocus-X3",
tab_titles=["Generate All", "Logs"],
description="Generate processed data and view logs."
)
# Run and Push to HuggingFace
def run_and_push():
train_dataset()
api = HfApi()
api.create_repo(repo_id=HF_REPO, private=False, exist_ok=True)
upload_file(
path_or_fileobj="output.csv",
path_in_repo="output.csv",
repo_id=HF_REPO
)
logging.info("Dataset pushed to HuggingFace.")
if __name__ == "__main__":
iface.launch()
run_and_push()