File size: 2,815 Bytes
0506cec
1726149
df438e3
 
1726149
ba51acd
1726149
df438e3
 
 
 
 
 
3dafe4f
df438e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1726149
 
df438e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1726149
df438e3
 
 
 
 
 
 
 
 
1726149
df438e3
1726149
df438e3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import nltk
import csv
import logging
from tqdm import tqdm
import gradio as gr
from transformers import pipeline
from huggingface_hub import HfApi, upload_file

# Setup Logging
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Download NLTK Data
nltk.download('all')

# Constants
HF_REPO = "katsukiai/DeepFocus-X3"
TOKENIZER = 'bert-base-uncased'
MODELS = ["bert-base-uncased", "gpt2", "roberta-base", "distilbert-base-uncased", "albert-base-v2"]  # Add more models as needed

# Initialize Models
models = {model: pipeline('feature-extraction', model=model) for model in MODELS}

# Functions
def process_text(text):
    tokens = nltk.word_tokenize(text)
    words = list(set(tokens))
    means = {}
    for word in tqdm(words, desc="Processing Words"):
        word_means = {}
        for model_name, model in models.items():
            try:
                output = model(word)
                word_means[model_name] = output[0].mean().item()
            except Exception as e:
                logging.error(f"Error processing word {word} with model {model_name}: {e}")
                word_means[model_name] = None
        means[word] = word_means
    return {"tokenizer": tokens, "words": words, "meaning": means}

def save_to_csv(data, filename="output.csv"):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["word", "tokenizer", "meanings"])
        writer.writeheader()
        for word in data['words']:
            writer.writerow({
                "word": word,
                "tokenizer": data['tokenizer'],
                "meanings": str(data['meaning'][word])
            })

def train_dataset():
    text = "Your long text goes here..."
    data = process_text(text)
    save_to_csv(data)
    logging.info("Dataset processed and saved to CSV.")

def generate_report():
    with open('app.log', 'r') as log_file:
        log_content = log_file.read()
    return log_content

# Gradio Interface
def generate_all(text):
    data = process_text(text)
    save_to_csv(data)
    return f"Processed data saved to output.csv"

iface = gr.Interface(
    fn=[generate_all, generate_report],
    inputs="text",
    outputs=["text", "text"],
    title="DeepFocus-X3",
    tab_titles=["Generate All", "Logs"],
    description="Generate processed data and view logs."
)

# Run and Push to HuggingFace
def run_and_push():
    train_dataset()
    api = HfApi()
    api.create_repo(repo_id=HF_REPO, private=False, exist_ok=True)
    upload_file(
        path_or_fileobj="output.csv",
        path_in_repo="output.csv",
        repo_id=HF_REPO
    )
    logging.info("Dataset pushed to HuggingFace.")

if __name__ == "__main__":
    iface.launch()
    run_and_push()