File size: 5,524 Bytes
fb2776a
0506cec
df438e3
fb2776a
df438e3
ba51acd
6252133
fb2776a
6252133
fb2776a
 
 
 
 
 
 
df438e3
fb2776a
ad4152f
df438e3
fb2776a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df438e3
fb2776a
 
 
 
df438e3
fb2776a
 
 
1726149
fb2776a
 
 
 
5eafbe3
fb2776a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df438e3
fb2776a
 
 
 
 
 
 
 
f4e74d7
fb2776a
 
 
 
 
 
 
 
5eafbe3
fb2776a
 
 
 
 
 
 
 
 
 
 
1726149
fb2776a
 
 
 
6252133
fb2776a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1726149
fb2776a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

import os
import csv
import json
import logging
import gradio as gr
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from huggingface_hub import HfApi, Repository, login
from datasets import Dataset
import pandas as pd
from datetime import datetime
import secrets

# Download all NLTK data
nltk.download('all')

# Setup logging
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(
    filename=os.path.join(log_dir, f"app_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Error logging to Hugging Face
error_dir = "errors"
os.makedirs(error_dir, exist_ok=True)
error_log_file = os.path.join(error_dir, f"errors_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")

def log_error(error_msg):
    with open(error_log_file, 'a') as f:
        f.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - ERROR - {error_msg}\n")
    try:
        api = HfApi()
        api.upload_file(
            path_or_fileobj=error_log_file,
            path_in_repo=f"errors_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log",
            repo_id="katsukiai/errors",
            repo_type="dataset"
        )
    except Exception as e:
        logging.error(f"Failed to upload error log: {str(e)}")

# Load Hugging Face models (300+ models available, using DeepSeek for long text)
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-6.7b-instruct")
model = AutoModelForSeq2SeqLM.from_pretrained("deepseek-ai/deepseek-coder-6.7b-instruct")
meaning_generator = pipeline("text2text-generation", model="google/flan-t5-large")

# Hugging Face login
HF_TOKEN = os.getenv("HF_TOKEN", secrets.token_hex(16))
login(token=HF_TOKEN)

# Dataset preparation
dataset_dir = "dataset"
os.makedirs(dataset_dir, exist_ok=True)
csv_file = os.path.join(dataset_dir, "deepfocus_data.csv")

def process_text_to_csv(input_text):
    try:
        tokens = word_tokenize(input_text.lower())
        words = list(set(tokens))
        data = []
        for word in tqdm(words, desc="Processing words"):
            meanings = []
            synsets = wordnet.synsets(word)
            if synsets:
                meanings = [syn.definition() for syn in synsets[:3]]
            else:
                try:
                    generated_meaning = meaning_generator(f"Define the word '{word}'", max_length=100)[0]['generated_text']
                    meanings.append(generated_meaning)
                except Exception as e:
                    log_error(f"Meaning generation failed for '{word}': {str(e)}")
            data.append({"tokenizer": tokens, "words": word, "meaning": meanings})
        
        # Save to CSV
        with open(csv_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=["tokenizer", "words", "meaning"])
            writer.writeheader()
            writer.writerows(data)
        
        logging.info(f"Dataset saved to {csv_file}")
        return data
    except Exception as e:
        log_error(f"Error in process_text_to_csv: {str(e)}")
        raise

def upload_to_huggingface():
    try:
        dataset = Dataset.from_csv(csv_file)
        dataset.push_to_hub("katsukiai/DeepFocus-X3", token=HF_TOKEN)
        logging.info("Dataset uploaded to Hugging Face")
    except Exception as e:
        log_error(f"Error uploading to Hugging Face: {str(e)}")
        raise

def generate_output(input_text):
    try:
        data = process_text_to_csv(input_text)
        upload_to_huggingface()
        return json.dumps(data, indent=2)
    except Exception as e:
        log_error(f"Error in generate_output: {str(e)}")
        return f"Error: {str(e)}"

def view_logs():
    try:
        log_files = os.listdir(log_dir)
        log_content = ""
        for log_file in log_files:
            with open(os.path.join(log_dir, log_file), 'r') as f:
                log_content += f"\n\n--- {log_file} ---\n\n{f.read()}"
        return log_content
    except Exception as e:
        log_error(f"Error in view_logs: {str(e)}")
        return f"Error: {str(e)}"

# Gradio Interface
with gr.Blocks(title="DeepFocus-X3") as demo:
    gr.Markdown("# DeepFocus-X3")
    
    with gr.Tabs():
        with gr.TabItem("About"):
            gr.Markdown("""
            ## About DeepFocus-X3
            This application processes text, tokenizes it, extracts unique words, generates meanings, and uploads the dataset to Hugging Face.
            - Uses NLTK for tokenization and WordNet for meanings.
            - Leverages DeepSeek AI for long text processing and Google FLAN-T5 for meaning generation.
            - Logs all activities and errors, with error logs uploaded to Hugging Face.
            """)
        
        with gr.TabItem("Generate all"):
            input_text = gr.Textbox(label="Input Text", lines=10)
            output_json = gr.Textbox(label="Output JSON", lines=10)
            generate_btn = gr.Button("Generate and Upload")
            generate_btn.click(fn=generate_output, inputs=input_text, outputs=output_json)
        
        with gr.TabItem("Logs"):
            gr.Markdown("## Report using Logs")
            log_output = gr.Textbox(label="Log Content", lines=20)
            view_logs_btn = gr.Button("View Logs")
            view_logs_btn.click(fn=view_logs, inputs=None, outputs=log_output)

# Launch Gradio app
demo.launch()