Spaces:
Runtime error
Runtime error
File size: 5,263 Bytes
0506cec df438e3 fb2776a df438e3 ba51acd 6252133 fb2776a 6252133 fb2776a 8eb49e4 fb2776a df438e3 ad4152f df438e3 fb2776a df438e3 fb2776a 8eb49e4 fb2776a df438e3 fb2776a 1726149 fb2776a 5eafbe3 fb2776a df438e3 fb2776a f4e74d7 fb2776a 5eafbe3 fb2776a 1726149 fb2776a 6252133 fb2776a 1726149 fb2776a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import os
import csv
import json
import logging
import gradio as gr
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import HfApi, Repository, login
from datasets import Dataset
import pandas as pd
from datetime import datetime
import secrets
nltk.download('all')
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(
filename=os.path.join(log_dir, f"app_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
error_dir = "errors"
os.makedirs(error_dir, exist_ok=True)
error_log_file = os.path.join(error_dir, f"errors_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
def log_error(error_msg):
with open(error_log_file, 'a') as f:
f.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - ERROR - {error_msg}\n")
try:
api = HfApi()
api.upload_file(
path_or_fileobj=error_log_file,
path_in_repo=f"errors_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log",
repo_id="katsukiai/errors",
repo_type="dataset"
)
except Exception as e:
logging.error(f"Failed to upload error log: {str(e)}")
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-6.7b-instruct")
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-6.7b-instruct")
meaning_generator = pipeline("text2text-generation", model="google/flan-t5-large")
HF_TOKEN = os.getenv("HF_TOKEN", secrets.token_hex(16))
login(token=HF_TOKEN)
dataset_dir = "dataset"
os.makedirs(dataset_dir, exist_ok=True)
csv_file = os.path.join(dataset_dir, "deepfocus_data.csv")
def process_text_to_csv(input_text):
try:
tokens = word_tokenize(input_text.lower())
words = list(set(tokens))
data = []
for word in tqdm(words, desc="Processing words"):
meanings = []
synsets = wordnet.synsets(word)
if synsets:
meanings = [syn.definition() for syn in synsets[:3]]
else:
try:
generated_meaning = meaning_generator(f"Define the word '{word}'", max_length=100)[0]['generated_text']
meanings.append(generated_meaning)
except Exception as e:
log_error(f"Meaning generation failed for '{word}': {str(e)}")
data.append({"tokenizer": tokens, "words": word, "meaning": meanings})
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=["tokenizer", "words", "meaning"])
writer.writeheader()
writer.writerows(data)
logging.info(f"Dataset saved to {csv_file}")
return data
except Exception as e:
log_error(f"Error in process_text_to_csv: {str(e)}")
raise
def upload_to_huggingface():
try:
dataset = Dataset.from_csv(csv_file)
dataset.push_to_hub("katsukiai/DeepFocus-X3", token=HF_TOKEN)
logging.info("Dataset uploaded to Hugging Face")
except Exception as e:
log_error(f"Error uploading to Hugging Face: {str(e)}")
raise
def generate_output(input_text):
try:
data = process_text_to_csv(input_text)
upload_to_huggingface()
return json.dumps(data, indent=2)
except Exception as e:
log_error(f"Error in generate_output: {str(e)}")
return f"Error: {str(e)}"
def view_logs():
try:
log_files = os.listdir(log_dir)
log_content = ""
for log_file in log_files:
with open(os.path.join(log_dir, log_file), 'r') as f:
log_content += f"\n\n--- {log_file} ---\n\n{f.read()}"
return log_content
except Exception as e:
log_error(f"Error in view_logs: {str(e)}")
return f"Error: {str(e)}"
with gr.Blocks(title="DeepFocus-X3") as demo:
gr.Markdown("# DeepFocus-X3")
with gr.Tabs():
with gr.TabItem("About"):
gr.Markdown("""
## About DeepFocus-X3
This application processes text, tokenizes it, extracts unique words, generates meanings, and uploads the dataset to Hugging Face.
- Uses NLTK for tokenization and WordNet for meanings.
- Leverages DeepSeek AI for long text processing and Google FLAN-T5 for meaning generation.
- Logs all activities and errors, with error logs uploaded to Hugging Face.
""")
with gr.TabItem("Generate all"):
input_text = gr.Textbox(label="Input Text", lines=10)
output_json = gr.Textbox(label="Output JSON", lines=10)
generate_btn = gr.Button("Generate and Upload")
generate_btn.click(fn=generate_output, inputs=input_text, outputs=output_json)
with gr.TabItem("Logs"):
gr.Markdown("## Report using Logs")
log_output = gr.Textbox(label="Log Content", lines=20)
view_logs_btn = gr.Button("View Logs")
view_logs_btn.click(fn=view_logs, inputs=None, outputs=log_output)
demo.launch() |