Fold / app.py
Jahadu's picture
Update app.py
da013d9 verified
import gradio as gr
from transformers import pipeline
from translatepy import Translator
import logging
import random
import time
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import os
from typing import Dict, Optional
from functools import lru_cache
# Configure logging with more detailed format
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('app.log')
]
)
logger = logging.getLogger(__name__)
# Environment configuration with defaults
class Config:
NLTK_DATA = os.getenv('NLTK_DATA', '/home/user/nltk_data')
CACHE_DIR = os.getenv('CACHE_DIR', '/home/user/model_cache')
MAX_TEXT_LENGTH = 10000
CHUNK_SIZE = 500
# Set up NLTK data path with error handling
def setup_nltk():
try:
os.makedirs(Config.NLTK_DATA, exist_ok=True)
nltk.data.path.append(Config.NLTK_DATA)
required_packages = ['punkt']
for package in required_packages:
try:
nltk.data.find(f'tokenizers/{package}')
except LookupError:
nltk.download(package, download_dir=Config.NLTK_DATA, quiet=True)
except Exception as e:
logger.error(f"NLTK setup failed: {str(e)}")
raise
class TextHumanizer:
def __init__(self, cache_dir: str = Config.CACHE_DIR):
"""Initialize with better error handling and resource management"""
try:
os.makedirs(cache_dir, exist_ok=True)
# Initialize models with timeout and retry logic
self.detector = self._init_pipeline(
"text-classification",
"roberta-base-openai-detector",
cache_dir
)
self.humanizer = self._init_pipeline(
"text2text-generation",
"facebook/bart-large-cnn",
cache_dir
)
self.translator = Translator()
# Move prompts to a separate configuration file in production
self.tone_prompts = {
"Casual": [
"Rewrite this casually as if you're texting a friend: {text}",
"Make this sound like natural conversation: {text}",
"Convert this to everyday spoken English: {text}"
],
"Business": [
"Rephrase this in professional corporate language: {text}",
"Transform this into formal business communication: {text}",
"Rewrite for a professional email: {text}"
],
"Academic": [
"Rephrase this in scholarly academic language: {text}",
"Convert to academic paper style: {text}",
"Rewrite for a research publication: {text}"
],
"Creative": [
"Transform this into vivid, imaginative writing: {text}",
"Rewrite with creative metaphors and sensory details: {text}",
"Convert to engaging storytelling style: {text}"
]
}
self.human_patterns = self._load_patterns()
except Exception as e:
logger.error(f"Initialization failed: {str(e)}")
raise
@staticmethod
def _init_pipeline(task: str, model: str, cache_dir: str, max_retries: int = 3):
"""Initialize pipeline with retry logic"""
for attempt in range(max_retries):
try:
return pipeline(task, model=model, cache_dir=cache_dir, device=-1)
except Exception as e:
if attempt == max_retries - 1:
raise
logger.warning(f"Pipeline initialization attempt {attempt + 1} failed: {str(e)}")
time.sleep(2 ** attempt) # Exponential backoff
@staticmethod
def _load_patterns():
"""Load human-like patterns with enhanced variety"""
return {
'fillers': ["well", "you know", "actually", "I mean", "basically",
"to be honest", "kind of", "sort of", "like"],
'contractions': {
"cannot": "can't",
"could not": "couldn't",
"would not": "wouldn't",
"is not": "isn't",
"do not": "don't",
"will not": "won't",
"should not": "shouldn't",
"have not": "haven't"
},
'sentence_variants': [
lambda s: s.lower(),
lambda s: s.capitalize(),
lambda s: s[:-1] + ", which is interesting." if s.endswith('.') else s,
lambda s: s[:-1] + ", you know?" if s.endswith('.') else s,
lambda s: s[:-1] + "..." if s.endswith('.') else s
]
}
@lru_cache(maxsize=1000)
def _add_human_touches(self, text: str) -> str:
"""Apply multiple layers of human-like modifications with caching"""
try:
sentences = sent_tokenize(text)
# Enhanced sentence modification with better randomization
modified_sentences = []
for sent in sentences:
if random.random() < 0.4:
filler = random.choice(self.human_patterns['fillers'])
sent = f"{filler}, {sent.lower()}"
# Smart sentence splitting for long sentences
if len(sent.split()) > 12 and random.random() < 0.3:
words = word_tokenize(sent)
split_point = len(words)//2 + random.randint(-2, 2)
modified_sentences.extend([
' '.join(words[:split_point]) + ',',
' '.join(words[split_point:])
])
else:
modified_sentences.append(sent)
# Apply contractions and variations
text = ' '.join(modified_sentences)
for formal, casual in self.human_patterns['contractions'].items():
text = text.replace(f" {formal} ", f" {casual} ")
# Apply sentence variants with natural distribution
final_sentences = []
for sent in sent_tokenize(text):
if random.random() < 0.7: # 70% chance of modification
sent = random.choice(self.human_patterns['sentence_variants'])(sent)
final_sentences.append(sent)
return ' '.join(final_sentences)
except Exception as e:
logger.error(f"Humanization error: {str(e)}")
return text
def detect_ai_text(self, text: str) -> float:
"""Enhanced AI detection with better chunk handling"""
try:
if not text.strip():
return 0.0
chunks = [text[i:i+Config.CHUNK_SIZE] for i in range(0, len(text), Config.CHUNK_SIZE)]
scores = []
for chunk in chunks:
if len(chunk.strip()) < 50: # Skip very short chunks
continue
result = self.detector(chunk)[0]
if result['label'] == 'ARTIFICIAL':
scores.append(result['score'])
return sum(scores)/len(scores) if scores else 0.0
except Exception as e:
logger.error(f"Detection error: {str(e)}")
return 0.0
def humanize_text(self, text: str, tone: str, translate_to: Optional[str] = None) -> str:
"""Improved humanization pipeline with better error handling and quality control"""
try:
if not text or len(text) > Config.MAX_TEXT_LENGTH:
raise ValueError(f"Text must be between 1 and {Config.MAX_TEXT_LENGTH} characters")
# Track processing metrics
metrics = {'start_time': time.time()}
original_score = self.detect_ai_text(text)
logger.info(f"Initial AI score: {original_score:.2f}")
# Generate humanized text with enhanced parameters
prompt = random.choice(self.tone_prompts[tone]).format(text=text)
generated = self.humanizer(
prompt,
max_length=min(len(text)*2, 1024),
temperature=0.9,
top_p=0.95,
num_beams=4,
repetition_penalty=1.2,
no_repeat_ngram_size=3
)[0]['generated_text']
# Multi-pass humanization with quality control
humanized = self._add_human_touches(generated)
final_score = self.detect_ai_text(humanized)
# Adaptive humanization based on scores
if final_score > original_score * 0.8:
logger.info("Applying additional humanization pass")
humanized = self._add_human_touches(humanized)
# Translation with error handling
if translate_to and translate_to != "None":
try:
lang_code = translate_to.split()[0]
humanized = self.translator.translate(humanized, lang_code).result
except Exception as e:
logger.error(f"Translation failed: {str(e)}")
raise ValueError(f"Translation failed: {str(e)}")
metrics['processing_time'] = time.time() - metrics['start_time']
logger.info(f"Processing completed in {metrics['processing_time']:.2f} seconds")
return humanized
except Exception as e:
logger.error(f"Humanization failed: {str(e)}")
raise
def create_interface():
"""Create Gradio interface with improved error handling and user experience"""
try:
humanizer = TextHumanizer()
setup_nltk()
def process_text(text: str, tone: str, translate_to: str) -> Dict:
try:
if not text.strip():
return {
"data": ["Please enter some text to process"],
"success": False,
"error": "Empty input"
}
start_time = time.time()
result = humanizer.humanize_text(text, tone, translate_to)
processing_time = time.time() - start_time
return {
"data": [result],
"success": True,
"metrics": {
"processing_time": round(processing_time, 2),
"characters_processed": len(text),
"words_processed": len(text.split())
}
}
except Exception as e:
logger.error(f"Text processing failed: {str(e)}")
return {
"data": [],
"success": False,
"error": str(e)
}
iface = gr.Interface(
fn=process_text,
inputs=[
gr.Textbox(
label="Input Text",
lines=5,
placeholder="Enter text to humanize..."
),
gr.Dropdown(
choices=list(humanizer.tone_prompts.keys()),
label="Writing Style",
value="Casual"
),
gr.Dropdown(
choices=["None"] + [f"{c} ({n})" for c, n in [
("da", "Danish"), ("no", "Norwegian"),
("sv", "Swedish"), ("es", "Spanish"),
("fr", "French"), ("de", "German")
]],
label="Translate to",
value="None"
)
],
outputs=gr.JSON(),
title="Advanced AI Text Humanizer",
description="Transform AI-generated text into more natural, human-like writing",
examples=[
["Large language models demonstrate remarkable capabilities in natural language understanding tasks.", "Casual", "None"],
["The implementation requires careful consideration of multiple interdependent factors.", "Business", "es (Spanish)"]
],
flagging_mode=None
)
iface.launch(
server_name="0.0.0.0",
server_port=7860,
share=True
)
except Exception as e:
logger.error(f"Interface creation failed: {str(e)}")
raise
if __name__ == "__main__":
create_interface()