Spaces:

Jahadu
/

Fold

Running

App Files Files Community

Fold / app.py

Jahadu

Update app.py

da013d9 verified 3 months ago

raw

history blame contribute delete

13 kB

	import gradio as gr
	from transformers import pipeline
	from translatepy import Translator
	import logging
	import random
	import time
	import nltk
	from nltk.tokenize import sent_tokenize, word_tokenize
	import os
	from typing import Dict, Optional
	from functools import lru_cache

	# Configure logging with more detailed format
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	handlers=[
	logging.StreamHandler(),
	logging.FileHandler('app.log')
	]
	)
	logger = logging.getLogger(__name__)

	# Environment configuration with defaults
	class Config:
	NLTK_DATA = os.getenv('NLTK_DATA', '/home/user/nltk_data')
	CACHE_DIR = os.getenv('CACHE_DIR', '/home/user/model_cache')
	MAX_TEXT_LENGTH = 10000
	CHUNK_SIZE = 500

	# Set up NLTK data path with error handling
	def setup_nltk():
	try:
	os.makedirs(Config.NLTK_DATA, exist_ok=True)
	nltk.data.path.append(Config.NLTK_DATA)
	required_packages = ['punkt']
	for package in required_packages:
	try:
	nltk.data.find(f'tokenizers/{package}')
	except LookupError:
	nltk.download(package, download_dir=Config.NLTK_DATA, quiet=True)
	except Exception as e:
	logger.error(f"NLTK setup failed: {str(e)}")
	raise

	class TextHumanizer:
	def __init__(self, cache_dir: str = Config.CACHE_DIR):
	"""Initialize with better error handling and resource management"""
	try:
	os.makedirs(cache_dir, exist_ok=True)

	# Initialize models with timeout and retry logic
	self.detector = self._init_pipeline(
	"text-classification",
	"roberta-base-openai-detector",
	cache_dir
	)

	self.humanizer = self._init_pipeline(
	"text2text-generation",
	"facebook/bart-large-cnn",
	cache_dir
	)

	self.translator = Translator()

	# Move prompts to a separate configuration file in production
	self.tone_prompts = {
	"Casual": [
	"Rewrite this casually as if you're texting a friend: {text}",
	"Make this sound like natural conversation: {text}",
	"Convert this to everyday spoken English: {text}"
	],
	"Business": [
	"Rephrase this in professional corporate language: {text}",
	"Transform this into formal business communication: {text}",
	"Rewrite for a professional email: {text}"
	],
	"Academic": [
	"Rephrase this in scholarly academic language: {text}",
	"Convert to academic paper style: {text}",
	"Rewrite for a research publication: {text}"
	],
	"Creative": [
	"Transform this into vivid, imaginative writing: {text}",
	"Rewrite with creative metaphors and sensory details: {text}",
	"Convert to engaging storytelling style: {text}"
	]
	}

	self.human_patterns = self._load_patterns()

	except Exception as e:
	logger.error(f"Initialization failed: {str(e)}")
	raise

	@staticmethod
	def _init_pipeline(task: str, model: str, cache_dir: str, max_retries: int = 3):
	"""Initialize pipeline with retry logic"""
	for attempt in range(max_retries):
	try:
	return pipeline(task, model=model, cache_dir=cache_dir, device=-1)
	except Exception as e:
	if attempt == max_retries - 1:
	raise
	logger.warning(f"Pipeline initialization attempt {attempt + 1} failed: {str(e)}")
	time.sleep(2 ** attempt) # Exponential backoff

	@staticmethod
	def _load_patterns():
	"""Load human-like patterns with enhanced variety"""
	return {
	'fillers': ["well", "you know", "actually", "I mean", "basically",
	"to be honest", "kind of", "sort of", "like"],
	'contractions': {
	"cannot": "can't",
	"could not": "couldn't",
	"would not": "wouldn't",
	"is not": "isn't",
	"do not": "don't",
	"will not": "won't",
	"should not": "shouldn't",
	"have not": "haven't"
	},
	'sentence_variants': [
	lambda s: s.lower(),
	lambda s: s.capitalize(),
	lambda s: s[:-1] + ", which is interesting." if s.endswith('.') else s,
	lambda s: s[:-1] + ", you know?" if s.endswith('.') else s,
	lambda s: s[:-1] + "..." if s.endswith('.') else s
	]
	}

	@lru_cache(maxsize=1000)
	def _add_human_touches(self, text: str) -> str:
	"""Apply multiple layers of human-like modifications with caching"""
	try:
	sentences = sent_tokenize(text)

	# Enhanced sentence modification with better randomization
	modified_sentences = []
	for sent in sentences:
	if random.random() < 0.4:
	filler = random.choice(self.human_patterns['fillers'])
	sent = f"{filler}, {sent.lower()}"

	# Smart sentence splitting for long sentences
	if len(sent.split()) > 12 and random.random() < 0.3:
	words = word_tokenize(sent)
	split_point = len(words)//2 + random.randint(-2, 2)
	modified_sentences.extend([
	' '.join(words[:split_point]) + ',',
	' '.join(words[split_point:])
	])
	else:
	modified_sentences.append(sent)

	# Apply contractions and variations
	text = ' '.join(modified_sentences)
	for formal, casual in self.human_patterns['contractions'].items():
	text = text.replace(f" {formal} ", f" {casual} ")

	# Apply sentence variants with natural distribution
	final_sentences = []
	for sent in sent_tokenize(text):
	if random.random() < 0.7: # 70% chance of modification
	sent = random.choice(self.human_patterns['sentence_variants'])(sent)
	final_sentences.append(sent)

	return ' '.join(final_sentences)

	except Exception as e:
	logger.error(f"Humanization error: {str(e)}")
	return text

	def detect_ai_text(self, text: str) -> float:
	"""Enhanced AI detection with better chunk handling"""
	try:
	if not text.strip():
	return 0.0

	chunks = [text[i:i+Config.CHUNK_SIZE] for i in range(0, len(text), Config.CHUNK_SIZE)]
	scores = []

	for chunk in chunks:
	if len(chunk.strip()) < 50: # Skip very short chunks
	continue
	result = self.detector(chunk)[0]
	if result['label'] == 'ARTIFICIAL':
	scores.append(result['score'])

	return sum(scores)/len(scores) if scores else 0.0

	except Exception as e:
	logger.error(f"Detection error: {str(e)}")
	return 0.0

	def humanize_text(self, text: str, tone: str, translate_to: Optional[str] = None) -> str:
	"""Improved humanization pipeline with better error handling and quality control"""
	try:
	if not text or len(text) > Config.MAX_TEXT_LENGTH:
	raise ValueError(f"Text must be between 1 and {Config.MAX_TEXT_LENGTH} characters")

	# Track processing metrics
	metrics = {'start_time': time.time()}

	original_score = self.detect_ai_text(text)
	logger.info(f"Initial AI score: {original_score:.2f}")

	# Generate humanized text with enhanced parameters
	prompt = random.choice(self.tone_prompts[tone]).format(text=text)
	generated = self.humanizer(
	prompt,
	max_length=min(len(text)*2, 1024),
	temperature=0.9,
	top_p=0.95,
	num_beams=4,
	repetition_penalty=1.2,
	no_repeat_ngram_size=3
	)[0]['generated_text']

	# Multi-pass humanization with quality control
	humanized = self._add_human_touches(generated)
	final_score = self.detect_ai_text(humanized)

	# Adaptive humanization based on scores
	if final_score > original_score * 0.8:
	logger.info("Applying additional humanization pass")
	humanized = self._add_human_touches(humanized)

	# Translation with error handling
	if translate_to and translate_to != "None":
	try:
	lang_code = translate_to.split()[0]
	humanized = self.translator.translate(humanized, lang_code).result
	except Exception as e:
	logger.error(f"Translation failed: {str(e)}")
	raise ValueError(f"Translation failed: {str(e)}")

	metrics['processing_time'] = time.time() - metrics['start_time']
	logger.info(f"Processing completed in {metrics['processing_time']:.2f} seconds")

	return humanized

	except Exception as e:
	logger.error(f"Humanization failed: {str(e)}")
	raise

	def create_interface():
	"""Create Gradio interface with improved error handling and user experience"""
	try:
	humanizer = TextHumanizer()
	setup_nltk()

	def process_text(text: str, tone: str, translate_to: str) -> Dict:
	try:
	if not text.strip():
	return {
	"data": ["Please enter some text to process"],
	"success": False,
	"error": "Empty input"
	}

	start_time = time.time()
	result = humanizer.humanize_text(text, tone, translate_to)
	processing_time = time.time() - start_time

	return {
	"data": [result],
	"success": True,
	"metrics": {
	"processing_time": round(processing_time, 2),
	"characters_processed": len(text),
	"words_processed": len(text.split())
	}
	}
	except Exception as e:
	logger.error(f"Text processing failed: {str(e)}")
	return {
	"data": [],
	"success": False,
	"error": str(e)
	}

	iface = gr.Interface(
	fn=process_text,
	inputs=[
	gr.Textbox(
	label="Input Text",
	lines=5,
	placeholder="Enter text to humanize..."
	),
	gr.Dropdown(
	choices=list(humanizer.tone_prompts.keys()),
	label="Writing Style",
	value="Casual"
	),
	gr.Dropdown(
	choices=["None"] + [f"{c} ({n})" for c, n in [
	("da", "Danish"), ("no", "Norwegian"),
	("sv", "Swedish"), ("es", "Spanish"),
	("fr", "French"), ("de", "German")
	]],
	label="Translate to",
	value="None"
	)
	],
	outputs=gr.JSON(),
	title="Advanced AI Text Humanizer",
	description="Transform AI-generated text into more natural, human-like writing",
	examples=[
	["Large language models demonstrate remarkable capabilities in natural language understanding tasks.", "Casual", "None"],
	["The implementation requires careful consideration of multiple interdependent factors.", "Business", "es (Spanish)"]
	],
	flagging_mode=None
	)

	iface.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True
	)

	except Exception as e:
	logger.error(f"Interface creation failed: {str(e)}")
	raise

	if __name__ == "__main__":
	create_interface()