import gradio as gr from datasets import load_dataset from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np # Load datasets nsfw_datasets = [ load_dataset("aifeifei798/DPO_Pairs-Roleplay-NSFW"), load_dataset("Maxx0/sexting-nsfw-adultconten"), load_dataset("QuietImpostor/Claude-3-Opus-Claude-3.5-Sonnnet-9k"), load_dataset("HuggingFaceTB/everyday-conversations-llama3.1-2k"), load_dataset("Chadgpt-fam/sexting_dataset") ] # Prepare all texts from datasets all_texts = [] for dataset in nsfw_datasets: for split in dataset.keys(): if 'text' in dataset[split].features: all_texts.extend(dataset[split]['text']) elif 'content' in dataset[split].features: all_texts.extend(dataset[split]['content']) # Create TF-IDF vectorizer vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(all_texts) def find_best_description(input_text): input_vector = vectorizer.transform([input_text]) similarities = cosine_similarity(input_vector, tfidf_matrix) most_similar_index = np.argmax(similarities) return all_texts[most_similar_index] def generate_text(input_text): return find_best_description(input_text) # Create Gradio interface iface = gr.Interface( fn=generate_text, inputs=gr.Textbox(label="Enter text to describe"), outputs="text", title="NSFW Text Descriptor", description="Enter text to find the best description from NSFW datasets.", allow_flagging="never" ) # Launch the app if __name__ == "__main__": iface.launch()