nsfwdata / app.py
Jhakx's picture
Create app.py
3ae75b1 verified
import gradio as gr
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Load datasets
nsfw_datasets = [
load_dataset("aifeifei798/DPO_Pairs-Roleplay-NSFW"),
load_dataset("Maxx0/sexting-nsfw-adultconten"),
load_dataset("QuietImpostor/Claude-3-Opus-Claude-3.5-Sonnnet-9k"),
load_dataset("HuggingFaceTB/everyday-conversations-llama3.1-2k"),
load_dataset("Chadgpt-fam/sexting_dataset")
]
# Prepare all texts from datasets
all_texts = []
for dataset in nsfw_datasets:
for split in dataset.keys():
if 'text' in dataset[split].features:
all_texts.extend(dataset[split]['text'])
elif 'content' in dataset[split].features:
all_texts.extend(dataset[split]['content'])
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_texts)
def find_best_description(input_text):
input_vector = vectorizer.transform([input_text])
similarities = cosine_similarity(input_vector, tfidf_matrix)
most_similar_index = np.argmax(similarities)
return all_texts[most_similar_index]
def generate_text(input_text):
return find_best_description(input_text)
# Create Gradio interface
iface = gr.Interface(
fn=generate_text,
inputs=gr.Textbox(label="Enter text to describe"),
outputs="text",
title="NSFW Text Descriptor",
description="Enter text to find the best description from NSFW datasets.",
allow_flagging="never"
)
# Launch the app
if __name__ == "__main__":
iface.launch()