|
import gradio as gr |
|
from datasets import load_dataset |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import numpy as np |
|
|
|
|
|
nsfw_datasets = [ |
|
load_dataset("aifeifei798/DPO_Pairs-Roleplay-NSFW"), |
|
load_dataset("Maxx0/sexting-nsfw-adultconten"), |
|
load_dataset("QuietImpostor/Claude-3-Opus-Claude-3.5-Sonnnet-9k"), |
|
load_dataset("HuggingFaceTB/everyday-conversations-llama3.1-2k"), |
|
load_dataset("Chadgpt-fam/sexting_dataset") |
|
] |
|
|
|
|
|
all_texts = [] |
|
for dataset in nsfw_datasets: |
|
for split in dataset.keys(): |
|
if 'text' in dataset[split].features: |
|
all_texts.extend(dataset[split]['text']) |
|
elif 'content' in dataset[split].features: |
|
all_texts.extend(dataset[split]['content']) |
|
|
|
|
|
vectorizer = TfidfVectorizer() |
|
tfidf_matrix = vectorizer.fit_transform(all_texts) |
|
|
|
def find_best_description(input_text): |
|
input_vector = vectorizer.transform([input_text]) |
|
similarities = cosine_similarity(input_vector, tfidf_matrix) |
|
most_similar_index = np.argmax(similarities) |
|
return all_texts[most_similar_index] |
|
|
|
def generate_text(input_text): |
|
return find_best_description(input_text) |
|
|
|
|
|
iface = gr.Interface( |
|
fn=generate_text, |
|
inputs=gr.Textbox(label="Enter text to describe"), |
|
outputs="text", |
|
title="NSFW Text Descriptor", |
|
description="Enter text to find the best description from NSFW datasets.", |
|
allow_flagging="never" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|