toxic-comment-classifier_rlhf

Sleeping

App Files Files Community

toxic-comment-classifier_rlhf / paraphraser.py

JanviMl

Update paraphraser.py

bb4164b verified 2 months ago

raw

history blame

2.02 kB

	# paraphraser.py
	import torch
	from model_loader import paraphrase_model, paraphrase_tokenizer

	def paraphrase_comment(comment):
	"""
	Paraphrase a toxic comment using the Granite 3.2-2B-Instruct model.
	Returns the paraphrased comment.
	"""
	# Define the paraphrasing prompt with system instruction, guidelines, examples, and the task
	prompt = (
	"You are a content moderator tasked with rewriting toxic comments into neutral and constructive ones while maintaining the original meaning.\n"
	"Guidelines:\n"
	"- Remove explicit hate speech, personal attacks, or offensive language.\n"
	"- Keep the response neutral and professional.\n"
	"- Ensure the rewritten comment retains the original intent but in a constructive tone.\n"
	"Examples:\n"
	"Toxic: \"You're so dumb! You never understand anything!\"\n"
	"Neutral: \"I think there's some misunderstanding. Let's clarify things.\"\n"
	"Toxic: \"This is the worst idea ever. Only an idiot would suggest this.\"\n"
	"Neutral: \"I don't think this idea works well. Maybe we can explore other options.\"\n"
	"Now, rewrite this comment: \"{comment}\""
	)

	# Format the prompt with the input comment
	prompt = prompt.format(comment=comment)

	# Tokenize the prompt
	inputs = paraphrase_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512)

	# Generate the paraphrased output
	with torch.no_grad():
	outputs = paraphrase_model.generate(
	**inputs,
	max_length=512,
	num_return_sequences=1,
	do_sample=True,
	top_p=0.95,
	temperature=0.7
	)

	# Decode the generated output
	paraphrased_comment = paraphrase_tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Remove the prompt part from the output (if the model includes it)
	paraphrased_comment = paraphrased_comment.replace(prompt, "").strip()

	return paraphrased_comment