toxic-comment-classifier_rlhf

Sleeping

App Files Files Community

toxic-comment-classifier_rlhf / classifier.py

JanviMl

Update classifier.py

1e828e9 verified 9 days ago

raw

history blame contribute delete

3.34 kB

	# classifier.py
	import torch
	import time
	from model_loader import classifier_model
	from paraphraser import paraphrase_comment
	from metrics import compute_semantic_similarity, compute_empathy_score, compute_bias_score, compute_hallucination_score
	from metrics import compute_reward_scores
	import numpy as np

	def softmax(logits):
	exp_logits = np.exp(logits - np.max(logits))
	return exp_logits / exp_logits.sum()


	def compute_reward_scores(original, paraphrased):
	"""
	Compute all reward scores for a paraphrase.
	Returns a dictionary with empathy, toxicity, bias, hallucination, and overall reward.
	"""
	try:
	# Get toxicity from classifier
	_, _, _, toxicity_score, bias_score, _, _, _, _, paraphrased_toxicity_score, paraphrased_bias_score, _, _ = classify_toxic_comment(paraphrased)
	toxicity = paraphrased_toxicity_score if paraphrased_toxicity_score is not None else 0.5

	# Compute other metrics
	empathy = compute_empathy_score(paraphrased) or 0.5
	bias = compute_bias_score(paraphrased) or 0.5
	hallucination = compute_hallucination_score(original, paraphrased) or 0.5

	# Overall reward: Weighted combination (adjust weights as needed)
	reward = (0.4 * empathy) - (0.2 * toxicity) - (0.2 * bias) - (0.2 * hallucination)
	reward = max(0.0, min(1.0, round(reward, 2)))

	return {
	"empathy": empathy,
	"toxicity": toxicity,
	"bias": bias,
	"hallucination": hallucination,
	"reward": reward
	}
	except Exception as e:
	print(f"Error computing reward scores: {str(e)}")
	return {
	"empathy": 0.5,
	"toxicity": 0.5,
	"bias": 0.5,
	"hallucination": 0.5,
	"reward": 0.5
	}

	def classify_toxic_comment(comment):
	"""
	Classify a comment for toxicity and compute additional metrics.
	Returns a dictionary with classification results and scores.
	"""
	try:
	start_time = time.time()
	print("Starting classification...")

	# Tokenize the comment
	inputs = classifier_model.tokenizer(
	comment,
	return_tensors="pt",
	truncation=True,
	padding=True,
	max_length=512
	).to(classifier_model.device)

	# Classify using the toxicity classifier
	with torch.no_grad():
	outputs = classifier_model.model(**inputs)
	logits = outputs.logits.cpu().numpy()[0]
	probs = softmax(logits)

	toxicity = probs[1] # Assuming label 1 is toxic
	print(f"Classification took {time.time() - start_time:.2f} seconds")

	# Compute additional metrics (empathy, bias, hallucination, reward)
	scores = compute_reward_scores(comment, comment) # Use comment as both original and paraphrase for classification
	scores["toxicity"] = toxicity # Override toxicity with classifier result

	print(f"Total processing time: {time.time() - start_time:.2f} seconds")
	return scores

	except Exception as e:
	print(f"Error during classification: {str(e)}")
	return {
	"empathy": 0.0,
	"toxicity": 1.0,
	"bias": 1.0,
	"hallucination": 1.0,
	"reward": 0.0
	}