|
import os |
|
import sys |
|
from tqdm import tqdm |
|
import numpy as np |
|
import torch |
|
import matplotlib.pyplot as plt |
|
from transformers import GPT2LMHeadModel, GPT2TokenizerFast |
|
from bert_score import BERTScorer |
|
from bert_score.utils import model2layers |
|
from nltk.tokenize import word_tokenize |
|
from Levenshtein import distance as levenshtein_distance |
|
from sentence_transformers import SentenceTransformer |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from scipy.spatial.distance import cdist |
|
from scipy.optimize import linear_sum_assignment |
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
|
|
from config.config import load_config |
|
config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml') |
|
config = load_config(config_path)['PECCAVI_TEXT']['Metrics'] |
|
|
|
class SentenceDistortionCalculator: |
|
""" |
|
A class to calculate and analyze distortion metrics between an original sentence and modified sentences. |
|
""" |
|
def __init__(self, config, original_sentence, paraphrased_sentences): |
|
""" |
|
Initialize the calculator with the original sentence and a list of modified sentences. |
|
""" |
|
self.original_sentence = original_sentence |
|
self.paraphrased_sentences = paraphrased_sentences |
|
|
|
self.levenshtein_distances = {} |
|
self.bert_scores = {} |
|
self.mover_scores = {} |
|
|
|
self.normalized_levenshtein = {} |
|
self.normalized_bert_scores = {} |
|
self.normalized_mover_scores = {} |
|
self.combined_distortions = {} |
|
|
|
self.tokenizer = GPT2TokenizerFast.from_pretrained(config['Distortion']) |
|
self.model = GPT2LMHeadModel.from_pretrained(config['Distortion']) |
|
self.model.eval() |
|
|
|
def calculate_all_metrics(self): |
|
""" |
|
Calculate all distortion metrics for each modified sentence. |
|
""" |
|
for idx, modified_sentence in tqdm(enumerate(self.paraphrased_sentences), total=len(self.paraphrased_sentences), desc="Calculating Metrics"): |
|
key = f"Sentence_{idx+1}" |
|
self.levenshtein_distances[key] = self._calculate_levenshtein_distance(modified_sentence) |
|
self.bert_scores[key] = self._calculate_bert_score(modified_sentence) |
|
self.mover_scores[key] = self._calculate_mover_score(modified_sentence) |
|
|
|
|
|
def normalize_metrics(self): |
|
""" |
|
Normalize all metrics to be between 0 and 1. |
|
""" |
|
for _ in tqdm(range(1), desc="Normalizing Metrics"): |
|
self.normalized_levenshtein = self._normalize_dict(self.levenshtein_distances) |
|
self.normalized_bert_scores = self._normalize_dict(self.bert_scores) |
|
self.normalized_mover_scores = self._normalize_dict(self.mover_scores) |
|
|
|
def calculate_combined_distortion(self): |
|
""" |
|
Calculate the combined distortion using the root mean square of the normalized metrics. |
|
""" |
|
for _ in tqdm(range(1), desc="Calculating Combined Distortion"): |
|
for key in self.normalized_levenshtein.keys(): |
|
rms = np.sqrt( |
|
( |
|
self.normalized_levenshtein[key] ** 2 + |
|
self.normalized_bert_scores[key] ** 2+ |
|
self.normalized_mover_scores[key] **2 |
|
) / 3 |
|
) |
|
self.combined_distortions[key] = rms |
|
|
|
def plot_metrics(self): |
|
""" |
|
Plot each normalized metric and the combined distortion in separate graphs. |
|
""" |
|
keys = list(self.normalized_levenshtein.keys()) |
|
indices = np.arange(len(keys)) |
|
|
|
|
|
metrics = { |
|
'Levenshtein Distance': [self.normalized_levenshtein[key] for key in keys], |
|
'BERTScore': [self.normalized_bert_scores[key] for key in keys], |
|
'MOVERscore':[self.normalized_mover_scores[key] for key in keys], |
|
'Combined Distortion': [self.combined_distortions[key] for key in keys] |
|
} |
|
|
|
|
|
for metric_name, values in tqdm(metrics.items(), desc="Plotting Metrics"): |
|
plt.figure(figsize=(12, 6)) |
|
plt.plot(indices, values, marker='o', color=np.random.rand(3,)) |
|
plt.xlabel('Sentence Index') |
|
plt.ylabel('Normalized Value (0-1)') |
|
plt.title(f'Normalized {metric_name}') |
|
plt.grid(True) |
|
plt.tight_layout() |
|
plt.show() |
|
|
|
def _calculate_levenshtein_distance(self, modified_sentence): |
|
""" |
|
Calculate the word-level Levenshtein distance between the original and modified sentence. |
|
""" |
|
words1 = word_tokenize(self.original_sentence) |
|
words2 = word_tokenize(modified_sentence) |
|
lev_distance = levenshtein_distance(words1, words2) |
|
return (lev_distance / max(len(words1), len(words2))) |
|
|
|
def _calculate_bert_score(self, modified_sentence): |
|
""" |
|
Compute the BERTScore similarity between the original and modified sentence. |
|
Returns 1 - F1 score to represent dissimilarity. |
|
""" |
|
if not hasattr(self, 'original_sentence'): |
|
raise ValueError("original_sentence is not set. Please set self.original_sentence before calling this function.") |
|
if not isinstance(modified_sentence, str): |
|
raise ValueError("modified_sentence must be a string.") |
|
|
|
model_type = "microsoft/deberta-xlarge-mnli" |
|
num_layers = model2layers[model_type] |
|
|
|
if not hasattr(self, "cached_bertscorer"): |
|
self.cached_bertscorer = BERTScorer( |
|
model_type=model_type, |
|
num_layers=num_layers, |
|
batch_size=1, |
|
nthreads=4, |
|
all_layers=False, |
|
idf=False, |
|
device="cuda" if torch.cuda.is_available() else "cpu", |
|
lang="en" |
|
) |
|
|
|
|
|
_, _, F1 = self.cached_bertscorer.score( |
|
cands=[modified_sentence], |
|
refs=[self.original_sentence], |
|
verbose=False, |
|
batch_size=1 |
|
) |
|
|
|
return 1 - F1.item() |
|
def _calculate_mover_score(self,modified_sentence,model_name='all-MiniLM-L6-v2'): |
|
"""Compute MoverScore correctly using word-level embeddings.""" |
|
if not self.original_sentence: |
|
raise ValueError("Original sentence not provided.") |
|
|
|
|
|
original_tokens = self.original_sentence.split() |
|
modified_tokens = modified_sentence.split() |
|
model = SentenceTransformer(model_name) |
|
|
|
|
|
original_embeddings = model.encode(original_tokens, convert_to_numpy=True) |
|
modified_embeddings = model.encode(modified_tokens, convert_to_numpy=True) |
|
|
|
|
|
cost_matrix = cdist(original_embeddings, modified_embeddings, metric='cosine') |
|
|
|
|
|
row_ind, col_ind = linear_sum_assignment(cost_matrix) |
|
|
|
|
|
vectorizer = TfidfVectorizer() |
|
vectorizer.fit([self.original_sentence, modified_sentence]) |
|
idf_values = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_)) |
|
|
|
|
|
idf_weights_original = np.array([idf_values.get(word.lower(), 1.0) for word in original_tokens]) |
|
idf_weights_modified = np.array([idf_values.get(word.lower(), 1.0) for word in modified_tokens]) |
|
combined_idf_weights = (idf_weights_original[row_ind] + idf_weights_modified[col_ind]) / 2 |
|
weighted_score = np.sum((1 - cost_matrix[row_ind, col_ind]) * combined_idf_weights) / np.sum(combined_idf_weights) |
|
|
|
return 1-weighted_score |
|
|
|
def _normalize_dict(self, metric_dict): |
|
""" |
|
Normalize the values in a dictionary to be between 0 and 1. |
|
""" |
|
values = np.array(list(metric_dict.values())) |
|
min_val = values.min() |
|
max_val = values.max() |
|
if max_val - min_val == 0: |
|
normalized_values = np.zeros_like(values) |
|
else: |
|
normalized_values = (values - min_val) / (max_val - min_val) |
|
return dict(zip(metric_dict.keys(), normalized_values)) |
|
|
|
def get_normalized_metrics(self): |
|
""" |
|
Get all normalized metrics as a dictionary. |
|
""" |
|
return { |
|
'Min Edit Distance': self.normalized_levenshtein, |
|
'BERTScore': self.normalized_bert_scores, |
|
'Mover Score': self.normalized_mover_scores |
|
} |
|
|
|
def get_combined_distortions(self): |
|
""" |
|
Get the dictionary of combined distortion values. |
|
""" |
|
return self.combined_distortions |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
config = load_config(config_path)['PECCAVI_TEXT']['Metrics'] |
|
|
|
|
|
original_sentence = "The quick brown fox jumps over the lazy dog" |
|
|
|
|
|
paraphrased_sentences = [ |
|
|
|
"The swift auburn fox leaps across a sleepy canine.", |
|
"A quick auburn fox leaps across a sleepy canine.", |
|
"A swift ginger fox leaps across a sleepy canine.", |
|
"A swift auburn fox bounds across a sleepy canine.", |
|
"A swift auburn fox leaps across a tired canine.", |
|
"Three swift auburn foxes leap across a sleepy canine.", |
|
"The vulpine specimen rapidly traverses over a dormant dog.", |
|
"Like lightning, the russet hunter soars over the drowsy guardian.", |
|
"Tha quick ginger fox jumps o'er the lazy hound, ye ken.", |
|
"One rapid Vulpes vulpes traverses the path of a quiescent canine.", |
|
"A swift auburn predator navigates across a lethargic pet.", |
|
"Subject A (fox) demonstrates velocity over Subject B (dog).", |
|
|
|
|
|
"Some agile russet foxes bound over an idle hound.", |
|
"The nimble russet fox bounds over an idle hound.", |
|
"The agile brown fox bounds over an idle hound.", |
|
"The agile russet fox jumps over an idle hound.", |
|
"The agile russet fox bounds over a lazy hound.", |
|
"Two agile russet foxes bound over an idle hound.", |
|
"A dexterous vulpine surpasses a stationary canine.", |
|
"Quick as thought, the copper warrior sails over the guardian.", |
|
"Tha nimble reddish fox jumps o'er the doggo, don't ya know.", |
|
"A dexterous V. vulpes exceeds the plane of an inactive canine.", |
|
"An agile russet hunter maneuvers above a resting hound.", |
|
"Test subject F-1 achieves displacement superior to subject D-1.", |
|
|
|
|
|
"The nimble mahogany vulpine vaults above a drowsy dog.", |
|
"A swift mahogany vulpine vaults above a drowsy dog.", |
|
"A nimble reddish vulpine vaults above a drowsy dog.", |
|
"A nimble mahogany fox vaults above a drowsy dog.", |
|
"A nimble mahogany vulpine leaps above a drowsy dog.", |
|
"Four nimble mahogany vulpines vault above a drowsy dog.", |
|
"An agile specimen of reddish fur surpasses a somnolent canine.", |
|
"Fleet as wind, the earth-toned hunter soars over the sleepy guard.", |
|
"Tha quick brown beastie jumps o'er the tired pup, aye.", |
|
"Single V. vulpes demonstrates vertical traverse over C. familiaris.", |
|
"A nimble rust-colored predator crosses above a drowsy pet.", |
|
"Observed: Subject Red executes vertical motion over Subject Gray.", |
|
|
|
|
|
"A speedy copper-colored fox hops over the lethargic pup.", |
|
"The quick copper-colored fox hops over the lethargic pup.", |
|
"The speedy bronze fox hops over the lethargic pup.", |
|
"The speedy copper-colored fox jumps over the lethargic pup.", |
|
"The speedy copper-colored fox hops over the tired pup.", |
|
"Multiple speedy copper-colored foxes hop over the lethargic pup.", |
|
"A rapid vulpine of bronze hue traverses an inactive young canine.", |
|
"Swift as a dart, the metallic hunter bounds over the lazy puppy.", |
|
"Tha fast copper beastie leaps o'er the sleepy wee dog.", |
|
"1 rapid V. vulpes crosses above 1 juvenile C. familiaris.", |
|
"A fleet copper-toned predator moves past a sluggish young dog.", |
|
"Field note: Adult fox subject exceeds puppy subject vertically.", |
|
|
|
|
|
"The rapid tawny fox springs over a sluggish dog.", |
|
"A quick tawny fox springs over a sluggish dog.", |
|
"A rapid golden fox springs over a sluggish dog.", |
|
"A rapid tawny fox jumps over a sluggish dog.", |
|
"A rapid tawny fox springs over a lazy dog.", |
|
"Six rapid tawny foxes spring over a sluggish dog.", |
|
"An expeditious yellowish vulpine surpasses a torpid canine.", |
|
"Fast as a bullet, the golden hunter vaults over the idle guard.", |
|
"Tha swift yellowy fox jumps o'er the lazy mutt, aye.", |
|
"One V. vulpes displays rapid transit over one inactive C. familiaris.", |
|
"A speedy yellow-brown predator bypasses a motionless dog.", |
|
"Log entry: Vulpine subject achieves swift vertical displacement.", |
|
|
|
|
|
"A fleet-footed chestnut fox soars above an indolent canine.", |
|
"The swift chestnut fox soars above an indolent canine.", |
|
"The fleet-footed brown fox soars above an indolent canine.", |
|
"The fleet-footed chestnut fox leaps above an indolent canine.", |
|
"The fleet-footed chestnut fox soars above a lazy canine.", |
|
"Several fleet-footed chestnut foxes soar above an indolent canine.", |
|
"A rapid brown vulpine specimen traverses a lethargic domestic dog.", |
|
"Graceful as a bird, the nutbrown hunter flies over the lazy guard.", |
|
"Tha quick brown beastie sails o'er the sleepy hound, ken.", |
|
"Single agile V. vulpes achieves elevation above stationary canine.", |
|
"A nimble brown predator glides over an unmoving domestic animal.", |
|
"Research note: Brown subject displays superior vertical mobility.", |
|
|
|
|
|
"The fast ginger fox hurdles past a slothful dog.", |
|
"A quick ginger fox hurdles past a slothful dog.", |
|
"A fast red fox hurdles past a slothful dog.", |
|
"A fast ginger fox jumps past a slothful dog.", |
|
"A fast ginger fox hurdles past a lazy dog.", |
|
"Five fast ginger foxes hurdle past a slothful dog.", |
|
"A rapid orange vulpine bypasses a lethargic canine.", |
|
"Quick as lightning, the flame-colored hunter races past the lazy guard.", |
|
"Tha swift ginger beastie leaps past the tired doggy, ye see.", |
|
"1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.", |
|
"A speedy red-orange predator overtakes a motionless dog.", |
|
"Data point: Orange subject demonstrates rapid transit past Gray subject.", |
|
|
|
|
|
"A spry rusty-colored fox jumps across a dozing hound.", |
|
"The agile rusty-colored fox jumps across a dozing hound.", |
|
"The spry reddish fox jumps across a dozing hound.", |
|
"The spry rusty-colored fox leaps across a dozing hound.", |
|
"The spry rusty-colored fox jumps across a sleeping hound.", |
|
"Multiple spry rusty-colored foxes jump across a dozing hound.", |
|
"An agile rust-toned vulpine traverses a somnolent canine.", |
|
"Nimble as thought, the copper hunter bounds over the resting guard.", |
|
"Tha lively rust-colored beastie hops o'er the snoozin' hound.", |
|
"Single dexterous V. vulpes crosses path of dormant C. familiaris.", |
|
"A lithe rust-tinted predator moves past a slumbering dog.", |
|
"Observation: Russet subject exhibits agility over dormant subject.", |
|
|
|
|
|
"The quick tan fox leaps over an inactive dog.", |
|
"A swift tan fox leaps over an inactive dog.", |
|
"A quick beige fox leaps over an inactive dog.", |
|
"A quick tan fox jumps over an inactive dog.", |
|
"A quick tan fox leaps over a motionless dog.", |
|
"Seven quick tan foxes leap over an inactive dog.", |
|
"A rapid light-brown vulpine surpasses a stationary canine.", |
|
"Fast as wind, the sand-colored hunter soars over the still guard.", |
|
"Tha nimble tan beastie jumps o'er the quiet doggy, aye.", |
|
"One agile fawn V. vulpes traverses one immobile C. familiaris.", |
|
"A fleet tan-colored predator bypasses an unmoving dog.", |
|
"Field report: Tan subject demonstrates movement over static subject.", |
|
|
|
|
|
"Some brisk auburn vulpines bounce over a listless canine.", |
|
"The quick auburn vulpine bounces over a listless canine.", |
|
"The brisk russet vulpine bounces over a listless canine.", |
|
"The brisk auburn fox bounces over a listless canine.", |
|
"The brisk auburn vulpine jumps over a listless canine.", |
|
"Five brisk auburn vulpines bounce over a listless canine.", |
|
"The expeditious specimen supersedes a quiescent Canis lupus.", |
|
"Swift as wind, the russet hunter vaults over the idle guardian.", |
|
"Tha quick ginger beastie hops o'er the lazy mutt, aye.", |
|
"One V. vulpes achieves displacement over inactive C. familiaris.", |
|
"A high-velocity auburn predator traverses an immobile animal.", |
|
"Final observation: Red subject shows mobility over Gray subject." |
|
] |
|
|
|
distortion_calculator = SentenceDistortionCalculator(config, original_sentence, paraphrased_sentences) |
|
for _ in tqdm(range(1)): |
|
distortion_calculator.calculate_all_metrics() |
|
distortion_calculator.normalize_metrics() |
|
distortion_calculator.calculate_combined_distortion() |
|
distortion_calculator.plot_metrics() |
|
print("Normalized Metrics:", distortion_calculator.get_normalized_metrics()) |
|
print("Combined Distortion:", distortion_calculator.get_combined_distortions()) |