File size: 3,446 Bytes
2f47f53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import json
from typing import Any

import numpy as np
from transformers import AutoTokenizer, pipeline

__all__ = ["ContextAwareWordVectors", "print_results"]


class NumpyFloatValuesEncoder(json.JSONEncoder):
    def default(self, obj: Any) -> Any:
        if isinstance(obj, np.float32):
            return round(float(obj), 3)
        return json.JSONEncoder.default(self, obj)


def print_results():
    with open("sentences.json", encoding="utf-8") as fp:
        samples = json.load(fp)
        context_aware_word_vectors = ContextAwareWordVectors(model="bert-base-uncased")
        results = context_aware_word_vectors.run(samples)
        print(json.dumps(results, indent=2, cls=NumpyFloatValuesEncoder))


class ContextAwareWordVectors:
    def __init__(self, model: str, framework: str = "tf") -> None:
        self.framework = framework
        self.model = model
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.feature_extractor = pipeline(
            model=model,
            framework=framework,
            tokenizer=self.tokenizer,
            task="feature-extraction",
        )

    def dot_product(self, v1: Any, v2: Any) -> Any:
        return round(np.dot(v1, v2), 3)

    def euclidean_distance(self, v1: Any, v2: Any) -> Any:
        return round(np.linalg.norm(v1 - v2), 3)

    def manhattan_distance(self, v1: Any, v2: Any) -> Any:
        return round(np.linalg.norm(v1 - v2, ord=1), 3)

    def run(self, samples: dict[str, dict[str, str]]) -> dict[str, dict[str, Any]]:
        test_word_vector: dict[str, Any]
        results: dict[str, dict[str, Any]] = {}

        for test_word, sample in samples.items():
            results[test_word] = {}
            test_word_vector = {}
            for index, sentence in sample.items():
                tokens = self.tokenizer.tokenize(sentence)
                vectors = self.feature_extractor(sentence, return_tensors=True).numpy()
                test_word_location = [
                    i for i in range(len(tokens)) if test_word == tokens[i]
                ][0]
                test_word_vector[index] = vectors[
                    0, test_word_location + 1, :
                ]  # 0 is '[CLS]'
                magnitude = np.linalg.norm(test_word_vector[index])
                test_word_vector[index] = test_word_vector[index] / magnitude
            results[test_word]["sentences"] = sample
            results[test_word]["dot_product"] = [
                self.dot_product(test_word_vector["1"], test_word_vector["2"]),
                self.dot_product(test_word_vector["2"], test_word_vector["3"]),
                self.dot_product(test_word_vector["3"], test_word_vector["1"]),
            ]
            results[test_word]["euclidean_distance"] = [
                self.euclidean_distance(test_word_vector["1"], test_word_vector["2"]),
                self.euclidean_distance(test_word_vector["2"], test_word_vector["3"]),
                self.euclidean_distance(test_word_vector["3"], test_word_vector["1"]),
            ]
            results[test_word]["manhattan_distance"] = [
                self.manhattan_distance(test_word_vector["1"], test_word_vector["2"]),
                self.manhattan_distance(test_word_vector["2"], test_word_vector["3"]),
                self.manhattan_distance(test_word_vector["3"], test_word_vector["1"]),
            ]
        return results


if __name__ == "__main__":
    print_results()