File size: 6,296 Bytes
8db7949
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e47a060
 
 
 
 
c9f2bdf
8db7949
 
 
 
 
 
 
 
 
 
 
 
e47a060
 
 
 
 
 
8db7949
e47a060
 
 
 
 
 
 
 
 
 
 
 
 
 
8db7949
 
 
 
 
e47a060
 
 
 
8db7949
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e47a060
8db7949
 
 
 
 
 
 
 
 
 
 
e47a060
 
 
8db7949
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from typing import Union

import os
import numpy as np
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

from .utils import assert_tokenizer_consistency
from .metrics import perplexity, entropy

torch.set_grad_enabled(False)

huggingface_config = {
    # Only required for private models from Huggingface (e.g. LLaMA models)
    "TOKEN": os.environ.get("HF_TOKEN", None)
}

# selected using Falcon-7B and Falcon-7B-Instruct at bfloat16
BINOCULARS_ACCURACY_THRESHOLD = 0.9015310749276843  # optimized for f1-score
BINOCULARS_FPR_THRESHOLD = 0.8536432310785527  # optimized for low-fpr [chosen at 0.01%]

# More efficient device handling for Spaces (likely single GPU)
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
# Use same device for both models in single-GPU environment
DEVICE_1 = DEVICE
DEVICE_2 = DEVICE


class Binoculars(object):
    def __init__(self,

                 observer_name_or_path: str = "tiiuae/falcon-7b",

                 performer_name_or_path: str = "tiiuae/falcon-7b-instruct",

                 use_bfloat16: bool = True,

                 max_token_observed: int = 512,

                 mode: str = "low-fpr",

                 ) -> None:
        assert_tokenizer_consistency(observer_name_or_path, performer_name_or_path)

        self.change_mode(mode)
        
        # Log memory usage before loading models
        if torch.cuda.is_available():
            print(f"Before loading observer model: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
            
        # Load first model
        self.observer_model = AutoModelForCausalLM.from_pretrained(observer_name_or_path,
                                                                  device_map={"": DEVICE_1},
                                                                  trust_remote_code=True,
                                                                  torch_dtype=torch.bfloat16 if use_bfloat16
                                                                  else torch.float32,
                                                                  token=huggingface_config["TOKEN"]
                                                                  )
        # Clear cache between model loads
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            print(f"After loading observer model: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
            
        # Load second model
        self.performer_model = AutoModelForCausalLM.from_pretrained(performer_name_or_path,
                                                                   device_map={"": DEVICE_2},
                                                                   trust_remote_code=True,
                                                                   torch_dtype=torch.bfloat16 if use_bfloat16
                                                                   else torch.float32,
                                                                   token=huggingface_config["TOKEN"]
                                                                   )
        
        if torch.cuda.is_available():
            print(f"After loading performer model: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
            
        self.observer_model.eval()
        self.performer_model.eval()

        self.tokenizer = AutoTokenizer.from_pretrained(observer_name_or_path)
        if not self.tokenizer.pad_token:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.max_token_observed = max_token_observed

    def change_mode(self, mode: str) -> None:
        if mode == "low-fpr":
            self.threshold = BINOCULARS_FPR_THRESHOLD
        elif mode == "accuracy":
            self.threshold = BINOCULARS_ACCURACY_THRESHOLD
        else:
            raise ValueError(f"Invalid mode: {mode}")
        
    def free_memory(self) -> None:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()

        del self.observer_model
        del self.performer_model
        self.observer_model = None
        self.performer_model = None

    def _tokenize(self, batch: list[str]) -> transformers.BatchEncoding:
        batch_size = len(batch)
        encodings = self.tokenizer(
            batch,
            return_tensors="pt",
            padding="longest" if batch_size > 1 else False,
            truncation=True,
            max_length=self.max_token_observed,
            return_token_type_ids=False).to(self.observer_model.device)
        return encodings

    @torch.inference_mode()
    def _get_logits(self, encodings: transformers.BatchEncoding) -> torch.Tensor:
        # Ensure we're using the same device for both models
        observer_logits = self.observer_model(**encodings.to(DEVICE_1)).logits
        performer_logits = self.performer_model(**encodings.to(DEVICE_2)).logits
        if DEVICE_1 != "cpu":
            torch.cuda.synchronize()
        return observer_logits, performer_logits

    def compute_score(self, input_text: Union[list[str], str]) -> Union[float, list[float]]:
        batch = [input_text] if isinstance(input_text, str) else input_text
        encodings = self._tokenize(batch)
        observer_logits, performer_logits = self._get_logits(encodings)
        ppl = perplexity(encodings, performer_logits)
        # No need to move tensors again if they're already on the same device
        x_ppl = entropy(observer_logits, performer_logits,
                        encodings, self.tokenizer.pad_token_id)
        binoculars_scores = ppl / x_ppl
        binoculars_scores = binoculars_scores.tolist()
        return binoculars_scores[0] if isinstance(input_text, str) else binoculars_scores

    def predict(self, input_text: Union[list[str], str]) -> Union[list[str], str]:
        binoculars_scores = np.array(self.compute_score(input_text))
        pred = np.where(binoculars_scores < self.threshold,
                        "Most likely AI-generated",
                        "Most likely human-generated"
                        ).tolist()
        return pred