File size: 3,414 Bytes
22e1b62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import math

from _google_search_engine_testing_share import find_by_relative_search
from transformers import pipeline

# TODO: move to a config file
# Constants should be UPPER_SNAKE_CASE
PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv"
WORD_FREQUENCY = None

DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"

MODEL_HUMAN_LABEL = {DEFAULT_MODEL: "Human"}

HUMAN = "HUMAN"
MACHINE = "MACHINE"
UNKNOWN = "UNKNOWN"
PARAPHRASE = "PARAPHRASE"
NON_PARAPHRASE = "NON_PARAPHRASE"


def detect_ai_content(
    input_text: str,
    model: str = DEFAULT_MODEL,
    max_length: int = 512,
) -> tuple:
    """
    Detects if text is human or machine generated.

    Returns:
        tuple: (label, confidence_score)
            where label is HUMAN or MACHINE.
    """
    try:
        pipe = pipeline(
            "text-classification",
            model=model,
            tokenizer=model,
            max_length=max_length,
            truncation=True,
            device_map="auto",  # good for GPU usage
        )
        result = pipe(input_text)[0]
        confidence_score = result["score"]
        if result["label"] == MODEL_HUMAN_LABEL[model]:
            label = HUMAN
        else:
            label = MACHINE
        return label, confidence_score
    except Exception as e:  # Add exception handling
        print(f"Error in Roberta model inference: {e}")
        return UNKNOWN, 0.0  # Return UNKNOWN and 0.0 confidence if error


def check_human(data, min_ratio=0.7):
    """
    Checks if a sufficient number of input sentences are found within
        source sentences.

    Returns:
        bool: True if the condition is met, False otherwise.
    """
    if not data:  # Handle empty data case
        return False
    min_matching = math.ceil(len(data) * min_ratio)

    count = 0
    
    #for input_sentence, source_sentence, similiarity, is_paraprhase in data:
    for sentence in data:
        if sentence["similarity"] >= 0.99:
            count += 1
    print(f"\tmatching_sentence_count   : {count}, min_matching: {min_matching}")
    if count >= min_matching:
        return True
    return False


def abstract_detect_generated_text(input_text):
    """
    Abstracts the process of detecting generated text using search
        and a classification model.

    Returns:
        tuple: (
            search_engine_prediction,
            SOTA_prediction,
            SOTA_confidence,
            found_url,
            sentence_pairs,
            )
    """

    is_paraphrase, found_url, data = find_by_relative_search(
        input_text,
        is_support_opposite=False,
    )  # Explicitly set the keyword argument
    SOTA_prediction, SOTA_confidence = detect_ai_content(input_text)

    if not is_paraphrase:
        search_engine_prediction = UNKNOWN
    else:
        search_engine_prediction = HUMAN if check_human(data) else MACHINE

    sentence_pairs = []
    if data:  # Check if data is not empty to avoid error when iterating
        for input_sentence, source_sentence, _, is_paraphrase in data:
            check_paraphrase = PARAPHRASE if is_paraphrase else NON_PARAPHRASE
            sentence_pairs.append(
                [input_sentence, source_sentence, check_paraphrase],
            )

    return (
        search_engine_prediction,
        SOTA_prediction,
        SOTA_confidence,
        found_url,
        sentence_pairs,
    )