File size: 3,442 Bytes
22e1b62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from transformers import pipeline
from _google_search_engine_testing_share import find_by_relative_search
import math

PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv"
WORD_FREQUENCY = None

DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
"""
data/MAGE/xsum_human.csv = {'HUMAN': 64, 'MACHINE': 36} correction = 20 => 84%
data/MAGE/xsum_machine_topical_gpt-3.5-trubo.csv = {'HUMAN': 3, 'MACHINE': 97} => correction = 3 => 94%
    original acc = (64+97)/ 200 = 80.5%
    improve = (84 + 94) / 200 = 89%
    different = 8.5%

https://huggingface.co/datasets/RealTimeData/bbc_news_alltime = {'HUMAN': 82, 'MACHINE': 18} => corrected 16 => 98%

"""

MODEL_HUMAN_MATCHING = dict()
MODEL_HUMAN_MATCHING[DEFAULT_MODEL] = "Human" 

HUMAN = "HUMAN"
MACHINE = "MACHINE"

UNKNOWN = "UNKNOWN"
PARAPHASE = "PARAPHASE"
NON_PARAPHASE = "NON_PARAPHASE"


def detect_by_huggingface_model(input_text, model = DEFAULT_MODEL, max_length=512):
    """
    trả về kết quả là "HUMAN" hay "MACHINE" và confidence score (int)
    """
    pipe = pipeline("text-classification", model=model,tokenizer=model, max_length=512, truncation=True, device_map="auto")
    result = pipe(input_text)[0]
    confidence_score = result['score']
    if result['label'] == MODEL_HUMAN_MATCHING[model]:
        return HUMAN, confidence_score
    else:
        return MACHINE, confidence_score

def check_human(data, min_ratio = 0.7):
    """
    input: 
        - data have item:
            + input sentence
            + source sentence
            + similarity
            + True/False : paraphrase or not
    output:
        is human (True/False)
    """
    total_sentence = len(data)
    min_matching = int(math.ceil(total_sentence * min_ratio))
    count = 0
    for input_sentence, source_sentence, similiarity, is_paraprhase in data:
        if input_sentence in source_sentence:
            count += 1
    if count >= min_matching:
        return True
    else:
        return False

def abstract_detect_generated_text(input_text):
    """
    Assists to detect the source of text using the search engine
    Output
    - prediction by search engine (HUMAN/MACHINE/UNKNOWN)
    - Prediction by SOTA (HUMAN/MACHINE)
    - SOTA confidence (float)
    - url to website (None if UNKNOWN)
    - pair of sentences. Each item have ([] if empty)
        - input sentence 
        - source sentence best matching in url
        - matching result between input /source sentence (PARAPHASE/NON_PARAPHASE)
    """
    is_support_opposite = False
    is_paraphrase, found_url, data = find_by_relative_search(input_text, is_support_opposite) 
    sentence_pairs = []
    SOTA_prediction, SOTA_confidence =  detect_by_huggingface_model(input_text)    
    if not is_paraphrase:
        search_engine_prediction = UNKNOWN
    else:
        if check_human(data): 
            search_engine_prediction = HUMAN
        else:
            search_engine_prediction = MACHINE            
        for input_sentence, source_sentence, similiarity, is_paraphrase in data:            
            if is_paraphrase:
                check_paraphrase = PARAPHASE
            else:
                check_paraphrase = NON_PARAPHASE
            sentence_pairs.append([input_sentence, source_sentence, check_paraphrase])

    return search_engine_prediction, SOTA_prediction, SOTA_confidence, found_url, sentence_pairs 

if __name__ == "__main__":  
    pass