Spaces:
Sleeping
Sleeping
File size: 3,414 Bytes
22e1b62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import math
from _google_search_engine_testing_share import find_by_relative_search
from transformers import pipeline
# TODO: move to a config file
# Constants should be UPPER_SNAKE_CASE
PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv"
WORD_FREQUENCY = None
DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
MODEL_HUMAN_LABEL = {DEFAULT_MODEL: "Human"}
HUMAN = "HUMAN"
MACHINE = "MACHINE"
UNKNOWN = "UNKNOWN"
PARAPHRASE = "PARAPHRASE"
NON_PARAPHRASE = "NON_PARAPHRASE"
def detect_ai_content(
input_text: str,
model: str = DEFAULT_MODEL,
max_length: int = 512,
) -> tuple:
"""
Detects if text is human or machine generated.
Returns:
tuple: (label, confidence_score)
where label is HUMAN or MACHINE.
"""
try:
pipe = pipeline(
"text-classification",
model=model,
tokenizer=model,
max_length=max_length,
truncation=True,
device_map="auto", # good for GPU usage
)
result = pipe(input_text)[0]
confidence_score = result["score"]
if result["label"] == MODEL_HUMAN_LABEL[model]:
label = HUMAN
else:
label = MACHINE
return label, confidence_score
except Exception as e: # Add exception handling
print(f"Error in Roberta model inference: {e}")
return UNKNOWN, 0.0 # Return UNKNOWN and 0.0 confidence if error
def check_human(data, min_ratio=0.7):
"""
Checks if a sufficient number of input sentences are found within
source sentences.
Returns:
bool: True if the condition is met, False otherwise.
"""
if not data: # Handle empty data case
return False
min_matching = math.ceil(len(data) * min_ratio)
count = 0
#for input_sentence, source_sentence, similiarity, is_paraprhase in data:
for sentence in data:
if sentence["similarity"] >= 0.99:
count += 1
print(f"\tmatching_sentence_count : {count}, min_matching: {min_matching}")
if count >= min_matching:
return True
return False
def abstract_detect_generated_text(input_text):
"""
Abstracts the process of detecting generated text using search
and a classification model.
Returns:
tuple: (
search_engine_prediction,
SOTA_prediction,
SOTA_confidence,
found_url,
sentence_pairs,
)
"""
is_paraphrase, found_url, data = find_by_relative_search(
input_text,
is_support_opposite=False,
) # Explicitly set the keyword argument
SOTA_prediction, SOTA_confidence = detect_ai_content(input_text)
if not is_paraphrase:
search_engine_prediction = UNKNOWN
else:
search_engine_prediction = HUMAN if check_human(data) else MACHINE
sentence_pairs = []
if data: # Check if data is not empty to avoid error when iterating
for input_sentence, source_sentence, _, is_paraphrase in data:
check_paraphrase = PARAPHRASE if is_paraphrase else NON_PARAPHRASE
sentence_pairs.append(
[input_sentence, source_sentence, check_paraphrase],
)
return (
search_engine_prediction,
SOTA_prediction,
SOTA_confidence,
found_url,
sentence_pairs,
)
|