File size: 2,449 Bytes
0827f9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Download necessary NLTK data files
"""
Author: Khanh Phan
Date: 2024-12-04
"""
import os

import nltk
import openai
import torch
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer

# Load environment variables
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

# GPT Model
GPT_ENTITY_MODEL = "o1-mini"  # "gpt-4o-mini" or "o1-mini"
GPT_PARAPHRASE_MODELS = ["gpt-4o", "gpt-4o-mini"]
AZUREOPENAI_CLIENT = openai.AzureOpenAI(
    api_version=AZURE_OPENAI_API_VERSION,  # AZURE_OPENAI_API_VERSION,
    api_key=AZURE_OPENAI_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
)

# Download the resources
nltk.download("punkt", quiet=True)  # Sentence tokenization
nltk.download("punkt_tab", quiet=True)  # Tokenization with tab-separated data
nltk.download("stopwords", quiet=True)  # A list of stop words
STOPWORDS_LANG = "english"

# Load PARAPHASE_MODEL
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PARAPHRASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
PARAPHRASE_MODEL.to(DEVICE)

# Model to detect AI-generated text
AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"

# Thresholds
PARAPHRASE_THRESHOLD_HUMAN = 0.963
PARAPHRASE_THRESHOLD_MACHINE = 0.8
PARAPHRASE_THRESHOLD = 0.8

MIN_SAME_SENTENCE_LEN = 6
MIN_PHRASE_SENTENCE_LEN = 10
MIN_RATIO_PARAPHRASE_NUM = 0.5
MAX_CHAR_SIZE = 30000

# Number of top URLs per search
TOP_URLS_PER_SEARCH = 3

# Search parameters
GOOGLE_ENDPOINT_URL = "https://www.googleapis.com/customsearch/v1"
TOP_SEARCH_RESUTLS = 10
CHUNK_SIZE = 32  # words
NUM_CHUNKS = 3  # number of chunks to search
NUM_FREQUENT_WORDS = 32  # number of top words to return
NUM_KEYWORDS = 5  # number of keywords to return

# Labels
MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"}
HUMAN = "HUMAN"
MACHINE = "MACHINE"
UNKNOWN = "UNKNOWN"
PARAPHRASE = "PARAPHRASE"
NON_PARAPHRASE = "NON_PARAPHRASE"

# Entity color
"""
factor > 1: Lightens the color.
factor = 1: Leaves the color unchanged.
factor < 1: Darkens the color.
factor = 0: Black.
"""
ENTITY_LIGHTEN_COLOR = 2.2
ENTITY_DARKEN_COLOR = 0.7
ENTITY_SATURATION = 0.65  # Saturation: color's intensity (vividness).
ENTITY_BRIGHTNESS = 0.75  # color's brightness.