Spaces:
Sleeping
Sleeping
# Download necessary NLTK data files | |
""" | |
Author: Khanh Phan | |
Date: 2024-12-04 | |
""" | |
import os | |
import nltk | |
import openai | |
import torch | |
from dotenv import load_dotenv | |
from sentence_transformers import SentenceTransformer | |
# Load environment variables | |
load_dotenv() | |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") | |
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID") | |
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") | |
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") | |
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") | |
# GPT Model | |
GPT_ENTITY_MODEL = "o1-mini" # "gpt-4o-mini" or "o1-mini" | |
GPT_PARAPHRASE_MODELS = ["gpt-4o", "gpt-4o-mini"] | |
AZUREOPENAI_CLIENT = openai.AzureOpenAI( | |
api_version=AZURE_OPENAI_API_VERSION, # AZURE_OPENAI_API_VERSION, | |
api_key=AZURE_OPENAI_API_KEY, | |
azure_endpoint=AZURE_OPENAI_ENDPOINT, | |
) | |
# Download the resources | |
nltk.download("punkt", quiet=True) # Sentence tokenization | |
nltk.download("punkt_tab", quiet=True) # Tokenization with tab-separated data | |
nltk.download("stopwords", quiet=True) # A list of stop words | |
STOPWORDS_LANG = "english" | |
# Load PARAPHASE_MODEL | |
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
PARAPHRASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2") | |
PARAPHRASE_MODEL.to(DEVICE) | |
# Model to detect AI-generated text | |
AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B" | |
# Thresholds | |
PARAPHRASE_THRESHOLD_HUMAN = 0.963 | |
PARAPHRASE_THRESHOLD_MACHINE = 0.8 | |
PARAPHRASE_THRESHOLD = 0.8 | |
MIN_SAME_SENTENCE_LEN = 6 | |
MIN_PHRASE_SENTENCE_LEN = 10 | |
MIN_RATIO_PARAPHRASE_NUM = 0.5 | |
MAX_CHAR_SIZE = 30000 | |
# Number of top URLs per search | |
TOP_URLS_PER_SEARCH = 3 | |
# Search parameters | |
GOOGLE_ENDPOINT_URL = "https://www.googleapis.com/customsearch/v1" | |
TOP_SEARCH_RESUTLS = 10 | |
CHUNK_SIZE = 32 # words | |
NUM_CHUNKS = 3 # number of chunks to search | |
NUM_FREQUENT_WORDS = 32 # number of top words to return | |
NUM_KEYWORDS = 5 # number of keywords to return | |
# Labels | |
MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"} | |
HUMAN = "HUMAN" | |
MACHINE = "MACHINE" | |
UNKNOWN = "UNKNOWN" | |
PARAPHRASE = "PARAPHRASE" | |
NON_PARAPHRASE = "NON_PARAPHRASE" | |
# Entity color | |
""" | |
factor > 1: Lightens the color. | |
factor = 1: Leaves the color unchanged. | |
factor < 1: Darkens the color. | |
factor = 0: Black. | |
""" | |
ENTITY_LIGHTEN_COLOR = 2.2 | |
ENTITY_DARKEN_COLOR = 0.7 | |
ENTITY_SATURATION = 0.65 # Saturation: color's intensity (vividness). | |
ENTITY_BRIGHTNESS = 0.75 # color's brightness. | |