File size: 2,954 Bytes
22e1b62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import configparser

import google.generativeai as genai
import nltk
from datasets import load_metric
from langchain.chat_models import ChatOpenAI
from transformers import AutoTokenizer

from texts.bart_score import BARTScorer


# Constants
# TODO: move to .env
env = configparser.ConfigParser()
env.read(".env")  # An example environment: .sample-env

# Get API key
OPENAI_API_KEY = env["API_KEY"]["OPENAI_API_KEY"]
GEMINI_API_KEY = env["API_KEY"]["GEMINI_API_KEY"]
TOGETHER_API_KEY = env["API_KEY"]["TOGETHER_API_KEY"]

# Environment setup
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY
os.environ["TOGETHER_API_KEY"] = TOGETHER_API_KEY
os.environ["CURL_CA_BUNDLE"] = ""
os.environ["REQUESTS_CA_BUNDLE"] = ""

# File Path
LOG_FILE = "data/99_log.txt"
OUTPUT_FILE = "data/result.txt"
METRIC_NAME = "roc_auc"

# Training and Model Parameters
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING = 10
PATIENCE = 3
BATCH_SIZE = 64
OPTIMIZED_METRIC = "roc_auc"
SEED = 0
TEMPERATURE = 0.0
IS_OUTPUT_NORMALIZATION = False
RATIO = 0.9
HUMAN_LABEL = 0
MACHINE_LABEL = 1
BART = "bart"

# Model Options
MULTIMODEL = "multimodel"
SINGLE_FROM_MULTIMODEL = "single_from_multimodel"

# Downloading the NLTK "punkt" only if it's not already downloaded
nltk.download("punkt", quiet=True)

# API Models
# TODO: consider using an enum
API_ERROR = "API_ERROR"
IGNORE_BY_API_ERROR = "IGNORE_BY_API_ERROR"
CHATGPT = "ChatGPT"
GEMINI = "Gemini"
# LLAMA_2_70_CHAT_TEMP_0 = "LLaMa"

# Initialize BARTScorer
# TODO: consider loading model lazily
bart_scorer = BARTScorer(device="cuda:0", checkpoint="facebook/bart-large-cnn")

# Generative AI configuration
OPENAI_MODEL_NAME = "gpt-3.5-turbo-0125"
GEMINI_MODEL_NAME = "gemini-pro"

genai.configure(api_key=GEMINI_API_KEY, transport="rest")
GEMINI_MODEL = genai.GenerativeModel(
    GEMINI_MODEL_NAME,
    generation_config={"temperature": TEMPERATURE},
)
OPENAI_MODEL = ChatOpenAI(
    temperature=TEMPERATURE,
    model_name=OPENAI_MODEL_NAME,
)

# Model paths
MODEL_PATHS = {
    "LLaMa": "meta-llama/Llama-2-70b-chat-hf",
    "QWEN": "Qwen/Qwen1.5-72B-Chat",
    "Yi": "NousResearch/Nous-Hermes-2-Yi-34B",
    "Mixtral": "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "OLMo": "allenai/OLMo-7B-Instruct",
    "Phi": "microsoft/phi-2",
    "OpenChat": "openchat/openchat-3.5-1210",
    "WizardLM": "WizardLM/WizardLM-13B-V1.2",
    "Vicuna": "lmsys/vicuna-13b-v1.5",
}

TOGETHER_PATH = "https://api.together.xyz"

# Roberta model configurations
ROBERTA_BASE = "roberta-base"
ROBERTA_LARGE = "roberta-large"
ROBERTA_MODEL_PATHS = {
    ROBERTA_BASE: "roberta-base",
    ROBERTA_LARGE: "roberta-large",
}
LEARNING_RATES = {
    ROBERTA_BASE: 2e-5,
    ROBERTA_LARGE: 8e-6,
}
MODEL_NAME = ROBERTA_BASE

# Tokenizer initialization
tokenizer = AutoTokenizer.from_pretrained(ROBERTA_MODEL_PATHS[MODEL_NAME])

# Metric loading
metric = load_metric(METRIC_NAME)