|
""" |
|
๋ฒกํฐ ์คํ ์ด, ์๋ฒ ๋ฉ ๋ชจ๋ธ, LLM ๋ฑ ๊ตฌ์ฑ ์์ ์ค์ |
|
ํ๊ฒฝ ๋ณ์ ๋ฐ .env ํ์ผ ํ์ฉ ๊ฐ์ ๋ฒ์ - HuggingFace ํ๊ฒฝ ์ง์ ์ถ๊ฐ |
|
""" |
|
import os |
|
import logging |
|
import sys |
|
import re |
|
import requests |
|
import json |
|
from pathlib import Path |
|
from typing import Dict, Any |
|
from dotenv import load_dotenv |
|
|
|
|
|
logger = logging.getLogger("Config") |
|
|
|
|
|
script_dir = os.path.dirname(os.path.abspath(__file__)) |
|
logger.info(f"์คํฌ๋ฆฝํธ ๋๋ ํ ๋ฆฌ: {script_dir}") |
|
logger.info(f"ํ์ฌ ์์
๋๋ ํ ๋ฆฌ: {os.getcwd()}") |
|
logger.info(f"์ด์ ์ฒด์ : {os.name}") |
|
|
|
|
|
IS_HUGGINGFACE = False |
|
if os.getenv('SPACE_ID') is not None or os.getenv('SYSTEM') == 'spaces': |
|
IS_HUGGINGFACE = True |
|
logger.info("HuggingFace Spaces ํ๊ฒฝ์ด ๊ฐ์ง๋์์ต๋๋ค.") |
|
else: |
|
|
|
|
|
env_paths = [ |
|
".env", |
|
os.path.join(script_dir, ".env"), |
|
os.path.join(script_dir, "config", ".env"), |
|
os.path.join(os.path.dirname(script_dir), ".env"), |
|
] |
|
|
|
|
|
env_loaded = False |
|
for env_path in env_paths: |
|
if os.path.isfile(env_path): |
|
logger.info(f".env ํ์ผ ๋ฐ๊ฒฌ: {env_path}") |
|
env_loaded = load_dotenv(env_path, verbose=True) |
|
if env_loaded: |
|
logger.info(f".env ํ์ผ ๋ก๋ ์ฑ๊ณต: {env_path}") |
|
break |
|
|
|
if not env_loaded: |
|
logger.warning(".env ํ์ผ์ ์ฐพ์ ์ ์์ต๋๋ค. ๊ธฐ๋ณธ๊ฐ ๋๋ ์์คํ
ํ๊ฒฝ ๋ณ์๋ฅผ ์ฌ์ฉํฉ๋๋ค.") |
|
|
|
logger.info(f"๋ก์ปฌ ํ๊ฒฝ์์ ์คํ ์ค์
๋๋ค. (OS: {'Windows' if os.name == 'nt' else 'Unix/Linux/MacOS'})") |
|
|
|
|
|
IS_WINDOWS = os.name == 'nt' |
|
|
|
|
|
def get_env(key: str, default: Any = None, required: bool = False) -> Any: |
|
""" |
|
ํ๊ฒฝ ๋ณ์๋ฅผ ๊ฐ์ ธ์ค๋ ์ ํธ๋ฆฌํฐ ํจ์ (HuggingFace ํ๊ฒฝ ์ง์) |
|
|
|
Args: |
|
key: ํ๊ฒฝ ๋ณ์ ํค |
|
default: ํ๊ฒฝ ๋ณ์๊ฐ ์์ ๊ฒฝ์ฐ ๊ธฐ๋ณธ๊ฐ |
|
required: ํ๊ฒฝ ๋ณ์๊ฐ ํ์์ ์ธ์ง ์ฌ๋ถ |
|
|
|
Returns: |
|
ํ๊ฒฝ ๋ณ์ ๊ฐ ๋๋ ๊ธฐ๋ณธ๊ฐ |
|
""" |
|
|
|
if IS_HUGGINGFACE: |
|
|
|
|
|
hf_secret_key = f"HF_SECRET_{key.upper()}" |
|
value = os.getenv(hf_secret_key) |
|
|
|
|
|
if value is None: |
|
value = os.getenv(key, default) |
|
else: |
|
|
|
value = os.getenv(key, default) |
|
|
|
if required and value is None: |
|
if IS_HUGGINGFACE: |
|
error_msg = f"ํ์ ํ๊ฒฝ ๋ณ์ {key}๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. HuggingFace Space์์ ์ํฌ๋ฆฟ์ ์ค์ ํด์ฃผ์ธ์." |
|
logger.error(error_msg) |
|
raise ValueError(error_msg) |
|
else: |
|
error_msg = f"ํ์ ํ๊ฒฝ ๋ณ์ {key}๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. .env ํ์ผ์ ์ถ๊ฐํด์ฃผ์ธ์." |
|
logger.error(error_msg) |
|
raise ValueError(error_msg) |
|
|
|
return value |
|
|
|
|
|
def ensure_absolute_path(path_str: str) -> str: |
|
""" |
|
์๋ ๊ฒฝ๋ก๋ฅผ ์ ๋ ๊ฒฝ๋ก๋ก ๋ณํ (Windows ๊ฒฝ๋ก ์ง์) |
|
|
|
Args: |
|
path_str: ๋ณํํ ๊ฒฝ๋ก ๋ฌธ์์ด |
|
|
|
Returns: |
|
์ ๋ ๊ฒฝ๋ก |
|
""" |
|
|
|
if IS_WINDOWS and re.match(r'^[a-zA-Z]:\\', path_str): |
|
logger.info(f"Windows ์ ๋ ๊ฒฝ๋ก ๊ฐ์ง: {path_str}") |
|
|
|
return path_str |
|
|
|
path = Path(path_str) |
|
if path.is_absolute(): |
|
return str(path) |
|
|
|
|
|
script_based_path = Path(script_dir) / path |
|
|
|
|
|
cwd_based_path = Path.cwd() / path |
|
|
|
|
|
if script_based_path.exists(): |
|
return str(script_based_path) |
|
elif cwd_based_path.exists(): |
|
return str(cwd_based_path) |
|
else: |
|
|
|
return str(cwd_based_path) |
|
|
|
|
|
def normalize_path(path_str: str) -> str: |
|
""" |
|
๊ฒฝ๋ก ๋ฌธ์์ด์ ์ ๊ทํํ์ฌ OS์ ๋ง๊ฒ ๋ณํ |
|
|
|
Args: |
|
path_str: ๋ณํํ ๊ฒฝ๋ก ๋ฌธ์์ด |
|
|
|
Returns: |
|
์ ๊ทํ๋ ๊ฒฝ๋ก |
|
""" |
|
|
|
return os.path.normpath(path_str) |
|
|
|
|
|
PDF_DIRECTORY_RAW = get_env("PDF_DIRECTORY", "documents") |
|
|
|
PDF_DIRECTORY_RAW = normalize_path(PDF_DIRECTORY_RAW) |
|
PDF_DIRECTORY = ensure_absolute_path(PDF_DIRECTORY_RAW) |
|
|
|
CACHE_DIRECTORY_RAW = get_env("CACHE_DIRECTORY", "cached_data") |
|
CACHE_DIRECTORY_RAW = normalize_path(CACHE_DIRECTORY_RAW) |
|
CACHE_DIRECTORY = ensure_absolute_path(CACHE_DIRECTORY_RAW) |
|
|
|
logger.info(f"PDF ๋๋ ํ ๋ฆฌ (์๋ณธ): {PDF_DIRECTORY_RAW}") |
|
logger.info(f"PDF ๋๋ ํ ๋ฆฌ (์ ๋): {PDF_DIRECTORY}") |
|
logger.info(f"์บ์ ๋๋ ํ ๋ฆฌ (์๋ณธ): {CACHE_DIRECTORY_RAW}") |
|
logger.info(f"์บ์ ๋๋ ํ ๋ฆฌ (์ ๋): {CACHE_DIRECTORY}") |
|
|
|
|
|
CHUNK_SIZE = int(get_env("CHUNK_SIZE", "1000")) |
|
CHUNK_OVERLAP = int(get_env("CHUNK_OVERLAP", "200")) |
|
|
|
|
|
OPENAI_API_KEY = get_env("OPENAI_API_KEY", "") |
|
LANGFUSE_PUBLIC_KEY = get_env("LANGFUSE_PUBLIC_KEY", "") |
|
LANGFUSE_SECRET_KEY = get_env("LANGFUSE_SECRET_KEY", "") |
|
LANGFUSE_HOST = get_env("LANGFUSE_HOST", "https://cloud.langfuse.com") |
|
|
|
|
|
DEEPSEEK_API_KEY = get_env("DEEPSEEK_API_KEY", "") |
|
DEEPSEEK_ENDPOINT = get_env("DEEPSEEK_ENDPOINT", "https://api.deepseek.com/v1/chat/completions") |
|
DEEPSEEK_MODEL = get_env("DEEPSEEK_MODEL", "deepseek-chat") |
|
|
|
|
|
if IS_HUGGINGFACE: |
|
logger.info(f"ํ๊น
ํ์ด์ค ํ๊ฒฝ์์ DeepSeek API ํค ์กด์ฌ ์ฌ๋ถ: {bool(DEEPSEEK_API_KEY)}") |
|
|
|
if DEEPSEEK_API_KEY: |
|
masked_key = DEEPSEEK_API_KEY[:4] + "****" + DEEPSEEK_API_KEY[-4:] if len(DEEPSEEK_API_KEY) > 8 else "****" |
|
logger.info(f"DeepSeek API ํค: {masked_key}") |
|
|
|
logger.info(f"DeepSeek ๋ชจ๋ธ: {DEEPSEEK_MODEL}") |
|
logger.info(f"DeepSeek ์๋ํฌ์ธํธ: {DEEPSEEK_ENDPOINT}") |
|
|
|
|
|
MILVUS_HOST = get_env("MILVUS_HOST", "localhost") |
|
MILVUS_PORT = get_env("MILVUS_PORT", "19530") |
|
MILVUS_COLLECTION = get_env("MILVUS_COLLECTION", "pdf_documents") |
|
|
|
|
|
EMBEDDING_MODEL = get_env("EMBEDDING_MODEL", "Alibaba-NLP/gte-multilingual-base") |
|
RERANKER_MODEL = get_env("RERANKER_MODEL", "Alibaba-NLP/gte-multilingual-reranker-base") |
|
|
|
|
|
USE_OPENAI = get_env("USE_OPENAI", "False").lower() == "true" |
|
USE_DEEPSEEK = get_env("USE_DEEPSEEK", "False").lower() == "true" |
|
|
|
|
|
if IS_HUGGINGFACE: |
|
|
|
if DEEPSEEK_API_KEY: |
|
USE_DEEPSEEK = True |
|
USE_OPENAI = False |
|
LLM_MODEL = DEEPSEEK_MODEL |
|
logger.info("HuggingFace Spaces ํ๊ฒฝ: DeepSeek ๋ชจ๋ธ ์ฌ์ฉ") |
|
else: |
|
logger.warning("HuggingFace Spaces ํ๊ฒฝ์์ DeepSeek API ํค๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค.") |
|
USE_DEEPSEEK = False |
|
USE_OPENAI = False |
|
LLM_MODEL = get_env("LLM_MODEL", "gemma3:latest") |
|
logger.info(f"HuggingFace Spaces ํ๊ฒฝ: DeepSeek API ํค ์์, LLM ๋ชจ๋ธ: {LLM_MODEL}") |
|
else: |
|
|
|
if USE_DEEPSEEK: |
|
LLM_MODEL = DEEPSEEK_MODEL |
|
logger.info(f"๋ก์ปฌ ํ๊ฒฝ: DeepSeek ๋ชจ๋ธ ์ฌ์ฉ ({DEEPSEEK_MODEL})") |
|
elif USE_OPENAI: |
|
LLM_MODEL = get_env("LLM_MODEL", "gpt-3.5-turbo") |
|
logger.info(f"๋ก์ปฌ ํ๊ฒฝ: OpenAI ๋ชจ๋ธ ์ฌ์ฉ ({LLM_MODEL})") |
|
else: |
|
LLM_MODEL = get_env("LLM_MODEL", "gemma3:latest") |
|
OLLAMA_HOST = get_env("OLLAMA_HOST", "http://localhost:11434") |
|
logger.info(f"๋ก์ปฌ ํ๊ฒฝ: Ollama ๋ชจ๋ธ ์ฌ์ฉ ({LLM_MODEL})") |
|
|
|
|
|
if not IS_HUGGINGFACE: |
|
if USE_DEEPSEEK and not DEEPSEEK_API_KEY: |
|
logger.warning("DeepSeek ๋ชจ๋ธ์ด ์ ํ๋์์ง๋ง API ํค๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค.") |
|
USE_DEEPSEEK = False |
|
USE_OPENAI = False |
|
LLM_MODEL = get_env("LLM_MODEL", "gemma3:latest") |
|
logger.info("DeepSeek API ํค๊ฐ ์์ด Ollama๋ก ํด๋ฐฑํฉ๋๋ค.") |
|
elif USE_OPENAI and not OPENAI_API_KEY: |
|
logger.warning("OpenAI ๋ชจ๋ธ์ด ์ ํ๋์์ง๋ง API ํค๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค.") |
|
logger.warning("OpenAI API ํค๊ฐ ์์ด Ollama๋ก ํด๋ฐฑํฉ๋๋ค.") |
|
USE_OPENAI = False |
|
LLM_MODEL = get_env("LLM_MODEL", "gemma3:latest") |
|
|
|
|
|
def test_deepseek_connection(): |
|
""" |
|
DeepSeek API ์ฐ๊ฒฐ ํ
์คํธ |
|
|
|
Returns: |
|
ํ
์คํธ ๊ฒฐ๊ณผ ๋์
๋๋ฆฌ (์ฑ๊ณต ์ฌ๋ถ ๋ฐ ๋ฉ์์ง) |
|
""" |
|
if not DEEPSEEK_API_KEY: |
|
logger.warning("DeepSeek API ํค๊ฐ ์ค์ ๋์ง ์์ ํ
์คํธ๋ฅผ ๊ฑด๋๋๋๋ค.") |
|
return { |
|
"success": False, |
|
"message": "API ํค๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค.", |
|
"status_code": None |
|
} |
|
|
|
try: |
|
logger.info(f"DeepSeek API ์ฐ๊ฒฐ ํ
์คํธ ์์: {DEEPSEEK_ENDPOINT}, ๋ชจ๋ธ: {DEEPSEEK_MODEL}") |
|
|
|
|
|
test_prompt = "Hello, please respond with a short greeting." |
|
|
|
|
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {DEEPSEEK_API_KEY}" |
|
} |
|
|
|
payload = { |
|
"model": DEEPSEEK_MODEL, |
|
"messages": [{"role": "user", "content": test_prompt}], |
|
"temperature": 0.7, |
|
"max_tokens": 50 |
|
} |
|
|
|
|
|
response = requests.post( |
|
DEEPSEEK_ENDPOINT, |
|
headers=headers, |
|
json=payload, |
|
timeout=10 |
|
) |
|
|
|
|
|
if response.status_code == 200: |
|
logger.info("DeepSeek API ์ฐ๊ฒฐ ์ฑ๊ณต") |
|
return { |
|
"success": True, |
|
"message": "API ์ฐ๊ฒฐ ์ฑ๊ณต", |
|
"status_code": response.status_code |
|
} |
|
else: |
|
logger.error(f"DeepSeek API ์ค๋ฅ: ์ํ ์ฝ๋ {response.status_code}") |
|
error_message = "" |
|
try: |
|
error_data = response.json() |
|
error_message = error_data.get("error", {}).get("message", str(error_data)) |
|
except: |
|
error_message = response.text |
|
|
|
return { |
|
"success": False, |
|
"message": f"API ์ค๋ฅ: {error_message}", |
|
"status_code": response.status_code |
|
} |
|
|
|
except requests.exceptions.Timeout: |
|
logger.error("DeepSeek API ์์ฒญ ์๊ฐ ์ด๊ณผ") |
|
return { |
|
"success": False, |
|
"message": "API ์์ฒญ ์๊ฐ ์ด๊ณผ", |
|
"status_code": None |
|
} |
|
except requests.exceptions.ConnectionError: |
|
logger.error("DeepSeek API ์ฐ๊ฒฐ ์คํจ") |
|
return { |
|
"success": False, |
|
"message": "API ์๋ฒ ์ฐ๊ฒฐ ์คํจ", |
|
"status_code": None |
|
} |
|
except Exception as e: |
|
logger.error(f"DeepSeek API ํ
์คํธ ์ค ์์์น ๋ชปํ ์ค๋ฅ: {e}", exc_info=True) |
|
return { |
|
"success": False, |
|
"message": f"์์์น ๋ชปํ ์ค๋ฅ: {str(e)}", |
|
"status_code": None |
|
} |
|
|
|
|
|
TOP_K_RETRIEVAL = int(get_env("TOP_K_RETRIEVAL", "5")) |
|
TOP_K_RERANK = int(get_env("TOP_K_RERANK", "3")) |
|
|
|
|
|
LOG_LEVEL = get_env("LOG_LEVEL", "INFO") |
|
LOG_FILE = get_env("LOG_FILE", "autorag.log") |
|
|
|
|
|
def print_config(): |
|
"""ํ์ฌ ์ค์ ์ ๋ณด๋ฅผ ๋ก๊ทธ์ ์ถ๋ ฅ""" |
|
logger.info("===== ํ์ฌ ์ค์ ์ ๋ณด =====") |
|
logger.info(f"์คํ ํ๊ฒฝ: {'HuggingFace Spaces' if IS_HUGGINGFACE else '๋ก์ปฌ'}") |
|
logger.info(f"๋ฌธ์ ๋๋ ํ ๋ฆฌ: {PDF_DIRECTORY}") |
|
logger.info(f"์บ์ ๋๋ ํ ๋ฆฌ: {CACHE_DIRECTORY}") |
|
logger.info(f"์ฒญํฌ ํฌ๊ธฐ: {CHUNK_SIZE}, ์ค๋ฒ๋ฉ: {CHUNK_OVERLAP}") |
|
logger.info(f"OpenAI ์ฌ์ฉ: {USE_OPENAI}") |
|
logger.info(f"DeepSeek ์ฌ์ฉ: {USE_DEEPSEEK}") |
|
logger.info(f"LLM ๋ชจ๋ธ: {LLM_MODEL}") |
|
if not USE_OPENAI and not USE_DEEPSEEK and not IS_HUGGINGFACE: |
|
logger.info(f"Ollama ํธ์คํธ: {OLLAMA_HOST}") |
|
logger.info(f"์๋ฒ ๋ฉ ๋ชจ๋ธ: {EMBEDDING_MODEL}") |
|
logger.info(f"๋ฆฌ๋ญ์ปค ๋ชจ๋ธ: {RERANKER_MODEL}") |
|
logger.info(f"TOP_K ๊ฒ์: {TOP_K_RETRIEVAL}, ๋ฆฌ๋ญํน: {TOP_K_RERANK}") |
|
logger.info("=========================") |
|
|
|
|
|
def validate_config() -> Dict[str, Any]: |
|
""" |
|
ํ์ฌ ์ค์ ์ ์ ํจ์ฑ์ ๊ฒ์ฌํ๊ณ ๊ฒฝ๊ณ ๋ ์ค๋ฅ๋ฅผ ๋ก๊ทธ์ ๊ธฐ๋ก |
|
|
|
Returns: |
|
๊ฒ์ฆ ๊ฒฐ๊ณผ (status: ์ํ, warnings: ๊ฒฝ๊ณ ๋ชฉ๋ก) |
|
""" |
|
warnings = [] |
|
|
|
|
|
if not os.path.exists(PDF_DIRECTORY): |
|
warnings.append(f"PDF ๋๋ ํ ๋ฆฌ({PDF_DIRECTORY})๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.") |
|
|
|
|
|
if IS_HUGGINGFACE: |
|
if USE_DEEPSEEK and not DEEPSEEK_API_KEY: |
|
warnings.append("ํ๊น
ํ์ด์ค ํ๊ฒฝ์์ DeepSeek ์ฌ์ฉ์ด ์ค์ ๋์์ง๋ง API ํค๊ฐ ์ ๊ณต๋์ง ์์์ต๋๋ค.") |
|
else: |
|
if USE_OPENAI and not OPENAI_API_KEY: |
|
warnings.append("OpenAI ์ฌ์ฉ์ด ์ค์ ๋์์ง๋ง API ํค๊ฐ ์ ๊ณต๋์ง ์์์ต๋๋ค.") |
|
|
|
if USE_DEEPSEEK and not DEEPSEEK_API_KEY: |
|
warnings.append("DeepSeek ์ฌ์ฉ์ด ์ค์ ๋์์ง๋ง API ํค๊ฐ ์ ๊ณต๋์ง ์์์ต๋๋ค.") |
|
|
|
|
|
if CHUNK_SIZE <= CHUNK_OVERLAP: |
|
warnings.append(f"์ฒญํฌ ํฌ๊ธฐ({CHUNK_SIZE})๊ฐ ์ค๋ฒ๋ฉ({CHUNK_OVERLAP})๋ณด๋ค ์๊ฑฐ๋ ๊ฐ์ต๋๋ค.") |
|
|
|
|
|
if USE_DEEPSEEK and DEEPSEEK_API_KEY: |
|
deepseek_test_result = test_deepseek_connection() |
|
if not deepseek_test_result["success"]: |
|
warnings.append(f"DeepSeek API ์ฐ๊ฒฐ ํ
์คํธ ์คํจ: {deepseek_test_result['message']}") |
|
|
|
|
|
if warnings: |
|
for warning in warnings: |
|
logger.warning(warning) |
|
|
|
return { |
|
"status": "valid" if not warnings else "warnings", |
|
"warnings": warnings |
|
} |
|
|
|
|
|
print_config() |
|
config_status = validate_config() |