RAG3 / config.py
jeongsoo's picture
deepseek_done
ac1b0e8
raw
history blame
17.2 kB
"""
๋ฒกํ„ฐ ์Šคํ† ์–ด, ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ, LLM ๋“ฑ ๊ตฌ์„ฑ ์š”์†Œ ์„ค์ •
ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ๋ฐ .env ํŒŒ์ผ ํ™œ์šฉ ๊ฐœ์„  ๋ฒ„์ „ - ํƒ์ƒ‰ ์†๋„ ์ตœ์ ํ™”
"""
import os
import logging
import sys
import re
import requests
import json
from pathlib import Path
from typing import Dict, Any
from dotenv import load_dotenv
# ๋กœ๊น… ์„ค์ •
logger = logging.getLogger("Config")
# ํ˜„์žฌ ์‹คํ–‰ ์œ„์น˜ ํ™•์ธ (๋””๋ฒ„๊น…์šฉ)
script_dir = os.path.dirname(os.path.abspath(__file__))
logger.info(f"์Šคํฌ๋ฆฝํŠธ ๋””๋ ‰ํ† ๋ฆฌ: {script_dir}")
logger.info(f"ํ˜„์žฌ ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ: {os.getcwd()}")
logger.info(f"์šด์˜ ์ฒด์ œ: {os.name}")
# .env ํŒŒ์ผ ๊ฒ€์ƒ‰ ์ตœ์ ํ™”
def fast_env_load():
"""
.env ํŒŒ์ผ์„ ๋น ๋ฅด๊ฒŒ ์ฐพ์•„ ๋กœ๋“œํ•˜๋Š” ํ•จ์ˆ˜
Returns:
bool: ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ๋กœ๋“œ ์„ฑ๊ณต ์—ฌ๋ถ€
"""
# .env ํŒŒ์ผ ์œ„์น˜ ํ›„๋ณด๋“ค (.env ํŒŒ์ผ์€ ์ผ๋ฐ˜์ ์œผ๋กœ ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ์— ์žˆ์Œ)
env_paths = [
".env", # ํ˜„์žฌ ๋””๋ ‰ํ† ๋ฆฌ
os.path.join(script_dir, ".env"), # ์Šคํฌ๋ฆฝํŠธ ๋””๋ ‰ํ† ๋ฆฌ
]
# ์œ„์—์„œ ๋น ๋ฅด๊ฒŒ ์ˆœํšŒํ•˜๋ฉฐ ์ฐพ๊ธฐ
for env_path in env_paths:
if os.path.isfile(env_path):
logger.info(f".env ํŒŒ์ผ ๋ฐœ๊ฒฌ: {env_path}")
loaded = load_dotenv(env_path, verbose=False, override=True)
if loaded:
logger.info(f".env ํŒŒ์ผ ๋กœ๋“œ ์„ฑ๊ณต: {env_path}")
return True
# ๊ฒ€์ƒ‰ ์‹คํŒจ ์‹œ ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์‚ฌ์šฉ ๋ฉ”์‹œ์ง€ ์ถœ๋ ฅ
logger.warning(".env ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ๊ธฐ๋ณธ๊ฐ’ ๋˜๋Š” ์‹œ์Šคํ…œ ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.")
return False
# ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ๋กœ๋“œ
env_loaded = fast_env_load()
# ํ™˜๊ฒฝ ๊ฐ์ง€
IS_HUGGINGFACE = os.getenv('SPACE_ID') is not None
IS_WINDOWS = os.name == 'nt'
if IS_HUGGINGFACE:
logger.info("HuggingFace Spaces ํ™˜๊ฒฝ์ด ๊ฐ์ง€๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
else:
logger.info(f"๋กœ์ปฌ ํ™˜๊ฒฝ์—์„œ ์‹คํ–‰ ์ค‘์ž…๋‹ˆ๋‹ค. (OS: {'Windows' if IS_WINDOWS else 'Unix/Linux/MacOS'})")
# ์œ ํ‹ธ๋ฆฌํ‹ฐ ํ•จ์ˆ˜: ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ๊ฐ€์ ธ์˜ค๊ธฐ (๊ธฐ๋ณธ๊ฐ’ ์ œ๊ณต)
def get_env(key: str, default: Any = None, required: bool = False) -> Any:
"""
ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋ฅผ ๊ฐ€์ ธ์˜ค๋Š” ์œ ํ‹ธ๋ฆฌํ‹ฐ ํ•จ์ˆ˜
Args:
key: ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ํ‚ค
default: ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์—†์„ ๊ฒฝ์šฐ ๊ธฐ๋ณธ๊ฐ’
required: ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ํ•„์ˆ˜์ ์ธ์ง€ ์—ฌ๋ถ€
Returns:
ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ๊ฐ’ ๋˜๋Š” ๊ธฐ๋ณธ๊ฐ’
"""
value = os.getenv(key, default)
if required and not value:
logger.error(f"ํ•„์ˆ˜ ํ™˜๊ฒฝ ๋ณ€์ˆ˜ {key}๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
raise ValueError(f"ํ•„์ˆ˜ ํ™˜๊ฒฝ ๋ณ€์ˆ˜ {key}๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
return value
# ๊ฒฝ๋กœ ์ƒ์„ฑ ์œ ํ‹ธ๋ฆฌํ‹ฐ ํ•จ์ˆ˜
def ensure_absolute_path(path_str: str) -> str:
"""
์ƒ๋Œ€ ๊ฒฝ๋กœ๋ฅผ ์ ˆ๋Œ€ ๊ฒฝ๋กœ๋กœ ๋ณ€ํ™˜ (Windows ๊ฒฝ๋กœ ์ง€์›)
Args:
path_str: ๋ณ€ํ™˜ํ•  ๊ฒฝ๋กœ ๋ฌธ์ž์—ด
Returns:
์ ˆ๋Œ€ ๊ฒฝ๋กœ
"""
# Windows ๋“œ๋ผ์ด๋ธŒ ๋ฌธ์ž(C:\ ๋“ฑ)๋กœ ์‹œ์ž‘ํ•˜๋Š” ๊ฒฝ๋กœ ํ™•์ธ
if IS_WINDOWS and re.match(r'^[a-zA-Z]:\\', path_str):
logger.info(f"Windows ์ ˆ๋Œ€ ๊ฒฝ๋กœ ๊ฐ์ง€: {path_str}")
# Windows ์ ˆ๋Œ€ ๊ฒฝ๋กœ๋Š” ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
return path_str
path = Path(path_str)
if path.is_absolute():
return str(path)
# ์Šคํฌ๋ฆฝํŠธ ๋””๋ ‰ํ† ๋ฆฌ ๊ธฐ์ค€ ๊ฒฝ๋กœ
script_based_path = Path(script_dir) / path
# ํ˜„์žฌ ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ ๊ธฐ์ค€ ๊ฒฝ๋กœ
cwd_based_path = Path.cwd() / path
# ๋‘ ๊ฒฝ๋กœ ์ค‘ ์กด์žฌํ•˜๋Š” ๊ฒฝ๋กœ ์šฐ์„  ์‚ฌ์šฉ
if script_based_path.exists():
return str(script_based_path)
elif cwd_based_path.exists():
return str(cwd_based_path)
else:
# ๊ธฐ๋ณธ์ ์œผ๋กœ ํ˜„์žฌ ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ ๊ธฐ์ค€ ๊ฒฝ๋กœ ๋ฐ˜ํ™˜
return str(cwd_based_path)
# Windows ๊ฒฝ๋กœ ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•œ ์œ ํ‹ธ๋ฆฌํ‹ฐ ํ•จ์ˆ˜
def normalize_path(path_str: str) -> str:
"""
๊ฒฝ๋กœ ๋ฌธ์ž์—ด์„ ์ •๊ทœํ™”ํ•˜์—ฌ OS์— ๋งž๊ฒŒ ๋ณ€ํ™˜
Args:
path_str: ๋ณ€ํ™˜ํ•  ๊ฒฝ๋กœ ๋ฌธ์ž์—ด
Returns:
์ •๊ทœํ™”๋œ ๊ฒฝ๋กœ
"""
# Windows ๊ฒฝ๋กœ ํ˜•์‹('\')์„ OS์— ๋งž๊ฒŒ ๋ณ€ํ™˜
return os.path.normpath(path_str)
# ๊ธฐ๋ณธ ๋””๋ ‰ํ† ๋ฆฌ ์„ค์ • (์ ˆ๋Œ€ ๊ฒฝ๋กœ๋กœ ๋ณ€ํ™˜)
PDF_DIRECTORY_RAW = get_env("PDF_DIRECTORY", "documents")
# Windows ๋ฐฑ์Šฌ๋ž˜์‹œ ์ด์ค‘ ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•ด ์ •๊ทœํ™”
PDF_DIRECTORY_RAW = normalize_path(PDF_DIRECTORY_RAW)
PDF_DIRECTORY = ensure_absolute_path(PDF_DIRECTORY_RAW)
CACHE_DIRECTORY_RAW = get_env("CACHE_DIRECTORY", "cached_data")
CACHE_DIRECTORY_RAW = normalize_path(CACHE_DIRECTORY_RAW)
CACHE_DIRECTORY = ensure_absolute_path(CACHE_DIRECTORY_RAW)
logger.info(f"PDF ๋””๋ ‰ํ† ๋ฆฌ (์›๋ณธ): {PDF_DIRECTORY_RAW}")
logger.info(f"PDF ๋””๋ ‰ํ† ๋ฆฌ (์ ˆ๋Œ€): {PDF_DIRECTORY}")
logger.info(f"์บ์‹œ ๋””๋ ‰ํ† ๋ฆฌ (์›๋ณธ): {CACHE_DIRECTORY_RAW}")
logger.info(f"์บ์‹œ ๋””๋ ‰ํ† ๋ฆฌ (์ ˆ๋Œ€): {CACHE_DIRECTORY}")
# ์ฒญํ‚น ์„ค์ •
CHUNK_SIZE = int(get_env("CHUNK_SIZE", "1000"))
CHUNK_OVERLAP = int(get_env("CHUNK_OVERLAP", "200"))
# API ํ‚ค ๋ฐ ํ™˜๊ฒฝ ์„ค์ •
OPENAI_API_KEY = get_env("OPENAI_API_KEY", "")
LANGFUSE_PUBLIC_KEY = get_env("LANGFUSE_PUBLIC_KEY", "pk-lf-cd6248e2-59ad-496d-a4cb-487bb3ecfcd5")
LANGFUSE_SECRET_KEY = get_env("LANGFUSE_SECRET_KEY", "sk-lf-61460a1d-e637-4c22-b5e9-9250ac2579ba")
LANGFUSE_HOST = get_env("LANGFUSE_HOST", "https://cloud.langfuse.com")
# DeepSeek ๊ด€๋ จ ์„ค์ • ์ถ”๊ฐ€
DEEPSEEK_API_KEY = get_env("DEEPSEEK_API_KEY", "")
DEEPSEEK_ENDPOINT = get_env("DEEPSEEK_ENDPOINT", "https://api.deepseek.com/v1/chat/completions")
DEEPSEEK_MODEL = get_env("DEEPSEEK_MODEL", "deepseek-chat")
# Milvus ๋ฒกํ„ฐ DB ์„ค์ •
MILVUS_HOST = get_env("MILVUS_HOST", "localhost")
MILVUS_PORT = get_env("MILVUS_PORT", "19530")
MILVUS_COLLECTION = get_env("MILVUS_COLLECTION", "pdf_documents")
# ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์„ค์ •
EMBEDDING_MODEL = get_env("EMBEDDING_MODEL", "Alibaba-NLP/gte-multilingual-base") # ๋‹ค๊ตญ์–ด ์ง€์› ๋ชจ๋ธ
RERANKER_MODEL = get_env("RERANKER_MODEL", "Alibaba-NLP/gte-multilingual-reranker-base") # ๋‹ค๊ตญ์–ด ์ง€์› ๋ฆฌ๋žญ์ปค
# LLM ๋ชจ๋ธ ์„ค์ • (ํ™˜๊ฒฝ์— ๋”ฐ๋ผ ์ž๋™ ์„ ํƒ)
USE_OPENAI = get_env("USE_OPENAI", "False").lower() == "true"
USE_DEEPSEEK = get_env("USE_DEEPSEEK", "False").lower() == "true"
# Ollama ํ˜ธ์ŠคํŠธ ์„ค์ • (๊ธฐ๋ณธ๊ฐ’)
OLLAMA_HOST = get_env("OLLAMA_HOST", "http://localhost:11434")
# DeepSeek API ํ…Œ์ŠคํŠธ ํ•จ์ˆ˜
def test_deepseek_connection():
"""
DeepSeek API ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ
Returns:
ํ…Œ์ŠคํŠธ ๊ฒฐ๊ณผ ๋”•์…”๋„ˆ๋ฆฌ (์„ฑ๊ณต ์—ฌ๋ถ€ ๋ฐ ๋ฉ”์‹œ์ง€)
"""
if not DEEPSEEK_API_KEY:
logger.warning("DeepSeek API ํ‚ค๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•„ ํ…Œ์ŠคํŠธ๋ฅผ ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค.")
return {
"success": False,
"message": "API ํ‚ค๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.",
"status_code": None
}
try:
logger.info(f"DeepSeek API ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ ์‹œ์ž‘: {DEEPSEEK_ENDPOINT}, ๋ชจ๋ธ: {DEEPSEEK_MODEL}")
# ํ…Œ์ŠคํŠธ์šฉ ๊ฐ„๋‹จํ•œ ํ”„๋กฌํ”„ํŠธ
test_prompt = "Hello, please respond with a short greeting."
# API ์š”์ฒญ ํ—ค๋” ๋ฐ ๋ฐ์ดํ„ฐ
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {DEEPSEEK_API_KEY}"
}
payload = {
"model": DEEPSEEK_MODEL,
"messages": [{"role": "user", "content": test_prompt}],
"temperature": 0.7,
"max_tokens": 50
}
# API ์š”์ฒญ ์ „์†ก
response = requests.post(
DEEPSEEK_ENDPOINT,
headers=headers,
data=json.dumps(payload),
timeout=10 # 10์ดˆ ํƒ€์ž„์•„์›ƒ
)
# ์‘๋‹ต ํ™•์ธ
if response.status_code == 200:
logger.info("DeepSeek API ์—ฐ๊ฒฐ ์„ฑ๊ณต")
return {
"success": True,
"message": "API ์—ฐ๊ฒฐ ์„ฑ๊ณต",
"status_code": response.status_code
}
else:
logger.error(f"DeepSeek API ์˜ค๋ฅ˜: ์ƒํƒœ ์ฝ”๋“œ {response.status_code}")
error_message = ""
try:
error_data = response.json()
error_message = error_data.get("error", {}).get("message", str(error_data))
except:
error_message = response.text
return {
"success": False,
"message": f"API ์˜ค๋ฅ˜: {error_message}",
"status_code": response.status_code
}
except requests.exceptions.Timeout:
logger.error("DeepSeek API ์š”์ฒญ ์‹œ๊ฐ„ ์ดˆ๊ณผ")
return {
"success": False,
"message": "API ์š”์ฒญ ์‹œ๊ฐ„ ์ดˆ๊ณผ",
"status_code": None
}
except requests.exceptions.ConnectionError:
logger.error("DeepSeek API ์—ฐ๊ฒฐ ์‹คํŒจ")
return {
"success": False,
"message": "API ์„œ๋ฒ„ ์—ฐ๊ฒฐ ์‹คํŒจ",
"status_code": None
}
except Exception as e:
logger.error(f"DeepSeek API ํ…Œ์ŠคํŠธ ์ค‘ ์˜ˆ์ƒ์น˜ ๋ชปํ•œ ์˜ค๋ฅ˜: {e}", exc_info=True)
return {
"success": False,
"message": f"์˜ˆ์ƒ์น˜ ๋ชปํ•œ ์˜ค๋ฅ˜: {str(e)}",
"status_code": None
}
if IS_HUGGINGFACE:
# HuggingFace ํ™˜๊ฒฝ์—์„œ๋Š” DeepSeek ์‚ฌ์šฉ
if get_env("DEEPSEEK_API_KEY", ""):
USE_DEEPSEEK = True
USE_OPENAI = False
LLM_MODEL = get_env("DEEPSEEK_MODEL", "deepseek-chat")
logger.info("HuggingFace Spaces ํ™˜๊ฒฝ ๊ฐ์ง€: DeepSeek ๋ชจ๋ธ ์‚ฌ์šฉ")
# DeepSeek API ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ
deepseek_test_result = test_deepseek_connection()
if deepseek_test_result["success"]:
logger.info("DeepSeek API ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ ์„ฑ๊ณต")
else:
logger.warning(f"DeepSeek API ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ ์‹คํŒจ: {deepseek_test_result['message']}")
logger.info("OpenAI ๋ชจ๋ธ๋กœ ํด๋ฐฑํ•ฉ๋‹ˆ๋‹ค")
USE_DEEPSEEK = False
USE_OPENAI = True
LLM_MODEL = get_env("LLM_MODEL", "gpt-3.5-turbo")
else:
# DeepSeek API ํ‚ค๊ฐ€ ์—†์œผ๋ฉด OpenAI ์‚ฌ์šฉ
USE_OPENAI = True
USE_DEEPSEEK = False
LLM_MODEL = get_env("LLM_MODEL", "gpt-3.5-turbo")
logger.info("HuggingFace Spaces ํ™˜๊ฒฝ ๊ฐ์ง€: OpenAI ๋ชจ๋ธ ์‚ฌ์šฉ")
else:
# ๋กœ์ปฌ ํ™˜๊ฒฝ์—์„œ๋Š” ์„ค์ •์— ๋”ฐ๋ผ ์„ ํƒ
if USE_DEEPSEEK:
LLM_MODEL = get_env("DEEPSEEK_MODEL", "deepseek-chat")
logger.info(f"DeepSeek ๋ชจ๋ธ ์‚ฌ์šฉ")
# DeepSeek API ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ
deepseek_test_result = test_deepseek_connection()
if deepseek_test_result["success"]:
logger.info("DeepSeek API ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ ์„ฑ๊ณต")
else:
logger.warning(f"DeepSeek API ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ ์‹คํŒจ: {deepseek_test_result['message']}")
if not USE_OPENAI:
logger.info("Ollama๋กœ ํด๋ฐฑํ•ฉ๋‹ˆ๋‹ค")
USE_DEEPSEEK = False
LLM_MODEL = get_env("LLM_MODEL", "gemma3:latest")
else:
logger.info("OpenAI ๋ชจ๋ธ๋กœ ํด๋ฐฑํ•ฉ๋‹ˆ๋‹ค")
USE_DEEPSEEK = False
elif USE_OPENAI:
LLM_MODEL = get_env("LLM_MODEL", "gpt-3.5-turbo")
logger.info(f"OpenAI ๋ชจ๋ธ ์‚ฌ์šฉ")
else:
LLM_MODEL = get_env("LLM_MODEL", "gemma3:latest")
logger.info(f"Ollama ๋ชจ๋ธ ์‚ฌ์šฉ")
# API ํ‚ค ๊ฒ€์ฆ
if USE_DEEPSEEK and not DEEPSEEK_API_KEY:
logger.warning("DeepSeek ๋ชจ๋ธ์ด ์„ ํƒ๋˜์—ˆ์ง€๋งŒ API ํ‚ค๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
USE_DEEPSEEK = False
USE_OPENAI = False
LLM_MODEL = get_env("LLM_MODEL", "gemma3:latest")
logger.info("DeepSeek API ํ‚ค๊ฐ€ ์—†์–ด Ollama๋กœ ํด๋ฐฑํ•ฉ๋‹ˆ๋‹ค.")
elif USE_OPENAI and not OPENAI_API_KEY:
logger.warning("OpenAI ๋ชจ๋ธ์ด ์„ ํƒ๋˜์—ˆ์ง€๋งŒ API ํ‚ค๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
if not IS_HUGGINGFACE: # HuggingFace ํ™˜๊ฒฝ์—์„œ๋Š” ์ž์ฒด API ํ‚ค๋ฅผ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Œ
logger.warning("OpenAI API ํ‚ค๊ฐ€ ์—†์–ด Ollama๋กœ ํด๋ฐฑํ•ฉ๋‹ˆ๋‹ค.")
USE_OPENAI = False
LLM_MODEL = get_env("LLM_MODEL", "gemma3:latest")
# ๋ฒกํ„ฐ ๊ฒ€์ƒ‰ ์„ค์ •
TOP_K_RETRIEVAL = int(get_env("TOP_K_RETRIEVAL", "5")) # ๋ฒกํ„ฐ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ˆ˜
TOP_K_RERANK = int(get_env("TOP_K_RERANK", "3")) # ๋ฆฌ๋žญํ‚น ํ›„ ์„ ํƒํ•  ๊ฒฐ๊ณผ ์ˆ˜
# ๋กœ๊น… ์„ค์ •
LOG_LEVEL = get_env("LOG_LEVEL", "INFO")
LOG_FILE = get_env("LOG_FILE", "autorag.log")
# ์„ค์ • ์ •๋ณด ์ถœ๋ ฅ (๋””๋ฒ„๊น…์šฉ)
def print_config():
"""ํ˜„์žฌ ์„ค์ • ์ •๋ณด๋ฅผ ๋กœ๊ทธ์— ์ถœ๋ ฅ"""
logger.info("===== ํ˜„์žฌ ์„ค์ • ์ •๋ณด =====")
logger.info(f"๋ฌธ์„œ ๋””๋ ‰ํ† ๋ฆฌ: {PDF_DIRECTORY}")
logger.info(f"์บ์‹œ ๋””๋ ‰ํ† ๋ฆฌ: {CACHE_DIRECTORY}")
logger.info(f"์ฒญํฌ ํฌ๊ธฐ: {CHUNK_SIZE}, ์˜ค๋ฒ„๋žฉ: {CHUNK_OVERLAP}")
logger.info(f"OpenAI ์‚ฌ์šฉ: {USE_OPENAI}")
logger.info(f"DeepSeek ์‚ฌ์šฉ: {USE_DEEPSEEK}")
logger.info(f"LLM ๋ชจ๋ธ: {LLM_MODEL}")
logger.info(f"Ollama ํ˜ธ์ŠคํŠธ: {OLLAMA_HOST}")
logger.info(f"์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ: {EMBEDDING_MODEL}")
logger.info(f"๋ฆฌ๋žญ์ปค ๋ชจ๋ธ: {RERANKER_MODEL}")
logger.info(f"TOP_K ๊ฒ€์ƒ‰: {TOP_K_RETRIEVAL}, ๋ฆฌ๋žญํ‚น: {TOP_K_RERANK}")
logger.info("=========================")
# ์„ค์ • ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ
def validate_config() -> Dict[str, Any]:
"""
ํ˜„์žฌ ์„ค์ •์˜ ์œ ํšจ์„ฑ์„ ๊ฒ€์‚ฌํ•˜๊ณ  ๊ฒฝ๊ณ ๋‚˜ ์˜ค๋ฅ˜๋ฅผ ๋กœ๊ทธ์— ๊ธฐ๋ก
Returns:
๊ฒ€์ฆ ๊ฒฐ๊ณผ (status: ์ƒํƒœ, warnings: ๊ฒฝ๊ณ  ๋ชฉ๋ก)
"""
warnings = []
# ๋””๋ ‰ํ† ๋ฆฌ ํ™•์ธ
if not os.path.exists(PDF_DIRECTORY):
warnings.append(f"PDF ๋””๋ ‰ํ† ๋ฆฌ({PDF_DIRECTORY})๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
# API ํ‚ค ํ™•์ธ
if USE_OPENAI and not OPENAI_API_KEY:
warnings.append("OpenAI ์‚ฌ์šฉ์ด ์„ค์ •๋˜์—ˆ์ง€๋งŒ API ํ‚ค๊ฐ€ ์ œ๊ณต๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
if USE_DEEPSEEK and not DEEPSEEK_API_KEY:
warnings.append("DeepSeek ์‚ฌ์šฉ์ด ์„ค์ •๋˜์—ˆ์ง€๋งŒ API ํ‚ค๊ฐ€ ์ œ๊ณต๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
# ๋ชจ๋ธ ๋ฐ ์„ค์ • ๊ฐ’ ํ™•์ธ
if CHUNK_SIZE <= CHUNK_OVERLAP:
warnings.append(f"์ฒญํฌ ํฌ๊ธฐ({CHUNK_SIZE})๊ฐ€ ์˜ค๋ฒ„๋žฉ({CHUNK_OVERLAP})๋ณด๋‹ค ์ž‘๊ฑฐ๋‚˜ ๊ฐ™์Šต๋‹ˆ๋‹ค.")
# DeepSeek API ์—ฐ๊ฒฐ ํ™•์ธ (์„ค์ •๋œ ๊ฒฝ์šฐ)
if USE_DEEPSEEK and DEEPSEEK_API_KEY:
deepseek_test_result = test_deepseek_connection()
if not deepseek_test_result["success"]:
warnings.append(f"DeepSeek API ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ ์‹คํŒจ: {deepseek_test_result['message']}")
# ๊ฒฐ๊ณผ ๊ธฐ๋ก
if warnings:
for warning in warnings:
logger.warning(warning)
return {
"status": "valid" if not warnings else "warnings",
"warnings": warnings
}
# ๋””๋ ‰ํ† ๋ฆฌ ๋ชฉ๋ก ํ•จ์ˆ˜ (๋””๋ฒ„๊น…์šฉ)
def list_directory_contents():
"""
ํ˜„์žฌ ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ์™€ PDF_DIRECTORY์˜ ๋‚ด์šฉ์„ ๋กœ๊น…
"""
# ํ˜„์žฌ ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ ๋‚ด์šฉ
try:
cwd_contents = os.listdir(os.getcwd())
logger.info(f"ํ˜„์žฌ ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ ๋‚ด์šฉ: {cwd_contents}")
except Exception as e:
logger.error(f"ํ˜„์žฌ ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ ๋‚ด์šฉ ํ™•์ธ ์‹คํŒจ: {e}")
# PDF ๋””๋ ‰ํ† ๋ฆฌ ๋‚ด์šฉ
try:
if os.path.exists(PDF_DIRECTORY):
pdf_dir_contents = os.listdir(PDF_DIRECTORY)
logger.info(f"PDF ๋””๋ ‰ํ† ๋ฆฌ ๋‚ด์šฉ: {pdf_dir_contents}")
# PDF ํŒŒ์ผ๋งŒ ํ•„ํ„ฐ๋ง
pdf_files = [f for f in pdf_dir_contents if f.lower().endswith('.pdf')]
logger.info(f"PDF ๋””๋ ‰ํ† ๋ฆฌ ๋‚ด PDF ํŒŒ์ผ: {pdf_files}")
else:
logger.warning(f"PDF ๋””๋ ‰ํ† ๋ฆฌ๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Œ: {PDF_DIRECTORY}")
except Exception as e:
logger.error(f"PDF ๋””๋ ‰ํ† ๋ฆฌ ๋‚ด์šฉ ํ™•์ธ ์‹คํŒจ: {e}")
# ์ง์ ‘ ์ฃผ์–ด์ง„ ๊ฒฝ๋กœ์—์„œ PDF ์ฐพ๊ธฐ (๋””๋ฒ„๊น…์šฉ)
def find_pdf_files_in_path(path: str) -> list:
"""
ํŠน์ • ๊ฒฝ๋กœ์—์„œ PDF ํŒŒ์ผ ์ฐพ๊ธฐ
Args:
path: ๊ฒ€์ƒ‰ํ•  ๊ฒฝ๋กœ
Returns:
๋ฐœ๊ฒฌ๋œ PDF ํŒŒ์ผ ๋ชฉ๋ก
"""
try:
if os.path.exists(path) and os.path.isdir(path):
pdf_files = [f for f in os.listdir(path) if f.lower().endswith('.pdf')]
logger.info(f"๊ฒฝ๋กœ '{path}'์—์„œ {len(pdf_files)}๊ฐœ์˜ PDF ํŒŒ์ผ ๋ฐœ๊ฒฌ: {pdf_files}")
return pdf_files
else:
logger.warning(f"๊ฒฝ๋กœ๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š๊ฑฐ๋‚˜ ๋””๋ ‰ํ† ๋ฆฌ๊ฐ€ ์•„๋‹˜: {path}")
return []
except Exception as e:
logger.error(f"๊ฒฝ๋กœ '{path}'์—์„œ PDF ํŒŒ์ผ ๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜: {e}")
return []
# ์œˆ๋„์šฐ์ฆˆ ์ฃผ์š” ๊ฒฝ๋กœ์—์„œ PDF ํŒŒ์ผ ๊ฒ€์ƒ‰ (๋””๋ฒ„๊น…์šฉ)
def find_pdfs_in_windows_paths():
"""์œˆ๋„์šฐ์ฆˆ์—์„œ ์ฃผ์š” ๊ฒฝ๋กœ์— PDF ํŒŒ์ผ์ด ์žˆ๋Š”์ง€ ํ™•์ธ"""
if not IS_WINDOWS:
return
# ์ผ๋ฐ˜์ ์ธ ์œˆ๋„์šฐ์ฆˆ ๊ฒฝ๋กœ๋“ค
common_paths = [
"C:\\Users\\USER\\RAG3\\documents",
"C:\\Users\\USER\\Documents",
os.path.join(os.environ.get('USERPROFILE', ''), 'Documents'),
os.path.join(os.environ.get('USERPROFILE', ''), 'Downloads'),
"documents",
"."
]
for path in common_paths:
find_pdf_files_in_path(path)
# ์„ค์ • ์ •๋ณด ์ถœ๋ ฅ ๋ฐ ๊ฒ€์ฆ (๋ชจ๋“ˆ ์ž„ํฌํŠธ ์‹œ ์‹คํ–‰)
print_config()
config_status = validate_config()
list_directory_contents()
if IS_WINDOWS:
find_pdfs_in_windows_paths()