|
""" |
|
λλ²κΉ
μ μν μ½λ μΆκ° - κ²½λ‘ κ΄λ ¨ λ¬Έμ ν΄κ²° |
|
""" |
|
import os |
|
import time |
|
import hashlib |
|
import pickle |
|
import json |
|
import logging |
|
import glob |
|
from typing import List, Dict, Tuple, Any, Optional |
|
from logging.handlers import RotatingFileHandler |
|
from pathlib import Path |
|
from langchain.schema import Document |
|
|
|
from config import ( |
|
PDF_DIRECTORY, CACHE_DIRECTORY, CHUNK_SIZE, CHUNK_OVERLAP, |
|
LLM_MODEL, LOG_LEVEL, LOG_FILE, print_config, validate_config |
|
) |
|
from optimized_document_processor import OptimizedDocumentProcessor |
|
from vector_store import VectorStore |
|
|
|
import sys |
|
print("===== Script starting =====") |
|
sys.stdout.flush() |
|
|
|
|
|
print("Loading config...") |
|
sys.stdout.flush() |
|
|
|
print("Config loaded!") |
|
sys.stdout.flush() |
|
|
|
|
|
def setup_logging(): |
|
"""μ ν리μΌμ΄μ
λ‘κΉ
μ€μ """ |
|
|
|
log_level = getattr(logging, LOG_LEVEL.upper(), logging.INFO) |
|
|
|
|
|
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
formatter = logging.Formatter(log_format) |
|
|
|
|
|
root_logger = logging.getLogger() |
|
root_logger.setLevel(log_level) |
|
|
|
|
|
|
|
console_handler = logging.StreamHandler() |
|
console_handler.setFormatter(formatter) |
|
root_logger.addHandler(console_handler) |
|
|
|
|
|
try: |
|
file_handler = RotatingFileHandler( |
|
LOG_FILE, |
|
maxBytes=10*1024*1024, |
|
backupCount=5 |
|
) |
|
file_handler.setFormatter(formatter) |
|
root_logger.addHandler(file_handler) |
|
except Exception as e: |
|
console_handler.warning(f"λ‘κ·Έ νμΌ μ€μ μ€ν¨: {e}, μ½μ λ‘κΉ
λ§ μ¬μ©ν©λλ€.") |
|
|
|
return logging.getLogger("AutoRAG") |
|
|
|
|
|
logger = setup_logging() |
|
|
|
|
|
current_dir = os.getcwd() |
|
logger.info(f"νμ¬ μμ
λλ ν 리: {current_dir}") |
|
|
|
|
|
abs_pdf_dir = os.path.abspath(PDF_DIRECTORY) |
|
logger.info(f"μ€μ λ PDF λλ ν 리: {PDF_DIRECTORY}") |
|
logger.info(f"μ λ κ²½λ‘λ‘ λ³νλ PDF λλ ν 리: {abs_pdf_dir}") |
|
|
|
|
|
if os.path.exists(abs_pdf_dir): |
|
logger.info(f"PDF λλ ν λ¦¬κ° μ‘΄μ¬ν©λλ€: {abs_pdf_dir}") |
|
|
|
pdf_files = glob.glob(os.path.join(abs_pdf_dir, "*.pdf")) |
|
logger.info(f"λλ ν 리 λ΄ PDF νμΌ λͺ©λ‘: {pdf_files}") |
|
else: |
|
logger.error(f"PDF λλ ν λ¦¬κ° μ‘΄μ¬νμ§ μμ΅λλ€: {abs_pdf_dir}") |
|
|
|
parent_dir = os.path.dirname(abs_pdf_dir) |
|
logger.info(f"μμ λλ ν 리: {parent_dir}") |
|
if os.path.exists(parent_dir): |
|
dir_contents = os.listdir(parent_dir) |
|
logger.info(f"μμ λλ ν 리 λ΄μ©: {dir_contents}") |
|
|
|
|
|
logger.info("μ ν리μΌμ΄μ
μ€μ κ²μ¦ μ€...") |
|
config_status = validate_config() |
|
if config_status["status"] != "valid": |
|
for warning in config_status["warnings"]: |
|
logger.warning(f"μ€μ κ²½κ³ : {warning}") |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
from rag_chain import RAGChain |
|
RAG_CHAIN_AVAILABLE = True |
|
print("RAG μ²΄μΈ λͺ¨λ λ‘λ μ±κ³΅!") |
|
except ImportError as e: |
|
logger.warning(f"RAG μ²΄μΈ λͺ¨λμ λ‘λν μ μμ΅λλ€: {e}") |
|
RAG_CHAIN_AVAILABLE = False |
|
except Exception as e: |
|
logger.warning(f"RAG μ²΄μΈ λͺ¨λ λ‘λ μ€ μμμΉ λͺ»ν μ€λ₯: {e}") |
|
RAG_CHAIN_AVAILABLE = False |
|
|
|
|
|
try: |
|
from fallback_rag_chain import FallbackRAGChain |
|
FALLBACK_AVAILABLE = True |
|
print("ν΄λ°± RAG μ²΄μΈ λͺ¨λ λ‘λ μ±κ³΅!") |
|
except ImportError as e: |
|
logger.warning(f"ν΄λ°± RAG μ²΄μΈ λͺ¨λμ λ‘λν μ μμ΅λλ€: {e}") |
|
FALLBACK_AVAILABLE = False |
|
|
|
try: |
|
from offline_fallback_rag import OfflineFallbackRAG |
|
OFFLINE_FALLBACK_AVAILABLE = True |
|
print("μ€νλΌμΈ ν΄λ°± RAG λͺ¨λ λ‘λ μ±κ³΅!") |
|
except ImportError as e: |
|
logger.warning(f"μ€νλΌμΈ ν΄λ°± RAG λͺ¨λμ λ‘λν μ μμ΅λλ€: {e}") |
|
OFFLINE_FALLBACK_AVAILABLE = False |
|
|
|
|
|
class DocumentProcessingError(Exception): |
|
"""λ¬Έμ μ²λ¦¬ μ€ λ°μνλ μμΈ""" |
|
pass |
|
|
|
|
|
class VectorStoreError(Exception): |
|
"""λ²‘ν° μ€ν μ΄ μμ
μ€ λ°μνλ μμΈ""" |
|
pass |
|
|
|
|
|
class RAGInitializationError(Exception): |
|
"""RAG μ²΄μΈ μ΄κΈ°ν μ€ λ°μνλ μμΈ""" |
|
pass |
|
|
|
|
|
class ConfigurationError(Exception): |
|
"""μ€μ κ΄λ ¨ μ€λ₯""" |
|
pass |
|
|
|
|
|
class AutoRAGChatApp: |
|
""" |
|
documents ν΄λμ PDF νμΌμ μλμΌλ‘ μ²λ¦¬νλ RAG μ±λ΄ |
|
""" |
|
|
|
def __init__(self): |
|
""" |
|
RAG μ±λ΄ μ ν리μΌμ΄μ
μ΄κΈ°ν |
|
""" |
|
try: |
|
logger.info("AutoRAGChatApp μ΄κΈ°ν μμ") |
|
|
|
|
|
|
|
self.pdf_directory = os.path.abspath(PDF_DIRECTORY) |
|
self.cache_directory = os.path.abspath(CACHE_DIRECTORY) |
|
self.index_file = os.path.join(self.cache_directory, "file_index.json") |
|
self.chunks_dir = os.path.join(self.cache_directory, "chunks") |
|
self.vector_index_dir = os.path.join(self.cache_directory, "vector_index") |
|
|
|
logger.info(f"μ€μ λ PDF λλ ν 리 (μ λ κ²½λ‘): {self.pdf_directory}") |
|
|
|
|
|
self._verify_pdf_directory() |
|
|
|
|
|
self._ensure_directories_exist() |
|
|
|
logger.info(f"PDF λ¬Έμ λλ ν 리: '{self.pdf_directory}'") |
|
logger.info(f"μΊμ λλ ν 리: '{self.cache_directory}'") |
|
|
|
|
|
try: |
|
self.document_processor = OptimizedDocumentProcessor( |
|
chunk_size=CHUNK_SIZE, |
|
chunk_overlap=CHUNK_OVERLAP |
|
) |
|
except Exception as e: |
|
logger.error(f"λ¬Έμ μ²λ¦¬κΈ° μ΄κΈ°ν μ€ν¨: {e}") |
|
raise DocumentProcessingError(f"λ¬Έμ μ²λ¦¬κΈ° μ΄κΈ°ν μ€ν¨: {str(e)}") |
|
|
|
|
|
try: |
|
self.vector_store = VectorStore(use_milvus=False) |
|
except Exception as e: |
|
logger.error(f"λ²‘ν° μ μ₯μ μ΄κΈ°ν μ€ν¨: {e}") |
|
raise VectorStoreError(f"λ²‘ν° μ μ₯μ μ΄κΈ°ν μ€ν¨: {str(e)}") |
|
|
|
|
|
self.file_index = self._load_file_index() |
|
|
|
|
|
self.documents = [] |
|
self.processed_files = [] |
|
self.is_initialized = False |
|
|
|
|
|
logger.info("λ¬Έμ μλ λ‘λ λ° μ²λ¦¬ μμ...") |
|
self.auto_process_documents() |
|
|
|
logger.info("AutoRAGChatApp μ΄κΈ°ν μλ£") |
|
|
|
except Exception as e: |
|
logger.critical(f"μ ν리μΌμ΄μ
μ΄κΈ°ν μ€ μ¬κ°ν μ€λ₯: {e}", exc_info=True) |
|
|
|
self.pdf_directory = os.path.abspath(PDF_DIRECTORY) |
|
self.documents = [] |
|
self.processed_files = [] |
|
self.is_initialized = False |
|
self.file_index = {} |
|
|
|
def _ensure_directories_exist(self) -> None: |
|
""" |
|
νμν λλ ν λ¦¬κ° μ‘΄μ¬νλμ§ νμΈνκ³ μμ± |
|
""" |
|
directories = [ |
|
self.pdf_directory, |
|
self.cache_directory, |
|
self.chunks_dir, |
|
self.vector_index_dir |
|
] |
|
|
|
for directory in directories: |
|
try: |
|
os.makedirs(directory, exist_ok=True) |
|
except Exception as e: |
|
logger.error(f"λλ ν 리 μμ± μ€ν¨ '{directory}': {e}") |
|
raise OSError(f"λλ ν 리 μμ± μ€ν¨ '{directory}': {str(e)}") |
|
|
|
def _process_pdf_file(self, file_path: str) -> List[Document]: |
|
""" |
|
PDF νμΌ μ²λ¦¬ - docling μ€ν¨ μ PyPDFLoader μ¬μ© |
|
|
|
Args: |
|
file_path: μ²λ¦¬ν PDF νμΌ κ²½λ‘ |
|
|
|
Returns: |
|
μ²λ¦¬λ λ¬Έμ μ²ν¬ 리μ€νΈ |
|
""" |
|
if not os.path.exists(file_path): |
|
logger.error(f"νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") |
|
raise FileNotFoundError(f"νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") |
|
|
|
try: |
|
logger.info(f"doclingμΌλ‘ μ²λ¦¬ μλ: {file_path}") |
|
|
|
|
|
try: |
|
|
|
import signal |
|
|
|
def timeout_handler(signum, frame): |
|
raise TimeoutError("docling μ²λ¦¬ μκ° μ΄κ³Ό (60μ΄)") |
|
|
|
|
|
try: |
|
signal.signal(signal.SIGALRM, timeout_handler) |
|
signal.alarm(60) |
|
except (AttributeError, ValueError) as se: |
|
logger.warning(f"μκ·Έλ μ€μ μ€ν¨ (μλμ° νκ²½μΌ μ μμ): {se}") |
|
|
|
|
|
chunks = self.document_processor.process_pdf(file_path, use_docling=True) |
|
|
|
|
|
try: |
|
signal.alarm(0) |
|
except (AttributeError, ValueError): |
|
pass |
|
|
|
return chunks |
|
|
|
except TimeoutError as te: |
|
logger.warning(f"docling μ²λ¦¬ μκ° μ΄κ³Ό: {te}") |
|
logger.info("PyPDFLoaderλ‘ λ체ν©λλ€.") |
|
|
|
|
|
try: |
|
return self.document_processor.process_pdf(file_path, use_docling=False) |
|
except Exception as inner_e: |
|
logger.error(f"PyPDFLoader μ²λ¦¬ μ€λ₯: {inner_e}", exc_info=True) |
|
raise DocumentProcessingError(f"PDF λ‘λ© μ€ν¨ (PyPDFLoader): {str(inner_e)}") |
|
|
|
except Exception as e: |
|
|
|
error_str = str(e) |
|
if "Invalid code point" in error_str or "RuntimeError" in error_str: |
|
logger.warning(f"docling μ²λ¦¬ μ€λ₯ (μ½λ ν¬μΈνΈ λ¬Έμ ): {error_str}") |
|
logger.info("PyPDFLoaderλ‘ λ체ν©λλ€.") |
|
else: |
|
logger.warning(f"docling μ²λ¦¬ μ€λ₯: {error_str}") |
|
logger.info("PyPDFLoaderλ‘ λ체ν©λλ€.") |
|
|
|
|
|
try: |
|
return self.document_processor.process_pdf(file_path, use_docling=False) |
|
except Exception as inner_e: |
|
logger.error(f"PyPDFLoader μ²λ¦¬ μ€λ₯: {inner_e}", exc_info=True) |
|
raise DocumentProcessingError(f"PDF λ‘λ© μ€ν¨ (PyPDFLoader): {str(inner_e)}") |
|
|
|
except DocumentProcessingError: |
|
|
|
raise |
|
except Exception as e: |
|
logger.error(f"PDF μ²λ¦¬ μ€ μ¬κ°ν μ€λ₯: {e}", exc_info=True) |
|
|
|
logger.warning(f"'{file_path}' μ²λ¦¬ μ€ν¨λ‘ λΉ μ²ν¬ λͺ©λ‘ λ°ν") |
|
return [] |
|
|
|
def _load_file_index(self) -> Dict[str, Dict[str, Any]]: |
|
""" |
|
νμΌ μΈλ±μ€ λ‘λ |
|
|
|
Returns: |
|
νμΌ κ²½λ‘ -> λ©νλ°μ΄ν° λ§€ν |
|
""" |
|
if os.path.exists(self.index_file): |
|
try: |
|
with open(self.index_file, 'r', encoding='utf-8') as f: |
|
return json.load(f) |
|
except json.JSONDecodeError as e: |
|
logger.error(f"μΈλ±μ€ νμΌ JSON νμ± μ€ν¨: {e}") |
|
logger.warning("μμλ μΈλ±μ€ νμΌ, μλ‘μ΄ μΈλ±μ€λ₯Ό μμ±ν©λλ€.") |
|
return {} |
|
except Exception as e: |
|
logger.error(f"μΈλ±μ€ νμΌ λ‘λ μ€ν¨: {e}") |
|
return {} |
|
return {} |
|
|
|
def _save_file_index(self) -> None: |
|
""" |
|
νμΌ μΈλ±μ€ μ μ₯ |
|
""" |
|
try: |
|
with open(self.index_file, 'w', encoding='utf-8') as f: |
|
json.dump(self.file_index, f, ensure_ascii=False, indent=2) |
|
logger.debug("νμΌ μΈλ±μ€ μ μ₯ μλ£") |
|
except Exception as e: |
|
logger.error(f"νμΌ μΈλ±μ€ μ μ₯ μ€ν¨: {e}") |
|
raise IOError(f"νμΌ μΈλ±μ€ μ μ₯ μ€ν¨: {str(e)}") |
|
|
|
def _calculate_file_hash(self, file_path: str) -> str: |
|
""" |
|
νμΌ ν΄μ κ³μ° |
|
|
|
Args: |
|
file_path: νμΌ κ²½λ‘ |
|
|
|
Returns: |
|
MD5 ν΄μκ° |
|
""" |
|
if not os.path.exists(file_path): |
|
logger.error(f"ν΄μ κ³μ° μ€ν¨ - νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") |
|
raise FileNotFoundError(f"νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") |
|
|
|
try: |
|
hasher = hashlib.md5() |
|
with open(file_path, 'rb') as f: |
|
buf = f.read(65536) |
|
while len(buf) > 0: |
|
hasher.update(buf) |
|
buf = f.read(65536) |
|
return hasher.hexdigest() |
|
except Exception as e: |
|
logger.error(f"νμΌ ν΄μ κ³μ° μ€ μ€λ₯: {e}") |
|
raise IOError(f"νμΌ ν΄μ κ³μ° μ€ν¨: {str(e)}") |
|
|
|
def _is_file_processed(self, file_path: str) -> bool: |
|
""" |
|
νμΌμ΄ μ΄λ―Έ μ²λ¦¬λμκ³ λ³κ²½λμ§ μμλμ§ νμΈ |
|
|
|
Args: |
|
file_path: νμΌ κ²½λ‘ |
|
|
|
Returns: |
|
μ²λ¦¬ μ¬λΆ |
|
""" |
|
|
|
if not os.path.exists(file_path): |
|
logger.warning(f"νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") |
|
return False |
|
|
|
|
|
if file_path not in self.file_index: |
|
return False |
|
|
|
try: |
|
|
|
current_hash = self._calculate_file_hash(file_path) |
|
|
|
|
|
if self.file_index[file_path]['hash'] != current_hash: |
|
logger.info(f"νμΌ λ³κ²½ κ°μ§: {file_path}") |
|
return False |
|
|
|
|
|
chunks_path = self.file_index[file_path]['chunks_path'] |
|
if not os.path.exists(chunks_path): |
|
logger.warning(f"μ²ν¬ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {chunks_path}") |
|
return False |
|
|
|
return True |
|
except Exception as e: |
|
logger.error(f"νμΌ μ²λ¦¬ μν νμΈ μ€ μ€λ₯: {e}") |
|
return False |
|
|
|
def _get_chunks_path(self, file_hash: str) -> str: |
|
""" |
|
μ²ν¬ νμΌ κ²½λ‘ μμ± |
|
|
|
Args: |
|
file_hash: νμΌ ν΄μκ° |
|
|
|
Returns: |
|
μ²ν¬ νμΌ κ²½λ‘ |
|
""" |
|
return os.path.join(self.chunks_dir, f"{file_hash}.pkl") |
|
|
|
def _save_chunks(self, file_path: str, chunks: List[Document]) -> None: |
|
""" |
|
μ²ν¬ λ°μ΄ν° μ μ₯ |
|
|
|
Args: |
|
file_path: μλ³Έ νμΌ κ²½λ‘ |
|
chunks: λ¬Έμ μ²ν¬ 리μ€νΈ |
|
""" |
|
try: |
|
|
|
file_hash = self._calculate_file_hash(file_path) |
|
|
|
|
|
chunks_path = self._get_chunks_path(file_hash) |
|
|
|
|
|
with open(chunks_path, 'wb') as f: |
|
pickle.dump(chunks, f) |
|
|
|
|
|
self.file_index[file_path] = { |
|
'hash': file_hash, |
|
'chunks_path': chunks_path, |
|
'last_processed': time.time(), |
|
'chunks_count': len(chunks), |
|
'file_size': os.path.getsize(file_path), |
|
'file_name': os.path.basename(file_path) |
|
} |
|
|
|
|
|
self._save_file_index() |
|
|
|
logger.info(f"μ²ν¬ μ μ₯ μλ£: {file_path} ({len(chunks)}κ° μ²ν¬)") |
|
except Exception as e: |
|
logger.error(f"μ²ν¬ μ μ₯ μ€ν¨: {e}", exc_info=True) |
|
raise IOError(f"μ²ν¬ μ μ₯ μ€ν¨: {str(e)}") |
|
|
|
def _load_chunks(self, file_path: str) -> List[Document]: |
|
""" |
|
μ μ₯λ μ²ν¬ λ°μ΄ν° λ‘λ |
|
|
|
Args: |
|
file_path: νμΌ κ²½λ‘ |
|
|
|
Returns: |
|
λ¬Έμ μ²ν¬ 리μ€νΈ |
|
""" |
|
if file_path not in self.file_index: |
|
logger.error(f"μΈλ±μ€μ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") |
|
raise KeyError(f"μΈλ±μ€μ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") |
|
|
|
chunks_path = self.file_index[file_path]['chunks_path'] |
|
|
|
if not os.path.exists(chunks_path): |
|
logger.error(f"μ²ν¬ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {chunks_path}") |
|
raise FileNotFoundError(f"μ²ν¬ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {chunks_path}") |
|
|
|
try: |
|
with open(chunks_path, 'rb') as f: |
|
chunks = pickle.load(f) |
|
|
|
logger.info(f"μ²ν¬ λ‘λ μλ£: {file_path} ({len(chunks)}κ° μ²ν¬)") |
|
return chunks |
|
except pickle.UnpicklingError as e: |
|
logger.error(f"μ²ν¬ νμΌ μμ§λ ¬ν μ€ν¨: {e}") |
|
raise IOError(f"μ²ν¬ νμΌ μμ: {str(e)}") |
|
except Exception as e: |
|
logger.error(f"μ²ν¬ λ‘λ μ€ν¨: {e}", exc_info=True) |
|
raise IOError(f"μ²ν¬ λ‘λ μ€ν¨: {str(e)}") |
|
|
|
def _verify_pdf_directory(self): |
|
"""PDF λλ ν 리 κ²μ¦ λ° νμΌ μ‘΄μ¬ νμΈ""" |
|
try: |
|
|
|
if not os.path.exists(self.pdf_directory): |
|
try: |
|
logger.warning(f"PDF λλ ν λ¦¬κ° μ‘΄μ¬νμ§ μμ μμ±ν©λλ€: {self.pdf_directory}") |
|
os.makedirs(self.pdf_directory, exist_ok=True) |
|
except Exception as e: |
|
logger.error(f"PDF λλ ν 리 μμ± μ€ν¨: {e}") |
|
raise |
|
|
|
|
|
if not os.path.isdir(self.pdf_directory): |
|
logger.error(f"PDF κ²½λ‘κ° λλ ν λ¦¬κ° μλλλ€: {self.pdf_directory}") |
|
raise ConfigurationError(f"PDF κ²½λ‘κ° λλ ν λ¦¬κ° μλλλ€: {self.pdf_directory}") |
|
|
|
|
|
pdf_files = [f for f in os.listdir(self.pdf_directory) if f.lower().endswith('.pdf')] |
|
|
|
if pdf_files: |
|
logger.info(f"PDF λλ ν 리μμ {len(pdf_files)}κ°μ PDF νμΌμ μ°Ύμμ΅λλ€: {pdf_files}") |
|
else: |
|
|
|
alternative_paths = [ |
|
"./documents", |
|
"../documents", |
|
"documents", |
|
os.path.join(os.getcwd(), "documents") |
|
] |
|
|
|
found_pdfs = False |
|
for alt_path in alternative_paths: |
|
if os.path.exists(alt_path) and os.path.isdir(alt_path): |
|
alt_pdf_files = [f for f in os.listdir(alt_path) if f.lower().endswith('.pdf')] |
|
if alt_pdf_files: |
|
logger.warning(f"λ체 κ²½λ‘ '{alt_path}'μμ PDF νμΌμ μ°Ύμμ΅λλ€. μ΄ κ²½λ‘λ₯Ό μ¬μ©ν©λλ€.") |
|
self.pdf_directory = os.path.abspath(alt_path) |
|
found_pdfs = True |
|
break |
|
|
|
if not found_pdfs: |
|
logger.warning(f"PDF λλ ν 리μ PDF νμΌμ΄ μμ΅λλ€: {self.pdf_directory}") |
|
logger.info("PDF νμΌμ λλ ν 리μ μΆκ°ν΄μ£ΌμΈμ.") |
|
|
|
except Exception as e: |
|
logger.error(f"PDF λλ ν 리 κ²μ¦ μ€ μ€λ₯: {e}", exc_info=True) |
|
raise |
|
|
|
def auto_process_documents(self) -> str: |
|
""" |
|
documents ν΄λμ PDF νμΌ μλ μ²λ¦¬ |
|
|
|
Returns: |
|
μ²λ¦¬ κ²°κ³Ό λ©μμ§ |
|
""" |
|
try: |
|
start_time = time.time() |
|
|
|
|
|
try: |
|
pdf_files = [] |
|
|
|
|
|
logger.info(f"PDF νμΌ κ²μ κ²½λ‘: {self.pdf_directory}") |
|
|
|
if os.path.exists(self.pdf_directory) and os.path.isdir(self.pdf_directory): |
|
|
|
dir_contents = os.listdir(self.pdf_directory) |
|
logger.info(f"λλ ν 리 λ΄μ©: {dir_contents}") |
|
|
|
|
|
for filename in os.listdir(self.pdf_directory): |
|
if filename.lower().endswith('.pdf'): |
|
file_path = os.path.join(self.pdf_directory, filename) |
|
if os.path.isfile(file_path): |
|
pdf_files.append(file_path) |
|
logger.info(f"PDF νμΌ μ°Ύμ: {file_path}") |
|
|
|
|
|
logger.info(f"λ°κ²¬λ λͺ¨λ PDF νμΌ: {pdf_files}") |
|
|
|
except FileNotFoundError: |
|
logger.error(f"PDF λλ ν 리λ₯Ό μ°Ύμ μ μμ: {self.pdf_directory}") |
|
return f"'{self.pdf_directory}' λλ ν 리λ₯Ό μ°Ύμ μ μμ΅λλ€. λλ ν λ¦¬κ° μ‘΄μ¬νλμ§ νμΈνμΈμ." |
|
except PermissionError: |
|
logger.error(f"PDF λλ ν 리 μ κ·Ό κΆν μμ: {self.pdf_directory}") |
|
return f"'{self.pdf_directory}' λλ ν 리μ μ κ·Όν μ μμ΅λλ€. κΆνμ νμΈνμΈμ." |
|
|
|
if not pdf_files: |
|
logger.warning(f"'{self.pdf_directory}' ν΄λμ PDF νμΌμ΄ μμ΅λλ€.") |
|
return f"'{self.pdf_directory}' ν΄λμ PDF νμΌμ΄ μμ΅λλ€." |
|
|
|
logger.info(f"λ°κ²¬λ PDF νμΌ: {len(pdf_files)}κ°") |
|
|
|
|
|
new_files = [] |
|
updated_files = [] |
|
cached_files = [] |
|
failed_files = [] |
|
all_chunks = [] |
|
|
|
for file_path in pdf_files: |
|
try: |
|
if self._is_file_processed(file_path): |
|
|
|
try: |
|
chunks = self._load_chunks(file_path) |
|
all_chunks.extend(chunks) |
|
cached_files.append(file_path) |
|
self.processed_files.append(os.path.basename(file_path)) |
|
except Exception as e: |
|
logger.error(f"μΊμλ μ²ν¬ λ‘λ μ€ν¨: {e}") |
|
|
|
logger.info(f"μΊμ μ€ν¨λ‘ νμΌ μ¬μ²λ¦¬: {file_path}") |
|
chunks = self._process_pdf_file(file_path) |
|
if chunks: |
|
self._save_chunks(file_path, chunks) |
|
all_chunks.extend(chunks) |
|
updated_files.append(file_path) |
|
self.processed_files.append(os.path.basename(file_path)) |
|
else: |
|
failed_files.append(file_path) |
|
else: |
|
|
|
logger.info(f"μ²λ¦¬ μ€: {file_path}") |
|
|
|
try: |
|
|
|
chunks = self._process_pdf_file(file_path) |
|
|
|
if chunks: |
|
|
|
self._save_chunks(file_path, chunks) |
|
|
|
all_chunks.extend(chunks) |
|
if file_path in self.file_index: |
|
updated_files.append(file_path) |
|
else: |
|
new_files.append(file_path) |
|
|
|
self.processed_files.append(os.path.basename(file_path)) |
|
else: |
|
logger.warning(f"'{file_path}' μ²λ¦¬ μ€ν¨: μΆμΆλ μ²ν¬ μμ") |
|
failed_files.append(file_path) |
|
except Exception as e: |
|
logger.error(f"'{file_path}' μ²λ¦¬ μ€ μ€λ₯: {e}", exc_info=True) |
|
failed_files.append(file_path) |
|
except Exception as e: |
|
logger.error(f"'{file_path}' νμΌ μ²λ¦¬ 루ν μ€ μ€λ₯: {e}", exc_info=True) |
|
failed_files.append(file_path) |
|
|
|
|
|
self.documents = all_chunks |
|
|
|
processing_time = time.time() - start_time |
|
logger.info(f"λ¬Έμ μ²λ¦¬ μλ£: {len(all_chunks)}κ° μ²ν¬, {processing_time:.2f}μ΄") |
|
|
|
|
|
try: |
|
self._process_vector_index(new_files, updated_files) |
|
except Exception as e: |
|
logger.error(f"λ²‘ν° μΈλ±μ€ μ²λ¦¬ μ€ν¨: {e}", exc_info=True) |
|
return f"λ¬Έμλ μ²λ¦¬λμμΌλ λ²‘ν° μΈλ±μ€ μμ±μ μ€ν¨νμ΅λλ€: {str(e)}" |
|
|
|
|
|
if RAG_CHAIN_AVAILABLE: |
|
try: |
|
logger.info("RAGChainμΌλ‘ μ΄κΈ°νλ₯Ό μλν©λλ€.") |
|
self.rag_chain = RAGChain(self.vector_store) |
|
self.is_initialized = True |
|
logger.info("RAG μ²΄μΈ μ΄κΈ°ν μ±κ³΅") |
|
except Exception as e: |
|
logger.error(f"RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {e}", exc_info=True) |
|
|
|
|
|
try: |
|
logger.info("FallbackRAGChainμΌλ‘ λ체ν©λλ€...") |
|
from fallback_rag_chain import FallbackRAGChain |
|
self.rag_chain = FallbackRAGChain(self.vector_store) |
|
self.is_initialized = True |
|
logger.info("ν΄λ°± RAG μ²΄μΈ μ΄κΈ°ν μ±κ³΅") |
|
except Exception as fallback_e: |
|
logger.error(f"ν΄λ°± RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {fallback_e}", exc_info=True) |
|
|
|
|
|
try: |
|
logger.info("SimpleRAGChainμΌλ‘ λ체ν©λλ€...") |
|
from simple_rag_chain import SimpleRAGChain |
|
|
|
|
|
try: |
|
from config import DEEPSEEK_API_KEY, DEEPSEEK_MODEL, DEEPSEEK_ENDPOINT |
|
logger.info(f"μ€μ νμΌμμ DeepSeek API μ 보λ₯Ό λ‘λνμ΅λλ€: λͺ¨λΈ={DEEPSEEK_MODEL}") |
|
except ImportError: |
|
|
|
DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY", "") |
|
DEEPSEEK_MODEL = os.environ.get("DEEPSEEK_MODEL", "deepseek-chat") |
|
DEEPSEEK_ENDPOINT = os.environ.get("DEEPSEEK_ENDPOINT", |
|
"https://api.deepseek.com/v1/chat/completions") |
|
logger.info(f"νκ²½ λ³μμμ DeepSeek API μ 보λ₯Ό λ‘λνμ΅λλ€: λͺ¨λΈ={DEEPSEEK_MODEL}") |
|
|
|
|
|
self.rag_chain = SimpleRAGChain(self.vector_store) |
|
self.is_initialized = True |
|
logger.info("SimpleRAGChain μ΄κΈ°ν μ±κ³΅") |
|
except Exception as simple_e: |
|
logger.error(f"λͺ¨λ RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {simple_e}", exc_info=True) |
|
return f"λ¬Έμμ λ²‘ν° μΈλ±μ€λ μ²λ¦¬λμμΌλ RAG μ²΄μΈ μ΄κΈ°νμ μ€ν¨νμ΅λλ€: {str(e)}" |
|
else: |
|
|
|
try: |
|
logger.info("κΈ°λ³Έ RAG Chainμ μ¬μ©ν μ μμ΄ λ체 λ²μ μ μλν©λλ€...") |
|
|
|
|
|
try: |
|
from fallback_rag_chain import FallbackRAGChain |
|
self.rag_chain = FallbackRAGChain(self.vector_store) |
|
self.is_initialized = True |
|
logger.info("ν΄λ°± RAG μ²΄μΈ μ΄κΈ°ν μ±κ³΅") |
|
except Exception as fallback_e: |
|
logger.error(f"ν΄λ°± RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {fallback_e}", exc_info=True) |
|
|
|
|
|
try: |
|
from simple_rag_chain import SimpleRAGChain |
|
self.rag_chain = SimpleRAGChain(self.vector_store) |
|
self.is_initialized = True |
|
logger.info("SimpleRAGChain μ΄κΈ°ν μ±κ³΅") |
|
except Exception as simple_e: |
|
logger.error(f"λͺ¨λ RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {simple_e}", exc_info=True) |
|
return f"λ¬Έμμ λ²‘ν° μΈλ±μ€λ μ²λ¦¬λμμΌλ RAG μ²΄μΈ μ΄κΈ°νμ μ€ν¨νμ΅λλ€" |
|
except Exception as e: |
|
logger.error(f"RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {e}", exc_info=True) |
|
return f"λ¬Έμμ λ²‘ν° μΈλ±μ€λ μ²λ¦¬λμμΌλ RAG μ²΄μΈ μ΄κΈ°νμ μ€ν¨νμ΅λλ€: {str(e)}" |
|
|
|
|
|
result_message = f"""λ¬Έμ μ²λ¦¬ μλ£! |
|
- μ²λ¦¬λ νμΌ: {len(pdf_files)}κ° |
|
- μΊμλ νμΌ: {len(cached_files)}κ° |
|
- μ νμΌ: {len(new_files)}κ° |
|
- μ
λ°μ΄νΈλ νμΌ: {len(updated_files)}κ° |
|
- μ€ν¨ν νμΌ: {len(failed_files)}κ° |
|
- μ΄ μ²ν¬ μ: {len(all_chunks)}κ° |
|
- μ²λ¦¬ μκ°: {processing_time:.2f}μ΄ |
|
μ΄μ μ§λ¬Έν μ€λΉκ° λμμ΅λλ€!""" |
|
|
|
return result_message |
|
|
|
except Exception as e: |
|
error_message = f"λ¬Έμ μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}" |
|
logger.error(error_message, exc_info=True) |
|
return error_message |
|
|
|
def _process_vector_index(self, new_files: List[str], updated_files: List[str]) -> None: |
|
""" |
|
λ²‘ν° μΈλ±μ€ μ²λ¦¬ |
|
|
|
Args: |
|
new_files: μλ‘ μΆκ°λ νμΌ λͺ©λ‘ |
|
updated_files: μ
λ°μ΄νΈλ νμΌ λͺ©λ‘ |
|
""" |
|
|
|
if os.path.exists(self.vector_index_dir) and any(os.listdir(self.vector_index_dir)): |
|
|
|
try: |
|
logger.info("μ μ₯λ λ²‘ν° μΈλ±μ€ λ‘λ μ€...") |
|
vector_store_loaded = self.vector_store.load_local(self.vector_index_dir) |
|
|
|
|
|
if self.vector_store.vector_store is not None: |
|
|
|
if new_files or updated_files: |
|
logger.info("λ²‘ν° μΈλ±μ€ μ
λ°μ΄νΈ μ€...") |
|
self.vector_store.add_documents(self.documents) |
|
|
|
logger.info("λ²‘ν° μΈλ±μ€ λ‘λ μλ£") |
|
else: |
|
logger.warning("λ²‘ν° μΈλ±μ€λ₯Ό λ‘λνμΌλ μ ν¨νμ§ μμ, μλ‘ μμ±ν©λλ€.") |
|
self.vector_store.create_or_load(self.documents) |
|
|
|
except Exception as e: |
|
logger.error(f"λ²‘ν° μΈλ±μ€ λ‘λ μ€ν¨, μλ‘ μμ±ν©λλ€: {e}", exc_info=True) |
|
|
|
self.vector_store.create_or_load(self.documents) |
|
else: |
|
|
|
logger.info("μ λ²‘ν° μΈλ±μ€ μμ± μ€...") |
|
self.vector_store.create_or_load(self.documents) |
|
|
|
|
|
if self.vector_store and self.vector_store.vector_store is not None: |
|
try: |
|
logger.info(f"λ²‘ν° μΈλ±μ€ μ μ₯ μ€: {self.vector_index_dir}") |
|
save_result = self.vector_store.save_local(self.vector_index_dir) |
|
logger.info(f"λ²‘ν° μΈλ±μ€ μ μ₯ μλ£: {self.vector_index_dir}") |
|
except Exception as e: |
|
logger.error(f"λ²‘ν° μΈλ±μ€ μ μ₯ μ€ν¨: {e}", exc_info=True) |
|
raise VectorStoreError(f"λ²‘ν° μΈλ±μ€ μ μ₯ μ€ν¨: {str(e)}") |
|
else: |
|
logger.warning("λ²‘ν° μΈλ±μ€κ° μ΄κΈ°νλμ§ μμ μ μ₯νμ§ μμ΅λλ€.") |
|
|
|
def reset_cache(self) -> str: |
|
""" |
|
μΊμ μ΄κΈ°ν |
|
|
|
Returns: |
|
κ²°κ³Ό λ©μμ§ |
|
""" |
|
try: |
|
|
|
try: |
|
for filename in os.listdir(self.chunks_dir): |
|
file_path = os.path.join(self.chunks_dir, filename) |
|
if os.path.isfile(file_path): |
|
os.remove(file_path) |
|
logger.info("μ²ν¬ μΊμ νμΌ μμ μλ£") |
|
except Exception as e: |
|
logger.error(f"μ²ν¬ νμΌ μμ μ€ μ€λ₯: {e}") |
|
return f"μ²ν¬ νμΌ μμ μ€ μ€λ₯ λ°μ: {str(e)}" |
|
|
|
|
|
self.file_index = {} |
|
try: |
|
self._save_file_index() |
|
logger.info("νμΌ μΈλ±μ€ μ΄κΈ°ν μλ£") |
|
except Exception as e: |
|
logger.error(f"μΈλ±μ€ νμΌ μ΄κΈ°ν μ€ μ€λ₯: {e}") |
|
return f"μΈλ±μ€ νμΌ μ΄κΈ°ν μ€ μ€λ₯ λ°μ: {str(e)}" |
|
|
|
|
|
try: |
|
for filename in os.listdir(self.vector_index_dir): |
|
file_path = os.path.join(self.vector_index_dir, filename) |
|
if os.path.isfile(file_path): |
|
os.remove(file_path) |
|
logger.info("λ²‘ν° μΈλ±μ€ νμΌ μμ μλ£") |
|
except Exception as e: |
|
logger.error(f"λ²‘ν° μΈλ±μ€ νμΌ μμ μ€ μ€λ₯: {e}") |
|
return f"λ²‘ν° μΈλ±μ€ νμΌ μμ μ€ μ€λ₯ λ°μ: {str(e)}" |
|
|
|
self.documents = [] |
|
self.processed_files = [] |
|
self.is_initialized = False |
|
|
|
logger.info("μΊμ μ΄κΈ°ν μλ£") |
|
return "μΊμκ° μ΄κΈ°νλμμ΅λλ€. λ€μ μ€ν μ λͺ¨λ λ¬Έμκ° λ€μ μ²λ¦¬λ©λλ€." |
|
except Exception as e: |
|
error_msg = f"μΊμ μ΄κΈ°ν μ€ μ€λ₯ λ°μ: {str(e)}" |
|
logger.error(error_msg, exc_info=True) |
|
return error_msg |
|
|
|
def process_query(self, query: str, chat_history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]: |
|
""" |
|
μ¬μ©μ 쿼리 μ²λ¦¬ |
|
|
|
Args: |
|
query: μ¬μ©μ μ§λ¬Έ |
|
chat_history: λν κΈ°λ‘ |
|
|
|
Returns: |
|
μλ΅ λ° μ
λ°μ΄νΈλ λν κΈ°λ‘ |
|
""" |
|
if not query or not query.strip(): |
|
response = "μ§λ¬Έμ΄ λΉμ΄ μμ΅λλ€. μ§λ¬Έμ μ
λ ₯ν΄ μ£ΌμΈμ." |
|
chat_history.append((query, response)) |
|
return "", chat_history |
|
|
|
if not self.is_initialized: |
|
response = "λ¬Έμ λ‘λκ° μ΄κΈ°νλμ§ μμμ΅λλ€. μλ λ‘λλ₯Ό μλν©λλ€." |
|
chat_history.append((query, response)) |
|
|
|
|
|
try: |
|
init_result = self.auto_process_documents() |
|
if not self.is_initialized: |
|
response = f"λ¬Έμλ₯Ό λ‘λν μ μμ΅λλ€. 'documents' ν΄λμ PDF νμΌμ΄ μλμ§ νμΈνμΈμ. μ΄κΈ°ν κ²°κ³Ό: {init_result}" |
|
chat_history.append((query, response)) |
|
return "", chat_history |
|
except Exception as e: |
|
response = f"λ¬Έμ λ‘λ μ€ μ€λ₯ λ°μ: {str(e)}" |
|
logger.error(f"μλ λ‘λ μ€ν¨: {e}", exc_info=True) |
|
chat_history.append((query, response)) |
|
return "", chat_history |
|
|
|
try: |
|
|
|
start_time = time.time() |
|
logger.info(f"쿼리 μ²λ¦¬ μμ: {query}") |
|
|
|
|
|
if not hasattr(self, 'rag_chain') or self.rag_chain is None: |
|
raise RAGInitializationError("RAG 체μΈμ΄ μ΄κΈ°νλμ§ μμμ΅λλ€") |
|
|
|
|
|
try: |
|
response = self.rag_chain.run(query) |
|
logger.info(f"κΈ°λ³Έ RAG 체μΈμΌλ‘ μλ΅ μμ± μ±κ³΅") |
|
except Exception as rag_error: |
|
logger.error(f"κΈ°λ³Έ RAG μ²΄μΈ μ€ν μ€ν¨: {rag_error}, λμ μλ") |
|
|
|
|
|
try: |
|
|
|
try: |
|
from config import DEEPSEEK_API_KEY, DEEPSEEK_MODEL, DEEPSEEK_ENDPOINT |
|
except ImportError: |
|
|
|
DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY", "") |
|
DEEPSEEK_MODEL = os.environ.get("DEEPSEEK_MODEL", "deepseek-chat") |
|
DEEPSEEK_ENDPOINT = os.environ.get("DEEPSEEK_ENDPOINT", |
|
"https://api.deepseek.com/v1/chat/completions") |
|
|
|
|
|
def direct_api_call(query, context, api_key, model_name, endpoint, max_retries=3, timeout=60): |
|
"""DeepSeek API μ§μ νΈμΆ ν¨μ""" |
|
import requests |
|
import json |
|
import time |
|
|
|
|
|
if len(context) > 6000: |
|
context = context[:2500] + "\n...(μ€λ΅)...\n" + context[-2500:] |
|
|
|
|
|
prompt = f""" |
|
λ€μ μ 보λ₯Ό κΈ°λ°μΌλ‘ μ§λ¬Έμ μ ννκ² λ΅λ³ν΄μ£ΌμΈμ. |
|
|
|
μ§λ¬Έ: {query} |
|
|
|
μ°Έκ³ μ 보: |
|
{context} |
|
|
|
μ°Έκ³ μ 보μ λ΅μ΄ μμΌλ©΄ λ°λμ κ·Έ μ 보λ₯Ό κΈ°λ°μΌλ‘ λ΅λ³νμΈμ. |
|
μ°Έκ³ μ 보μ λ΅μ΄ μλ κ²½μ°μλ μΌλ°μ μΈ μ§μμ νμ©νμ¬ λ΅λ³ν μ μμ§λ§, "μ 곡λ λ¬Έμμλ μ΄ μ λ³΄κ° μμΌλ, μΌλ°μ μΌλ‘λ..." μμΌλ‘ μμνμΈμ. |
|
λ΅λ³μ μ ννκ³ κ°κ²°νκ² μ 곡νλ, κ°λ₯ν μ°Έκ³ μ 보μμ κ·Όκ±°λ₯Ό μ°Ύμ μ€λͺ
ν΄μ£ΌμΈμ. |
|
μ°Έκ³ μ 보μ μΆμ²λ ν¨κ» μλ €μ£ΌμΈμ. |
|
""" |
|
|
|
|
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {api_key}" |
|
} |
|
|
|
payload = { |
|
"model": model_name, |
|
"messages": [{"role": "user", "content": prompt}], |
|
"temperature": 0.3, |
|
"max_tokens": 1000 |
|
} |
|
|
|
|
|
retry_delay = 1.0 |
|
for attempt in range(max_retries): |
|
try: |
|
logger.info(f"DeepSeek API μ§μ νΈμΆ μλ ({attempt + 1}/{max_retries})...") |
|
response = requests.post( |
|
endpoint, |
|
headers=headers, |
|
json=payload, |
|
timeout=timeout |
|
) |
|
|
|
if response.status_code == 200: |
|
result = response.json() |
|
content = result.get("choices", [{}])[0].get("message", {}).get("content", "") |
|
logger.info(f"DeepSeek API μ§μ νΈμΆ μ±κ³΅") |
|
return content |
|
else: |
|
logger.warning(f"API μ€λ₯: μν μ½λ {response.status_code}") |
|
|
|
if response.status_code == 429: |
|
retry_delay = min(retry_delay * 3, 15) |
|
else: |
|
retry_delay = min(retry_delay * 2, 10) |
|
|
|
if attempt < max_retries - 1: |
|
logger.info(f"{retry_delay}μ΄ ν μ¬μλ...") |
|
time.sleep(retry_delay) |
|
except Exception as e: |
|
logger.error(f"API νΈμΆ μ€λ₯: {e}") |
|
if attempt < max_retries - 1: |
|
logger.info(f"{retry_delay}μ΄ ν μ¬μλ...") |
|
time.sleep(retry_delay) |
|
retry_delay = min(retry_delay * 2, 10) |
|
|
|
|
|
raise Exception("μ΅λ μ¬μλ νμ μ΄κ³Ό") |
|
|
|
|
|
if self.vector_store and hasattr(self.vector_store, "similarity_search"): |
|
logger.info("λ²‘ν° κ²μ μν...") |
|
docs = self.vector_store.similarity_search(query, k=5) |
|
|
|
|
|
context_parts = [] |
|
for i, doc in enumerate(docs, 1): |
|
source = doc.metadata.get("source", "μ μ μλ μΆμ²") |
|
page = doc.metadata.get("page", "") |
|
source_info = f"{source}" |
|
if page: |
|
source_info += f" (νμ΄μ§: {page})" |
|
context_parts.append(f"[μ°Έκ³ μλ£ {i}] - μΆμ²: {source_info}\n{doc.page_content}\n") |
|
context = "\n".join(context_parts) |
|
|
|
|
|
logger.info("DeepSeek API μ§μ νΈμΆ μλ...") |
|
response = direct_api_call( |
|
query, |
|
context, |
|
DEEPSEEK_API_KEY, |
|
DEEPSEEK_MODEL, |
|
DEEPSEEK_ENDPOINT, |
|
max_retries=3, |
|
timeout=120 |
|
) |
|
logger.info("DeepSeek API μ§μ νΈμΆ μ±κ³΅") |
|
else: |
|
raise Exception("λ²‘ν° μ€ν μ΄κ° μ΄κΈ°νλμ§ μμμ΅λλ€") |
|
|
|
except Exception as direct_api_error: |
|
logger.error(f"DeepSeek API μ§μ νΈμΆ μ€ν¨: {direct_api_error}, κ²μ κ²°κ³Ό λ°ν") |
|
|
|
|
|
try: |
|
|
|
if self.vector_store and hasattr(self.vector_store, "similarity_search"): |
|
docs = self.vector_store.similarity_search(query, k=5) |
|
|
|
|
|
context_parts = [] |
|
for i, doc in enumerate(docs, 1): |
|
source = doc.metadata.get("source", "μ μ μλ μΆμ²") |
|
page = doc.metadata.get("page", "") |
|
source_info = f"{source}" |
|
if page: |
|
source_info += f" (νμ΄μ§: {page})" |
|
context_parts.append(f"[μ°Έκ³ μλ£ {i}] - μΆμ²: {source_info}\n{doc.page_content}\n") |
|
context = "\n".join(context_parts) |
|
|
|
|
|
predefined_answers = { |
|
"λνλ―Όκ΅μ μλ": "λνλ―Όκ΅μ μλλ μμΈμ
λλ€.", |
|
"μλ": "λνλ―Όκ΅μ μλλ μμΈμ
λλ€.", |
|
"λꡬμΌ": "μ λ RAG κΈ°λ° μ§μμλ΅ μμ€ν
μ
λλ€. λ¬Έμλ₯Ό κ²μνκ³ κ΄λ ¨ μ 보λ₯Ό μ°Ύμλ립λλ€.", |
|
"μλ
": "μλ
νμΈμ! 무μμ λμλ릴κΉμ?", |
|
"λν΄": "μ¬μ©μμ μ§λ¬Έμ λ΅λ³νκΈ° μν΄ λ¬Έμλ₯Ό κ²μνκ³ μμ΅λλ€. 무μμ μλ €λ릴κΉμ?" |
|
} |
|
|
|
|
|
for key, answer in predefined_answers.items(): |
|
if key in query.lower(): |
|
response = answer |
|
logger.info(f"미리 μ μλ μλ΅ μ 곡: {key}") |
|
break |
|
else: |
|
|
|
response = f""" |
|
API μλ² μ°κ²°μ λ¬Έμ κ° μμ΄ κ²μ κ²°κ³Όλ§ νμν©λλ€. |
|
|
|
μ§λ¬Έ: {query} |
|
|
|
κ²μλ κ΄λ ¨ λ¬Έμ: |
|
{context} |
|
|
|
[μ°Έκ³ ] API μ°κ²° λ¬Έμ λ‘ μΈν΄ μλ μμ½μ΄ μ 곡λμ§ μμ΅λλ€. λ€μ μλνκ±°λ λ€λ₯Έ μ§λ¬Έμ ν΄λ³΄μΈμ. |
|
""" |
|
logger.info("κ²μ κ²°κ³Όλ§ νμ") |
|
else: |
|
response = f"API μ°κ²° λ° λ²‘ν° κ²μμ λͺ¨λ μ€ν¨νμ΅λλ€. μμ€ν
κ΄λ¦¬μμκ² λ¬ΈμνμΈμ." |
|
except Exception as fallback_error: |
|
logger.error(f"μ΅μ’
ν΄λ°± μλ΅ μμ± μ€ν¨: {fallback_error}") |
|
|
|
|
|
if "Connection error" in str(rag_error) or "timeout" in str(rag_error).lower(): |
|
response = f""" |
|
API μλ² μ°κ²°μ λ¬Έμ κ° μμ΅λλ€. μ μ ν λ€μ μλν΄μ£ΌμΈμ. |
|
|
|
μ§λ¬Έ: {query} |
|
|
|
[μ°Έκ³ ] νμ¬ DeepSeek API μλ²μμ μ°κ²°μ΄ μννμ§ μμ΅λλ€. μ΄λ‘ μΈν΄ μ§λ¬Έμ λν μλ΅μ μ 곡ν μ μμ΅λλ€. |
|
""" |
|
else: |
|
response = f"쿼리 μ²λ¦¬ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(rag_error)}" |
|
|
|
end_time = time.time() |
|
query_time = end_time - start_time |
|
logger.info(f"쿼리 μ²λ¦¬ μλ£: {query_time:.2f}μ΄") |
|
|
|
chat_history.append((query, response)) |
|
return "", chat_history |
|
except RAGInitializationError as e: |
|
error_msg = f"RAG μμ€ν
μ΄κΈ°ν μ€λ₯: {str(e)}. 'documents' ν΄λμ PDF νμΌμ΄ μλμ§ νμΈνκ³ , μ¬μμν΄ λ³΄μΈμ." |
|
logger.error(f"쿼리 μ²λ¦¬ μ€ RAG μ΄κΈ°ν μ€λ₯: {e}", exc_info=True) |
|
chat_history.append((query, error_msg)) |
|
return "", chat_history |
|
except (VectorStoreError, DocumentProcessingError) as e: |
|
error_msg = f"λ¬Έμ μ²λ¦¬ μμ€ν
μ€λ₯: {str(e)}. λ¬Έμ νμμ΄ μ¬λ°λ₯Έμ§ νμΈν΄ 보μΈμ." |
|
logger.error(f"쿼리 μ²λ¦¬ μ€ λ¬Έμ/λ²‘ν° μ€ν μ΄ μ€λ₯: {e}", exc_info=True) |
|
chat_history.append((query, error_msg)) |
|
return "", chat_history |
|
except Exception as e: |
|
error_msg = f"쿼리 μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}" |
|
logger.error(f"쿼리 μ²λ¦¬ μ€ μμμΉ λͺ»ν μ€λ₯: {e}", exc_info=True) |
|
chat_history.append((query, error_msg)) |
|
return "", chat_history |
|
|
|
def launch_app(self) -> None: |
|
""" |
|
Gradio μ± μ€ν |
|
""" |
|
try: |
|
import gradio as gr |
|
except ImportError: |
|
logger.error("Gradio λΌμ΄λΈλ¬λ¦¬λ₯Ό μ°Ύμ μ μμ΅λλ€. pip install gradioλ‘ μ€μΉνμΈμ.") |
|
print("Gradio λΌμ΄λΈλ¬λ¦¬λ₯Ό μ°Ύμ μ μμ΅λλ€. pip install gradioλ‘ μ€μΉνμΈμ.") |
|
return |
|
|
|
try: |
|
with gr.Blocks(title="PDF λ¬Έμ κΈ°λ° RAG μ±λ΄") as app: |
|
gr.Markdown("# PDF λ¬Έμ κΈ°λ° RAG μ±λ΄") |
|
gr.Markdown(f"* μ¬μ© μ€μΈ LLM λͺ¨λΈ: **{LLM_MODEL}**") |
|
|
|
|
|
actual_pdf_dir = self.pdf_directory.replace('\\', '\\\\') if os.name == 'nt' else self.pdf_directory |
|
gr.Markdown(f"* PDF λ¬Έμ ν΄λ: **{actual_pdf_dir}**") |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
|
|
status_box = gr.Textbox( |
|
label="λ¬Έμ μ²λ¦¬ μν", |
|
value=self._get_status_message(), |
|
lines=5, |
|
interactive=False |
|
) |
|
|
|
|
|
refresh_button = gr.Button("λ¬Έμ μλ‘ μ½κΈ°", variant="primary") |
|
reset_button = gr.Button("μΊμ μ΄κΈ°ν", variant="stop") |
|
|
|
|
|
status_info = gr.Markdown( |
|
value=f"μμ€ν
μν: {'μ΄κΈ°νλ¨' if self.is_initialized else 'μ΄κΈ°νλμ§ μμ'}" |
|
) |
|
|
|
|
|
with gr.Accordion("μΊμ μΈλΆ μ 보", open=False): |
|
cache_info = gr.Textbox( |
|
label="μΊμλ νμΌ μ 보", |
|
value=self._get_cache_info(), |
|
lines=5, |
|
interactive=False |
|
) |
|
|
|
with gr.Column(scale=2): |
|
|
|
chatbot = gr.Chatbot( |
|
label="λν λ΄μ©", |
|
bubble_full_width=False, |
|
height=500, |
|
show_copy_button=True |
|
) |
|
|
|
|
|
with gr.Row(): |
|
query_box = gr.Textbox( |
|
label="μ§λ¬Έ", |
|
placeholder="μ²λ¦¬λ λ¬Έμ λ΄μ©μ λν΄ μ§λ¬ΈνμΈμ...", |
|
lines=2, |
|
scale=4 |
|
) |
|
submit_btn = gr.Button("μ μ‘", variant="primary", scale=1) |
|
|
|
clear_chat_button = gr.Button("λν μ΄κΈ°ν") |
|
|
|
|
|
def update_ui_after_refresh(result): |
|
return ( |
|
result, |
|
self._get_status_message(), |
|
f"μμ€ν
μν: {'μ΄κΈ°νλ¨' if self.is_initialized else 'μ΄κΈ°νλμ§ μμ'}", |
|
self._get_cache_info() |
|
) |
|
|
|
|
|
refresh_button.click( |
|
fn=lambda: update_ui_after_refresh(self.auto_process_documents()), |
|
inputs=[], |
|
outputs=[status_box, status_box, status_info, cache_info] |
|
) |
|
|
|
|
|
def reset_and_process(): |
|
reset_result = self.reset_cache() |
|
process_result = self.auto_process_documents() |
|
return update_ui_after_refresh(f"{reset_result}\n\n{process_result}") |
|
|
|
reset_button.click( |
|
fn=reset_and_process, |
|
inputs=[], |
|
outputs=[status_box, status_box, status_info, cache_info] |
|
) |
|
|
|
|
|
submit_btn.click( |
|
fn=self.process_query, |
|
inputs=[query_box, chatbot], |
|
outputs=[query_box, chatbot] |
|
) |
|
|
|
|
|
query_box.submit( |
|
fn=self.process_query, |
|
inputs=[query_box, chatbot], |
|
outputs=[query_box, chatbot] |
|
) |
|
|
|
|
|
clear_chat_button.click( |
|
fn=lambda: [], |
|
outputs=[chatbot] |
|
) |
|
|
|
|
|
app.launch(share=False) |
|
except Exception as e: |
|
logger.error(f"Gradio μ± μ€ν μ€ μ€λ₯ λ°μ: {e}", exc_info=True) |
|
print(f"Gradio μ± μ€ν μ€ μ€λ₯ λ°μ: {e}") |
|
|
|
def _get_status_message(self) -> str: |
|
""" |
|
νμ¬ μ²λ¦¬ μν λ©μμ§ μμ± |
|
|
|
Returns: |
|
μν λ©μμ§ |
|
""" |
|
if not self.processed_files: |
|
return "μ²λ¦¬λ λ¬Έμκ° μμ΅λλ€. 'λ¬Έμ μλ‘ μ½κΈ°' λ²νΌμ ν΄λ¦νμΈμ." |
|
|
|
|
|
from config import USE_DEEPSEEK, DEEPSEEK_API_KEY, DEEPSEEK_MODEL |
|
|
|
model_info = "" |
|
if USE_DEEPSEEK and DEEPSEEK_API_KEY: |
|
|
|
try: |
|
|
|
try: |
|
from deepseek_utils import test_deepseek_api |
|
|
|
|
|
from config import DEEPSEEK_ENDPOINT |
|
|
|
|
|
test_result = test_deepseek_api(DEEPSEEK_API_KEY, DEEPSEEK_ENDPOINT, DEEPSEEK_MODEL) |
|
|
|
if test_result["success"]: |
|
model_info = f"\nDeepSeek API μν: μ μ ({DEEPSEEK_MODEL})" |
|
else: |
|
model_info = f"\nDeepSeek API μν: μ€λ₯ - {test_result['message']}" |
|
|
|
except ImportError: |
|
|
|
import requests |
|
import json |
|
|
|
|
|
from config import DEEPSEEK_ENDPOINT |
|
|
|
|
|
test_prompt = "Hello, please respond with a short greeting." |
|
|
|
|
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {DEEPSEEK_API_KEY}" |
|
} |
|
|
|
payload = { |
|
"model": DEEPSEEK_MODEL, |
|
"messages": [{"role": "user", "content": test_prompt}], |
|
"temperature": 0.7, |
|
"max_tokens": 50 |
|
} |
|
|
|
|
|
try: |
|
response = requests.post( |
|
DEEPSEEK_ENDPOINT, |
|
headers=headers, |
|
data=json.dumps(payload), |
|
timeout=5 |
|
) |
|
|
|
|
|
if response.status_code == 200: |
|
model_info = f"\nDeepSeek API μν: μ μ ({DEEPSEEK_MODEL})" |
|
else: |
|
error_message = response.text[:100] |
|
model_info = f"\nDeepSeek API μν: μ€λ₯ (μν μ½λ: {response.status_code})" |
|
except Exception as e: |
|
model_info = f"\nDeepSeek API μν: μ°κ²° μ€ν¨ ({str(e)[:100]})" |
|
except Exception as e: |
|
model_info = f"\nDeepSeek API μν νμΈ μ€ν¨: {str(e)[:100]}" |
|
|
|
return f"μ²λ¦¬λ λ¬Έμ ({len(self.processed_files)}κ°): {', '.join(self.processed_files)}{model_info}" |
|
|
|
def _get_cache_info(self) -> str: |
|
""" |
|
μΊμ μΈλΆ μ 보 λ©μμ§ μμ± |
|
|
|
Returns: |
|
μΊμ μ 보 λ©μμ§ |
|
""" |
|
if not self.file_index: |
|
return "μΊμλ νμΌμ΄ μμ΅λλ€." |
|
|
|
file_info = "" |
|
for file_path, info in self.file_index.items(): |
|
file_name = info.get('file_name', os.path.basename(file_path)) |
|
chunks_count = info.get('chunks_count', 0) |
|
file_size = info.get('file_size', 0) |
|
last_processed = info.get('last_processed', 0) |
|
|
|
|
|
if file_size < 1024: |
|
size_str = f"{file_size} bytes" |
|
elif file_size < 1024 * 1024: |
|
size_str = f"{file_size / 1024:.1f} KB" |
|
else: |
|
size_str = f"{file_size / (1024 * 1024):.1f} MB" |
|
|
|
|
|
if last_processed: |
|
from datetime import datetime |
|
last_time = datetime.fromtimestamp(last_processed).strftime('%Y-%m-%d %H:%M:%S') |
|
else: |
|
last_time = "μ μ μμ" |
|
|
|
file_info += f"- {file_name}: {chunks_count}κ° μ²ν¬, {size_str}, λ§μ§λ§ μ²λ¦¬: {last_time}\n" |
|
|
|
return file_info |
|
|
|
def launch_app(self) -> None: |
|
""" |
|
Gradio μ± μ€ν |
|
""" |
|
try: |
|
import gradio as gr |
|
except ImportError: |
|
logger.error("Gradio λΌμ΄λΈλ¬λ¦¬λ₯Ό μ°Ύμ μ μμ΅λλ€. pip install gradioλ‘ μ€μΉνμΈμ.") |
|
print("Gradio λΌμ΄λΈλ¬λ¦¬λ₯Ό μ°Ύμ μ μμ΅λλ€. pip install gradioλ‘ μ€μΉνμΈμ.") |
|
return |
|
|
|
try: |
|
with gr.Blocks(title="PDF λ¬Έμ κΈ°λ° RAG μ±λ΄") as app: |
|
gr.Markdown("# PDF λ¬Έμ κΈ°λ° RAG μ±λ΄") |
|
|
|
|
|
from config import LLM_MODEL, USE_OPENAI, USE_DEEPSEEK |
|
|
|
|
|
model_type = "DeepSeek" if USE_DEEPSEEK else "OpenAI" if USE_OPENAI else "Ollama" |
|
gr.Markdown(f"* μ¬μ© μ€μΈ LLM λͺ¨λΈ: **{model_type} - {LLM_MODEL}**") |
|
|
|
|
|
actual_pdf_dir = self.pdf_directory.replace('\\', '\\\\') if os.name == 'nt' else self.pdf_directory |
|
gr.Markdown(f"* PDF λ¬Έμ ν΄λ: **{actual_pdf_dir}**") |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
|
|
status_box = gr.Textbox( |
|
label="λ¬Έμ μ²λ¦¬ μν", |
|
value=self._get_status_message(), |
|
lines=5, |
|
interactive=False |
|
) |
|
|
|
|
|
refresh_button = gr.Button("λ¬Έμ μλ‘ μ½κΈ°", variant="primary") |
|
reset_button = gr.Button("μΊμ μ΄κΈ°ν", variant="stop") |
|
|
|
|
|
from config import USE_DEEPSEEK |
|
if USE_DEEPSEEK: |
|
deepseek_button = gr.Button("DeepSeek API μν νμΈ", variant="secondary") |
|
|
|
|
|
status_info = gr.Markdown( |
|
value=f"μμ€ν
μν: {'μ΄κΈ°νλ¨' if self.is_initialized else 'μ΄κΈ°νλμ§ μμ'}" |
|
) |
|
|
|
|
|
with gr.Accordion("μΊμ μΈλΆ μ 보", open=False): |
|
cache_info = gr.Textbox( |
|
label="μΊμλ νμΌ μ 보", |
|
value=self._get_cache_info(), |
|
lines=5, |
|
interactive=False |
|
) |
|
|
|
with gr.Column(scale=2): |
|
|
|
chatbot = gr.Chatbot( |
|
label="λν λ΄μ©", |
|
bubble_full_width=False, |
|
height=500, |
|
show_copy_button=True |
|
) |
|
|
|
|
|
with gr.Row(): |
|
query_box = gr.Textbox( |
|
label="μ§λ¬Έ", |
|
placeholder="μ²λ¦¬λ λ¬Έμ λ΄μ©μ λν΄ μ§λ¬ΈνμΈμ...", |
|
lines=2, |
|
scale=4 |
|
) |
|
submit_btn = gr.Button("μ μ‘", variant="primary", scale=1) |
|
|
|
clear_chat_button = gr.Button("λν μ΄κΈ°ν") |
|
|
|
|
|
def update_ui_after_refresh(result): |
|
return ( |
|
result, |
|
self._get_status_message(), |
|
f"μμ€ν
μν: {'μ΄κΈ°νλ¨' if self.is_initialized else 'μ΄κΈ°νλμ§ μμ'}", |
|
self._get_cache_info() |
|
) |
|
|
|
|
|
refresh_button.click( |
|
fn=lambda: update_ui_after_refresh(self.auto_process_documents()), |
|
inputs=[], |
|
outputs=[status_box, status_box, status_info, cache_info] |
|
) |
|
|
|
|
|
def reset_and_process(): |
|
reset_result = self.reset_cache() |
|
process_result = self.auto_process_documents() |
|
return update_ui_after_refresh(f"{reset_result}\n\n{process_result}") |
|
|
|
reset_button.click( |
|
fn=reset_and_process, |
|
inputs=[], |
|
outputs=[status_box, status_box, status_info, cache_info] |
|
) |
|
|
|
|
|
if USE_DEEPSEEK: |
|
def check_deepseek_api(): |
|
|
|
try: |
|
from config import DEEPSEEK_API_KEY, DEEPSEEK_ENDPOINT, DEEPSEEK_MODEL |
|
|
|
try: |
|
from deepseek_utils import test_deepseek_api |
|
test_result = test_deepseek_api(DEEPSEEK_API_KEY, DEEPSEEK_ENDPOINT, DEEPSEEK_MODEL) |
|
|
|
if test_result["success"]: |
|
return f"DeepSeek API μν: μ μ\nλͺ¨λΈ: {DEEPSEEK_MODEL}\nμλ΅: {test_result.get('response', '(μλ΅ λ΄μ© μμ)')}" |
|
else: |
|
return f"DeepSeek API μ€λ₯: {test_result['message']}\nμν μ½λ: {test_result.get('status_code', 'N/A')}" |
|
except ImportError: |
|
|
|
import requests |
|
import json |
|
|
|
|
|
test_prompt = "Hello, please respond with a short greeting." |
|
|
|
|
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {DEEPSEEK_API_KEY}" |
|
} |
|
|
|
payload = { |
|
"model": DEEPSEEK_MODEL, |
|
"messages": [{"role": "user", "content": test_prompt}], |
|
"temperature": 0.7, |
|
"max_tokens": 50 |
|
} |
|
|
|
|
|
response = requests.post( |
|
DEEPSEEK_ENDPOINT, |
|
headers=headers, |
|
data=json.dumps(payload), |
|
timeout=10 |
|
) |
|
|
|
|
|
if response.status_code == 200: |
|
response_data = response.json() |
|
message_content = response_data.get("choices", [{}])[0].get("message", {}).get( |
|
"content", "") |
|
return f"DeepSeek API μν: μ μ\nλͺ¨λΈ: {DEEPSEEK_MODEL}\nμλ΅: {message_content[:200]}..." |
|
else: |
|
return f"DeepSeek API μ€λ₯\nμν μ½λ: {response.status_code}\nμλ΅: {response.text[:200]}..." |
|
|
|
except Exception as e: |
|
return f"DeepSeek API ν
μ€νΈ μ€ μ€λ₯ λ°μ: {str(e)}" |
|
|
|
deepseek_button.click( |
|
fn=check_deepseek_api, |
|
inputs=[], |
|
outputs=[status_box] |
|
) |
|
|
|
|
|
submit_btn.click( |
|
fn=self.process_query, |
|
inputs=[query_box, chatbot], |
|
outputs=[query_box, chatbot] |
|
) |
|
|
|
|
|
query_box.submit( |
|
fn=self.process_query, |
|
inputs=[query_box, chatbot], |
|
outputs=[query_box, chatbot] |
|
) |
|
|
|
|
|
clear_chat_button.click( |
|
fn=lambda: [], |
|
outputs=[chatbot] |
|
) |
|
|
|
|
|
app.launch(share=False) |
|
except Exception as e: |
|
logger.error(f"Gradio μ± μ€ν μ€ μ€λ₯ λ°μ: {e}", exc_info=True) |
|
print(f"Gradio μ± μ€ν μ€ μ€λ₯ λ°μ: {e}") |
|
|
|
|
|
if __name__ == "__main__": |
|
app = AutoRAGChatApp() |
|
app.launch_app() |