RAGOndevice

Running on Zero

App Files Files Community

openfree commited on Mar 5

Commit

55caecd

verified ·

1 Parent(s): e6c14df

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -214

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ os.environ["TORCH_DYNAMO_DISABLE"] = "1"
 # 2) Triton의 cudagraphs 최적화 비활성화
 os.environ["TRITON_DISABLE_CUDAGRAPHS"] = "1"
-# 3) 경고 무시 설정 (skipping cudagraphs 관련)
 import warnings
 warnings.filterwarnings("ignore", message="skipping cudagraphs due to mutated inputs")
 warnings.filterwarnings("ignore", message="Not enough SMs to use max_autotune_gemm mode")
@@ -15,26 +15,22 @@ import torch
 # TensorFloat32 연산 활성화 (성능 최적화)
 torch.set_float32_matmul_precision('high')
-# TorchInductor cudagraphs 비활성화
 import torch._inductor
 torch._inductor.config.triton.cudagraphs = False
-# Dynamo suppress_errors 옵션 (오류 시 eager로 fallback)
 import torch._dynamo
 torch._dynamo.config.suppress_errors = True
 import gradio as gr
 import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
-import random
 from datasets import load_dataset
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 import pandas as pd
-from typing import List, Tuple
 import json
 from datetime import datetime
 import pyarrow.parquet as pq
@@ -44,8 +40,8 @@ import platform
 import subprocess
 import pytesseract
 from pdf2image import convert_from_path
-import queue  # queue.Empty 예외 처리를 위해
-import time   # 스트리밍 타이밍을 위해
 # -------------------- PDF to Markdown 변환 관련 import --------------------
 try:
@@ -70,7 +66,6 @@ current_file_context = None
 # 환경 변수 설정
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 MODEL_ID = "CohereForAI/c4ai-command-r7b-12-2024"
-MODELS = os.environ.get("MODELS")
 MODEL_NAME = MODEL_ID.split("/")[-1]
 model = None  # 전역에서 관리
@@ -80,9 +75,9 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 wiki_dataset = load_dataset("lcw99/wikipedia-korean-20240501-1million-qna")
 print("Wikipedia dataset loaded:", wiki_dataset)
-# (2) TF-IDF 벡터라이저 초기화 및 학습
 print("TF-IDF 벡터화 시작...")
-questions = wiki_dataset['train']['question'][:10000]  # 처음 10000개만 사용
 vectorizer = TfidfVectorizer(max_features=1000)
 question_vectors = vectorizer.fit_transform(questions)
 print("TF-IDF 벡터화 완료")
@@ -143,16 +138,12 @@ class ChatHistory:
             print(f"히스토리 로드 실패: {e}")
             self.history = []
-# 전역 ChatHistory 인스턴스
 chat_history = ChatHistory()
 # ------------------------- 위키 문서 검색 (TF-IDF) -------------------------
 def find_relevant_context(query, top_k=3):
-    # 쿼리 벡터화
     query_vector = vectorizer.transform([query])
-    # 코사인 유사도
     similarities = (query_vector * question_vectors.T).toarray()[0]
-    # 유사도 높은 질문 인덱스
     top_indices = np.argsort(similarities)[-top_k:][::-1]
     relevant_contexts = []
@@ -165,15 +156,11 @@ def find_relevant_context(query, top_k=3):
             })
     return relevant_contexts
-# 파일 업로드 시 표시할 초기 메시지
 def init_msg():
     return "파일을 분석하고 있습니다..."
 # -------------------- PDF 파일을 Markdown으로 변환하는 유틸 함수들 --------------------
 def extract_text_from_pdf(reader: PdfReader) -> str:
-    """
-    PyPDF를 사용해 모든 페이지 텍스트를 추출.
-    """
     full_text = ""
     for idx, page in enumerate(reader.pages):
         text = page.extract_text() or ""
@@ -182,16 +169,11 @@ def extract_text_from_pdf(reader: PdfReader) -> str:
     return full_text.strip()
 def convert_pdf_to_markdown(pdf_file: str):
-    """
-    PDF 파일에서 텍스트를 추출하고,
-    이미지가 많고 텍스트가 적으면 OCR 시도
-    """
     try:
         reader = PdfReader(pdf_file)
     except Exception as e:
         return f"PDF 파일을 읽는 중 오류 발생: {e}", None, None
-    # 메타데이터 추출
     raw_meta = reader.metadata
     metadata = {
         "author": raw_meta.author if raw_meta else None,
@@ -201,16 +183,13 @@ def convert_pdf_to_markdown(pdf_file: str):
         "title": raw_meta.title if raw_meta else None,
     }
-    # 텍스트 추출
     full_text = extract_text_from_pdf(reader)
-    # 이미지-텍스트 비율 판단 후 OCR 시도
     image_count = sum(len(page.images) for page in reader.pages)
     if image_count > 0 and len(full_text) < 1000:
         try:
             out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
             ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
-            # OCR된 PDF 다시 읽기
             reader_ocr = PdfReader(out_pdf_file)
             full_text = extract_text_from_pdf(reader_ocr)
         except Exception as e:
@@ -220,7 +199,6 @@ def convert_pdf_to_markdown(pdf_file: str):
 # ------------------------- 파일 분석 함수 -------------------------
 def analyze_file_content(content, file_type):
-    """간단한 구조 분석/요약."""
     if file_type in ['parquet', 'csv']:
         try:
             lines = content.split('\n')
@@ -246,16 +224,16 @@ def analyze_file_content(content, file_type):
     return f"📝 Document Structure: {total_lines} lines, {paragraphs} paragraphs, approximately {words} words"
 def read_uploaded_file(file):
-    """
-    업로드된 파일 처리 -> 내용/타입
-    """
     if file is None:
         return "", ""
     try:
         file_ext = os.path.splitext(file.name)[1].lower()
-        # Parquet
         if file_ext == '.parquet':
             try:
                 table = pq.read_table(file.name)
@@ -291,8 +269,7 @@ def read_uploaded_file(file):
             except Exception as e:
                 return f"Error reading Parquet file: {str(e)}", "error"
-        # PDF
-        if file_ext == '.pdf':
             try:
                 markdown_text, metadata, processed_pdf_path = convert_pdf_to_markdown(file.name)
                 if metadata is None:
@@ -302,14 +279,13 @@ def read_uploaded_file(file):
                 content += "## Metadata\n"
                 for k, v in metadata.items():
                     content += f"**{k.capitalize()}**: {v}\n\n"
                 content += "## Extracted Text\n\n"
                 content += markdown_text
                 return content, "pdf"
             except Exception as e:
                 return f"Error reading PDF file: {str(e)}", "error"
-        # CSV
         elif file_ext == '.csv':
             encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
             for encoding in encodings:
@@ -342,7 +318,6 @@ def read_uploaded_file(file):
                 f"Unable to read file with supported encodings ({', '.join(encodings)})"
             )
-        # 텍스트 파일
         else:
             encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
             for encoding in encodings:
@@ -358,7 +333,7 @@ def read_uploaded_file(file):
                         for keyword in ['def ', 'class ', 'import ', 'function']
                     )
-                    analysis = f"\n📝 File Analysis:\n"
                     if is_code:
                         functions = sum('def ' in line for line in lines)
                         classes = sum('class ' in line for line in lines)
@@ -374,7 +349,6 @@ def read_uploaded_file(file):
                     else:
                         words = len(content.split())
                         chars = len(content)
                         analysis += f"- File Type: Text\n"
                         analysis += f"- Total Lines: {total_lines:,}\n"
                         analysis += f"- Non-empty Lines: {non_empty_lines:,}\n"
@@ -395,162 +369,10 @@ def read_uploaded_file(file):
 # ------------------------- CSS -------------------------
 CSS = """
-/* 3D 스타일 CSS */
-:root {
-    --primary-color: #2196f3;
-    --secondary-color: #1976d2;
-    --background-color: #f0f2f5;
-    --card-background: #ffffff;
-    --text-color: #333333;
-    --shadow-color: rgba(0, 0, 0, 0.1);
-}
-body {
-    background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
-    min-height: 100vh;
-    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
-}
-.container {
-    transform-style: preserve-3d;
-    perspective: 1000px;
-}
-.chatbot {
-    background: var(--card-background);
-    border-radius: 20px;
-    box-shadow:
-        0 10px 20px var(--shadow-color),
-        0 6px 6px var(--shadow-color);
-    transform: translateZ(0);
-    transition: transform 0.3s ease;
-    backdrop-filter: blur(10px);
-}
-.chatbot:hover {
-    transform: translateZ(10px);
-}
-/* 메시지 입력 영역 */
-.input-area {
-    background: var(--card-background);
-    border-radius: 15px;
-    padding: 15px;
-    margin-top: 20px;
-    box-shadow:
-        0 5px 15px var(--shadow-color),
-        0 3px 3px var(--shadow-color);
-    transform: translateZ(0);
-    transition: all 0.3s ease;
-    display: flex;
-    align-items: center;
-    gap: 10px;
-}
-.input-area:hover {
-    transform: translateZ(5px);
-}
-/* 버튼 스타일 */
-.custom-button {
-    background: linear-gradient(145deg, var(--primary-color), var(--secondary-color));
-    color: white;
-    border: none;
-    border-radius: 10px;
-    padding: 10px 20px;
-    font-weight: 600;
-    cursor: pointer;
-    transform: translateZ(0);
-    transition: all 0.3s ease;
-    box-shadow:
-        0 4px 6px var(--shadow-color),
-        0 1px 3px var(--shadow-color);
-}
-.custom-button:hover {
-    transform: translateZ(5px) translateY(-2px);
-    box-shadow:
-        0 7px 14px var(--shadow-color),
-        0 3px 6px var(--shadow-color);
-}
-/* 파일 업로드 버튼 */
-.file-upload-icon {
-    background: linear-gradient(145deg, #64b5f6, #42a5f5);
-    color: white;
-    border-radius: 8px;
-    font-size: 2em;
-    cursor: pointer;
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    height: 70px;
-    width: 70px;
-    transition: all 0.3s ease;
-    box-shadow: 0 2px 5px rgba(0,0,0,0.1);
-}
-.file-upload-icon:hover {
-    transform: translateY(-2px);
-    box-shadow: 0 4px 8px rgba(0,0,0,0.2);
-}
-/* 파일 업로드 버튼 내부 요소 스타일링 */
-.file-upload-icon > .wrap {
-    display: flex !important;
-    align-items: center;
-    justify-content: center;
-    width: 100%;
-    height: 100%;
-}
-.file-upload-icon > .wrap > p {
-    display: none !important;
-}
-.file-upload-icon > .wrap::before {
-    content: "📁";
-    font-size: 2em;
-    display: block;
-}
-/* 메시지 스타일 */
-.message {
-    background: var(--card-background);
-    border-radius: 15px;
-    padding: 15px;
-    margin: 10px 0;
-    box-shadow:
-        0 4px 6px var(--shadow-color),
-        0 1px 3px var(--shadow-color);
-    transform: translateZ(0);
-    transition: all 0.3s ease;
-}
-.message:hover {
-    transform: translateZ(5px);
-}
-.chat-container {
-    height: 600px !important;
-    margin-bottom: 10px;
-}
-.input-container {
-    height: 70px !important;
-    display: flex;
-    align-items: center;
-    gap: 10px;
-    margin-top: 5px;
-}
-.input-textbox {
-    height: 70px !important;
-    border-radius: 8px !important;
-    font-size: 1.1em !important;
-    padding: 10px 15px !important;
-    display: flex !important;
-    align-items: flex-start !important;
-}
-.input-textbox textarea {
-    padding-top: 5px !important;
-}
-.send-button {
-    height: 70px !important;
-    min-width: 70px !important;
-    font-size: 1.1em !important;
-}
-/* 설정 패널 기본 스타일 */
-.settings-panel {
-    padding: 20px;
-    margin-top: 20px;
-}
 """
 def clear_cuda_memory():
-    """CUDA 캐시 정리."""
     if hasattr(torch.cuda, 'empty_cache'):
         with torch.cuda.device('cuda'):
             torch.cuda.empty_cache()
@@ -566,13 +388,14 @@ def load_model():
             device_map="auto",
             low_cpu_mem_usage=True,
         )
         return loaded_model
     except Exception as e:
         print(f"모델 로드 오류: {str(e)}")
         raise
 def build_prompt(conversation: list) -> str:
-    """대화 내역을 단순 텍스트 프롬프트로 변환."""
     prompt = ""
     for msg in conversation:
         if msg["role"] == "user":
@@ -597,14 +420,13 @@ def stream_chat(
     global model, current_file_context
     try:
-        # 모델 미로드시 로딩
         if model is None:
             model = load_model()
         print(f'[User input] message: {message}')
         print(f'[History] {history}')
-        # (1) 파일 업로드 처리
         file_context = ""
         if uploaded_file and message == "파일을 분석하고 있습니다...":
             current_file_context = None
@@ -624,7 +446,7 @@ def stream_chat(
         elif current_file_context:
             file_context = current_file_context
-        # (2) TF-IDF 기반 관련 문서 탐색
         wiki_context = ""
         try:
             relevant_contexts = find_relevant_context(message)
@@ -639,7 +461,7 @@ def stream_chat(
         except Exception as e:
             print(f"[컨텍스트 검색 오류] {str(e)}")
-        # (3) 대화 이력 구성
         max_history_length = 10
         if len(history) > max_history_length:
             history = history[-max_history_length:]
@@ -651,7 +473,7 @@ def stream_chat(
                 {"role": "assistant", "content": answer}
             ])
-        # (4) 최종 메시지 결정
         final_message = message
         if file_context:
             final_message = file_context + "\n현재 질문: " + message
@@ -662,13 +484,13 @@ def stream_chat(
         conversation.append({"role": "user", "content": final_message})
-        # (5) 토큰화 및 프롬프트 구축
         input_ids_str = build_prompt(conversation)
         max_context = 8192
         tokenized_input = tokenizer(input_ids_str, return_tensors="pt")
         input_length = tokenized_input["input_ids"].shape[1]
-        # (6) 컨텍스트가 너무 길면 앞부분 토큰 자르기
         if input_length > max_context - max_new_tokens:
             print(f"[경고] 입력이 너무 깁니다: {input_length} 토큰 -> 잘라냄.")
             min_generation = min(256, max_new_tokens)
@@ -683,18 +505,18 @@ def stream_chat(
         print(f"[토큰 길이] {input_length}")
         inputs = tokenized_input.to("cuda")
-        # 남은 토큰 수로 max_new_tokens 조정
         remaining = max_context - input_length
         if remaining < max_new_tokens:
             print(f"[max_new_tokens 조정] {max_new_tokens} -> {remaining}")
             max_new_tokens = remaining
-        # 스트리머 설정
         streamer = TextIteratorStreamer(
             tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True
         )
-        # (7) 생성 파라미터
         generate_kwargs = dict(
             **inputs,
             streamer=streamer,
@@ -704,18 +526,18 @@ def stream_chat(
             max_new_tokens=max_new_tokens,
             do_sample=True,
             temperature=temperature,
-            pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id,
             eos_token_id=tokenizer.eos_token_id,
-            use_cache=True
         )
         clear_cuda_memory()
-        # (8) 별도 스레드에서 생성
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
-        # (9) 스트리밍 응답
         buffer = ""
         partial_message = ""
         last_yield_time = time.time()
@@ -725,23 +547,23 @@ def stream_chat(
                 buffer += new_text
                 partial_message += new_text
-                # 일정 시간 또는 버퍼 길이 기준으로 yield
                 current_time = time.time()
                 if (current_time - last_yield_time > 0.1) or (len(partial_message) > 20):
                     yield "", history + [[message, buffer]]
                     partial_message = ""
                     last_yield_time = current_time
-            # 마지막 완성된 응답
             if buffer:
                 yield "", history + [[message, buffer]]
-            # 대화 내용 저장
             chat_history.add_conversation(message, buffer)
         except Exception as e:
             print(f"[스트리밍 중 오류] {str(e)}")
-            if not buffer:  # buffer가 비어있다면 오류메시지 대화창 표시
                 buffer = f"응답 생성 중 오류 발생: {str(e)}"
             yield "", history + [[message, buffer]]
@@ -835,7 +657,7 @@ def create_demo():
                         label="Repetition Penalty 🔄"
                     )
-        # 예시
         gr.Examples(
             examples=[
                 ["Please analyze this code and suggest improvements:\ndef fibonacci(n):\n    if n <= 1: return n\n    return fibonacci(n-1) + fibonacci(n-2)"],
@@ -852,7 +674,7 @@ def create_demo():
             current_file_context = None
             return [], None, "Start a new conversation..."
-        # 메시지 전송
         msg.submit(
             stream_chat,
             inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty],
@@ -889,7 +711,6 @@ def create_demo():
         return demo
-# 메인 실행
 if __name__ == "__main__":
     demo = create_demo()
     demo.launch()

 # 2) Triton의 cudagraphs 최적화 비활성화
 os.environ["TRITON_DISABLE_CUDAGRAPHS"] = "1"
+# (옵션) 경고 무시 설정
 import warnings
 warnings.filterwarnings("ignore", message="skipping cudagraphs due to mutated inputs")
 warnings.filterwarnings("ignore", message="Not enough SMs to use max_autotune_gemm mode")
 # TensorFloat32 연산 활성화 (성능 최적화)
 torch.set_float32_matmul_precision('high')
 import torch._inductor
 torch._inductor.config.triton.cudagraphs = False
 import torch._dynamo
+# suppress_errors (오류 시 eager로 fallback)
 torch._dynamo.config.suppress_errors = True
 import gradio as gr
 import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 from datasets import load_dataset
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 import pandas as pd
 import json
 from datetime import datetime
 import pyarrow.parquet as pq
 import subprocess
 import pytesseract
 from pdf2image import convert_from_path
+import queue
+import time
 # -------------------- PDF to Markdown 변환 관련 import --------------------
 try:
 # 환경 변수 설정
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 MODEL_ID = "CohereForAI/c4ai-command-r7b-12-2024"
 MODEL_NAME = MODEL_ID.split("/")[-1]
 model = None  # 전역에서 관리
 wiki_dataset = load_dataset("lcw99/wikipedia-korean-20240501-1million-qna")
 print("Wikipedia dataset loaded:", wiki_dataset)
+# (2) TF-IDF 벡터라이저 초기화 및 학습 (일부만 사용)
 print("TF-IDF 벡터화 시작...")
+questions = wiki_dataset['train']['question'][:10000]
 vectorizer = TfidfVectorizer(max_features=1000)
 question_vectors = vectorizer.fit_transform(questions)
 print("TF-IDF 벡터화 완료")
             print(f"히스토리 로드 실패: {e}")
             self.history = []
 chat_history = ChatHistory()
 # ------------------------- 위키 문서 검색 (TF-IDF) -------------------------
 def find_relevant_context(query, top_k=3):
     query_vector = vectorizer.transform([query])
     similarities = (query_vector * question_vectors.T).toarray()[0]
     top_indices = np.argsort(similarities)[-top_k:][::-1]
     relevant_contexts = []
             })
     return relevant_contexts
 def init_msg():
     return "파일을 분석하고 있습니다..."
 # -------------------- PDF 파일을 Markdown으로 변환하는 유틸 함수들 --------------------
 def extract_text_from_pdf(reader: PdfReader) -> str:
     full_text = ""
     for idx, page in enumerate(reader.pages):
         text = page.extract_text() or ""
     return full_text.strip()
 def convert_pdf_to_markdown(pdf_file: str):
     try:
         reader = PdfReader(pdf_file)
     except Exception as e:
         return f"PDF 파일을 읽는 중 오류 발생: {e}", None, None
     raw_meta = reader.metadata
     metadata = {
         "author": raw_meta.author if raw_meta else None,
         "title": raw_meta.title if raw_meta else None,
     }
     full_text = extract_text_from_pdf(reader)
     image_count = sum(len(page.images) for page in reader.pages)
     if image_count > 0 and len(full_text) < 1000:
         try:
             out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
             ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
             reader_ocr = PdfReader(out_pdf_file)
             full_text = extract_text_from_pdf(reader_ocr)
         except Exception as e:
 # ------------------------- 파일 분석 함수 -------------------------
 def analyze_file_content(content, file_type):
     if file_type in ['parquet', 'csv']:
         try:
             lines = content.split('\n')
     return f"📝 Document Structure: {total_lines} lines, {paragraphs} paragraphs, approximately {words} words"
 def read_uploaded_file(file):
     if file is None:
         return "", ""
+    import pyarrow.parquet as pq
+    import pandas as pd
+    from tabulate import tabulate
     try:
         file_ext = os.path.splitext(file.name)[1].lower()
         if file_ext == '.parquet':
             try:
                 table = pq.read_table(file.name)
             except Exception as e:
                 return f"Error reading Parquet file: {str(e)}", "error"
+        elif file_ext == '.pdf':
             try:
                 markdown_text, metadata, processed_pdf_path = convert_pdf_to_markdown(file.name)
                 if metadata is None:
                 content += "## Metadata\n"
                 for k, v in metadata.items():
                     content += f"**{k.capitalize()}**: {v}\n\n"
                 content += "## Extracted Text\n\n"
                 content += markdown_text
                 return content, "pdf"
             except Exception as e:
                 return f"Error reading PDF file: {str(e)}", "error"
         elif file_ext == '.csv':
             encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
             for encoding in encodings:
                 f"Unable to read file with supported encodings ({', '.join(encodings)})"
             )
         else:
             encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
             for encoding in encodings:
                         for keyword in ['def ', 'class ', 'import ', 'function']
                     )
+                    analysis = "\n📝 File Analysis:\n"
                     if is_code:
                         functions = sum('def ' in line for line in lines)
                         classes = sum('class ' in line for line in lines)
                     else:
                         words = len(content.split())
                         chars = len(content)
                         analysis += f"- File Type: Text\n"
                         analysis += f"- Total Lines: {total_lines:,}\n"
                         analysis += f"- Non-empty Lines: {non_empty_lines:,}\n"
 # ------------------------- CSS -------------------------
 CSS = """
+/* (생략: 동일) */
 """
 def clear_cuda_memory():
     if hasattr(torch.cuda, 'empty_cache'):
         with torch.cuda.device('cuda'):
             torch.cuda.empty_cache()
             device_map="auto",
             low_cpu_mem_usage=True,
         )
+        # (중요) 모델 기본 config에서도 캐시 사용 꺼둘 수 있음
+        loaded_model.config.use_cache = False
         return loaded_model
     except Exception as e:
         print(f"모델 로드 오류: {str(e)}")
         raise
 def build_prompt(conversation: list) -> str:
     prompt = ""
     for msg in conversation:
         if msg["role"] == "user":
     global model, current_file_context
     try:
         if model is None:
             model = load_model()
         print(f'[User input] message: {message}')
         print(f'[History] {history}')
+        # 1) 파일 업로드 처리
         file_context = ""
         if uploaded_file and message == "파일을 분석하고 있습니다...":
             current_file_context = None
         elif current_file_context:
             file_context = current_file_context
+        # 2) 위키 컨텍스트
         wiki_context = ""
         try:
             relevant_contexts = find_relevant_context(message)
         except Exception as e:
             print(f"[컨텍스트 검색 오류] {str(e)}")
+        # 3) 대화 이력 축소
         max_history_length = 10
         if len(history) > max_history_length:
             history = history[-max_history_length:]
                 {"role": "assistant", "content": answer}
             ])
+        # 4) 최종 메시지
         final_message = message
         if file_context:
             final_message = file_context + "\n현재 질문: " + message
         conversation.append({"role": "user", "content": final_message})
+        # 5) 토큰화
         input_ids_str = build_prompt(conversation)
         max_context = 8192
         tokenized_input = tokenizer(input_ids_str, return_tensors="pt")
         input_length = tokenized_input["input_ids"].shape[1]
+        # 6) 컨텍스트 초과 시 자르기
         if input_length > max_context - max_new_tokens:
             print(f"[경고] 입력이 너무 깁니다: {input_length} 토큰 -> 잘라냄.")
             min_generation = min(256, max_new_tokens)
         print(f"[토큰 길이] {input_length}")
         inputs = tokenized_input.to("cuda")
+        # 7) 남은 토큰 수로 max_new_tokens 보정
         remaining = max_context - input_length
         if remaining < max_new_tokens:
             print(f"[max_new_tokens 조정] {max_new_tokens} -> {remaining}")
             max_new_tokens = remaining
+        # 8) TextIteratorStreamer 설정
         streamer = TextIteratorStreamer(
             tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True
         )
+        # ★ use_cache=False 설정 (중요) ★
         generate_kwargs = dict(
             **inputs,
             streamer=streamer,
             max_new_tokens=max_new_tokens,
             do_sample=True,
             temperature=temperature,
+            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
             eos_token_id=tokenizer.eos_token_id,
+            use_cache=False,   # ← 여기가 핵심!
         )
         clear_cuda_memory()
+        # 9) 별도 스레드로 모델 호출
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
+        # 10) 스트리밍
         buffer = ""
         partial_message = ""
         last_yield_time = time.time()
                 buffer += new_text
                 partial_message += new_text
+                # 타이밍 or 일정 길이마다 UI 업데이트
                 current_time = time.time()
                 if (current_time - last_yield_time > 0.1) or (len(partial_message) > 20):
                     yield "", history + [[message, buffer]]
                     partial_message = ""
                     last_yield_time = current_time
+            # 마지막 출력
             if buffer:
                 yield "", history + [[message, buffer]]
+            # 대화 히스토리 저장
             chat_history.add_conversation(message, buffer)
         except Exception as e:
             print(f"[스트리밍 중 오류] {str(e)}")
+            if not buffer:
                 buffer = f"응답 생성 중 오류 발생: {str(e)}"
             yield "", history + [[message, buffer]]
                         label="Repetition Penalty 🔄"
                     )
+        # 예시 입력
         gr.Examples(
             examples=[
                 ["Please analyze this code and suggest improvements:\ndef fibonacci(n):\n    if n <= 1: return n\n    return fibonacci(n-1) + fibonacci(n-2)"],
             current_file_context = None
             return [], None, "Start a new conversation..."
+        # 메시지 전송(Submit)
         msg.submit(
             stream_chat,
             inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty],
         return demo
 if __name__ == "__main__":
     demo = create_demo()
     demo.launch()