RAGOndevice

Running on Zero

App Files Files Community

openfree commited on Feb 1

Commit

58d9d19

verified ·

1 Parent(s): 1be852d

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -37

app.py CHANGED Viewed

@@ -29,6 +29,7 @@ import platform
 import subprocess
 import pytesseract
 from pdf2image import convert_from_path
 # -------------------- 추가: PDF to Markdown 변환 관련 import --------------------
 try:
@@ -47,9 +48,7 @@ except ModuleNotFoundError as e:
     )
 # ---------------------------------------------------------------------------
-# --------------------
 # 1) Dynamo suppress_errors 옵션 사용 (오류 시 eager로 fallback)
-# --------------------
 torch._dynamo.config.suppress_errors = True
 # 전역 변수
@@ -562,7 +561,6 @@ def _truncate_tokens_for_context(input_ids_str: str, desired_input_length: int)
     """
     tokens = input_ids_str.split()
     if len(tokens) > desired_input_length:
-        # 가장 오래된 부분을 버리고, 뒤에서 desired_input_length만 남김
         tokens = tokens[-desired_input_length:]
     return " ".join(tokens)
@@ -579,7 +577,6 @@ def build_prompt(conversation: list) -> str:
             prompt += "User: " + msg["content"] + "\n"
         elif msg["role"] == "assistant":
             prompt += "Assistant: " + msg["content"] + "\n"
-    # 마지막에 어시스턴트 응답을 기대하도록 추가
     prompt += "Assistant: "
     return prompt
@@ -607,7 +604,6 @@ def stream_chat(
         # 파일 업로드 처리
         file_context = ""
         if uploaded_file and message == "파일을 분석하고 있습니다...":
-            # 새로운 파일 업로드 시에는 기존 메모리 컨텍스트 초기화
             current_file_context = None
             try:
                 content, file_type = read_uploaded_file(uploaded_file)
@@ -617,25 +613,21 @@ def stream_chat(
                         f"\n\n📄 파일 분석 결과:\n{file_analysis}"
                         f"\n\n파일 내용:\n```\n{content}\n```"
                     )
-                    current_file_context = file_context  # 파일 컨텍스트 저장
                     message = "업로드된 파일을 분석해주세요."
             except Exception as e:
                 print(f"파일 분석 오류: {str(e)}")
                 file_context = f"\n\n❌ 파일 분석 중 오류가 발생했습니다: {str(e)}"
         elif current_file_context:
-            # 이미 업로드된 파일 컨텍스트가 있다면 사용
             file_context = current_file_context
-        # 메모리 사용량 모니터링
         if torch.cuda.is_available():
             print(f"CUDA 메모리 사용량: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
-        # 대화 히스토리가 너무 길면 잘라내기
         max_history_length = 10
         if len(history) > max_history_length:
             history = history[-max_history_length:]
-        # 위키 컨텍스트 찾기
         try:
             relevant_contexts = find_relevant_context(message)
             wiki_context = "\n\n관련 위키피디아 정보:\n"
@@ -649,7 +641,6 @@ def stream_chat(
             print(f"컨텍스트 검색 오류: {str(e)}")
             wiki_context = ""
-        # 대화 히스토리 구성
         conversation = []
         for prompt, answer in history:
             conversation.extend([
@@ -657,47 +648,38 @@ def stream_chat(
                 {"role": "assistant", "content": answer}
             ])
-        # 최종 프롬프트 구성
         final_message = file_context + wiki_context + "\n현재 질문: " + message
         conversation.append({"role": "user", "content": final_message})
-        # build_prompt 사용 (기존 tokenizer.apply_chat_template 대신)
         input_ids_str = build_prompt(conversation)
-        # 먼저 6000 토큰 이내로 잘라주기 (임의의 수치, 필요에 따라 조정 가능)
         input_ids_str = _truncate_tokens_for_context(input_ids_str, 6000)
         inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
-        # 최대 컨텍스트 8192 고려하여, 남은 자리가 적으면 max_new_tokens 줄이기
         max_context = 8192
         input_length = inputs["input_ids"].shape[1]
         remaining = max_context - input_length
-        # 최소 128 토큰 정도는 생성할 수 있게 만들고 싶다면,
-        # remaining이 128 미만이면, 추가로 input을 더 잘라낸다.
         min_generation = 128
         if remaining < min_generation:
-            # 더 잘라서 충분한 출력 토큰 확보
-            must_cut = min_generation - remaining  # 몇 토큰만큼 더 자를지
-            new_desired_input_length = max(1, input_length - must_cut)
-            print(f"[주의] 입력이 너무 길어 {must_cut}토큰 더 제거하여, input_length={input_length} -> {new_desired_input_length} 재조정")
-            # 문자열 다시 만들어서 tokenizer
             input_ids_str = _truncate_tokens_for_context(input_ids_str, new_desired_input_length)
             inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
             input_length = inputs["input_ids"].shape[1]
             remaining = max_context - input_length
-        # 최종적으로 (input + max_new_tokens) <= 8192 되도록
         if remaining < max_new_tokens:
             print(f"[주의] 입력 토큰이 많아 max_new_tokens={max_new_tokens} -> {remaining}로 조정합니다.")
             max_new_tokens = remaining
-        if max_new_tokens < 1:
-            # 그래도 1 미만이면 1 토큰만 생성
-            max_new_tokens = 1
-        if torch.cuda.is_available():
-            print(f"입력 텐서 생성 후 CUDA 메모리: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
         streamer = TextIteratorStreamer(
             tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True
@@ -712,25 +694,26 @@ def stream_chat(
             max_new_tokens=max_new_tokens,
             do_sample=True,
             temperature=temperature,
-            eos_token_id=255001,  # 수정: 리스트 대신 정수형 사용
         )
-        # 생성 시작 전 메모리 정리
         clear_cuda_memory()
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
         buffer = ""
-        for new_text in streamer:
-            buffer += new_text
             yield "", history + [[message, buffer]]
-        # 생성 완료 후 메모리 정리
         clear_cuda_memory()
     except Exception as e:
-        # 예외 발생 시 예외의 전체 정보를 출력하여 디버깅에 도움이 되도록 함
         import traceback
         error_details = traceback.format_exc()
         error_message = f"오류가 발생했습니다: {str(e)}\n{error_details}"
@@ -829,7 +812,6 @@ def create_demo():
             current_file_context = None
             return [], None, "Start a new conversation..."
-        # 이벤트 연결
         msg.submit(
             stream_chat,
             inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty],

 import subprocess
 import pytesseract
 from pdf2image import convert_from_path
+import queue  # 추가: queue.Empty 예외 처리를 위해
 # -------------------- 추가: PDF to Markdown 변환 관련 import --------------------
 try:
     )
 # ---------------------------------------------------------------------------
 # 1) Dynamo suppress_errors 옵션 사용 (오류 시 eager로 fallback)
 torch._dynamo.config.suppress_errors = True
 # 전역 변수
     """
     tokens = input_ids_str.split()
     if len(tokens) > desired_input_length:
         tokens = tokens[-desired_input_length:]
     return " ".join(tokens)
             prompt += "User: " + msg["content"] + "\n"
         elif msg["role"] == "assistant":
             prompt += "Assistant: " + msg["content"] + "\n"
     prompt += "Assistant: "
     return prompt
         # 파일 업로드 처리
         file_context = ""
         if uploaded_file and message == "파일을 분석하고 있습니다...":
             current_file_context = None
             try:
                 content, file_type = read_uploaded_file(uploaded_file)
                         f"\n\n📄 파일 분석 결과:\n{file_analysis}"
                         f"\n\n파일 내용:\n```\n{content}\n```"
                     )
+                    current_file_context = file_context
                     message = "업로드된 파일을 분석해주세요."
             except Exception as e:
                 print(f"파일 분석 오류: {str(e)}")
                 file_context = f"\n\n❌ 파일 분석 중 오류가 발생했습니다: {str(e)}"
         elif current_file_context:
             file_context = current_file_context
         if torch.cuda.is_available():
             print(f"CUDA 메모리 사용량: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
         max_history_length = 10
         if len(history) > max_history_length:
             history = history[-max_history_length:]
         try:
             relevant_contexts = find_relevant_context(message)
             wiki_context = "\n\n관련 위키피디아 정보:\n"
             print(f"컨텍스트 검색 오류: {str(e)}")
             wiki_context = ""
         conversation = []
         for prompt, answer in history:
             conversation.extend([
                 {"role": "assistant", "content": answer}
             ])
         final_message = file_context + wiki_context + "\n현재 질문: " + message
         conversation.append({"role": "user", "content": final_message})
         input_ids_str = build_prompt(conversation)
+        # 먼저 6000 토큰 이내로 자르기
         input_ids_str = _truncate_tokens_for_context(input_ids_str, 6000)
         inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
         max_context = 8192
         input_length = inputs["input_ids"].shape[1]
         remaining = max_context - input_length
         min_generation = 128
+        # 만약 남은 토큰 수가 min_generation보다 적으면 입력을 추가로 자릅니다.
         if remaining < min_generation:
+            new_desired_input_length = max_context - min_generation
+            if new_desired_input_length < 1:
+                new_desired_input_length = 1
+            print(f"[주의] 입력이 너무 길어 input_length={input_length} -> {new_desired_input_length}로 재조정")
             input_ids_str = _truncate_tokens_for_context(input_ids_str, new_desired_input_length)
             inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
             input_length = inputs["input_ids"].shape[1]
             remaining = max_context - input_length
+        # max_new_tokens가 음수가 되지 않도록 보정
+        if remaining < 1:
+            remaining = 1
         if remaining < max_new_tokens:
             print(f"[주의] 입력 토큰이 많아 max_new_tokens={max_new_tokens} -> {remaining}로 조정합니다.")
             max_new_tokens = remaining
+        print(f"입력 텐서 생성 후 CUDA 메모리: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
         streamer = TextIteratorStreamer(
             tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True
             max_new_tokens=max_new_tokens,
             do_sample=True,
             temperature=temperature,
+            eos_token_id=255001,
         )
         clear_cuda_memory()
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
         buffer = ""
+        try:
+            for new_text in streamer:
+                buffer += new_text
+                yield "", history + [[message, buffer]]
+        except queue.Empty:
+            print("Streamer timed out. 최종 응답을 반환합니다.")
             yield "", history + [[message, buffer]]
         clear_cuda_memory()
     except Exception as e:
         import traceback
         error_details = traceback.format_exc()
         error_message = f"오류가 발생했습니다: {str(e)}\n{error_details}"
             current_file_context = None
             return [], None, "Start a new conversation..."
         msg.submit(
             stream_chat,
             inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty],