openfree commited on
Commit
58d9d19
ยท
verified ยท
1 Parent(s): 1be852d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -37
app.py CHANGED
@@ -29,6 +29,7 @@ import platform
29
  import subprocess
30
  import pytesseract
31
  from pdf2image import convert_from_path
 
32
 
33
  # -------------------- ์ถ”๊ฐ€: PDF to Markdown ๋ณ€ํ™˜ ๊ด€๋ จ import --------------------
34
  try:
@@ -47,9 +48,7 @@ except ModuleNotFoundError as e:
47
  )
48
  # ---------------------------------------------------------------------------
49
 
50
- # --------------------
51
  # 1) Dynamo suppress_errors ์˜ต์…˜ ์‚ฌ์šฉ (์˜ค๋ฅ˜ ์‹œ eager๋กœ fallback)
52
- # --------------------
53
  torch._dynamo.config.suppress_errors = True
54
 
55
  # ์ „์—ญ ๋ณ€์ˆ˜
@@ -562,7 +561,6 @@ def _truncate_tokens_for_context(input_ids_str: str, desired_input_length: int)
562
  """
563
  tokens = input_ids_str.split()
564
  if len(tokens) > desired_input_length:
565
- # ๊ฐ€์žฅ ์˜ค๋ž˜๋œ ๋ถ€๋ถ„์„ ๋ฒ„๋ฆฌ๊ณ , ๋’ค์—์„œ desired_input_length๋งŒ ๋‚จ๊น€
566
  tokens = tokens[-desired_input_length:]
567
  return " ".join(tokens)
568
 
@@ -579,7 +577,6 @@ def build_prompt(conversation: list) -> str:
579
  prompt += "User: " + msg["content"] + "\n"
580
  elif msg["role"] == "assistant":
581
  prompt += "Assistant: " + msg["content"] + "\n"
582
- # ๋งˆ์ง€๋ง‰์— ์–ด์‹œ์Šคํ„ดํŠธ ์‘๋‹ต์„ ๊ธฐ๋Œ€ํ•˜๋„๋ก ์ถ”๊ฐ€
583
  prompt += "Assistant: "
584
  return prompt
585
 
@@ -607,7 +604,6 @@ def stream_chat(
607
  # ํŒŒ์ผ ์—…๋กœ๋“œ ์ฒ˜๋ฆฌ
608
  file_context = ""
609
  if uploaded_file and message == "ํŒŒ์ผ์„ ๋ถ„์„ํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค...":
610
- # ์ƒˆ๋กœ์šด ํŒŒ์ผ ์—…๋กœ๋“œ ์‹œ์—๋Š” ๊ธฐ์กด ๋ฉ”๋ชจ๋ฆฌ ์ปจํ…์ŠคํŠธ ์ดˆ๊ธฐํ™”
611
  current_file_context = None
612
  try:
613
  content, file_type = read_uploaded_file(uploaded_file)
@@ -617,25 +613,21 @@ def stream_chat(
617
  f"\n\n๐Ÿ“„ ํŒŒ์ผ ๋ถ„์„ ๊ฒฐ๊ณผ:\n{file_analysis}"
618
  f"\n\nํŒŒ์ผ ๋‚ด์šฉ:\n```\n{content}\n```"
619
  )
620
- current_file_context = file_context # ํŒŒ์ผ ์ปจํ…์ŠคํŠธ ์ €์žฅ
621
  message = "์—…๋กœ๋“œ๋œ ํŒŒ์ผ์„ ๋ถ„์„ํ•ด์ฃผ์„ธ์š”."
622
  except Exception as e:
623
  print(f"ํŒŒ์ผ ๋ถ„์„ ์˜ค๋ฅ˜: {str(e)}")
624
  file_context = f"\n\nโŒ ํŒŒ์ผ ๋ถ„์„ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
625
  elif current_file_context:
626
- # ์ด๋ฏธ ์—…๋กœ๋“œ๋œ ํŒŒ์ผ ์ปจํ…์ŠคํŠธ๊ฐ€ ์žˆ๋‹ค๋ฉด ์‚ฌ์šฉ
627
  file_context = current_file_context
628
 
629
- # ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰ ๋ชจ๋‹ˆํ„ฐ๋ง
630
  if torch.cuda.is_available():
631
  print(f"CUDA ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
632
 
633
- # ๋Œ€ํ™” ํžˆ์Šคํ† ๋ฆฌ๊ฐ€ ๋„ˆ๋ฌด ๊ธธ๋ฉด ์ž˜๋ผ๋‚ด๊ธฐ
634
  max_history_length = 10
635
  if len(history) > max_history_length:
636
  history = history[-max_history_length:]
637
 
638
- # ์œ„ํ‚ค ์ปจํ…์ŠคํŠธ ์ฐพ๊ธฐ
639
  try:
640
  relevant_contexts = find_relevant_context(message)
641
  wiki_context = "\n\n๊ด€๋ จ ์œ„ํ‚คํ”ผ๋””์•„ ์ •๋ณด:\n"
@@ -649,7 +641,6 @@ def stream_chat(
649
  print(f"์ปจํ…์ŠคํŠธ ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {str(e)}")
650
  wiki_context = ""
651
 
652
- # ๋Œ€ํ™” ํžˆ์Šคํ† ๋ฆฌ ๊ตฌ์„ฑ
653
  conversation = []
654
  for prompt, answer in history:
655
  conversation.extend([
@@ -657,47 +648,38 @@ def stream_chat(
657
  {"role": "assistant", "content": answer}
658
  ])
659
 
660
- # ์ตœ์ข… ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
661
  final_message = file_context + wiki_context + "\nํ˜„์žฌ ์งˆ๋ฌธ: " + message
662
  conversation.append({"role": "user", "content": final_message})
663
 
664
- # build_prompt ์‚ฌ์šฉ (๊ธฐ์กด tokenizer.apply_chat_template ๋Œ€์‹ )
665
  input_ids_str = build_prompt(conversation)
666
- # ๋จผ์ € 6000 ํ† ํฐ ์ด๋‚ด๋กœ ์ž˜๋ผ์ฃผ๊ธฐ (์ž„์˜์˜ ์ˆ˜์น˜, ํ•„์š”์— ๋”ฐ๋ผ ์กฐ์ • ๊ฐ€๋Šฅ)
667
  input_ids_str = _truncate_tokens_for_context(input_ids_str, 6000)
668
 
669
  inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
670
-
671
- # ์ตœ๋Œ€ ์ปจํ…์ŠคํŠธ 8192 ๊ณ ๋ คํ•˜์—ฌ, ๋‚จ์€ ์ž๋ฆฌ๊ฐ€ ์ ์œผ๋ฉด max_new_tokens ์ค„์ด๊ธฐ
672
  max_context = 8192
673
  input_length = inputs["input_ids"].shape[1]
674
  remaining = max_context - input_length
675
 
676
- # ์ตœ์†Œ 128 ํ† ํฐ ์ •๋„๋Š” ์ƒ์„ฑํ•  ์ˆ˜ ์žˆ๊ฒŒ ๋งŒ๋“ค๊ณ  ์‹ถ๋‹ค๋ฉด,
677
- # remaining์ด 128 ๋ฏธ๋งŒ์ด๋ฉด, ์ถ”๊ฐ€๋กœ input์„ ๋” ์ž˜๋ผ๋‚ธ๋‹ค.
678
  min_generation = 128
 
679
  if remaining < min_generation:
680
- # ๋” ์ž˜๋ผ์„œ ์ถฉ๋ถ„ํ•œ ์ถœ๋ ฅ ํ† ํฐ ํ™•๋ณด
681
- must_cut = min_generation - remaining # ๋ช‡ ํ† ํฐ๋งŒํผ ๋” ์ž๋ฅผ์ง€
682
- new_desired_input_length = max(1, input_length - must_cut)
683
- print(f"[์ฃผ์˜] ์ž…๋ ฅ์ด ๋„ˆ๋ฌด ๊ธธ์–ด {must_cut}ํ† ํฐ ๋” ์ œ๊ฑฐํ•˜์—ฌ, input_length={input_length} -> {new_desired_input_length} ์žฌ์กฐ์ •")
684
- # ๋ฌธ์ž์—ด ๋‹ค์‹œ ๋งŒ๋“ค์–ด์„œ tokenizer
685
  input_ids_str = _truncate_tokens_for_context(input_ids_str, new_desired_input_length)
686
  inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
687
  input_length = inputs["input_ids"].shape[1]
688
  remaining = max_context - input_length
689
 
690
- # ์ตœ์ข…์ ์œผ๋กœ (input + max_new_tokens) <= 8192 ๋˜๋„๋ก
 
 
691
  if remaining < max_new_tokens:
692
  print(f"[์ฃผ์˜] ์ž…๋ ฅ ํ† ํฐ์ด ๋งŽ์•„ max_new_tokens={max_new_tokens} -> {remaining}๋กœ ์กฐ์ •ํ•ฉ๋‹ˆ๋‹ค.")
693
  max_new_tokens = remaining
694
 
695
- if max_new_tokens < 1:
696
- # ๊ทธ๋ž˜๋„ 1 ๋ฏธ๋งŒ์ด๋ฉด 1 ํ† ํฐ๋งŒ ์ƒ์„ฑ
697
- max_new_tokens = 1
698
-
699
- if torch.cuda.is_available():
700
- print(f"์ž…๋ ฅ ํ…์„œ ์ƒ์„ฑ ํ›„ CUDA ๋ฉ”๋ชจ๋ฆฌ: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
701
 
702
  streamer = TextIteratorStreamer(
703
  tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True
@@ -712,25 +694,26 @@ def stream_chat(
712
  max_new_tokens=max_new_tokens,
713
  do_sample=True,
714
  temperature=temperature,
715
- eos_token_id=255001, # ์ˆ˜์ •: ๋ฆฌ์ŠคํŠธ ๋Œ€์‹  ์ •์ˆ˜ํ˜• ์‚ฌ์šฉ
716
  )
717
 
718
- # ์ƒ์„ฑ ์‹œ์ž‘ ์ „ ๋ฉ”๋ชจ๋ฆฌ ์ •๋ฆฌ
719
  clear_cuda_memory()
720
 
721
  thread = Thread(target=model.generate, kwargs=generate_kwargs)
722
  thread.start()
723
 
724
  buffer = ""
725
- for new_text in streamer:
726
- buffer += new_text
 
 
 
 
727
  yield "", history + [[message, buffer]]
728
 
729
- # ์ƒ์„ฑ ์™„๋ฃŒ ํ›„ ๋ฉ”๋ชจ๋ฆฌ ์ •๋ฆฌ
730
  clear_cuda_memory()
731
 
732
  except Exception as e:
733
- # ์˜ˆ์™ธ ๋ฐœ์ƒ ์‹œ ์˜ˆ์™ธ์˜ ์ „์ฒด ์ •๋ณด๋ฅผ ์ถœ๋ ฅํ•˜์—ฌ ๋””๋ฒ„๊น…์— ๋„์›€์ด ๋˜๋„๋ก ํ•จ
734
  import traceback
735
  error_details = traceback.format_exc()
736
  error_message = f"์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}\n{error_details}"
@@ -829,7 +812,6 @@ def create_demo():
829
  current_file_context = None
830
  return [], None, "Start a new conversation..."
831
 
832
- # ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
833
  msg.submit(
834
  stream_chat,
835
  inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty],
 
29
  import subprocess
30
  import pytesseract
31
  from pdf2image import convert_from_path
32
+ import queue # ์ถ”๊ฐ€: queue.Empty ์˜ˆ์™ธ ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•ด
33
 
34
  # -------------------- ์ถ”๊ฐ€: PDF to Markdown ๋ณ€ํ™˜ ๊ด€๋ จ import --------------------
35
  try:
 
48
  )
49
  # ---------------------------------------------------------------------------
50
 
 
51
  # 1) Dynamo suppress_errors ์˜ต์…˜ ์‚ฌ์šฉ (์˜ค๋ฅ˜ ์‹œ eager๋กœ fallback)
 
52
  torch._dynamo.config.suppress_errors = True
53
 
54
  # ์ „์—ญ ๋ณ€์ˆ˜
 
561
  """
562
  tokens = input_ids_str.split()
563
  if len(tokens) > desired_input_length:
 
564
  tokens = tokens[-desired_input_length:]
565
  return " ".join(tokens)
566
 
 
577
  prompt += "User: " + msg["content"] + "\n"
578
  elif msg["role"] == "assistant":
579
  prompt += "Assistant: " + msg["content"] + "\n"
 
580
  prompt += "Assistant: "
581
  return prompt
582
 
 
604
  # ํŒŒ์ผ ์—…๋กœ๋“œ ์ฒ˜๋ฆฌ
605
  file_context = ""
606
  if uploaded_file and message == "ํŒŒ์ผ์„ ๋ถ„์„ํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค...":
 
607
  current_file_context = None
608
  try:
609
  content, file_type = read_uploaded_file(uploaded_file)
 
613
  f"\n\n๐Ÿ“„ ํŒŒ์ผ ๋ถ„์„ ๊ฒฐ๊ณผ:\n{file_analysis}"
614
  f"\n\nํŒŒ์ผ ๋‚ด์šฉ:\n```\n{content}\n```"
615
  )
616
+ current_file_context = file_context
617
  message = "์—…๋กœ๋“œ๋œ ํŒŒ์ผ์„ ๋ถ„์„ํ•ด์ฃผ์„ธ์š”."
618
  except Exception as e:
619
  print(f"ํŒŒ์ผ ๋ถ„์„ ์˜ค๋ฅ˜: {str(e)}")
620
  file_context = f"\n\nโŒ ํŒŒ์ผ ๋ถ„์„ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
621
  elif current_file_context:
 
622
  file_context = current_file_context
623
 
 
624
  if torch.cuda.is_available():
625
  print(f"CUDA ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
626
 
 
627
  max_history_length = 10
628
  if len(history) > max_history_length:
629
  history = history[-max_history_length:]
630
 
 
631
  try:
632
  relevant_contexts = find_relevant_context(message)
633
  wiki_context = "\n\n๊ด€๋ จ ์œ„ํ‚คํ”ผ๋””์•„ ์ •๋ณด:\n"
 
641
  print(f"์ปจํ…์ŠคํŠธ ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {str(e)}")
642
  wiki_context = ""
643
 
 
644
  conversation = []
645
  for prompt, answer in history:
646
  conversation.extend([
 
648
  {"role": "assistant", "content": answer}
649
  ])
650
 
 
651
  final_message = file_context + wiki_context + "\nํ˜„์žฌ ์งˆ๋ฌธ: " + message
652
  conversation.append({"role": "user", "content": final_message})
653
 
 
654
  input_ids_str = build_prompt(conversation)
655
+ # ๋จผ์ € 6000 ํ† ํฐ ์ด๋‚ด๋กœ ์ž๋ฅด๊ธฐ
656
  input_ids_str = _truncate_tokens_for_context(input_ids_str, 6000)
657
 
658
  inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
 
 
659
  max_context = 8192
660
  input_length = inputs["input_ids"].shape[1]
661
  remaining = max_context - input_length
662
 
 
 
663
  min_generation = 128
664
+ # ๋งŒ์•ฝ ๋‚จ์€ ํ† ํฐ ์ˆ˜๊ฐ€ min_generation๋ณด๋‹ค ์ ์œผ๋ฉด ์ž…๋ ฅ์„ ์ถ”๊ฐ€๋กœ ์ž๋ฆ…๋‹ˆ๋‹ค.
665
  if remaining < min_generation:
666
+ new_desired_input_length = max_context - min_generation
667
+ if new_desired_input_length < 1:
668
+ new_desired_input_length = 1
669
+ print(f"[์ฃผ์˜] ์ž…๋ ฅ์ด ๋„ˆ๋ฌด ๊ธธ์–ด input_length={input_length} -> {new_desired_input_length}๋กœ ์žฌ์กฐ์ •")
 
670
  input_ids_str = _truncate_tokens_for_context(input_ids_str, new_desired_input_length)
671
  inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
672
  input_length = inputs["input_ids"].shape[1]
673
  remaining = max_context - input_length
674
 
675
+ # max_new_tokens๊ฐ€ ์Œ์ˆ˜๊ฐ€ ๋˜์ง€ ์•Š๋„๋ก ๋ณด์ •
676
+ if remaining < 1:
677
+ remaining = 1
678
  if remaining < max_new_tokens:
679
  print(f"[์ฃผ์˜] ์ž…๋ ฅ ํ† ํฐ์ด ๋งŽ์•„ max_new_tokens={max_new_tokens} -> {remaining}๋กœ ์กฐ์ •ํ•ฉ๋‹ˆ๋‹ค.")
680
  max_new_tokens = remaining
681
 
682
+ print(f"์ž…๋ ฅ ํ…์„œ ์ƒ์„ฑ ํ›„ CUDA ๋ฉ”๋ชจ๋ฆฌ: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
 
 
 
 
 
683
 
684
  streamer = TextIteratorStreamer(
685
  tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True
 
694
  max_new_tokens=max_new_tokens,
695
  do_sample=True,
696
  temperature=temperature,
697
+ eos_token_id=255001,
698
  )
699
 
 
700
  clear_cuda_memory()
701
 
702
  thread = Thread(target=model.generate, kwargs=generate_kwargs)
703
  thread.start()
704
 
705
  buffer = ""
706
+ try:
707
+ for new_text in streamer:
708
+ buffer += new_text
709
+ yield "", history + [[message, buffer]]
710
+ except queue.Empty:
711
+ print("Streamer timed out. ์ตœ์ข… ์‘๋‹ต์„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.")
712
  yield "", history + [[message, buffer]]
713
 
 
714
  clear_cuda_memory()
715
 
716
  except Exception as e:
 
717
  import traceback
718
  error_details = traceback.format_exc()
719
  error_message = f"์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}\n{error_details}"
 
812
  current_file_context = None
813
  return [], None, "Start a new conversation..."
814
 
 
815
  msg.submit(
816
  stream_chat,
817
  inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty],