Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -29,6 +29,7 @@ import platform
|
|
29 |
import subprocess
|
30 |
import pytesseract
|
31 |
from pdf2image import convert_from_path
|
|
|
32 |
|
33 |
# -------------------- ์ถ๊ฐ: PDF to Markdown ๋ณํ ๊ด๋ จ import --------------------
|
34 |
try:
|
@@ -47,9 +48,7 @@ except ModuleNotFoundError as e:
|
|
47 |
)
|
48 |
# ---------------------------------------------------------------------------
|
49 |
|
50 |
-
# --------------------
|
51 |
# 1) Dynamo suppress_errors ์ต์
์ฌ์ฉ (์ค๋ฅ ์ eager๋ก fallback)
|
52 |
-
# --------------------
|
53 |
torch._dynamo.config.suppress_errors = True
|
54 |
|
55 |
# ์ ์ญ ๋ณ์
|
@@ -562,7 +561,6 @@ def _truncate_tokens_for_context(input_ids_str: str, desired_input_length: int)
|
|
562 |
"""
|
563 |
tokens = input_ids_str.split()
|
564 |
if len(tokens) > desired_input_length:
|
565 |
-
# ๊ฐ์ฅ ์ค๋๋ ๋ถ๋ถ์ ๋ฒ๋ฆฌ๊ณ , ๋ค์์ desired_input_length๋ง ๋จ๊น
|
566 |
tokens = tokens[-desired_input_length:]
|
567 |
return " ".join(tokens)
|
568 |
|
@@ -579,7 +577,6 @@ def build_prompt(conversation: list) -> str:
|
|
579 |
prompt += "User: " + msg["content"] + "\n"
|
580 |
elif msg["role"] == "assistant":
|
581 |
prompt += "Assistant: " + msg["content"] + "\n"
|
582 |
-
# ๋ง์ง๋ง์ ์ด์์คํดํธ ์๋ต์ ๊ธฐ๋ํ๋๋ก ์ถ๊ฐ
|
583 |
prompt += "Assistant: "
|
584 |
return prompt
|
585 |
|
@@ -607,7 +604,6 @@ def stream_chat(
|
|
607 |
# ํ์ผ ์
๋ก๋ ์ฒ๋ฆฌ
|
608 |
file_context = ""
|
609 |
if uploaded_file and message == "ํ์ผ์ ๋ถ์ํ๊ณ ์์ต๋๋ค...":
|
610 |
-
# ์๋ก์ด ํ์ผ ์
๋ก๋ ์์๋ ๊ธฐ์กด ๋ฉ๋ชจ๋ฆฌ ์ปจํ
์คํธ ์ด๊ธฐํ
|
611 |
current_file_context = None
|
612 |
try:
|
613 |
content, file_type = read_uploaded_file(uploaded_file)
|
@@ -617,25 +613,21 @@ def stream_chat(
|
|
617 |
f"\n\n๐ ํ์ผ ๋ถ์ ๊ฒฐ๊ณผ:\n{file_analysis}"
|
618 |
f"\n\nํ์ผ ๋ด์ฉ:\n```\n{content}\n```"
|
619 |
)
|
620 |
-
current_file_context = file_context
|
621 |
message = "์
๋ก๋๋ ํ์ผ์ ๋ถ์ํด์ฃผ์ธ์."
|
622 |
except Exception as e:
|
623 |
print(f"ํ์ผ ๋ถ์ ์ค๋ฅ: {str(e)}")
|
624 |
file_context = f"\n\nโ ํ์ผ ๋ถ์ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
625 |
elif current_file_context:
|
626 |
-
# ์ด๋ฏธ ์
๋ก๋๋ ํ์ผ ์ปจํ
์คํธ๊ฐ ์๋ค๋ฉด ์ฌ์ฉ
|
627 |
file_context = current_file_context
|
628 |
|
629 |
-
# ๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ๋ ๋ชจ๋ํฐ๋ง
|
630 |
if torch.cuda.is_available():
|
631 |
print(f"CUDA ๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ๋: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
|
632 |
|
633 |
-
# ๋ํ ํ์คํ ๋ฆฌ๊ฐ ๋๋ฌด ๊ธธ๋ฉด ์๋ผ๋ด๊ธฐ
|
634 |
max_history_length = 10
|
635 |
if len(history) > max_history_length:
|
636 |
history = history[-max_history_length:]
|
637 |
|
638 |
-
# ์ํค ์ปจํ
์คํธ ์ฐพ๊ธฐ
|
639 |
try:
|
640 |
relevant_contexts = find_relevant_context(message)
|
641 |
wiki_context = "\n\n๊ด๋ จ ์ํคํผ๋์ ์ ๋ณด:\n"
|
@@ -649,7 +641,6 @@ def stream_chat(
|
|
649 |
print(f"์ปจํ
์คํธ ๊ฒ์ ์ค๋ฅ: {str(e)}")
|
650 |
wiki_context = ""
|
651 |
|
652 |
-
# ๋ํ ํ์คํ ๋ฆฌ ๊ตฌ์ฑ
|
653 |
conversation = []
|
654 |
for prompt, answer in history:
|
655 |
conversation.extend([
|
@@ -657,47 +648,38 @@ def stream_chat(
|
|
657 |
{"role": "assistant", "content": answer}
|
658 |
])
|
659 |
|
660 |
-
# ์ต์ข
ํ๋กฌํํธ ๊ตฌ์ฑ
|
661 |
final_message = file_context + wiki_context + "\nํ์ฌ ์ง๋ฌธ: " + message
|
662 |
conversation.append({"role": "user", "content": final_message})
|
663 |
|
664 |
-
# build_prompt ์ฌ์ฉ (๊ธฐ์กด tokenizer.apply_chat_template ๋์ )
|
665 |
input_ids_str = build_prompt(conversation)
|
666 |
-
# ๋จผ์ 6000 ํ ํฐ ์ด๋ด๋ก
|
667 |
input_ids_str = _truncate_tokens_for_context(input_ids_str, 6000)
|
668 |
|
669 |
inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
|
670 |
-
|
671 |
-
# ์ต๋ ์ปจํ
์คํธ 8192 ๊ณ ๋ คํ์ฌ, ๋จ์ ์๋ฆฌ๊ฐ ์ ์ผ๋ฉด max_new_tokens ์ค์ด๊ธฐ
|
672 |
max_context = 8192
|
673 |
input_length = inputs["input_ids"].shape[1]
|
674 |
remaining = max_context - input_length
|
675 |
|
676 |
-
# ์ต์ 128 ํ ํฐ ์ ๋๋ ์์ฑํ ์ ์๊ฒ ๋ง๋ค๊ณ ์ถ๋ค๋ฉด,
|
677 |
-
# remaining์ด 128 ๋ฏธ๋ง์ด๋ฉด, ์ถ๊ฐ๋ก input์ ๋ ์๋ผ๋ธ๋ค.
|
678 |
min_generation = 128
|
|
|
679 |
if remaining < min_generation:
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
print(f"[์ฃผ์] ์
๋ ฅ์ด ๋๋ฌด ๊ธธ์ด
|
684 |
-
# ๋ฌธ์์ด ๋ค์ ๋ง๋ค์ด์ tokenizer
|
685 |
input_ids_str = _truncate_tokens_for_context(input_ids_str, new_desired_input_length)
|
686 |
inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
|
687 |
input_length = inputs["input_ids"].shape[1]
|
688 |
remaining = max_context - input_length
|
689 |
|
690 |
-
#
|
|
|
|
|
691 |
if remaining < max_new_tokens:
|
692 |
print(f"[์ฃผ์] ์
๋ ฅ ํ ํฐ์ด ๋ง์ max_new_tokens={max_new_tokens} -> {remaining}๋ก ์กฐ์ ํฉ๋๋ค.")
|
693 |
max_new_tokens = remaining
|
694 |
|
695 |
-
|
696 |
-
# ๊ทธ๋๋ 1 ๋ฏธ๋ง์ด๋ฉด 1 ํ ํฐ๋ง ์์ฑ
|
697 |
-
max_new_tokens = 1
|
698 |
-
|
699 |
-
if torch.cuda.is_available():
|
700 |
-
print(f"์
๋ ฅ ํ
์ ์์ฑ ํ CUDA ๋ฉ๋ชจ๋ฆฌ: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
|
701 |
|
702 |
streamer = TextIteratorStreamer(
|
703 |
tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True
|
@@ -712,25 +694,26 @@ def stream_chat(
|
|
712 |
max_new_tokens=max_new_tokens,
|
713 |
do_sample=True,
|
714 |
temperature=temperature,
|
715 |
-
eos_token_id=255001,
|
716 |
)
|
717 |
|
718 |
-
# ์์ฑ ์์ ์ ๋ฉ๋ชจ๋ฆฌ ์ ๋ฆฌ
|
719 |
clear_cuda_memory()
|
720 |
|
721 |
thread = Thread(target=model.generate, kwargs=generate_kwargs)
|
722 |
thread.start()
|
723 |
|
724 |
buffer = ""
|
725 |
-
|
726 |
-
|
|
|
|
|
|
|
|
|
727 |
yield "", history + [[message, buffer]]
|
728 |
|
729 |
-
# ์์ฑ ์๋ฃ ํ ๋ฉ๋ชจ๋ฆฌ ์ ๋ฆฌ
|
730 |
clear_cuda_memory()
|
731 |
|
732 |
except Exception as e:
|
733 |
-
# ์์ธ ๋ฐ์ ์ ์์ธ์ ์ ์ฒด ์ ๋ณด๋ฅผ ์ถ๋ ฅํ์ฌ ๋๋ฒ๊น
์ ๋์์ด ๋๋๋ก ํจ
|
734 |
import traceback
|
735 |
error_details = traceback.format_exc()
|
736 |
error_message = f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}\n{error_details}"
|
@@ -829,7 +812,6 @@ def create_demo():
|
|
829 |
current_file_context = None
|
830 |
return [], None, "Start a new conversation..."
|
831 |
|
832 |
-
# ์ด๋ฒคํธ ์ฐ๊ฒฐ
|
833 |
msg.submit(
|
834 |
stream_chat,
|
835 |
inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty],
|
|
|
29 |
import subprocess
|
30 |
import pytesseract
|
31 |
from pdf2image import convert_from_path
|
32 |
+
import queue # ์ถ๊ฐ: queue.Empty ์์ธ ์ฒ๋ฆฌ๋ฅผ ์ํด
|
33 |
|
34 |
# -------------------- ์ถ๊ฐ: PDF to Markdown ๋ณํ ๊ด๋ จ import --------------------
|
35 |
try:
|
|
|
48 |
)
|
49 |
# ---------------------------------------------------------------------------
|
50 |
|
|
|
51 |
# 1) Dynamo suppress_errors ์ต์
์ฌ์ฉ (์ค๋ฅ ์ eager๋ก fallback)
|
|
|
52 |
torch._dynamo.config.suppress_errors = True
|
53 |
|
54 |
# ์ ์ญ ๋ณ์
|
|
|
561 |
"""
|
562 |
tokens = input_ids_str.split()
|
563 |
if len(tokens) > desired_input_length:
|
|
|
564 |
tokens = tokens[-desired_input_length:]
|
565 |
return " ".join(tokens)
|
566 |
|
|
|
577 |
prompt += "User: " + msg["content"] + "\n"
|
578 |
elif msg["role"] == "assistant":
|
579 |
prompt += "Assistant: " + msg["content"] + "\n"
|
|
|
580 |
prompt += "Assistant: "
|
581 |
return prompt
|
582 |
|
|
|
604 |
# ํ์ผ ์
๋ก๋ ์ฒ๋ฆฌ
|
605 |
file_context = ""
|
606 |
if uploaded_file and message == "ํ์ผ์ ๋ถ์ํ๊ณ ์์ต๋๋ค...":
|
|
|
607 |
current_file_context = None
|
608 |
try:
|
609 |
content, file_type = read_uploaded_file(uploaded_file)
|
|
|
613 |
f"\n\n๐ ํ์ผ ๋ถ์ ๊ฒฐ๊ณผ:\n{file_analysis}"
|
614 |
f"\n\nํ์ผ ๋ด์ฉ:\n```\n{content}\n```"
|
615 |
)
|
616 |
+
current_file_context = file_context
|
617 |
message = "์
๋ก๋๋ ํ์ผ์ ๋ถ์ํด์ฃผ์ธ์."
|
618 |
except Exception as e:
|
619 |
print(f"ํ์ผ ๋ถ์ ์ค๋ฅ: {str(e)}")
|
620 |
file_context = f"\n\nโ ํ์ผ ๋ถ์ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
621 |
elif current_file_context:
|
|
|
622 |
file_context = current_file_context
|
623 |
|
|
|
624 |
if torch.cuda.is_available():
|
625 |
print(f"CUDA ๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ๋: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
|
626 |
|
|
|
627 |
max_history_length = 10
|
628 |
if len(history) > max_history_length:
|
629 |
history = history[-max_history_length:]
|
630 |
|
|
|
631 |
try:
|
632 |
relevant_contexts = find_relevant_context(message)
|
633 |
wiki_context = "\n\n๊ด๋ จ ์ํคํผ๋์ ์ ๋ณด:\n"
|
|
|
641 |
print(f"์ปจํ
์คํธ ๊ฒ์ ์ค๋ฅ: {str(e)}")
|
642 |
wiki_context = ""
|
643 |
|
|
|
644 |
conversation = []
|
645 |
for prompt, answer in history:
|
646 |
conversation.extend([
|
|
|
648 |
{"role": "assistant", "content": answer}
|
649 |
])
|
650 |
|
|
|
651 |
final_message = file_context + wiki_context + "\nํ์ฌ ์ง๋ฌธ: " + message
|
652 |
conversation.append({"role": "user", "content": final_message})
|
653 |
|
|
|
654 |
input_ids_str = build_prompt(conversation)
|
655 |
+
# ๋จผ์ 6000 ํ ํฐ ์ด๋ด๋ก ์๋ฅด๊ธฐ
|
656 |
input_ids_str = _truncate_tokens_for_context(input_ids_str, 6000)
|
657 |
|
658 |
inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
|
|
|
|
|
659 |
max_context = 8192
|
660 |
input_length = inputs["input_ids"].shape[1]
|
661 |
remaining = max_context - input_length
|
662 |
|
|
|
|
|
663 |
min_generation = 128
|
664 |
+
# ๋ง์ฝ ๋จ์ ํ ํฐ ์๊ฐ min_generation๋ณด๋ค ์ ์ผ๋ฉด ์
๋ ฅ์ ์ถ๊ฐ๋ก ์๋ฆ
๋๋ค.
|
665 |
if remaining < min_generation:
|
666 |
+
new_desired_input_length = max_context - min_generation
|
667 |
+
if new_desired_input_length < 1:
|
668 |
+
new_desired_input_length = 1
|
669 |
+
print(f"[์ฃผ์] ์
๋ ฅ์ด ๋๋ฌด ๊ธธ์ด input_length={input_length} -> {new_desired_input_length}๋ก ์ฌ์กฐ์ ")
|
|
|
670 |
input_ids_str = _truncate_tokens_for_context(input_ids_str, new_desired_input_length)
|
671 |
inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
|
672 |
input_length = inputs["input_ids"].shape[1]
|
673 |
remaining = max_context - input_length
|
674 |
|
675 |
+
# max_new_tokens๊ฐ ์์๊ฐ ๋์ง ์๋๋ก ๋ณด์
|
676 |
+
if remaining < 1:
|
677 |
+
remaining = 1
|
678 |
if remaining < max_new_tokens:
|
679 |
print(f"[์ฃผ์] ์
๋ ฅ ํ ํฐ์ด ๋ง์ max_new_tokens={max_new_tokens} -> {remaining}๋ก ์กฐ์ ํฉ๋๋ค.")
|
680 |
max_new_tokens = remaining
|
681 |
|
682 |
+
print(f"์
๋ ฅ ํ
์ ์์ฑ ํ CUDA ๋ฉ๋ชจ๋ฆฌ: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
|
|
|
|
|
|
|
|
|
|
|
683 |
|
684 |
streamer = TextIteratorStreamer(
|
685 |
tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True
|
|
|
694 |
max_new_tokens=max_new_tokens,
|
695 |
do_sample=True,
|
696 |
temperature=temperature,
|
697 |
+
eos_token_id=255001,
|
698 |
)
|
699 |
|
|
|
700 |
clear_cuda_memory()
|
701 |
|
702 |
thread = Thread(target=model.generate, kwargs=generate_kwargs)
|
703 |
thread.start()
|
704 |
|
705 |
buffer = ""
|
706 |
+
try:
|
707 |
+
for new_text in streamer:
|
708 |
+
buffer += new_text
|
709 |
+
yield "", history + [[message, buffer]]
|
710 |
+
except queue.Empty:
|
711 |
+
print("Streamer timed out. ์ต์ข
์๋ต์ ๋ฐํํฉ๋๋ค.")
|
712 |
yield "", history + [[message, buffer]]
|
713 |
|
|
|
714 |
clear_cuda_memory()
|
715 |
|
716 |
except Exception as e:
|
|
|
717 |
import traceback
|
718 |
error_details = traceback.format_exc()
|
719 |
error_message = f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}\n{error_details}"
|
|
|
812 |
current_file_context = None
|
813 |
return [], None, "Start a new conversation..."
|
814 |
|
|
|
815 |
msg.submit(
|
816 |
stream_chat,
|
817 |
inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty],
|