Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -706,12 +706,13 @@ def stream_chat(
|
|
706 |
|
707 |
print(f"์
๋ ฅ ํ
์ ์์ฑ ํ CUDA ๋ฉ๋ชจ๋ฆฌ: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
|
708 |
|
709 |
-
|
|
|
|
|
710 |
streamer = TextIteratorStreamer(
|
711 |
-
tokenizer, timeout=
|
712 |
)
|
713 |
-
|
714 |
-
# ์์ฑ ๋งค๊ฐ๋ณ์ ์ค์
|
715 |
generate_kwargs = dict(
|
716 |
**inputs,
|
717 |
streamer=streamer,
|
@@ -721,9 +722,13 @@ def stream_chat(
|
|
721 |
max_new_tokens=max_new_tokens,
|
722 |
do_sample=True,
|
723 |
temperature=temperature,
|
724 |
-
|
|
|
|
|
725 |
)
|
726 |
|
|
|
|
|
727 |
# ๋ฉ๋ชจ๋ฆฌ ์ ๋ฆฌ
|
728 |
clear_cuda_memory()
|
729 |
|
@@ -731,35 +736,26 @@ def stream_chat(
|
|
731 |
thread = Thread(target=model.generate, kwargs=generate_kwargs)
|
732 |
thread.start()
|
733 |
|
734 |
-
#
|
735 |
buffer = ""
|
736 |
-
partial_message = ""
|
737 |
-
last_yield_time = time.time()
|
738 |
-
|
739 |
try:
|
740 |
for new_text in streamer:
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
# ์ผ์ ์๊ฐ๋ง๋ค ๋๋ ํ
์คํธ๊ฐ ์์ผ ๋๋ง๋ค ๊ฒฐ๊ณผ ์
๋ฐ์ดํธ
|
745 |
-
current_time = time.time()
|
746 |
-
if current_time - last_yield_time > 0.1 or len(partial_message) > 20:
|
747 |
yield "", history + [[message, buffer]]
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
# ๋ํ ๊ธฐ๋ก์ ์ ์ฅ
|
756 |
-
chat_history.add_conversation(message, buffer)
|
757 |
-
|
758 |
-
except Exception as e:
|
759 |
-
print(f"์คํธ๋ฆฌ๋ฐ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}")
|
760 |
-
if not buffer: # ๋ฒํผ๊ฐ ๋น์ด์์ผ๋ฉด ์ค๋ฅ ๋ฉ์์ง ํ์
|
761 |
-
buffer = f"์๋ต ์์ฑ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
762 |
yield "", history + [[message, buffer]]
|
|
|
|
|
|
|
|
|
|
|
763 |
|
764 |
# ์ค๋ ๋๊ฐ ์ฌ์ ํ ์คํ ์ค์ด๋ฉด ์ข
๋ฃ ๋๊ธฐ
|
765 |
if thread.is_alive():
|
|
|
706 |
|
707 |
print(f"์
๋ ฅ ํ
์ ์์ฑ ํ CUDA ๋ฉ๋ชจ๋ฆฌ: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
|
708 |
|
709 |
+
|
710 |
+
try:
|
711 |
+
# ์คํธ๋ฆฌ๋จธ ์ด๊ธฐํ ์ ํ์์์์ ๋ ๊ธธ๊ฒ ์ค์
|
712 |
streamer = TextIteratorStreamer(
|
713 |
+
tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True
|
714 |
)
|
715 |
+
|
|
|
716 |
generate_kwargs = dict(
|
717 |
**inputs,
|
718 |
streamer=streamer,
|
|
|
722 |
max_new_tokens=max_new_tokens,
|
723 |
do_sample=True,
|
724 |
temperature=temperature,
|
725 |
+
pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id,
|
726 |
+
eos_token_id=tokenizer.eos_token_id,
|
727 |
+
use_cache=True
|
728 |
)
|
729 |
|
730 |
+
|
731 |
+
|
732 |
# ๋ฉ๋ชจ๋ฆฌ ์ ๋ฆฌ
|
733 |
clear_cuda_memory()
|
734 |
|
|
|
736 |
thread = Thread(target=model.generate, kwargs=generate_kwargs)
|
737 |
thread.start()
|
738 |
|
739 |
+
# ์คํธ๋ฆฌ๋ฐ ์ฒ๋ฆฌ ์ค ์์ธ ์ฒ๋ฆฌ ๊ฐํ
|
740 |
buffer = ""
|
|
|
|
|
|
|
741 |
try:
|
742 |
for new_text in streamer:
|
743 |
+
try:
|
744 |
+
buffer += new_text
|
|
|
|
|
|
|
|
|
745 |
yield "", history + [[message, buffer]]
|
746 |
+
except Exception as inner_e:
|
747 |
+
print(f"๊ฐ๋ณ ํ ํฐ ์ฒ๋ฆฌ ์ค ์ค๋ฅ: {str(inner_e)}")
|
748 |
+
continue
|
749 |
+
except Exception as stream_e:
|
750 |
+
print(f"์คํธ๋ฆฌ๋ฐ ์ ์ฒด ์ค๋ฅ: {str(stream_e)}")
|
751 |
+
if not buffer:
|
752 |
+
buffer = "์๋ต ์์ฑ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
753 |
yield "", history + [[message, buffer]]
|
754 |
+
except Exception as outer_e:
|
755 |
+
print(f"์ ์ฒด ์์ฑ ๊ณผ์ ์ค๋ฅ: {str(outer_e)}")
|
756 |
+
yield "", history + [[message, "์ฃ์กํฉ๋๋ค, ์๋ต์ ์์ฑํ ์ ์์ต๋๋ค."]]
|
757 |
+
|
758 |
+
]
|
759 |
|
760 |
# ์ค๋ ๋๊ฐ ์ฌ์ ํ ์คํ ์ค์ด๋ฉด ์ข
๋ฃ ๋๊ธฐ
|
761 |
if thread.is_alive():
|