Gemma-3-R1984-4B

Running on Zero

App Files Files Community

seawolf2357 commited on Mar 16

Commit

5718b5c

verified ·

1 Parent(s): 7a2b5d0

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -102

app.py CHANGED Viewed

@@ -13,15 +13,16 @@ import torch
 from loguru import logger
 from PIL import Image
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
 import pandas as pd
 import PyPDF2
-##################################################
-# 기본 설정
-##################################################
-MAX_CONTENT_CHARS = 8000  # 텍스트로 전달 시 최대 글자 수
-model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
 processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
 model = Gemma3ForConditionalGeneration.from_pretrained(
     model_id,
@@ -29,17 +30,20 @@ model = Gemma3ForConditionalGeneration.from_pretrained(
     torch_dtype=torch.bfloat16,
     attn_implementation="eager"
 )
 MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
 ##################################################
-# 1) CSV, TXT, PDF 분석 함수 (빈 파일 대비)
 ##################################################
 def analyze_csv_file(path: str) -> str:
     try:
         df = pd.read_csv(path)
-        df_str = df.to_string().strip()
-        if not df_str:
-            df_str = "(CSV is empty)"
         if len(df_str) > MAX_CONTENT_CHARS:
             df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
         return f"**[CSV File: {os.path.basename(path)}]**\n\n{df_str}"
@@ -48,11 +52,12 @@ def analyze_csv_file(path: str) -> str:
 def analyze_txt_file(path: str) -> str:
     try:
         with open(path, "r", encoding="utf-8") as f:
-            text = f.read().strip()
-        if not text:
-            text = "(TXT is empty)"
         if len(text) > MAX_CONTENT_CHARS:
             text = text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
         return f"**[TXT File: {os.path.basename(path)}]**\n\n{text}"
@@ -61,26 +66,30 @@ def analyze_txt_file(path: str) -> str:
 def pdf_to_markdown(pdf_path: str) -> str:
     try:
         with open(pdf_path, "rb") as f:
             reader = PyPDF2.PdfReader(f)
-            chunks = []
             for page_num, page in enumerate(reader.pages, start=1):
-                ptext = (page.extract_text() or "").strip()
-                if ptext:
-                    chunks.append(f"## Page {page_num}\n\n{ptext}\n")
-        full_text = "\n".join(chunks).strip()
-        if not full_text:
-            full_text = "(PDF is empty)"
-        if len(full_text) > MAX_CONTENT_CHARS:
-            full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
-        return f"**[PDF File: {os.path.basename(pdf_path)}]**\n\n{full_text}"
     except Exception as e:
         return f"Failed to read PDF ({os.path.basename(pdf_path)}): {str(e)}"
 ##################################################
-# 2) 이미지/비디오 업로드 제한
 ##################################################
 def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
     image_count = 0
@@ -97,11 +106,9 @@ def count_files_in_history(history: list[dict]) -> tuple[int, int]:
     image_count = 0
     video_count = 0
     for item in history:
-        # assistant 또는 content가 str이면 제외
         if item["role"] != "user" or isinstance(item["content"], str):
             continue
-        file_path = item["content"][0]
-        if file_path.endswith(".mp4"):
             video_count += 1
         else:
             image_count += 1
@@ -110,10 +117,17 @@ def count_files_in_history(history: list[dict]) -> tuple[int, int]:
 def validate_media_constraints(message: dict, history: list[dict]) -> bool:
     """
-    이미지/비디오 개수 제한
     """
     media_files = []
     for f in message["files"]:
         if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE) or f.endswith(".mp4"):
             media_files.append(f)
@@ -122,11 +136,9 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
     image_count = history_image_count + new_image_count
     video_count = history_video_count + new_video_count
-    # 비디오 1개 초과 불가
     if video_count > 1:
         gr.Warning("Only one video is supported.")
         return False
-    # 비디오+이미지 혼합 불가
     if video_count == 1:
         if image_count > 0:
             gr.Warning("Mixing images and videos is not allowed.")
@@ -134,11 +146,9 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
         if "<image>" in message["text"]:
             gr.Warning("Using <image> tags with video files is not supported.")
             return False
-    # 이미지 개수 제한
     if video_count == 0 and image_count > MAX_NUM_IMAGES:
         gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
         return False
-    # <image> 태그 수와 이미지 파일 수 일치
     if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
         gr.Warning("The number of <image> tags in the text does not match the number of images.")
         return False
@@ -147,15 +157,16 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
 ##################################################
-# 3) 비디오 처리
 ##################################################
 def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     vidcap = cv2.VideoCapture(video_path)
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    frame_interval = int(fps / 3)
     frames = []
     for i in range(0, total_frames, frame_interval):
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
@@ -164,6 +175,7 @@ def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
@@ -171,16 +183,18 @@ def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
 def process_video(video_path: str) -> list[dict]:
     content = []
     frames = downsample_video(video_path)
-    for pil_image, timestamp in frames:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
             pil_image.save(temp_file.name)
             content.append({"type": "text", "text": f"Frame {timestamp}:"})
             content.append({"type": "image", "url": temp_file.name})
     return content
 ##################################################
-# 4) interleaved <image> 처리
 ##################################################
 def process_interleaved_images(message: dict) -> list[dict]:
     parts = re.split(r"(<image>)", message["text"])
@@ -193,57 +207,55 @@ def process_interleaved_images(message: dict) -> list[dict]:
         elif part.strip():
             content.append({"type": "text", "text": part.strip()})
         else:
             if isinstance(part, str) and part != "<image>":
                 content.append({"type": "text", "text": part})
     return content
 ##################################################
-# 5) CSV/PDF/TXT = 텍스트 / 이미지,비디오 = 실제 경로
 ##################################################
 def process_new_user_message(message: dict) -> list[dict]:
-    user_text = (message["text"] or "").strip() or "(No text)"
     if not message["files"]:
-        return [{"type": "text", "text": user_text}]
     video_files = [f for f in message["files"] if f.endswith(".mp4")]
     image_files = [f for f in message["files"] if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE)]
     csv_files = [f for f in message["files"] if f.lower().endswith(".csv")]
     txt_files = [f for f in message["files"] if f.lower().endswith(".txt")]
     pdf_files = [f for f in message["files"] if f.lower().endswith(".pdf")]
-    content_list = [{"type": "text", "text": user_text}]
-    # CSV
     for csv_path in csv_files:
         csv_analysis = analyze_csv_file(csv_path)
-        if not csv_analysis.strip():
-            csv_analysis = "(No CSV content?)"
         content_list.append({"type": "text", "text": csv_analysis})
-    # TXT
     for txt_path in txt_files:
         txt_analysis = analyze_txt_file(txt_path)
-        if not txt_analysis.strip():
-            txt_analysis = "(No TXT content?)"
         content_list.append({"type": "text", "text": txt_analysis})
-    # PDF
     for pdf_path in pdf_files:
-        pdf_md = pdf_to_markdown(pdf_path)
-        if not pdf_md.strip():
-            pdf_md = "(No PDF content?)"
-        content_list.append({"type": "text", "text": pdf_md})
     if video_files:
-        # 하나만 처리
         content_list += process_video(video_files[0])
         return content_list
-    if "<image>" in user_text:
         return process_interleaved_images(message)
     else:
-        # 일반 이미지
         for img_path in image_files:
             content_list.append({"type": "image", "url": img_path})
@@ -251,16 +263,18 @@ def process_new_user_message(message: dict) -> list[dict]:
 ##################################################
-# 6) 히스토리 -> LLM 메시지 변환 (비이미지 경로는 무시)
 ##################################################
 def process_history(history: list[dict]) -> list[dict]:
     messages = []
-    current_user_content = []
     for item in history:
         if item["role"] == "assistant":
             if current_user_content:
                 messages.append({"role": "user", "content": current_user_content})
                 current_user_content = []
             messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
         else:
             # user
@@ -268,18 +282,13 @@ def process_history(history: list[dict]) -> list[dict]:
             if isinstance(content, str):
                 current_user_content.append({"type": "text", "text": content})
             else:
-                # [파일경로]
-                fpath = content[0]
-                # 이미지나 mp4만 유지, 나머지는 제외
-                if re.search(r"\.(png|jpg|jpeg|gif|webp)$", fpath, re.IGNORECASE) or fpath.endswith(".mp4"):
-                    current_user_content.append({"type": "image", "url": fpath})
-                else:
-                    pass
     return messages
 ##################################################
-# 7) 메인 추론 (빈 토큰 방어)
 ##################################################
 @spaces.GPU(duration=120)
 def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
@@ -291,54 +300,36 @@ def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tok
     if system_prompt:
         messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
     messages.extend(process_history(history))
-    user_content = process_new_user_message(message)
-    messages.append({"role": "user", "content": user_content})
-    # 1) tokenize=False 후 토큰 길이 체크
-    raw_text = processor.tokenizer.apply_chat_template(
         messages,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-    token_ids = processor.tokenizer.encode(raw_text, add_special_tokens=False)
-    if len(token_ids) == 0:
-        # 빈 입력 → 임의 문구 추가
-        raw_text += " (No content?)"
-        token_ids = processor.tokenizer.encode(raw_text, add_special_tokens=False)
-    # 2) 실제 tokenizer
-    inputs = processor.tokenizer(
-        raw_text,
         return_tensors="pt",
-        padding=True
-    )
-    inputs = {k: v.to(model.device, dtype=torch.bfloat16) for k, v in inputs.items()}
-    # 3) 스트리밍 생성
-    streamer = TextIteratorStreamer(processor.tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
-    gen_kwargs = {
-        "inputs": inputs["input_ids"],
-        "attention_mask": inputs.get("attention_mask"),
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "do_sample": True,
-        "temperature": 0.3,
-        "top_p": 0.95,
-    }
-    gen_kwargs = {k: v for k, v in gen_kwargs.items() if v is not None}
     t = Thread(target=model.generate, kwargs=gen_kwargs)
     t.start()
     output = ""
-    for chunk in streamer:
-        output += chunk
         yield output
 ##################################################
-# 8) 예시
 ##################################################
 examples = [
@@ -470,13 +461,15 @@ examples = [
 ]
 demo = gr.ChatInterface(
     fn=run,
     type="messages",
     chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
     textbox=gr.MultimodalTextbox(
         file_types=[
-            ".png", ".jpg", ".jpeg", ".gif", ".webp",
             ".mp4", ".csv", ".txt", ".pdf"
         ],
         file_count="multiple",
@@ -486,12 +479,15 @@ demo = gr.ChatInterface(
     additional_inputs=[
         gr.Textbox(
             label="System Prompt",
-            value="You are a deeply thoughtful AI. Consider problems thoroughly and derive correct solutions through systematic reasoning. Please answer in korean."
         ),
         gr.Slider(label="Max New Tokens", minimum=100, maximum=8000, step=50, value=2000),
     ],
     stop_btn=False,
-    title="Gemma 3 27B IT",
     examples=examples,
     run_examples_on_click=False,
     cache_examples=False,

 from loguru import logger
 from PIL import Image
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
+# CSV/TXT 분석
 import pandas as pd
+# PDF 텍스트 추출
 import PyPDF2
+MAX_CONTENT_CHARS = 8000  # 너무 큰 파일을 막기 위해 최대 표시 8000자
+model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
 processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
 model = Gemma3ForConditionalGeneration.from_pretrained(
     model_id,
     torch_dtype=torch.bfloat16,
     attn_implementation="eager"
 )
 MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
 ##################################################
+# CSV, TXT, PDF 분석 함수
 ##################################################
 def analyze_csv_file(path: str) -> str:
+    """
+    CSV 파일을 전체 문자열로 변환. 너무 길 경우 일부만 표시.
+    """
     try:
         df = pd.read_csv(path)
+        df_str = df.to_string()
         if len(df_str) > MAX_CONTENT_CHARS:
             df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
         return f"**[CSV File: {os.path.basename(path)}]**\n\n{df_str}"
 def analyze_txt_file(path: str) -> str:
+    """
+    TXT 파일 전문 읽기. 너무 길면 일부만 표시.
+    """
     try:
         with open(path, "r", encoding="utf-8") as f:
+            text = f.read()
         if len(text) > MAX_CONTENT_CHARS:
             text = text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
         return f"**[TXT File: {os.path.basename(path)}]**\n\n{text}"
 def pdf_to_markdown(pdf_path: str) -> str:
+    """
+    PDF → Markdown. 페이지별로 간단히 텍스트 추출.
+    """
+    text_chunks = []
     try:
         with open(pdf_path, "rb") as f:
             reader = PyPDF2.PdfReader(f)
             for page_num, page in enumerate(reader.pages, start=1):
+                page_text = page.extract_text() or ""
+                page_text = page_text.strip()
+                if page_text:
+                    text_chunks.append(f"## Page {page_num}\n\n{page_text}\n")
     except Exception as e:
         return f"Failed to read PDF ({os.path.basename(pdf_path)}): {str(e)}"
+    full_text = "\n".join(text_chunks)
+    if len(full_text) > MAX_CONTENT_CHARS:
+        full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
+    return f"**[PDF File: {os.path.basename(pdf_path)}]**\n\n{full_text}"
 ##################################################
+# 이미지/비디오 업로드 제한 검사
 ##################################################
 def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
     image_count = 0
     image_count = 0
     video_count = 0
     for item in history:
         if item["role"] != "user" or isinstance(item["content"], str):
             continue
+        if item["content"][0].endswith(".mp4"):
             video_count += 1
         else:
             image_count += 1
 def validate_media_constraints(message: dict, history: list[dict]) -> bool:
     """
+    - 비디오 1개 초과 불가
+    - 비디오와 이미지 혼합 불가
+    - 이미지 개수 MAX_NUM_IMAGES 초과 불가
+    - <image> 태그가 있으면 태그 수와 실제 이미지 수 일치
+    - CSV, TXT, PDF 등은 여기서 제한하지 않음
     """
     media_files = []
     for f in message["files"]:
+        # 이미지: png/jpg/jpeg/gif/webp
+        # 비디��: mp4
+        # cf) PDF, CSV, TXT 등은 제외
         if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE) or f.endswith(".mp4"):
             media_files.append(f)
     image_count = history_image_count + new_image_count
     video_count = history_video_count + new_video_count
     if video_count > 1:
         gr.Warning("Only one video is supported.")
         return False
     if video_count == 1:
         if image_count > 0:
             gr.Warning("Mixing images and videos is not allowed.")
         if "<image>" in message["text"]:
             gr.Warning("Using <image> tags with video files is not supported.")
             return False
     if video_count == 0 and image_count > MAX_NUM_IMAGES:
         gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
         return False
     if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
         gr.Warning("The number of <image> tags in the text does not match the number of images.")
         return False
 ##################################################
+# 비디오 처리
 ##################################################
 def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     vidcap = cv2.VideoCapture(video_path)
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frame_interval = int(fps / 3)
     frames = []
     for i in range(0, total_frames, frame_interval):
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
 def process_video(video_path: str) -> list[dict]:
     content = []
     frames = downsample_video(video_path)
+    for frame in frames:
+        pil_image, timestamp = frame
         with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
             pil_image.save(temp_file.name)
             content.append({"type": "text", "text": f"Frame {timestamp}:"})
             content.append({"type": "image", "url": temp_file.name})
+    logger.debug(f"{content=}")
     return content
 ##################################################
+# interleaved <image> 처리
 ##################################################
 def process_interleaved_images(message: dict) -> list[dict]:
     parts = re.split(r"(<image>)", message["text"])
         elif part.strip():
             content.append({"type": "text", "text": part.strip()})
         else:
+            # 공백이거나 \n 같은 경우
             if isinstance(part, str) and part != "<image>":
                 content.append({"type": "text", "text": part})
     return content
 ##################################################
+# PDF + CSV + TXT + 이미지/비디오
 ##################################################
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
+        return [{"type": "text", "text": message["text"]}]
+    # 1) 파일 분류
     video_files = [f for f in message["files"] if f.endswith(".mp4")]
     image_files = [f for f in message["files"] if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE)]
     csv_files = [f for f in message["files"] if f.lower().endswith(".csv")]
     txt_files = [f for f in message["files"] if f.lower().endswith(".txt")]
     pdf_files = [f for f in message["files"] if f.lower().endswith(".pdf")]
+    # 2) 사용자 원본 text 추가
+    content_list = [{"type": "text", "text": message["text"]}]
+    # 3) CSV
     for csv_path in csv_files:
         csv_analysis = analyze_csv_file(csv_path)
         content_list.append({"type": "text", "text": csv_analysis})
+    # 4) TXT
     for txt_path in txt_files:
         txt_analysis = analyze_txt_file(txt_path)
         content_list.append({"type": "text", "text": txt_analysis})
+    # 5) PDF
     for pdf_path in pdf_files:
+        pdf_markdown = pdf_to_markdown(pdf_path)
+        content_list.append({"type": "text", "text": pdf_markdown})
+    # 6) 비디오 (한 개만 허용)
     if video_files:
         content_list += process_video(video_files[0])
         return content_list
+    # 7) 이미지 처리
+    if "<image>" in message["text"]:
+        # interleaved
         return process_interleaved_images(message)
     else:
+        # 일반 여러 장
         for img_path in image_files:
             content_list.append({"type": "image", "url": img_path})
 ##################################################
+# history -> LLM 메시지 변환
 ##################################################
 def process_history(history: list[dict]) -> list[dict]:
     messages = []
+    current_user_content: list[dict] = []
     for item in history:
         if item["role"] == "assistant":
+            # user_content가 쌓여있다면 user 메시지로 저장
             if current_user_content:
                 messages.append({"role": "user", "content": current_user_content})
                 current_user_content = []
+            # 그 뒤 item은 assistant
             messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
         else:
             # user
             if isinstance(content, str):
                 current_user_content.append({"type": "text", "text": content})
             else:
+                # 이미지나 기타
+                current_user_content.append({"type": "image", "url": content[0]})
     return messages
 ##################################################
+# 메인 추론 함수
 ##################################################
 @spaces.GPU(duration=120)
 def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
     if system_prompt:
         messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
     messages.extend(process_history(history))
+    messages.append({"role": "user", "content": process_new_user_message(message)})
+    inputs = processor.apply_chat_template(
         messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
         return_tensors="pt",
+    ).to(device=model.device, dtype=torch.bfloat16)
+    streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
+    gen_kwargs = dict(
+        inputs,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+    )
     t = Thread(target=model.generate, kwargs=gen_kwargs)
     t.start()
     output = ""
+    for new_text in streamer:
+        output += new_text
         yield output
 ##################################################
+# 예시들 (기존)
+##################################################
+##################################################
+# 예시들 (한글화 버전)
 ##################################################
 examples = [
 ]
 demo = gr.ChatInterface(
     fn=run,
     type="messages",
     chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
+    # .webp, .png, .jpg, .jpeg, .gif, .mp4, .csv, .txt, .pdf 모두 허용
     textbox=gr.MultimodalTextbox(
         file_types=[
+            ".webp", ".png", ".jpg", ".jpeg", ".gif",
             ".mp4", ".csv", ".txt", ".pdf"
         ],
         file_count="multiple",
     additional_inputs=[
         gr.Textbox(
             label="System Prompt",
+            value=(
+                "You are a deeply thoughtful AI. Consider problems thoroughly and derive "
+                "correct solutions through systematic reasoning. Please answer in korean."
+            )
         ),
         gr.Slider(label="Max New Tokens", minimum=100, maximum=8000, step=50, value=2000),
     ],
     stop_btn=False,
+    title="Vidraft-Gemma-3-27B",
     examples=examples,
     run_examples_on_click=False,
     cache_examples=False,