seawolf2357 commited on
Commit
5718b5c
ยท
verified ยท
1 Parent(s): 7a2b5d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -102
app.py CHANGED
@@ -13,15 +13,16 @@ import torch
13
  from loguru import logger
14
  from PIL import Image
15
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
 
 
16
  import pandas as pd
 
 
17
  import PyPDF2
18
 
19
- ##################################################
20
- # ๊ธฐ๋ณธ ์„ค์ •
21
- ##################################################
22
- MAX_CONTENT_CHARS = 8000 # ํ…์ŠคํŠธ๋กœ ์ „๋‹ฌ ์‹œ ์ตœ๋Œ€ ๊ธ€์ž ์ˆ˜
23
- model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
24
 
 
25
  processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
26
  model = Gemma3ForConditionalGeneration.from_pretrained(
27
  model_id,
@@ -29,17 +30,20 @@ model = Gemma3ForConditionalGeneration.from_pretrained(
29
  torch_dtype=torch.bfloat16,
30
  attn_implementation="eager"
31
  )
 
32
  MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
33
 
 
34
  ##################################################
35
- # 1) CSV, TXT, PDF ๋ถ„์„ ํ•จ์ˆ˜ (๋นˆ ํŒŒ์ผ ๋Œ€๋น„)
36
  ##################################################
37
  def analyze_csv_file(path: str) -> str:
 
 
 
38
  try:
39
  df = pd.read_csv(path)
40
- df_str = df.to_string().strip()
41
- if not df_str:
42
- df_str = "(CSV is empty)"
43
  if len(df_str) > MAX_CONTENT_CHARS:
44
  df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
45
  return f"**[CSV File: {os.path.basename(path)}]**\n\n{df_str}"
@@ -48,11 +52,12 @@ def analyze_csv_file(path: str) -> str:
48
 
49
 
50
  def analyze_txt_file(path: str) -> str:
 
 
 
51
  try:
52
  with open(path, "r", encoding="utf-8") as f:
53
- text = f.read().strip()
54
- if not text:
55
- text = "(TXT is empty)"
56
  if len(text) > MAX_CONTENT_CHARS:
57
  text = text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
58
  return f"**[TXT File: {os.path.basename(path)}]**\n\n{text}"
@@ -61,26 +66,30 @@ def analyze_txt_file(path: str) -> str:
61
 
62
 
63
  def pdf_to_markdown(pdf_path: str) -> str:
 
 
 
 
64
  try:
65
  with open(pdf_path, "rb") as f:
66
  reader = PyPDF2.PdfReader(f)
67
- chunks = []
68
  for page_num, page in enumerate(reader.pages, start=1):
69
- ptext = (page.extract_text() or "").strip()
70
- if ptext:
71
- chunks.append(f"## Page {page_num}\n\n{ptext}\n")
72
- full_text = "\n".join(chunks).strip()
73
- if not full_text:
74
- full_text = "(PDF is empty)"
75
- if len(full_text) > MAX_CONTENT_CHARS:
76
- full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
77
- return f"**[PDF File: {os.path.basename(pdf_path)}]**\n\n{full_text}"
78
  except Exception as e:
79
  return f"Failed to read PDF ({os.path.basename(pdf_path)}): {str(e)}"
80
 
 
 
 
 
 
 
81
 
82
  ##################################################
83
- # 2) ์ด๋ฏธ์ง€/๋น„๋””์˜ค ์—…๋กœ๋“œ ์ œํ•œ
84
  ##################################################
85
  def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
86
  image_count = 0
@@ -97,11 +106,9 @@ def count_files_in_history(history: list[dict]) -> tuple[int, int]:
97
  image_count = 0
98
  video_count = 0
99
  for item in history:
100
- # assistant ๋˜๋Š” content๊ฐ€ str์ด๋ฉด ์ œ์™ธ
101
  if item["role"] != "user" or isinstance(item["content"], str):
102
  continue
103
- file_path = item["content"][0]
104
- if file_path.endswith(".mp4"):
105
  video_count += 1
106
  else:
107
  image_count += 1
@@ -110,10 +117,17 @@ def count_files_in_history(history: list[dict]) -> tuple[int, int]:
110
 
111
  def validate_media_constraints(message: dict, history: list[dict]) -> bool:
112
  """
113
- ์ด๋ฏธ์ง€/๋น„๋””์˜ค ๊ฐœ์ˆ˜ ์ œํ•œ
 
 
 
 
114
  """
115
  media_files = []
116
  for f in message["files"]:
 
 
 
117
  if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE) or f.endswith(".mp4"):
118
  media_files.append(f)
119
 
@@ -122,11 +136,9 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
122
  image_count = history_image_count + new_image_count
123
  video_count = history_video_count + new_video_count
124
 
125
- # ๋น„๋””์˜ค 1๊ฐœ ์ดˆ๊ณผ ๋ถˆ๊ฐ€
126
  if video_count > 1:
127
  gr.Warning("Only one video is supported.")
128
  return False
129
- # ๋น„๋””์˜ค+์ด๋ฏธ์ง€ ํ˜ผํ•ฉ ๋ถˆ๊ฐ€
130
  if video_count == 1:
131
  if image_count > 0:
132
  gr.Warning("Mixing images and videos is not allowed.")
@@ -134,11 +146,9 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
134
  if "<image>" in message["text"]:
135
  gr.Warning("Using <image> tags with video files is not supported.")
136
  return False
137
- # ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜ ์ œํ•œ
138
  if video_count == 0 and image_count > MAX_NUM_IMAGES:
139
  gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
140
  return False
141
- # <image> ํƒœ๊ทธ ์ˆ˜์™€ ์ด๋ฏธ์ง€ ํŒŒ์ผ ์ˆ˜ ์ผ์น˜
142
  if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
143
  gr.Warning("The number of <image> tags in the text does not match the number of images.")
144
  return False
@@ -147,15 +157,16 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
147
 
148
 
149
  ##################################################
150
- # 3) ๋น„๋””์˜ค ์ฒ˜๋ฆฌ
151
  ##################################################
152
  def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
153
  vidcap = cv2.VideoCapture(video_path)
154
  fps = vidcap.get(cv2.CAP_PROP_FPS)
155
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
156
- frame_interval = int(fps / 3)
157
 
 
158
  frames = []
 
159
  for i in range(0, total_frames, frame_interval):
160
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
161
  success, image = vidcap.read()
@@ -164,6 +175,7 @@ def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
164
  pil_image = Image.fromarray(image)
165
  timestamp = round(i / fps, 2)
166
  frames.append((pil_image, timestamp))
 
167
  vidcap.release()
168
  return frames
169
 
@@ -171,16 +183,18 @@ def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
171
  def process_video(video_path: str) -> list[dict]:
172
  content = []
173
  frames = downsample_video(video_path)
174
- for pil_image, timestamp in frames:
 
175
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
176
  pil_image.save(temp_file.name)
177
  content.append({"type": "text", "text": f"Frame {timestamp}:"})
178
  content.append({"type": "image", "url": temp_file.name})
 
179
  return content
180
 
181
 
182
  ##################################################
183
- # 4) interleaved <image> ์ฒ˜๋ฆฌ
184
  ##################################################
185
  def process_interleaved_images(message: dict) -> list[dict]:
186
  parts = re.split(r"(<image>)", message["text"])
@@ -193,57 +207,55 @@ def process_interleaved_images(message: dict) -> list[dict]:
193
  elif part.strip():
194
  content.append({"type": "text", "text": part.strip()})
195
  else:
 
196
  if isinstance(part, str) and part != "<image>":
197
  content.append({"type": "text", "text": part})
198
  return content
199
 
200
 
201
  ##################################################
202
- # 5) CSV/PDF/TXT = ํ…์ŠคํŠธ / ์ด๋ฏธ์ง€,๋น„๋””์˜ค = ์‹ค์ œ ๊ฒฝ๋กœ
203
  ##################################################
204
  def process_new_user_message(message: dict) -> list[dict]:
205
- user_text = (message["text"] or "").strip() or "(No text)"
206
  if not message["files"]:
207
- return [{"type": "text", "text": user_text}]
208
 
 
209
  video_files = [f for f in message["files"] if f.endswith(".mp4")]
210
  image_files = [f for f in message["files"] if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE)]
211
  csv_files = [f for f in message["files"] if f.lower().endswith(".csv")]
212
  txt_files = [f for f in message["files"] if f.lower().endswith(".txt")]
213
  pdf_files = [f for f in message["files"] if f.lower().endswith(".pdf")]
214
 
215
- content_list = [{"type": "text", "text": user_text}]
 
216
 
217
- # CSV
218
  for csv_path in csv_files:
219
  csv_analysis = analyze_csv_file(csv_path)
220
- if not csv_analysis.strip():
221
- csv_analysis = "(No CSV content?)"
222
  content_list.append({"type": "text", "text": csv_analysis})
223
 
224
- # TXT
225
  for txt_path in txt_files:
226
  txt_analysis = analyze_txt_file(txt_path)
227
- if not txt_analysis.strip():
228
- txt_analysis = "(No TXT content?)"
229
  content_list.append({"type": "text", "text": txt_analysis})
230
 
231
- # PDF
232
  for pdf_path in pdf_files:
233
- pdf_md = pdf_to_markdown(pdf_path)
234
- if not pdf_md.strip():
235
- pdf_md = "(No PDF content?)"
236
- content_list.append({"type": "text", "text": pdf_md})
237
 
 
238
  if video_files:
239
- # ํ•˜๋‚˜๋งŒ ์ฒ˜๋ฆฌ
240
  content_list += process_video(video_files[0])
241
  return content_list
242
 
243
- if "<image>" in user_text:
 
 
244
  return process_interleaved_images(message)
245
  else:
246
- # ์ผ๋ฐ˜ ์ด๋ฏธ์ง€
247
  for img_path in image_files:
248
  content_list.append({"type": "image", "url": img_path})
249
 
@@ -251,16 +263,18 @@ def process_new_user_message(message: dict) -> list[dict]:
251
 
252
 
253
  ##################################################
254
- # 6) ํžˆ์Šคํ† ๋ฆฌ -> LLM ๋ฉ”์‹œ์ง€ ๋ณ€ํ™˜ (๋น„์ด๋ฏธ์ง€ ๊ฒฝ๋กœ๋Š” ๋ฌด์‹œ)
255
  ##################################################
256
  def process_history(history: list[dict]) -> list[dict]:
257
  messages = []
258
- current_user_content = []
259
  for item in history:
260
  if item["role"] == "assistant":
 
261
  if current_user_content:
262
  messages.append({"role": "user", "content": current_user_content})
263
  current_user_content = []
 
264
  messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
265
  else:
266
  # user
@@ -268,18 +282,13 @@ def process_history(history: list[dict]) -> list[dict]:
268
  if isinstance(content, str):
269
  current_user_content.append({"type": "text", "text": content})
270
  else:
271
- # [ํŒŒ์ผ๊ฒฝ๋กœ]
272
- fpath = content[0]
273
- # ์ด๋ฏธ์ง€๋‚˜ mp4๋งŒ ์œ ์ง€, ๋‚˜๋จธ์ง€๋Š” ์ œ์™ธ
274
- if re.search(r"\.(png|jpg|jpeg|gif|webp)$", fpath, re.IGNORECASE) or fpath.endswith(".mp4"):
275
- current_user_content.append({"type": "image", "url": fpath})
276
- else:
277
- pass
278
  return messages
279
 
280
 
281
  ##################################################
282
- # 7) ๋ฉ”์ธ ์ถ”๋ก  (๋นˆ ํ† ํฐ ๋ฐฉ์–ด)
283
  ##################################################
284
  @spaces.GPU(duration=120)
285
  def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
@@ -291,54 +300,36 @@ def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tok
291
  if system_prompt:
292
  messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
293
  messages.extend(process_history(history))
 
294
 
295
- user_content = process_new_user_message(message)
296
- messages.append({"role": "user", "content": user_content})
297
-
298
- # 1) tokenize=False ํ›„ ํ† ํฐ ๊ธธ์ด ์ฒดํฌ
299
- raw_text = processor.tokenizer.apply_chat_template(
300
  messages,
301
- tokenize=False,
302
- add_generation_prompt=True
303
- )
304
- token_ids = processor.tokenizer.encode(raw_text, add_special_tokens=False)
305
- if len(token_ids) == 0:
306
- # ๋นˆ ์ž…๋ ฅ โ†’ ์ž„์˜ ๋ฌธ๊ตฌ ์ถ”๊ฐ€
307
- raw_text += " (No content?)"
308
- token_ids = processor.tokenizer.encode(raw_text, add_special_tokens=False)
309
-
310
- # 2) ์‹ค์ œ tokenizer
311
- inputs = processor.tokenizer(
312
- raw_text,
313
  return_tensors="pt",
314
- padding=True
315
- )
316
- inputs = {k: v.to(model.device, dtype=torch.bfloat16) for k, v in inputs.items()}
317
-
318
- # 3) ์ŠคํŠธ๋ฆฌ๋ฐ ์ƒ์„ฑ
319
- streamer = TextIteratorStreamer(processor.tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
320
- gen_kwargs = {
321
- "inputs": inputs["input_ids"],
322
- "attention_mask": inputs.get("attention_mask"),
323
- "streamer": streamer,
324
- "max_new_tokens": max_new_tokens,
325
- "do_sample": True,
326
- "temperature": 0.3,
327
- "top_p": 0.95,
328
- }
329
- gen_kwargs = {k: v for k, v in gen_kwargs.items() if v is not None}
330
 
 
 
 
 
 
 
331
  t = Thread(target=model.generate, kwargs=gen_kwargs)
332
  t.start()
333
 
334
  output = ""
335
- for chunk in streamer:
336
- output += chunk
337
  yield output
338
 
339
 
340
  ##################################################
341
- # 8) ์˜ˆ์‹œ
 
 
 
342
  ##################################################
343
  examples = [
344
 
@@ -470,13 +461,15 @@ examples = [
470
  ]
471
 
472
 
 
473
  demo = gr.ChatInterface(
474
  fn=run,
475
  type="messages",
476
  chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
 
477
  textbox=gr.MultimodalTextbox(
478
  file_types=[
479
- ".png", ".jpg", ".jpeg", ".gif", ".webp",
480
  ".mp4", ".csv", ".txt", ".pdf"
481
  ],
482
  file_count="multiple",
@@ -486,12 +479,15 @@ demo = gr.ChatInterface(
486
  additional_inputs=[
487
  gr.Textbox(
488
  label="System Prompt",
489
- value="You are a deeply thoughtful AI. Consider problems thoroughly and derive correct solutions through systematic reasoning. Please answer in korean."
 
 
 
490
  ),
491
  gr.Slider(label="Max New Tokens", minimum=100, maximum=8000, step=50, value=2000),
492
  ],
493
  stop_btn=False,
494
- title="Gemma 3 27B IT",
495
  examples=examples,
496
  run_examples_on_click=False,
497
  cache_examples=False,
 
13
  from loguru import logger
14
  from PIL import Image
15
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
16
+
17
+ # CSV/TXT ๋ถ„์„
18
  import pandas as pd
19
+
20
+ # PDF ํ…์ŠคํŠธ ์ถ”์ถœ
21
  import PyPDF2
22
 
23
+ MAX_CONTENT_CHARS = 8000 # ๋„ˆ๋ฌด ํฐ ํŒŒ์ผ์„ ๋ง‰๊ธฐ ์œ„ํ•ด ์ตœ๋Œ€ ํ‘œ์‹œ 8000์ž
 
 
 
 
24
 
25
+ model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
26
  processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
27
  model = Gemma3ForConditionalGeneration.from_pretrained(
28
  model_id,
 
30
  torch_dtype=torch.bfloat16,
31
  attn_implementation="eager"
32
  )
33
+
34
  MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
35
 
36
+
37
  ##################################################
38
+ # CSV, TXT, PDF ๋ถ„์„ ํ•จ์ˆ˜
39
  ##################################################
40
  def analyze_csv_file(path: str) -> str:
41
+ """
42
+ CSV ํŒŒ์ผ์„ ์ „์ฒด ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜. ๋„ˆ๋ฌด ๊ธธ ๊ฒฝ์šฐ ์ผ๋ถ€๋งŒ ํ‘œ์‹œ.
43
+ """
44
  try:
45
  df = pd.read_csv(path)
46
+ df_str = df.to_string()
 
 
47
  if len(df_str) > MAX_CONTENT_CHARS:
48
  df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
49
  return f"**[CSV File: {os.path.basename(path)}]**\n\n{df_str}"
 
52
 
53
 
54
  def analyze_txt_file(path: str) -> str:
55
+ """
56
+ TXT ํŒŒ์ผ ์ „๋ฌธ ์ฝ๊ธฐ. ๋„ˆ๋ฌด ๊ธธ๋ฉด ์ผ๋ถ€๋งŒ ํ‘œ์‹œ.
57
+ """
58
  try:
59
  with open(path, "r", encoding="utf-8") as f:
60
+ text = f.read()
 
 
61
  if len(text) > MAX_CONTENT_CHARS:
62
  text = text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
63
  return f"**[TXT File: {os.path.basename(path)}]**\n\n{text}"
 
66
 
67
 
68
  def pdf_to_markdown(pdf_path: str) -> str:
69
+ """
70
+ PDF โ†’ Markdown. ํŽ˜์ด์ง€๋ณ„๋กœ ๊ฐ„๋‹จํžˆ ํ…์ŠคํŠธ ์ถ”์ถœ.
71
+ """
72
+ text_chunks = []
73
  try:
74
  with open(pdf_path, "rb") as f:
75
  reader = PyPDF2.PdfReader(f)
 
76
  for page_num, page in enumerate(reader.pages, start=1):
77
+ page_text = page.extract_text() or ""
78
+ page_text = page_text.strip()
79
+ if page_text:
80
+ text_chunks.append(f"## Page {page_num}\n\n{page_text}\n")
 
 
 
 
 
81
  except Exception as e:
82
  return f"Failed to read PDF ({os.path.basename(pdf_path)}): {str(e)}"
83
 
84
+ full_text = "\n".join(text_chunks)
85
+ if len(full_text) > MAX_CONTENT_CHARS:
86
+ full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
87
+
88
+ return f"**[PDF File: {os.path.basename(pdf_path)}]**\n\n{full_text}"
89
+
90
 
91
  ##################################################
92
+ # ์ด๋ฏธ์ง€/๋น„๋””์˜ค ์—…๋กœ๋“œ ์ œํ•œ ๊ฒ€์‚ฌ
93
  ##################################################
94
  def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
95
  image_count = 0
 
106
  image_count = 0
107
  video_count = 0
108
  for item in history:
 
109
  if item["role"] != "user" or isinstance(item["content"], str):
110
  continue
111
+ if item["content"][0].endswith(".mp4"):
 
112
  video_count += 1
113
  else:
114
  image_count += 1
 
117
 
118
  def validate_media_constraints(message: dict, history: list[dict]) -> bool:
119
  """
120
+ - ๋น„๋””์˜ค 1๊ฐœ ์ดˆ๊ณผ ๋ถˆ๊ฐ€
121
+ - ๋น„๋””์˜ค์™€ ์ด๋ฏธ์ง€ ํ˜ผํ•ฉ ๋ถˆ๊ฐ€
122
+ - ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜ MAX_NUM_IMAGES ์ดˆ๊ณผ ๋ถˆ๊ฐ€
123
+ - <image> ํƒœ๊ทธ๊ฐ€ ์žˆ์œผ๋ฉด ํƒœ๊ทธ ์ˆ˜์™€ ์‹ค์ œ ์ด๋ฏธ์ง€ ์ˆ˜ ์ผ์น˜
124
+ - CSV, TXT, PDF ๋“ฑ์€ ์—ฌ๊ธฐ์„œ ์ œํ•œํ•˜์ง€ ์•Š์Œ
125
  """
126
  media_files = []
127
  for f in message["files"]:
128
+ # ์ด๋ฏธ์ง€: png/jpg/jpeg/gif/webp
129
+ # ๋น„๋””๏ฟฝ๏ฟฝ: mp4
130
+ # cf) PDF, CSV, TXT ๋“ฑ์€ ์ œ์™ธ
131
  if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE) or f.endswith(".mp4"):
132
  media_files.append(f)
133
 
 
136
  image_count = history_image_count + new_image_count
137
  video_count = history_video_count + new_video_count
138
 
 
139
  if video_count > 1:
140
  gr.Warning("Only one video is supported.")
141
  return False
 
142
  if video_count == 1:
143
  if image_count > 0:
144
  gr.Warning("Mixing images and videos is not allowed.")
 
146
  if "<image>" in message["text"]:
147
  gr.Warning("Using <image> tags with video files is not supported.")
148
  return False
 
149
  if video_count == 0 and image_count > MAX_NUM_IMAGES:
150
  gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
151
  return False
 
152
  if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
153
  gr.Warning("The number of <image> tags in the text does not match the number of images.")
154
  return False
 
157
 
158
 
159
  ##################################################
160
+ # ๋น„๋””์˜ค ์ฒ˜๋ฆฌ
161
  ##################################################
162
  def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
163
  vidcap = cv2.VideoCapture(video_path)
164
  fps = vidcap.get(cv2.CAP_PROP_FPS)
165
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
 
166
 
167
+ frame_interval = int(fps / 3)
168
  frames = []
169
+
170
  for i in range(0, total_frames, frame_interval):
171
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
172
  success, image = vidcap.read()
 
175
  pil_image = Image.fromarray(image)
176
  timestamp = round(i / fps, 2)
177
  frames.append((pil_image, timestamp))
178
+
179
  vidcap.release()
180
  return frames
181
 
 
183
  def process_video(video_path: str) -> list[dict]:
184
  content = []
185
  frames = downsample_video(video_path)
186
+ for frame in frames:
187
+ pil_image, timestamp = frame
188
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
189
  pil_image.save(temp_file.name)
190
  content.append({"type": "text", "text": f"Frame {timestamp}:"})
191
  content.append({"type": "image", "url": temp_file.name})
192
+ logger.debug(f"{content=}")
193
  return content
194
 
195
 
196
  ##################################################
197
+ # interleaved <image> ์ฒ˜๋ฆฌ
198
  ##################################################
199
  def process_interleaved_images(message: dict) -> list[dict]:
200
  parts = re.split(r"(<image>)", message["text"])
 
207
  elif part.strip():
208
  content.append({"type": "text", "text": part.strip()})
209
  else:
210
+ # ๊ณต๋ฐฑ์ด๊ฑฐ๋‚˜ \n ๊ฐ™์€ ๊ฒฝ์šฐ
211
  if isinstance(part, str) and part != "<image>":
212
  content.append({"type": "text", "text": part})
213
  return content
214
 
215
 
216
  ##################################################
217
+ # PDF + CSV + TXT + ์ด๋ฏธ์ง€/๋น„๋””์˜ค
218
  ##################################################
219
  def process_new_user_message(message: dict) -> list[dict]:
 
220
  if not message["files"]:
221
+ return [{"type": "text", "text": message["text"]}]
222
 
223
+ # 1) ํŒŒ์ผ ๋ถ„๋ฅ˜
224
  video_files = [f for f in message["files"] if f.endswith(".mp4")]
225
  image_files = [f for f in message["files"] if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE)]
226
  csv_files = [f for f in message["files"] if f.lower().endswith(".csv")]
227
  txt_files = [f for f in message["files"] if f.lower().endswith(".txt")]
228
  pdf_files = [f for f in message["files"] if f.lower().endswith(".pdf")]
229
 
230
+ # 2) ์‚ฌ์šฉ์ž ์›๋ณธ text ์ถ”๊ฐ€
231
+ content_list = [{"type": "text", "text": message["text"]}]
232
 
233
+ # 3) CSV
234
  for csv_path in csv_files:
235
  csv_analysis = analyze_csv_file(csv_path)
 
 
236
  content_list.append({"type": "text", "text": csv_analysis})
237
 
238
+ # 4) TXT
239
  for txt_path in txt_files:
240
  txt_analysis = analyze_txt_file(txt_path)
 
 
241
  content_list.append({"type": "text", "text": txt_analysis})
242
 
243
+ # 5) PDF
244
  for pdf_path in pdf_files:
245
+ pdf_markdown = pdf_to_markdown(pdf_path)
246
+ content_list.append({"type": "text", "text": pdf_markdown})
 
 
247
 
248
+ # 6) ๋น„๋””์˜ค (ํ•œ ๊ฐœ๋งŒ ํ—ˆ์šฉ)
249
  if video_files:
 
250
  content_list += process_video(video_files[0])
251
  return content_list
252
 
253
+ # 7) ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ
254
+ if "<image>" in message["text"]:
255
+ # interleaved
256
  return process_interleaved_images(message)
257
  else:
258
+ # ์ผ๋ฐ˜ ์—ฌ๋Ÿฌ ์žฅ
259
  for img_path in image_files:
260
  content_list.append({"type": "image", "url": img_path})
261
 
 
263
 
264
 
265
  ##################################################
266
+ # history -> LLM ๋ฉ”์‹œ์ง€ ๋ณ€ํ™˜
267
  ##################################################
268
  def process_history(history: list[dict]) -> list[dict]:
269
  messages = []
270
+ current_user_content: list[dict] = []
271
  for item in history:
272
  if item["role"] == "assistant":
273
+ # user_content๊ฐ€ ์Œ“์—ฌ์žˆ๋‹ค๋ฉด user ๋ฉ”์‹œ์ง€๋กœ ์ €์žฅ
274
  if current_user_content:
275
  messages.append({"role": "user", "content": current_user_content})
276
  current_user_content = []
277
+ # ๊ทธ ๋’ค item์€ assistant
278
  messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
279
  else:
280
  # user
 
282
  if isinstance(content, str):
283
  current_user_content.append({"type": "text", "text": content})
284
  else:
285
+ # ์ด๋ฏธ์ง€๋‚˜ ๊ธฐํƒ€
286
+ current_user_content.append({"type": "image", "url": content[0]})
 
 
 
 
 
287
  return messages
288
 
289
 
290
  ##################################################
291
+ # ๋ฉ”์ธ ์ถ”๋ก  ํ•จ์ˆ˜
292
  ##################################################
293
  @spaces.GPU(duration=120)
294
  def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
 
300
  if system_prompt:
301
  messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
302
  messages.extend(process_history(history))
303
+ messages.append({"role": "user", "content": process_new_user_message(message)})
304
 
305
+ inputs = processor.apply_chat_template(
 
 
 
 
306
  messages,
307
+ add_generation_prompt=True,
308
+ tokenize=True,
309
+ return_dict=True,
 
 
 
 
 
 
 
 
 
310
  return_tensors="pt",
311
+ ).to(device=model.device, dtype=torch.bfloat16)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
 
313
+ streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
314
+ gen_kwargs = dict(
315
+ inputs,
316
+ streamer=streamer,
317
+ max_new_tokens=max_new_tokens,
318
+ )
319
  t = Thread(target=model.generate, kwargs=gen_kwargs)
320
  t.start()
321
 
322
  output = ""
323
+ for new_text in streamer:
324
+ output += new_text
325
  yield output
326
 
327
 
328
  ##################################################
329
+ # ์˜ˆ์‹œ๋“ค (๊ธฐ์กด)
330
+ ##################################################
331
+ ##################################################
332
+ # ์˜ˆ์‹œ๋“ค (ํ•œ๊ธ€ํ™” ๋ฒ„์ „)
333
  ##################################################
334
  examples = [
335
 
 
461
  ]
462
 
463
 
464
+
465
  demo = gr.ChatInterface(
466
  fn=run,
467
  type="messages",
468
  chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
469
+ # .webp, .png, .jpg, .jpeg, .gif, .mp4, .csv, .txt, .pdf ๋ชจ๋‘ ํ—ˆ์šฉ
470
  textbox=gr.MultimodalTextbox(
471
  file_types=[
472
+ ".webp", ".png", ".jpg", ".jpeg", ".gif",
473
  ".mp4", ".csv", ".txt", ".pdf"
474
  ],
475
  file_count="multiple",
 
479
  additional_inputs=[
480
  gr.Textbox(
481
  label="System Prompt",
482
+ value=(
483
+ "You are a deeply thoughtful AI. Consider problems thoroughly and derive "
484
+ "correct solutions through systematic reasoning. Please answer in korean."
485
+ )
486
  ),
487
  gr.Slider(label="Max New Tokens", minimum=100, maximum=8000, step=50, value=2000),
488
  ],
489
  stop_btn=False,
490
+ title="Vidraft-Gemma-3-27B",
491
  examples=examples,
492
  run_examples_on_click=False,
493
  cache_examples=False,