seawolf2357 commited on
Commit
75b15f6
ยท
verified ยท
1 Parent(s): 9fce284

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -63
app.py CHANGED
@@ -14,8 +14,13 @@ from loguru import logger
14
  from PIL import Image
15
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
16
 
17
- # [PDF] PyPDF2 ์ถ”๊ฐ€
18
- import PyPDF2
 
 
 
 
 
19
 
20
  model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
21
  processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
@@ -26,6 +31,51 @@ model = Gemma3ForConditionalGeneration.from_pretrained(
26
  MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
30
  image_count = 0
31
  video_count = 0
@@ -52,15 +102,20 @@ def count_files_in_history(history: list[dict]) -> tuple[int, int]:
52
 
53
  def validate_media_constraints(message: dict, history: list[dict]) -> bool:
54
  """
55
- ์ด๋ฏธ์ง€/๋น„๋””์˜ค ๊ฐœ์ˆ˜์™€ ํ˜ผํ•ฉ ์—ฌ๋ถ€ ๋“ฑ์„ ๊ฒ€์‚ฌํ•˜๋Š” ํ•จ์ˆ˜.
56
- PDF๋Š” ๊ฒ€์‚ฌ ๋กœ์ง์—์„œ ์ œ์™ธํ•˜์—ฌ ์—…๋กœ๋“œ๋งŒ ํ—ˆ์šฉ.
 
 
 
57
  """
58
- # [PDF] PDF ํŒŒ์ผ ์ œ์™ธ ์ฒ˜๋ฆฌ
59
- pdf_files = [f for f in message["files"] if f.endswith(".pdf")]
60
- non_pdf_files = [f for f in message["files"] if not f.endswith(".pdf")]
61
-
62
- # ๊ธฐ์กด ๋กœ์ง์€ non_pdf_files(= ์ด๋ฏธ์ง€/๋น„๋””์˜ค)์— ๋Œ€ํ•ด์„œ๋งŒ ์ฒดํฌ
63
- new_image_count, new_video_count = count_files_in_new_message(non_pdf_files)
 
 
64
  history_image_count, history_video_count = count_files_in_history(history)
65
  image_count = history_image_count + new_image_count
66
  video_count = history_video_count + new_video_count
@@ -75,25 +130,19 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
75
  if "<image>" in message["text"]:
76
  gr.Warning("Using <image> tags with video files is not supported.")
77
  return False
78
- # TODO: Add frame count validation for videos similar to image count limits # noqa: FIX002, TD002, TD003
79
-
80
  if video_count == 0 and image_count > MAX_NUM_IMAGES:
81
  gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
82
  return False
83
-
84
- # [PDF] PDF ๊ฐฏ์ˆ˜ ์ œํ•œ(ํ•„์š”ํ•˜๋‹ค๋ฉด)๋„ ์ถ”๊ฐ€ ๊ฐ€๋Šฅ
85
- # ์ผ๋‹จ ์ œํ•œ์€ ๋‘์ง€ ์•Š๊ณ  ๋ฐ”๋กœ True ๋ฐ˜ํ™˜
86
-
87
- # <image> ํƒœ๊ทธ๊ฐ€ ์žˆ์„ ๊ฒฝ์šฐ, ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜์™€ ๋งค์นญ ๊ฒ€์‚ฌ
88
- if "<image>" in message["text"]:
89
- # new_image_count๋Š” pdf ์ œ์™ธ๋œ ์ด๋ฏธ์ง€ ์ˆ˜
90
- if message["text"].count("<image>") != new_image_count:
91
- gr.Warning("The number of <image> tags in the text does not match the number of images.")
92
- return False
93
 
94
  return True
95
 
96
 
 
 
 
97
  def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
98
  vidcap = cv2.VideoCapture(video_path)
99
  fps = vidcap.get(cv2.CAP_PROP_FPS)
@@ -128,6 +177,9 @@ def process_video(video_path: str) -> list[dict]:
128
  return content
129
 
130
 
 
 
 
131
  def process_interleaved_images(message: dict) -> list[dict]:
132
  logger.debug(f"{message['files']=}")
133
  parts = re.split(r"(<image>)", message["text"])
@@ -149,52 +201,40 @@ def process_interleaved_images(message: dict) -> list[dict]:
149
  return content
150
 
151
 
152
- # [PDF] PDF -> Markdown ๋ณ€ํ™˜ ํ•จ์ˆ˜ ์ถ”๊ฐ€
153
- def pdf_to_markdown(pdf_path: str) -> str:
154
- """
155
- PDF ํŒŒ์ผ์„ ํ…์ŠคํŠธ๋กœ ์ถ”์ถœ ํ›„, ๊ฐ„๋‹จํ•œ Markdown ํ˜•ํƒœ๋กœ ๋ฐ˜ํ™˜.
156
- """
157
- text_chunks = []
158
- with open(pdf_path, "rb") as f:
159
- reader = PyPDF2.PdfReader(f)
160
- for page_num, page in enumerate(reader.pages, start=1):
161
- page_text = page.extract_text()
162
- page_text = page_text.strip() if page_text else ""
163
- if page_text:
164
- # ํŽ˜์ด์ง€๋ณ„๋กœ ๊ฐ„๋‹จํ•œ ํ—ค๋”์™€ ๋ณธ๋ฌธ์„ Markdown์œผ๋กœ ํ•ฉ์นจ
165
- text_chunks.append(f"## Page {page_num}\n\n{page_text}\n")
166
- return "\n".join(text_chunks)
167
-
168
-
169
  def process_new_user_message(message: dict) -> list[dict]:
170
  """
171
- ์ƒˆ user message์—์„œ text, ํŒŒ์ผ(์ด๋ฏธ์ง€/๋น„๋””์˜ค/PDF)์„ ์ฒ˜๋ฆฌ.
 
 
 
172
  """
173
  if not message["files"]:
174
  return [{"type": "text", "text": message["text"]}]
175
 
176
- # [PDF] PDF ํŒŒ์ผ ๋ชฉ๋ก
177
- pdf_files = [f for f in message["files"] if f.endswith(".pdf")]
178
- # ์ด๋ฏธ์ง€ยท๋น„๋””์˜ค ๋ชฉ๋ก
179
- other_files = [f for f in message["files"] if not f.endswith(".pdf")]
 
180
 
181
- # ์ผ๋‹จ ์‚ฌ์šฉ์ž์˜ text๋ฅผ ๊ฐ€์žฅ ๋จผ์ € ๋„ฃ๋Š”๋‹ค
182
  content_list = [{"type": "text", "text": message["text"]}]
183
 
184
- # PDF ๋ณ€ํ™˜ ํ›„ ์ถ”๊ฐ€
185
- for pdf_path in pdf_files:
186
- pdf_markdown = pdf_to_markdown(pdf_path)
187
- if pdf_markdown.strip():
188
- content_list.append({"type": "text", "text": pdf_markdown})
189
- else:
190
- content_list.append({"type": "text", "text": "(PDF์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ)"})
191
 
 
 
 
 
192
 
193
- # ์˜์ƒ์ด ์žˆ๋Š”์ง€ ํ™•์ธ
194
- video_files = [f for f in other_files if f.endswith(".mp4")]
195
  if video_files:
196
- # ๋น„๋””์˜ค๋Š” ํ•œ ๊ฐœ๋งŒ ์ฒ˜๋ฆฌํ•œ๋‹ค๋Š” ์ „์ œ (validate_media_constraints์—์„œ ์ด๋ฏธ ๊ฒ€์‚ฌ)
197
- # ์—ฌ๋Ÿฌ ๊ฐœ์ผ ๊ฒฝ์šฐ ์ฒซ ๋ฒˆ์งธ ๊ฒƒ๋งŒ ์ฒ˜๋ฆฌํ•˜๊ฑฐ๋‚˜, ๊ฒฝ๊ณ  ์ฒ˜๋ฆฌ
198
  content_list += process_video(video_files[0])
199
  return content_list
200
 
@@ -203,13 +243,16 @@ def process_new_user_message(message: dict) -> list[dict]:
203
  return process_interleaved_images(message)
204
 
205
  # ์ผ๋ฐ˜ ์ด๋ฏธ์ง€(์—ฌ๋Ÿฌ ์žฅ)
206
- image_files = [f for f in other_files if not f.endswith(".mp4")]
207
  if image_files:
208
- content_list += [{"type": "image", "url": path} for path in image_files]
 
209
 
210
  return content_list
211
 
212
 
 
 
 
213
  def process_history(history: list[dict]) -> list[dict]:
214
  messages = []
215
  current_user_content: list[dict] = []
@@ -228,6 +271,9 @@ def process_history(history: list[dict]) -> list[dict]:
228
  return messages
229
 
230
 
 
 
 
231
  @spaces.GPU(duration=120)
232
  def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
233
  if not validate_media_constraints(message, history):
@@ -263,6 +309,9 @@ def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tok
263
  yield output
264
 
265
 
 
 
 
266
  examples = [
267
  [
268
  {
@@ -386,21 +435,35 @@ examples = [
386
  ]
387
 
388
 
389
-
390
- # [PDF] .pdf ํ—ˆ์šฉ
 
391
  demo = gr.ChatInterface(
392
  fn=run,
393
  type="messages",
394
  chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
 
395
  textbox=gr.MultimodalTextbox(
396
- file_types=["image", ".mp4", ".pdf"], # [PDF] ํ—ˆ์šฉ
397
  file_count="multiple",
398
  autofocus=True
399
  ),
400
  multimodal=True,
401
  additional_inputs=[
402
- gr.Textbox(label="System Prompt", value="ou are a deeply thoughtful AI. Consider problems thoroughly and derive correct solutions through systematic reasoning. Please answer in korean."),
403
- gr.Slider(label="Max New Tokens", minimum=100, maximum=8000, step=50, value=2000),
 
 
 
 
 
 
 
 
 
 
 
 
404
  ],
405
  stop_btn=False,
406
  title="Gemma 3 27B IT",
 
14
  from PIL import Image
15
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
16
 
17
+ # [CSV/TXT ๋ถ„์„์šฉ]
18
+ import pandas as pd
19
+
20
+ ##################################################
21
+ # ์ „์ฒด ์ „๋ฌธ์„ ๋„˜๊ธฐ๋˜, ๋„ˆ๋ฌด ํด ๊ฒฝ์šฐ ์ž˜๋ผ๋‚ด๊ธฐ ์œ„ํ•œ ์ƒ์ˆ˜
22
+ ##################################################
23
+ MAX_CONTENT_CHARS = 8000 # ์˜ˆ: 8000์ž ์ดˆ๊ณผ ์‹œ ์ž˜๋ผ๋ƒ„
24
 
25
  model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
26
  processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
 
31
  MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
32
 
33
 
34
+ ##################################################
35
+ # CSV/TXT ์ „๋ฌธ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
36
+ ##################################################
37
+ def analyze_csv_file(path: str) -> str:
38
+ """
39
+ CSV ํŒŒ์ผ ์ „์ฒด๋ฅผ ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ ๋ฆฌํ„ด.
40
+ ๋„ˆ๋ฌด ๊ธธ๋ฉด MAX_CONTENT_CHARS๊นŒ์ง€๋งŒ ์ž˜๋ผ๋ƒ„.
41
+ """
42
+ try:
43
+ df = pd.read_csv(path)
44
+ df_str = df.to_string()
45
+ if len(df_str) > MAX_CONTENT_CHARS:
46
+ df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
47
+
48
+ return (
49
+ f"**[CSV File: {os.path.basename(path)}]**\n\n"
50
+ f"{df_str}"
51
+ )
52
+ except Exception as e:
53
+ return f"Failed to read CSV ({os.path.basename(path)}): {str(e)}"
54
+
55
+
56
+ def analyze_txt_file(path: str) -> str:
57
+ """
58
+ TXT ํŒŒ์ผ ์ „์ฒด ๋‚ด์šฉ์„ ์ฝ์–ด์„œ ๋ชจ๋ธ์— ๋„˜๊น€.
59
+ ๋„ˆ๋ฌด ๊ธธ๋ฉด MAX_CONTENT_CHARS๊นŒ์ง€๋งŒ ์ž˜๋ผ๋ƒ„.
60
+ """
61
+ try:
62
+ with open(path, "r", encoding="utf-8") as f:
63
+ text = f.read()
64
+
65
+ if len(text) > MAX_CONTENT_CHARS:
66
+ text = text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
67
+
68
+ return (
69
+ f"**[TXT File: {os.path.basename(path)}]**\n\n"
70
+ f"{text}"
71
+ )
72
+ except Exception as e:
73
+ return f"Failed to read TXT ({os.path.basename(path)}): {str(e)}"
74
+
75
+
76
+ ##################################################
77
+ # ๊ธฐ์กด ๋ฏธ๋””์–ด ํŒŒ์ผ ๊ฒ€์‚ฌ ๋กœ์ง (์ด๋ฏธ์ง€/๋น„๋””์˜ค)
78
+ ##################################################
79
  def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
80
  image_count = 0
81
  video_count = 0
 
102
 
103
  def validate_media_constraints(message: dict, history: list[dict]) -> bool:
104
  """
105
+ - ๋น„๋””์˜ค 1๊ฐœ ์ดˆ๊ณผ ๋ถˆ๊ฐ€
106
+ - ๋น„๋””์˜ค/์ด๋ฏธ์ง€ ํ˜ผํ•ฉ ๋ถˆ๊ฐ€
107
+ - ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜ MAX_NUM_IMAGES ์ดˆ๊ณผ ๋ถˆ๊ฐ€
108
+ - <image> ํƒœ๊ทธ๊ฐ€ ์žˆ์œผ๋ฉด ํƒœ๊ทธ ์ˆ˜์™€ ์ด๋ฏธ์ง€ ์ˆ˜ ์ผ์น˜
109
+ CSV, TXT, PDF ๋“ฑ์€ ์—ฌ๊ธฐ์„œ ์ œํ•œํ•˜์ง€ ์•Š์Œ.
110
  """
111
+ media_files = []
112
+ for f in message["files"]:
113
+ # mp4๋‚˜ ๋Œ€ํ‘œ ์ด๋ฏธ์ง€ ํ™•์žฅ์ž๋งŒ ๊ฒ€์‚ฌ
114
+ # (ํŒŒ์ผ๋ช…์— .png / .jpg / .gif / .webp ๋“ฑ ์žˆ์„ ๋•Œ)
115
+ if f.endswith(".mp4") or re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE):
116
+ media_files.append(f)
117
+
118
+ new_image_count, new_video_count = count_files_in_new_message(media_files)
119
  history_image_count, history_video_count = count_files_in_history(history)
120
  image_count = history_image_count + new_image_count
121
  video_count = history_video_count + new_video_count
 
130
  if "<image>" in message["text"]:
131
  gr.Warning("Using <image> tags with video files is not supported.")
132
  return False
 
 
133
  if video_count == 0 and image_count > MAX_NUM_IMAGES:
134
  gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
135
  return False
136
+ if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
137
+ gr.Warning("The number of <image> tags in the text does not match the number of images.")
138
+ return False
 
 
 
 
 
 
 
139
 
140
  return True
141
 
142
 
143
+ ##################################################
144
+ # ๋น„๋””์˜ค ์ฒ˜๋ฆฌ
145
+ ##################################################
146
  def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
147
  vidcap = cv2.VideoCapture(video_path)
148
  fps = vidcap.get(cv2.CAP_PROP_FPS)
 
177
  return content
178
 
179
 
180
+ ##################################################
181
+ # interleaved <image> ํƒœ๊ทธ ์ฒ˜๋ฆฌ
182
+ ##################################################
183
  def process_interleaved_images(message: dict) -> list[dict]:
184
  logger.debug(f"{message['files']=}")
185
  parts = re.split(r"(<image>)", message["text"])
 
201
  return content
202
 
203
 
204
+ ##################################################
205
+ # CSV, TXT ํŒŒ์ผ๋„ ์ „๋ฌธ์„ LLM์— ๋„˜๊ธฐ๋„๋ก
206
+ ##################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  def process_new_user_message(message: dict) -> list[dict]:
208
  """
209
+ - mp4 -> ๋น„๋””์˜ค ์ฒ˜๋ฆฌ
210
+ - ์ด๋ฏธ์ง€ -> interleaved or multiple
211
+ - CSV -> ์ „์ฒด df.to_string() (๋„ˆ๋ฌด ๊ธธ๋ฉด ์ž˜๋ผ๋ƒ„)
212
+ - TXT -> ์ „์ฒด text (๋„ˆ๋ฌด ๊ธธ๋ฉด ์ž˜๋ผ๋ƒ„)
213
  """
214
  if not message["files"]:
215
  return [{"type": "text", "text": message["text"]}]
216
 
217
+ # ํ™•์žฅ์ž๋ณ„ ๋ถ„๋ฅ˜
218
+ video_files = [f for f in message["files"] if f.endswith(".mp4")]
219
+ image_files = [f for f in message["files"] if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE)]
220
+ csv_files = [f for f in message["files"] if f.lower().endswith(".csv")]
221
+ txt_files = [f for f in message["files"] if f.lower().endswith(".txt")]
222
 
223
+ # ์‚ฌ์šฉ์ž ํ…์ŠคํŠธ
224
  content_list = [{"type": "text", "text": message["text"]}]
225
 
226
+ # CSV ์ „๋ฌธ
227
+ for csv_path in csv_files:
228
+ csv_analysis = analyze_csv_file(csv_path)
229
+ content_list.append({"type": "text", "text": csv_analysis})
 
 
 
230
 
231
+ # TXT ์ „๋ฌธ
232
+ for txt_path in txt_files:
233
+ txt_analysis = analyze_txt_file(txt_path)
234
+ content_list.append({"type": "text", "text": txt_analysis})
235
 
236
+ # ๋น„๋””์˜ค
 
237
  if video_files:
 
 
238
  content_list += process_video(video_files[0])
239
  return content_list
240
 
 
243
  return process_interleaved_images(message)
244
 
245
  # ์ผ๋ฐ˜ ์ด๋ฏธ์ง€(์—ฌ๋Ÿฌ ์žฅ)
 
246
  if image_files:
247
+ for img_path in image_files:
248
+ content_list.append({"type": "image", "url": img_path})
249
 
250
  return content_list
251
 
252
 
253
+ ##################################################
254
+ # history -> LLM ๋ฉ”์‹œ์ง€ ๋ณ€ํ™˜
255
+ ##################################################
256
  def process_history(history: list[dict]) -> list[dict]:
257
  messages = []
258
  current_user_content: list[dict] = []
 
271
  return messages
272
 
273
 
274
+ ##################################################
275
+ # ๋ฉ”์ธ ์ถ”๋ก  ํ•จ์ˆ˜
276
+ ##################################################
277
  @spaces.GPU(duration=120)
278
  def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
279
  if not validate_media_constraints(message, history):
 
309
  yield output
310
 
311
 
312
+ ##################################################
313
+ # ์˜ˆ์‹œ ๋ชฉ๋ก (๊ธฐ์กด)
314
+ ##################################################
315
  examples = [
316
  [
317
  {
 
435
  ]
436
 
437
 
438
+ ##################################################
439
+ # Gradio ChatInterface
440
+ ##################################################
441
  demo = gr.ChatInterface(
442
  fn=run,
443
  type="messages",
444
  chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
445
+ # ์—ฌ๊ธฐ์„œ WEBP๋ฅผ ํฌํ•จํ•œ ๋ชจ๋“  ์ด๋ฏธ์ง€, mp4, csv, txt, pdf ํ—ˆ์šฉ
446
  textbox=gr.MultimodalTextbox(
447
+ file_types=["image/*", ".mp4", ".csv", ".txt", ".pdf"],
448
  file_count="multiple",
449
  autofocus=True
450
  ),
451
  multimodal=True,
452
  additional_inputs=[
453
+ gr.Textbox(
454
+ label="System Prompt",
455
+ value=(
456
+ "You are a deeply thoughtful AI. Consider problems thoroughly and derive "
457
+ "correct solutions through systematic reasoning. Please answer in korean."
458
+ )
459
+ ),
460
+ gr.Slider(
461
+ label="Max New Tokens",
462
+ minimum=100,
463
+ maximum=8000,
464
+ step=50,
465
+ value=2000
466
+ ),
467
  ],
468
  stop_btn=False,
469
  title="Gemma 3 27B IT",