seawolf2357 commited on
Commit
c3d078f
ยท
verified ยท
1 Parent(s): 63a1096

Update app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +85 -19
app-backup.py CHANGED
@@ -14,6 +14,9 @@ from loguru import logger
14
  from PIL import Image
15
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
16
 
 
 
 
17
  model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
18
  processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
19
  model = Gemma3ForConditionalGeneration.from_pretrained(
@@ -48,10 +51,20 @@ def count_files_in_history(history: list[dict]) -> tuple[int, int]:
48
 
49
 
50
  def validate_media_constraints(message: dict, history: list[dict]) -> bool:
51
- new_image_count, new_video_count = count_files_in_new_message(message["files"])
 
 
 
 
 
 
 
 
 
52
  history_image_count, history_video_count = count_files_in_history(history)
53
  image_count = history_image_count + new_image_count
54
  video_count = history_video_count + new_video_count
 
55
  if video_count > 1:
56
  gr.Warning("Only one video is supported.")
57
  return False
@@ -63,12 +76,21 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
63
  gr.Warning("Using <image> tags with video files is not supported.")
64
  return False
65
  # TODO: Add frame count validation for videos similar to image count limits # noqa: FIX002, TD002, TD003
 
66
  if video_count == 0 and image_count > MAX_NUM_IMAGES:
67
  gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
68
  return False
69
- if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
70
- gr.Warning("The number of <image> tags in the text does not match the number of images.")
71
- return False
 
 
 
 
 
 
 
 
72
  return True
73
 
74
 
@@ -127,20 +149,65 @@ def process_interleaved_images(message: dict) -> list[dict]:
127
  return content
128
 
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def process_new_user_message(message: dict) -> list[dict]:
 
 
 
131
  if not message["files"]:
132
  return [{"type": "text", "text": message["text"]}]
133
 
134
- if message["files"][0].endswith(".mp4"):
135
- return [{"type": "text", "text": message["text"]}, *process_video(message["files"][0])]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
 
 
 
 
 
 
 
 
 
137
  if "<image>" in message["text"]:
138
  return process_interleaved_images(message)
139
 
140
- return [
141
- {"type": "text", "text": message["text"]},
142
- *[{"type": "image", "url": path} for path in message["files"]],
143
- ]
 
 
144
 
145
 
146
  def process_history(history: list[dict]) -> list[dict]:
@@ -318,26 +385,25 @@ examples = [
318
  ],
319
  ]
320
 
321
- DESCRIPTION = """\
322
- <img src='https://huggingface.co/spaces/huggingface-projects/gemma-3-12b-it/resolve/main/assets/logo.png' id='logo' />
323
 
324
- This is a demo of Gemma 3 27B it, a vision language model with outstanding performance on a wide range of tasks.
325
- You can upload images, interleaved images and videos. Note that video input only supports single-turn conversation and mp4 input.
326
- """
327
 
 
328
  demo = gr.ChatInterface(
329
  fn=run,
330
  type="messages",
331
  chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
332
- textbox=gr.MultimodalTextbox(file_types=["image", ".mp4"], file_count="multiple", autofocus=True),
 
 
 
 
333
  multimodal=True,
334
  additional_inputs=[
335
- gr.Textbox(label="System Prompt", value="You are a helpful assistant."),
336
- gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
337
  ],
338
  stop_btn=False,
339
  title="Gemma 3 27B IT",
340
- description=DESCRIPTION,
341
  examples=examples,
342
  run_examples_on_click=False,
343
  cache_examples=False,
 
14
  from PIL import Image
15
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
16
 
17
+ # [PDF] PyPDF2 ์ถ”๊ฐ€
18
+ import PyPDF2
19
+
20
  model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
21
  processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
22
  model = Gemma3ForConditionalGeneration.from_pretrained(
 
51
 
52
 
53
  def validate_media_constraints(message: dict, history: list[dict]) -> bool:
54
+ """
55
+ ์ด๋ฏธ์ง€/๋น„๋””์˜ค ๊ฐœ์ˆ˜์™€ ํ˜ผํ•ฉ ์—ฌ๋ถ€ ๋“ฑ์„ ๊ฒ€์‚ฌํ•˜๋Š” ํ•จ์ˆ˜.
56
+ PDF๋Š” ๊ฒ€์‚ฌ ๋กœ์ง์—์„œ ์ œ์™ธํ•˜์—ฌ ์—…๋กœ๋“œ๋งŒ ํ—ˆ์šฉ.
57
+ """
58
+ # [PDF] PDF ํŒŒ์ผ ์ œ์™ธ ์ฒ˜๋ฆฌ
59
+ pdf_files = [f for f in message["files"] if f.endswith(".pdf")]
60
+ non_pdf_files = [f for f in message["files"] if not f.endswith(".pdf")]
61
+
62
+ # ๊ธฐ์กด ๋กœ์ง์€ non_pdf_files(= ์ด๋ฏธ์ง€/๋น„๋””์˜ค)์— ๋Œ€ํ•ด์„œ๋งŒ ์ฒดํฌ
63
+ new_image_count, new_video_count = count_files_in_new_message(non_pdf_files)
64
  history_image_count, history_video_count = count_files_in_history(history)
65
  image_count = history_image_count + new_image_count
66
  video_count = history_video_count + new_video_count
67
+
68
  if video_count > 1:
69
  gr.Warning("Only one video is supported.")
70
  return False
 
76
  gr.Warning("Using <image> tags with video files is not supported.")
77
  return False
78
  # TODO: Add frame count validation for videos similar to image count limits # noqa: FIX002, TD002, TD003
79
+
80
  if video_count == 0 and image_count > MAX_NUM_IMAGES:
81
  gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
82
  return False
83
+
84
+ # [PDF] PDF ๊ฐฏ์ˆ˜ ์ œํ•œ(ํ•„์š”ํ•˜๋‹ค๋ฉด)๋„ ์ถ”๊ฐ€ ๊ฐ€๋Šฅ
85
+ # ์ผ๋‹จ ์ œํ•œ์€ ๋‘์ง€ ์•Š๊ณ  ๋ฐ”๋กœ True ๋ฐ˜ํ™˜
86
+
87
+ # <image> ํƒœ๊ทธ๊ฐ€ ์žˆ์„ ๊ฒฝ์šฐ, ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜์™€ ๋งค์นญ ๊ฒ€์‚ฌ
88
+ if "<image>" in message["text"]:
89
+ # new_image_count๋Š” pdf ์ œ์™ธ๋œ ์ด๋ฏธ์ง€ ์ˆ˜
90
+ if message["text"].count("<image>") != new_image_count:
91
+ gr.Warning("The number of <image> tags in the text does not match the number of images.")
92
+ return False
93
+
94
  return True
95
 
96
 
 
149
  return content
150
 
151
 
152
+ # [PDF] PDF -> Markdown ๋ณ€ํ™˜ ํ•จ์ˆ˜ ์ถ”๊ฐ€
153
+ def pdf_to_markdown(pdf_path: str) -> str:
154
+ """
155
+ PDF ํŒŒ์ผ์„ ํ…์ŠคํŠธ๋กœ ์ถ”์ถœ ํ›„, ๊ฐ„๋‹จํ•œ Markdown ํ˜•ํƒœ๋กœ ๋ฐ˜ํ™˜.
156
+ """
157
+ text_chunks = []
158
+ with open(pdf_path, "rb") as f:
159
+ reader = PyPDF2.PdfReader(f)
160
+ for page_num, page in enumerate(reader.pages, start=1):
161
+ page_text = page.extract_text()
162
+ page_text = page_text.strip() if page_text else ""
163
+ if page_text:
164
+ # ํŽ˜์ด์ง€๋ณ„๋กœ ๊ฐ„๋‹จํ•œ ํ—ค๋”์™€ ๋ณธ๋ฌธ์„ Markdown์œผ๋กœ ํ•ฉ์นจ
165
+ text_chunks.append(f"## Page {page_num}\n\n{page_text}\n")
166
+ return "\n".join(text_chunks)
167
+
168
+
169
  def process_new_user_message(message: dict) -> list[dict]:
170
+ """
171
+ ์ƒˆ user message์—์„œ text, ํŒŒ์ผ(์ด๋ฏธ์ง€/๋น„๋””์˜ค/PDF)์„ ์ฒ˜๋ฆฌ.
172
+ """
173
  if not message["files"]:
174
  return [{"type": "text", "text": message["text"]}]
175
 
176
+ # [PDF] PDF ํŒŒ์ผ ๋ชฉ๋ก
177
+ pdf_files = [f for f in message["files"] if f.endswith(".pdf")]
178
+ # ์ด๋ฏธ์ง€ยท๋น„๋””์˜ค ๋ชฉ๋ก
179
+ other_files = [f for f in message["files"] if not f.endswith(".pdf")]
180
+
181
+ # ์ผ๋‹จ ์‚ฌ์šฉ์ž์˜ text๋ฅผ ๊ฐ€์žฅ ๋จผ์ € ๋„ฃ๋Š”๋‹ค
182
+ content_list = [{"type": "text", "text": message["text"]}]
183
+
184
+ # PDF ๋ณ€ํ™˜ ํ›„ ์ถ”๊ฐ€
185
+ for pdf_path in pdf_files:
186
+ pdf_markdown = pdf_to_markdown(pdf_path)
187
+ if pdf_markdown.strip():
188
+ content_list.append({"type": "text", "text": pdf_markdown})
189
+ else:
190
+ content_list.append({"type": "text", "text": "(PDF์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ)"})
191
+
192
 
193
+ # ์˜์ƒ์ด ์žˆ๋Š”์ง€ ํ™•์ธ
194
+ video_files = [f for f in other_files if f.endswith(".mp4")]
195
+ if video_files:
196
+ # ๋น„๋””์˜ค๋Š” ํ•œ ๊ฐœ๋งŒ ์ฒ˜๋ฆฌํ•œ๋‹ค๋Š” ์ „์ œ (validate_media_constraints์—์„œ ์ด๋ฏธ ๊ฒ€์‚ฌ)
197
+ # ์—ฌ๋Ÿฌ ๊ฐœ์ผ ๊ฒฝ์šฐ ์ฒซ ๋ฒˆ์งธ ๊ฒƒ๋งŒ ์ฒ˜๋ฆฌํ•˜๊ฑฐ๋‚˜, ๊ฒฝ๊ณ  ์ฒ˜๋ฆฌ
198
+ content_list += process_video(video_files[0])
199
+ return content_list
200
+
201
+ # interleaved ์ด๋ฏธ์ง€
202
  if "<image>" in message["text"]:
203
  return process_interleaved_images(message)
204
 
205
+ # ์ผ๋ฐ˜ ์ด๋ฏธ์ง€(์—ฌ๋Ÿฌ ์žฅ)
206
+ image_files = [f for f in other_files if not f.endswith(".mp4")]
207
+ if image_files:
208
+ content_list += [{"type": "image", "url": path} for path in image_files]
209
+
210
+ return content_list
211
 
212
 
213
  def process_history(history: list[dict]) -> list[dict]:
 
385
  ],
386
  ]
387
 
 
 
388
 
 
 
 
389
 
390
+ # [PDF] .pdf ํ—ˆ์šฉ
391
  demo = gr.ChatInterface(
392
  fn=run,
393
  type="messages",
394
  chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
395
+ textbox=gr.MultimodalTextbox(
396
+ file_types=["image", ".mp4", ".pdf"], # [PDF] ํ—ˆ์šฉ
397
+ file_count="multiple",
398
+ autofocus=True
399
+ ),
400
  multimodal=True,
401
  additional_inputs=[
402
+ gr.Textbox(label="System Prompt", value="ou are a deeply thoughtful AI. Consider problems thoroughly and derive correct solutions through systematic reasoning. Please answer in korean."),
403
+ gr.Slider(label="Max New Tokens", minimum=100, maximum=8000, step=50, value=2000),
404
  ],
405
  stop_btn=False,
406
  title="Gemma 3 27B IT",
 
407
  examples=examples,
408
  run_examples_on_click=False,
409
  cache_examples=False,