openfree commited on
Commit
55cddee
Β·
verified Β·
1 Parent(s): 9cd25a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -53
app.py CHANGED
@@ -4,10 +4,15 @@ import gradio as gr
4
  from transformers import pipeline
5
  from huggingface_hub import InferenceClient
6
  import os
 
 
 
 
7
 
8
  MODEL_NAME = "openai/whisper-large-v3-turbo"
9
  BATCH_SIZE = 8
10
  FILE_LIMIT_MB = 1000
 
11
 
12
  device = 0 if torch.cuda.is_available() else "cpu"
13
 
@@ -25,83 +30,140 @@ hf_client = InferenceClient(
25
  token=os.getenv("HF_TOKEN")
26
  )
27
 
28
- @spaces.GPU
29
- def transcribe_summarize(audio_input, task):
30
- if audio_input is None:
31
- raise gr.Error("μ˜€λ””μ˜€ 파일이 μ œμΆœλ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€!")
 
32
 
33
- # μŒμ„±μ„ ν…μŠ€νŠΈλ‘œ λ³€ν™˜
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  result = pipe(
35
- audio_input,
36
  batch_size=BATCH_SIZE,
37
  generate_kwargs={"task": task},
38
  return_timestamps=True
39
  )
40
- transcribed_text = result["text"]
41
 
42
- # ν…μŠ€νŠΈ μš”μ•½ (μˆ˜μ •λœ λΆ€λΆ„)
43
- try:
44
- # μš”μ•½μš© ν”„λ‘¬ν”„νŠΈ 생성
45
- prompt = f"""μ•„λž˜ ν…μŠ€νŠΈλ₯Ό κ°„λ‹¨νžˆ μš”μ•½ν•΄μ£Όμ„Έμš”:
46
 
47
- ν…μŠ€νŠΈ: {transcribed_text}
 
 
48
 
49
- μš”μ•½:"""
 
 
 
 
 
 
 
 
50
 
51
- # API 호좜
52
- response = hf_client.text_generation(
53
- model="CohereForAI/c4ai-command-r-plus-08-2024",
54
- prompt=prompt,
55
- max_new_tokens=150,
56
- temperature=0.3,
57
- top_p=0.9,
58
- repetition_penalty=1.2,
59
- stop_sequences=["\n", "ν…μŠ€νŠΈ:", "μš”μ•½:"]
60
- )
61
 
62
- # API 응닡 처리 (μˆ˜μ •λœ λΆ€λΆ„)
63
- if isinstance(response, str):
64
- summary_text = response
65
- else:
66
- summary_text = response.generated_text if hasattr(response, 'generated_text') else str(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- # ν”„λ‘¬ν”„νŠΈ λΆ€λΆ„ 제거
69
- if "μš”μ•½:" in summary_text:
70
- summary_text = summary_text.split("μš”μ•½:")[1].strip()
71
 
72
- if not summary_text:
73
- summary_text = "μš”μ•½μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€."
74
-
75
  except Exception as e:
76
- print(f"μš”μ•½ 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}") # λ””λ²„κΉ…μš© 둜그
77
- summary_text = "μš”μ•½μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€. μž μ‹œ ν›„ λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."
78
-
79
- print(f"λ³€ν™˜λœ ν…μŠ€νŠΈ: {transcribed_text}") # λ””λ²„κΉ…μš© 둜그
80
- print(f"μƒμ„±λœ μš”μ•½: {summary_text}") # λ””λ²„κΉ…μš© 둜그
81
-
82
- return [transcribed_text, summary_text]
83
 
84
  # CSS μŠ€νƒ€μΌ
85
  css = """
86
  footer { visibility: hidden; }
 
 
87
  """
88
 
89
  # 파일 μ—…λ‘œλ“œ μΈν„°νŽ˜μ΄μŠ€
90
  file_transcribe = gr.Interface(
91
  fn=transcribe_summarize,
92
  inputs=[
93
- gr.Audio(sources="upload", type="filepath", label="μ˜€λ””μ˜€ 파일"),
 
 
 
 
94
  gr.Radio(
95
  choices=["transcribe", "translate"],
96
  label="μž‘μ—…",
97
  value="transcribe"
98
- ),
99
  ],
100
  outputs=[
101
- gr.Textbox(label="λ³€ν™˜λœ ν…μŠ€νŠΈ", lines=5),
102
- gr.Textbox(label="μš”μ•½", lines=3)
 
 
 
 
 
 
 
 
 
103
  ],
104
- title="λ°›μ•„μ“°κΈ° AI: μŒμ„±μ„ ν…μŠ€νŠΈλ‘œ λ³€ν™˜ν•˜κ³  μš”μ•½ν•˜κΈ°",
 
 
 
 
 
105
  flagging_mode="never"
106
  )
107
 
@@ -109,24 +171,34 @@ file_transcribe = gr.Interface(
109
  mic_transcribe = gr.Interface(
110
  fn=transcribe_summarize,
111
  inputs=[
112
- gr.Audio(sources="microphone", type="filepath"),
 
 
 
113
  gr.Radio(
114
  choices=["transcribe", "translate"],
115
  label="μž‘μ—…",
116
  value="transcribe"
117
- ),
118
  ],
119
  outputs=[
120
- gr.Textbox(label="λ³€ν™˜λœ ν…μŠ€νŠΈ", lines=5),
121
- gr.Textbox(label="μš”μ•½", lines=3)
 
 
 
 
 
 
 
122
  ],
123
- title="λ°›μ•„μ“°κΈ° AI: μŒμ„±μ„ ν…μŠ€νŠΈλ‘œ λ³€ν™˜ν•˜κ³  μš”μ•½ν•˜κΈ°",
124
  flagging_mode="never",
125
  css=css
126
  )
127
 
128
  # 메인 μ• ν”Œλ¦¬μΌ€μ΄μ…˜
129
- demo = gr.Blocks(theme="Nymbo/Nymbo_Theme",css=css)
130
  with demo:
131
  gr.TabbedInterface(
132
  [file_transcribe, mic_transcribe],
@@ -134,4 +206,8 @@ with demo:
134
  )
135
 
136
  # μ• ν”Œλ¦¬μΌ€μ΄μ…˜ μ‹€ν–‰
137
- demo.queue().launch(ssr_mode=False)
 
 
 
 
 
4
  from transformers import pipeline
5
  from huggingface_hub import InferenceClient
6
  import os
7
+ import numpy as np
8
+ from pydub import AudioSegment
9
+ import tempfile
10
+ import math
11
 
12
  MODEL_NAME = "openai/whisper-large-v3-turbo"
13
  BATCH_SIZE = 8
14
  FILE_LIMIT_MB = 1000
15
+ CHUNK_LENGTH = 10 * 60 # 10λΆ„ λ‹¨μœ„λ‘œ λΆ„ν• 
16
 
17
  device = 0 if torch.cuda.is_available() else "cpu"
18
 
 
30
  token=os.getenv("HF_TOKEN")
31
  )
32
 
33
+ def split_audio(audio_path, chunk_length=CHUNK_LENGTH):
34
+ """μ˜€λ””μ˜€ νŒŒμΌμ„ 청크둜 λΆ„ν• """
35
+ audio = AudioSegment.from_file(audio_path)
36
+ duration = len(audio) / 1000 # 초 λ‹¨μœ„ λ³€ν™˜
37
+ chunks = []
38
 
39
+ # 청크 개수 계산
40
+ num_chunks = math.ceil(duration / chunk_length)
41
+
42
+ for i in range(num_chunks):
43
+ start_time = i * chunk_length * 1000 # milliseconds
44
+ end_time = min((i + 1) * chunk_length * 1000, len(audio))
45
+
46
+ chunk = audio[start_time:end_time]
47
+
48
+ # μž„μ‹œ 파일둜 μ €μž₯
49
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
50
+ chunk.export(temp_file.name, format='wav')
51
+ chunks.append(temp_file.name)
52
+
53
+ return chunks, num_chunks
54
+
55
+ def process_chunk(chunk_path, task):
56
+ """κ°œλ³„ 청크 처리"""
57
  result = pipe(
58
+ chunk_path,
59
  batch_size=BATCH_SIZE,
60
  generate_kwargs={"task": task},
61
  return_timestamps=True
62
  )
 
63
 
64
+ # μž„μ‹œ 파일 μ‚­μ œ
65
+ os.unlink(chunk_path)
66
+
67
+ return result["text"]
68
 
69
+ def update_progress(progress):
70
+ """μ§„ν–‰ 상황 μ—…λ°μ΄νŠΈ"""
71
+ return f"처리 쀑... {progress}% μ™„λ£Œ"
72
 
73
+ @spaces.GPU
74
+ def transcribe_summarize(audio_input, task, progress=gr.Progress()):
75
+ if audio_input is None:
76
+ raise gr.Error("μ˜€λ””μ˜€ 파일이 μ œμΆœλ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€!")
77
+
78
+ try:
79
+ # μ˜€λ””μ˜€ 파일 λΆ„ν• 
80
+ chunks, num_chunks = split_audio(audio_input)
81
+ progress(0, desc="μ˜€λ””μ˜€ 파일 λΆ„ν•  μ™„λ£Œ")
82
 
83
+ # 각 청크 처리
84
+ transcribed_texts = []
85
+ for i, chunk in enumerate(chunks):
86
+ chunk_text = process_chunk(chunk, task)
87
+ transcribed_texts.append(chunk_text)
88
+ progress((i + 1) / num_chunks, desc=f"청크 {i+1}/{num_chunks} 처리 쀑")
 
 
 
 
89
 
90
+ # 전체 ν…μŠ€νŠΈ μ‘°ν•©
91
+ transcribed_text = " ".join(transcribed_texts)
92
+ progress(0.9, desc="ν…μŠ€νŠΈ λ³€ν™˜ μ™„λ£Œ")
93
+
94
+ # ν…μŠ€νŠΈ μš”μ•½
95
+ try:
96
+ # κΈ΄ ν…μŠ€νŠΈλ₯Ό μœ„ν•œ μš”μ•½ ν”„λ‘¬ν”„νŠΈ
97
+ prompt = f"""λ‹€μŒ κΈ΄ ν…μŠ€νŠΈλ₯Ό μ£Όμš” λ‚΄μš© μ€‘μ‹¬μœΌλ‘œ κ°„λ‹¨νžˆ μš”μ•½ν•΄μ£Όμ„Έμš”:
98
+ ν…μŠ€νŠΈ: {transcribed_text[:3000]}... # ν…μŠ€νŠΈκ°€ λ„ˆλ¬΄ κΈΈ 경우 μ•žλΆ€λΆ„λ§Œ μš”μ•½
99
+ μš”μ•½:"""
100
+
101
+ response = hf_client.text_generation(
102
+ model="CohereForAI/c4ai-command-r-plus-08-2024",
103
+ prompt=prompt,
104
+ max_new_tokens=250,
105
+ temperature=0.3,
106
+ top_p=0.9,
107
+ repetition_penalty=1.2,
108
+ stop_sequences=["\n", "ν…μŠ€νŠΈ:", "μš”μ•½:"]
109
+ )
110
+
111
+ summary_text = str(response)
112
+ if "μš”μ•½:" in summary_text:
113
+ summary_text = summary_text.split("μš”μ•½:")[1].strip()
114
+
115
+ except Exception as e:
116
+ print(f"μš”μ•½ 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
117
+ summary_text = "μš”μ•½μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€. ν…μŠ€νŠΈκ°€ λ„ˆλ¬΄ κΈΈκ±°λ‚˜ 처리 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€."
118
 
119
+ progress(1.0, desc="처리 μ™„λ£Œ")
120
+ return [transcribed_text, summary_text]
 
121
 
 
 
 
122
  except Exception as e:
123
+ error_msg = f"μŒμ„± 처리 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
124
+ return ["", error_msg]
 
 
 
 
 
125
 
126
  # CSS μŠ€νƒ€μΌ
127
  css = """
128
  footer { visibility: hidden; }
129
+ .progress-bar { height: 15px; border-radius: 5px; }
130
+ .container { max-width: 1200px; margin: auto; padding: 20px; }
131
  """
132
 
133
  # 파일 μ—…λ‘œλ“œ μΈν„°νŽ˜μ΄μŠ€
134
  file_transcribe = gr.Interface(
135
  fn=transcribe_summarize,
136
  inputs=[
137
+ gr.Audio(
138
+ sources="upload",
139
+ type="filepath",
140
+ label="μ˜€λ””μ˜€ 파일"
141
+ ),
142
  gr.Radio(
143
  choices=["transcribe", "translate"],
144
  label="μž‘μ—…",
145
  value="transcribe"
146
+ )
147
  ],
148
  outputs=[
149
+ gr.Textbox(
150
+ label="λ³€ν™˜λœ ν…μŠ€νŠΈ",
151
+ lines=10,
152
+ max_lines=30,
153
+ placeholder="μŒμ„±μ΄ ν…μŠ€νŠΈλ‘œ λ³€ν™˜λ˜μ–΄ 여기에 ν‘œμ‹œλ©λ‹ˆλ‹€..."
154
+ ),
155
+ gr.Textbox(
156
+ label="μš”μ•½",
157
+ lines=5,
158
+ placeholder="ν…μŠ€νŠΈ μš”μ•½μ΄ 여기에 ν‘œμ‹œλ©λ‹ˆλ‹€..."
159
+ )
160
  ],
161
+ title="λ°›μ•„μ“°κΈ° AI: μž₯μ‹œκ°„ μŒμ„± λ³€ν™˜ 및 μš”μ•½",
162
+ description="""
163
+ κΈ΄ μŒμ„± 파일(1μ‹œκ°„ 이상)도 μ²˜λ¦¬ν•  수 μžˆμŠ΅λ‹ˆλ‹€.
164
+ 처리 μ‹œκ°„μ€ 파일 길이에 λΉ„λ‘€ν•˜μ—¬ μ¦κ°€ν•©λ‹ˆλ‹€.
165
+ λ³€ν™˜ μ€‘μ—λŠ” μ§„ν–‰ 상황이 ν‘œμ‹œλ©λ‹ˆλ‹€.
166
+ """,
167
  flagging_mode="never"
168
  )
169
 
 
171
  mic_transcribe = gr.Interface(
172
  fn=transcribe_summarize,
173
  inputs=[
174
+ gr.Audio(
175
+ sources="microphone",
176
+ type="filepath"
177
+ ),
178
  gr.Radio(
179
  choices=["transcribe", "translate"],
180
  label="μž‘μ—…",
181
  value="transcribe"
182
+ )
183
  ],
184
  outputs=[
185
+ gr.Textbox(
186
+ label="λ³€ν™˜λœ ν…μŠ€νŠΈ",
187
+ lines=10,
188
+ max_lines=30
189
+ ),
190
+ gr.Textbox(
191
+ label="μš”μ•½",
192
+ lines=5
193
+ )
194
  ],
195
+ title="λ°›μ•„μ“°κΈ° AI: μŒμ„± λ…ΉμŒ 및 λ³€ν™˜",
196
  flagging_mode="never",
197
  css=css
198
  )
199
 
200
  # 메인 μ• ν”Œλ¦¬μΌ€μ΄μ…˜
201
+ demo = gr.Blocks(theme="gradio/soft", css=css)
202
  with demo:
203
  gr.TabbedInterface(
204
  [file_transcribe, mic_transcribe],
 
206
  )
207
 
208
  # μ• ν”Œλ¦¬μΌ€μ΄μ…˜ μ‹€ν–‰
209
+ demo.queue(concurrency_count=1).launch(
210
+ share=False,
211
+ debug=True,
212
+ ssr_mode=False
213
+ )