Hammad712 commited on
Commit
a9589dc
·
verified ·
1 Parent(s): 899a99b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +92 -51
main.py CHANGED
@@ -142,62 +142,103 @@ async def startup_event():
142
  @app.get("/")
143
  async def root():
144
  return {
145
- "message": "Welcome to the Audio Similarity API!",
146
- "usage": {
147
- "endpoints": {
148
- "gemini": {
149
- "path": "/compare-audio",
150
- "description": "POST two audio files (user recitation and professional qarri) for similarity analysis using Gemini."
151
- },
152
- "dtw": {
153
- "path": "/compare-dtw",
154
- "description": "POST two audio files (user recitation and professional qarri) for similarity analysis using deep embeddings and DTW."
155
- }
156
- }
157
- }
158
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- @app.post("/compare-audio")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  async def compare_audio(
162
- audio1: UploadFile = File(...),
163
- audio2: UploadFile = File(...)
164
  ):
165
  """
166
- Compare two audio files using the Gemini approach.
167
- The first audio is the user's recitation and the second is the professional qarri recitation.
168
  """
169
- # Read the uploaded audio files.
170
- audio1_bytes = await audio1.read()
171
- audio2_bytes = await audio2.read()
172
-
173
- # Create a refined prompt that clearly identifies the audio sources.
174
- prompt = (
175
- """Please analyze and compare the two provided audio clips.
176
- The first audio is the user's recitation, and the second audio is the professional qarri recitation.
177
- Evaluate their similarity on a scale from 0 to 1, where:
178
- - 1 indicates the user's recitation contains no mistakes compared to the professional version,
179
- - 0 indicates there are significant mistakes.
180
- Provide your response with:
181
- 1. A numerical similarity score on the first line.
182
- 2. A single sentence that indicates whether the user's recitation is similar, moderately similar, or dissimilar to the professional qarri."""
183
- )
184
-
185
- # Generate the content using the Gemini model with the two audio inputs.
186
- response = client.models.generate_content(
187
- model='gemini-2.0-flash',
188
- contents=[
189
- prompt,
190
- types.Part.from_bytes(
191
- data=audio1_bytes,
192
- mime_type=audio1.content_type,
193
- ),
194
- types.Part.from_bytes(
195
- data=audio2_bytes,
196
- mime_type=audio2.content_type,
197
- )
198
- ]
199
- )
200
- return {"result": response.text}
201
 
202
  @app.post("/compare-dtw")
203
  async def compare_dtw(
 
142
  @app.get("/")
143
  async def root():
144
  return {
145
+ "message": "Welcome to the Audio Similarity API!"
146
+
147
+
148
+ # Load GROQ API key from environment variable
149
+ API_KEY = os.getenv("GROQ_API_KEY")
150
+ if not API_KEY:
151
+ raise RuntimeError("GROQ_API_KEY environment variable not set")
152
+ client = Groq(api_key=API_KEY)
153
+
154
+
155
+ def transcribe_audio(file_tuple: tuple) -> str:
156
+ """
157
+ Transcribes speech from an audio file using the GROQ Whisper model.
158
+ Args:
159
+ file_tuple (tuple): (filename, file_bytes)
160
+ Returns:
161
+ str: The transcription text or error message.
162
+ """
163
+ try:
164
+ transcription = client.audio.transcriptions.create(
165
+ file=file_tuple,
166
+ model="whisper-large-v3",
167
+ response_format="text"
168
+ )
169
+ return transcription
170
+ except Exception as e:
171
+ raise HTTPException(status_code=500, detail=f"Transcription error: {e}")
172
+
173
+
174
+ def levenshtein_similarity(text1: str, text2: str) -> float:
175
+ """
176
+ Calculate normalized Levenshtein similarity between two texts.
177
+ Returns a score between 0 and 1.
178
+ """
179
+ distance = Levenshtein.distance(text1, text2)
180
+ max_len = max(len(text1), len(text2))
181
+ return 1 - distance / max_len if max_len > 0 else 1.0
182
+
183
 
184
+ def find_differences(text_original: str, text_user: str) -> str:
185
+ """
186
+ Identify differences between original and user transcriptions using GROQ chat.
187
+ """
188
+ messages = [
189
+ {"role": "system", "content":
190
+ "You are a helpful assistant that finds mistakes between two texts. "
191
+ "Provide only the mistakes, no extra explanation."},
192
+ {"role": "user", "content": (
193
+ f"Original transcription: '{text_original}'\n"
194
+ f"User transcription: '{text_user}'\n"
195
+ "Explain the differences between these texts."
196
+ )}
197
+ ]
198
+ try:
199
+ completion = client.chat.completions.create(
200
+ model="mistral-saba-24b",
201
+ messages=messages,
202
+ temperature=1,
203
+ max_tokens=1024,
204
+ top_p=1,
205
+ stream=False
206
+ )
207
+ return completion.choices[0].message.content
208
+ except Exception as e:
209
+ raise HTTPException(status_code=500, detail=f"Error generating explanation: {e}")
210
+
211
+
212
+ @app.post("/compare")
213
  async def compare_audio(
214
+ original_audio: UploadFile = File(...),
215
+ user_audio: UploadFile = File(...)
216
  ):
217
  """
218
+ Endpoint to upload two audio files, transcribe, compare, and return similarity and differences.
 
219
  """
220
+ # Read uploaded files
221
+ original_bytes = await original_audio.read()
222
+ user_bytes = await user_audio.read()
223
+
224
+ # Transcribe
225
+ transcription_original = transcribe_audio((original_audio.filename, original_bytes))
226
+ transcription_user = transcribe_audio((user_audio.filename, user_bytes))
227
+
228
+ # Compute similarity
229
+ similarity_score = levenshtein_similarity(transcription_original, transcription_user)
230
+
231
+ # Find differences
232
+ explanation = find_differences(transcription_original, transcription_user)
233
+
234
+ # Build response
235
+ result = {
236
+ "original_transcription": transcription_original,
237
+ "user_transcription": transcription_user,
238
+ "levenshtein_similarity": round(similarity_score, 2),
239
+ "explanation_of_differences": explanation
240
+ }
241
+ return JSONResponse(content=result)
 
 
 
 
 
 
 
 
 
 
242
 
243
  @app.post("/compare-dtw")
244
  async def compare_dtw(