devcom33 commited on
Commit
5f0a430
·
1 Parent(s): 6be235b

host my awalit

Browse files
Files changed (8) hide show
  1. .gitignore +1 -0
  2. Dockerfile +11 -0
  3. app.py +81 -0
  4. config.py +18 -0
  5. models.py +59 -0
  6. requirements.txt +9 -0
  7. services.py +45 -0
  8. utils.py +0 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /app
4
+
5
+ COPY . .
6
+
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ EXPOSE 7860
10
+
11
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sys
3
+ from fastapi import FastAPI, UploadFile, File, HTTPException
4
+ from pydantic import BaseModel
5
+ import config
6
+ from models import load_whisper, load_summarizer, load_spacy
7
+ from services import process_transcription, process_summary
8
+
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ app = FastAPI(
13
+ title="Transcription and Summarization API",
14
+ description="API using Faster-Whisper, spaCy, and Hugging Face Transformers",
15
+ version="1.0.0"
16
+ )
17
+
18
+ logger.info("Application starting up - loading models...")
19
+ whisper_model = load_whisper(config)
20
+ summarizer_pipeline = load_summarizer(config)
21
+ nlp_spacy = load_spacy(config)
22
+ logger.info("Model loading complete.")
23
+
24
+ if not whisper_model:
25
+ logger.critical("Whisper model failed to load. Transcription endpoint will be unavailable.")
26
+ if not summarizer_pipeline:
27
+ logger.critical("Summarizer pipeline failed to load. Summarization endpoint will be unavailable.")
28
+ if not nlp_spacy:
29
+ logger.warning("SpaCy model failed to load. Summarization will proceed without spaCy preprocessing.")
30
+
31
+
32
+ class TranscriptInput(BaseModel):
33
+ transcript: str
34
+
35
+
36
+ @app.get("/health")
37
+ def health():
38
+ return {"status": "ok",
39
+ "whisper_loaded": whisper_model is not None,
40
+ "summarizer_loaded": summarizer_pipeline is not None,
41
+ "spacy_loaded": nlp_spacy is not None
42
+ }
43
+
44
+
45
+ @app.post("/transcribe")
46
+ async def transcription(audio_file : UploadFile = File(...)):
47
+ if whisper_model is None:
48
+ raise HTTPException(status_code=503, detail="Transcription service unavailable.")
49
+
50
+ try:
51
+ content = await audio_file.read()
52
+ transcript, info = process_transcription(content, whisper_model)
53
+ logger.info(f"Transcription successful. Language: {info.language}")
54
+ return {"transcript": transcript}
55
+ except ValueError as ve:
56
+ logger.error(f"Value error during transcription processing: {ve}")
57
+ raise HTTPException(status_code=400, detail=str(ve))
58
+ except Exception as e:
59
+ logger.error(f"Unhandled error during transcription: {e}", exc_info=True)
60
+ raise HTTPException(status_code=500, detail="Internal server error during transcription.")
61
+
62
+
63
+ @app.post("/summarize")
64
+ def summarize(input: TranscriptInput):
65
+
66
+ if summarizer_pipeline is None:
67
+ raise HTTPException(status_code=503, detail="Summarization service unavailable.")
68
+ if not input.transcript:
69
+ raise HTTPException(status_code=400, detail="Transcript cannot be empty.")
70
+
71
+ try:
72
+ summary = process_summary(input.transcript, summarizer_pipeline, nlp_spacy, config)
73
+ return {"summary": summary}
74
+
75
+ except ValueError as ve:
76
+ logger.error(f"Value error during summary processing: {ve}")
77
+ raise HTTPException(status_code=400, detail=str(ve))
78
+
79
+ except Exception as e:
80
+ logger.error(f"Unhandled error during summarization: {e}", exc_info=True)
81
+ raise HTTPException(status_code=500, detail="Internal server error during summarization.")
config.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import psutil
2
+ import os
3
+ #from dotenv import load_dotenv
4
+
5
+
6
+ #load_dotenv()
7
+ WHISPER_MODEL_NAME = "tiny"
8
+ WHISPER_DEVICE = "cpu"
9
+ WHISPER_COMPUTE_TYPE = "int8"
10
+ PYANNOTE_AUTH_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
11
+ SUMMARIZER_MODEL = "facebook/bart-large-cnn"
12
+ SUMMARIZER_MAX_LENGTH = 150
13
+ SUMMARIZER_MIN_LENGTH = 50
14
+ SPACY_MODEL = "en_core_web_sm"
15
+ CPU_THREADS = max(1, psutil.cpu_count(logical=False))
16
+
17
+ if not PYANNOTE_AUTH_TOKEN:
18
+ raise ValueError("HUGGINGFACE_API_KEY not set in environment variables")
models.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from faster_whisper import WhisperModel
3
+ import spacy
4
+ from transformers import pipeline
5
+ import os
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def load_whisper(config):
11
+ logger.info("Loading Whisper model...")
12
+ try:
13
+ model = WhisperModel(
14
+ config.WHISPER_MODEL_NAME,
15
+ device=config.WHISPER_DEVICE,
16
+ compute_type=config.WHISPER_COMPUTE_TYPE,
17
+ cpu_threads=config.CPU_THREADS
18
+ )
19
+ logger.info(f"Whisper model '{config.WHISPER_MODEL_NAME}' loaded on {config.WHISPER_DEVICE}.")
20
+ return model
21
+ except Exception as e:
22
+ logger.error(f"Failed to load Whisper model: {e}", exc_info=True)
23
+ return None
24
+
25
+ def load_summarizer(config):
26
+ logger.info("Loading Summarization pipeline...")
27
+ try:
28
+ summarizer = pipeline("summarization", model=config.SUMMARIZER_MODEL)
29
+ logger.info("Summarization pipeline loaded.")
30
+ return summarizer
31
+ except Exception as e:
32
+ logger.error(f"Failed to load Summarization pipeline: {e}", exc_info=True)
33
+ return None
34
+
35
+ def load_spacy(config):
36
+ logger.info("Loading spaCy model...")
37
+
38
+ try:
39
+ nlp = spacy.load("en_core_web_sm")
40
+ logger.info("spaCy model 'en_core_web_sm' loaded.")
41
+
42
+ return nlp
43
+
44
+ except OSError:
45
+ logger.warning("spaCy model 'en_core_web_sm' not found. Trying to download...")
46
+
47
+ try:
48
+ spacy.cli.download("en_core_web_sm")
49
+ nlp = spacy.load("en_core_web_sm")
50
+ logger.info("spaCy model 'en_core_web_sm' downloaded and loaded.")
51
+ return nlp
52
+
53
+ except Exception as download_e:
54
+ logger.error(f"Failed to download or load spaCy model 'en_core_web_sm': {download_e}")
55
+ return None
56
+
57
+ except Exception as e:
58
+ logger.error(f"Failed to load spaCy model: {e}")
59
+ return None
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ transformers
4
+ torch
5
+ faster_whisper
6
+ spacy
7
+ pydub
8
+ psutil
9
+ logging
services.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import tempfile
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ def process_transcription(audio_content: bytes, whisper_model):
8
+ if not whisper_model:
9
+ raise ValueError("Whisper model not loaded.")
10
+
11
+ temp_file_path = None
12
+ try:
13
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
14
+ temp_file_path = temp_file.name
15
+ temp_file.write(audio_content)
16
+
17
+ segments, info = whisper_model.transcribe(temp_file_path, beam_size=5)
18
+ transcript = " ".join([seg.text.strip() for seg in segments])
19
+ return transcript, info
20
+ finally:
21
+ if temp_file_path and os.path.exists(temp_file_path):
22
+ os.remove(temp_file_path)
23
+
24
+ def process_summary(text: str, summarizer_pipeline, nlp_spacy, config):
25
+ if not summarizer_pipeline:
26
+ raise ValueError("Summarizer model not loaded.")
27
+
28
+ processed_text = text
29
+ if nlp_spacy:
30
+ try:
31
+ doc = nlp_spacy(text)
32
+ sentences = [sent.text.strip() for sent in doc.sents]
33
+ processed_text = " ".join(sentences)
34
+ except Exception as e:
35
+ logger.error(f"SpaCy processing failed: {e}", exc_info=True)
36
+
37
+ summary_output = summarizer_pipeline(
38
+ processed_text,
39
+ max_length=config.SUMMARIZER_MAX_LENGTH,
40
+ min_length=config.SUMMARIZER_MIN_LENGTH,
41
+ do_sample=False
42
+ )
43
+
44
+ final_summary = summary_output[0]['summary_text']
45
+ return final_summary
utils.py ADDED
File without changes