notrey commited on
Commit
999d54c
·
1 Parent(s): a6197da

updating prj

Browse files
Files changed (2) hide show
  1. app.py +41 -127
  2. requirements.txt +3 -6
app.py CHANGED
@@ -1,133 +1,47 @@
1
- import streamlit as st
2
- from PIL import Image
3
  import numpy as np
4
- import tempfile
5
- import soundfile as sf
6
- import torch
7
- import easyocr
8
- import omegaconf
9
 
 
 
10
 
11
- # Inject custom CSS
12
- st.markdown(
13
  """
14
- <style>
15
- /* Customize the background and text colors */
16
- .reportview-container {
17
- background-color: #FFFFFF;
18
- }
19
- .main .block-container {
20
- background-color: #FFFFFF;
21
- color: #008000;
22
- }
23
- /* Customize buttons, headers, etc. */
24
- .stButton>button {
25
- background-color: #008000;
26
- color: #FFFFFF;
27
- }
28
- </style>
29
- """,
30
- unsafe_allow_html=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  )
32
 
33
-
34
-
35
- # ---------------------------
36
- # Caching the OCR reader for performance
37
- # ---------------------------
38
- @st.cache_resource(show_spinner=False)
39
- def load_ocr_reader(languages):
40
- # EasyOCR expects language codes like "en", "es", "ch_sim", "ar"
41
- return easyocr.Reader(languages, gpu=False)
42
-
43
- # ---------------------------
44
- # Caching TTS model loading (Silero TTS)
45
- # ---------------------------
46
- @st.cache_resource(show_spinner=False)
47
- def load_tts_model(language):
48
- # Map our language codes to Silero model speakers.
49
- # Note: Silero officially supports 'en' (and some community models for other languages).
50
- # For demonstration, if a language isn’t available, we fallback to English.
51
- lang_speaker_map = {
52
- 'en': 'v3_en',
53
- 'es': 'v3_es', # if available; otherwise, you might need to train or use an English model
54
- 'ch': 'v3_en', # fallback to English for now (or replace with an experimental Chinese model)
55
- 'ar': 'v3_en' # fallback to English (or an experimental Arabic model if available)
56
- }
57
- speaker = lang_speaker_map.get(language, 'v3_en')
58
- device = torch.device('cpu')
59
- # Load the Silero TTS model from torch.hub.
60
- # This command will download the model the first time you run it.
61
- model, example_text, sample_rate, speakers = torch.hub.load(
62
- repo_or_dir='snakers4/silero-models',
63
- model='silero_tts',
64
- language=language,
65
- speaker=speaker
66
- )
67
- return model, sample_rate, speaker
68
-
69
- def synthesize_speech(text, language):
70
- model, sample_rate, speaker = load_tts_model(language)
71
- # Synthesize speech; the output is a NumPy array with the audio waveform.
72
- audio = model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate)
73
- return audio, sample_rate
74
-
75
- def save_audio(audio, sample_rate):
76
- # Save audio to a temporary file and return its path.
77
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as f:
78
- sf.write(f.name, audio, sample_rate)
79
- return f.name
80
-
81
- def extract_text_from_image(image_array, languages):
82
- reader = load_ocr_reader(languages)
83
- results = reader.readtext(image_array)
84
- # Concatenate detected text parts.
85
- extracted_text = " ".join([res[1] for res in results])
86
- return extracted_text
87
-
88
- # ---------------------------
89
- # Mapping for EasyOCR language codes (EasyOCR uses 'ch_sim' for Chinese simplified)
90
- # ---------------------------
91
- ocr_language_map = {
92
- 'en': 'en',
93
- 'es': 'es',
94
- 'ch': 'ch_sim',
95
- 'ar': 'ar'
96
- }
97
-
98
- # ---------------------------
99
- # Streamlit App UI
100
- # ---------------------------
101
- st.title("Image-to-Audio Description App")
102
- st.write("Upload an image or enter text to generate audio descriptions.")
103
-
104
- # Select language for both OCR and TTS
105
- language = st.selectbox("Select language", options=['en', 'es', 'ch', 'ar'], index=0)
106
-
107
- # Choose input method
108
- input_method = st.radio("Input method", options=["Upload Image", "Enter Text"])
109
-
110
- text = ""
111
-
112
- if input_method == "Upload Image":
113
- uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"])
114
- if uploaded_file is not None:
115
- image = Image.open(uploaded_file)
116
- st.image(image, caption='Uploaded Image', use_column_width=True)
117
- # Convert PIL image to numpy array for EasyOCR
118
- image_array = np.array(image)
119
- with st.spinner("Extracting text from image..."):
120
- # EasyOCR expects language codes; here we wrap our choice.
121
- ocr_lang = [ocr_language_map.get(language, 'en')]
122
- text = extract_text_from_image(image_array, ocr_lang)
123
- st.write("**Extracted Text:**")
124
- st.write(text)
125
- else:
126
- text = st.text_area("Enter text to synthesize", "Type your description here...")
127
-
128
- if text and st.button("Generate Speech"):
129
- with st.spinner("Synthesizing speech..."):
130
- audio, sr = synthesize_speech(text, language)
131
- audio_file = save_audio(audio, sr)
132
- st.success("Audio generated!")
133
- st.audio(audio_file)
 
1
+ import cv2
2
+ import gradio as gr
3
  import numpy as np
4
+ from fer import FER
 
 
 
 
5
 
6
+ # Initialize the pre-trained detector once so you don't reinitialize on every function call.
7
+ detector = FER(mtcnn=True) # Optionally, you can set mtcnn to False to use a faster (but less accurate) cascade.
8
 
9
+ def emotion_recognition(image):
 
10
  """
11
+ Process the input image, detect emotions on faces,
12
+ and annotate the image with bounding boxes and emotion labels.
13
+
14
+ Parameters:
15
+ image (numpy.ndarray): Input image (RGB).
16
+
17
+ Returns:
18
+ numpy.ndarray: Annotated image with emotion labels.
19
+ """
20
+ # fer works with RGB images which is what Gradio provides by default.
21
+ results = detector.detect_emotions(image)
22
+ annotated_image = image.copy()
23
+
24
+ # Loop through each detected face
25
+ for face in results:
26
+ (x, y, w, h) = face["box"]
27
+ # Get the dominant emotion for the detected face
28
+ dominant_emotion = max(face["emotions"].items(), key=lambda item: item[1])[0]
29
+ # Draw bounding box around face
30
+ cv2.rectangle(annotated_image, (x, y), (x + w, y + h), (0, 255, 0), 2)
31
+ # Put the emotion label above the bounding box
32
+ cv2.putText(annotated_image, dominant_emotion, (x, y - 10),
33
+ cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)
34
+ return annotated_image
35
+
36
+ # Create a Gradio Interface
37
+ interface = gr.Interface(
38
+ fn=emotion_recognition,
39
+ inputs=gr.Image(type="numpy", label="Input Image"),
40
+ outputs=gr.Image(type="numpy", label="Annotated Image"),
41
+ title="Facial Emotion Recognition",
42
+ description="Upload an image and let the app detect and annotate facial emotions."
43
  )
44
 
45
+ # Run the app locally
46
+ if __name__ == "__main__":
47
+ interface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,7 +1,4 @@
1
- streamlit
2
- easyocr
3
- torch
4
- soundfile
5
- Pillow
6
  numpy
7
- omegaconf
 
1
+ gradio
2
+ opencv-python
3
+ fer
 
 
4
  numpy