Spaces:

Medtty
/

sv_hand_gesture_api

Running

App Files Files Community

medtty commited on 20 days ago

Commit

067b419

1 Parent(s): 528ce19

Updates

Browse files

Files changed (4) hide show

.gitignore +2 -1
app.py +287 -90
examples/.DS_Store +0 -0
requirements.txt +3 -1

.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	- ~~training~~.py


1	+ trainig.py
2	+ *.bak

app.py CHANGED Viewed

@@ -3,14 +3,24 @@ import tensorflow as tf
 import numpy as np
 import json
 from PIL import Image
-from fastapi import FastAPI, UploadFile, File
 import uvicorn
-import cv2 # Import OpenCV
-import mediapipe as mp # Import MediaPipe
 # Initialize MediaPipe Hands
 mp_hands = mp.solutions.hands
-hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)
 mp_drawing = mp.solutions.drawing_utils
 # Create both Gradio and FastAPI apps
@@ -28,129 +38,208 @@ with open('model/class_indices.json') as f:
 index_to_class = {int(k): v for k, v in class_indices.items()}
 # Preprocess function now expects a PIL Image (already cropped)
 def preprocess_image(image):
     # Ensure image is RGB before resizing and converting
     if image.mode != 'RGB':
         image = image.convert('RGB')
-    image = image.resize((224, 224))
     image_array = np.array(image) / 255.0
-    # The input tensor is expected to be float32
     return np.expand_dims(image_array, axis=0).astype(np.float32)
-# Modified predict function to include hand detection and cropping
-def predict(image_pil):
     try:
-        print(f"Original image mode: {image_pil.mode}, size: {image_pil.size}")
-        # Convert PIL image to OpenCV format (NumPy array)
         image_cv = np.array(image_pil)
-        # Convert RGB (from PIL) to BGR (for OpenCV display if needed) then back to RGB for MediaPipe
         image_rgb = cv2.cvtColor(image_cv, cv2.COLOR_RGB2BGR)
         image_rgb = cv2.cvtColor(image_rgb, cv2.COLOR_BGR2RGB)
-        # Process the image with MediaPipe Hands
-        results = hands.process(image_rgb)
-        if not results.multi_hand_landmarks:
-            print("No hand detected in the image.")
-            return {"error": "No hand detected"}
-        # Assuming only one hand is detected (max_num_hands=1)
-        hand_landmarks = results.multi_hand_landmarks[0]
-        # Calculate bounding box from landmarks
-        h, w, _ = image_rgb.shape
-        x_min, y_min = w, h
-        x_max, y_max = 0, 0
-        for landmark in hand_landmarks.landmark:
-            x, y = int(landmark.x * w), int(landmark.y * h)
-            if x < x_min: x_min = x
-            if y < y_min: y_min = y
-            if x > x_max: x_max = x
-            if y > y_max: y_max = y
-        # Add some padding to the bounding box
-        padding = 30
-        x_min = max(0, x_min - padding)
-        y_min = max(0, y_min - padding)
-        x_max = min(w, x_max + padding)
-        y_max = min(h, y_max + padding)
-        # Ensure the box has valid dimensions
-        if x_min >= x_max or y_min >= y_max:
-             print("Invalid bounding box calculated.")
-             return {"error": "Could not calculate valid hand bounding box"}
-        # Crop the original RGB image using the bounding box
-        cropped_image_np = image_rgb[y_min:y_max, x_min:x_max]
-        # Check if cropping resulted in an empty image
-        if cropped_image_np.size == 0:
-            print("Cropping resulted in an empty image.")
-            return {"error": "Cropping failed, possibly invalid bounding box"}
-        # Convert cropped NumPy array back to PIL Image
-        cropped_image_pil = Image.fromarray(cropped_image_np)
-        print(f"Cropped image size: {cropped_image_pil.size}")
-        # Preprocess the cropped image
-        processed_image = preprocess_image(cropped_image_pil)
-        print(f"Processed image shape: {processed_image.shape}, dtype: {processed_image.dtype}")
-        # --- Inference ---
         interpreter.set_tensor(input_details[0]['index'], processed_image)
         interpreter.invoke()
         output_data = interpreter.get_tensor(output_details[0]['index'])
         prediction = output_data[0]
-        # --- End Inference ---
-        print(f"Raw prediction output: {prediction}")
         predicted_class_idx = int(np.argmax(prediction))
         confidence = float(prediction[predicted_class_idx])
-        # Use the correct class mapping loaded earlier
         predicted_class = index_to_class.get(predicted_class_idx, f"unknown_{predicted_class_idx}")
-        print(f"Predicted class index: {predicted_class_idx}, Confidence: {confidence}, Class: {predicted_class}")
         return {
             "class": predicted_class,
             "confidence": confidence,
             "all_predictions": {
-                # Use the correct class mapping here too
                 index_to_class.get(i, f"class_{i}"): float(prediction[i])
                 for i in range(len(prediction))
             }
         }
     except Exception as e:
-        print(f"Error during prediction: {e}")
-        # Also print traceback for detailed debugging
         import traceback
         traceback.print_exc()
         return {"error": str(e)}
-# Gradio Interface
 with gradio_app:
     gr.Markdown("# Hand Gesture Recognition")
-    with gr.Row():
-        input_image = gr.Image(type="pil", label="Upload Image")
-        output_json = gr.JSON(label="Prediction Results")
-    submit = gr.Button("Predict")
-    submit.click(
-        fn=predict,
-        inputs=input_image,
-        outputs=output_json
-    )
-    gr.Examples(
-        examples=[["examples/two_up.jpg"], ["examples/call.jpg"], ["examples/stop.jpg"]],
-        inputs=input_image
-    )
 # Mount Gradio app to FastAPI
 fastapi_app = gr.mount_gradio_app(fastapi_app, gradio_app, path="/")
-# API endpoint
 @fastapi_app.post("/api/predict")
 async def api_predict(file: UploadFile = File(...)):
     try:
@@ -164,13 +253,121 @@ async def api_predict(file: UploadFile = File(...)):
         # Convert BGR (OpenCV default) to RGB for PIL
         img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
         image_pil = Image.fromarray(img_rgb)
-        return predict(image_pil) # Call the modified predict function
     except Exception as e:
-        print(f"Error processing uploaded file: {e}")
         import traceback
         traceback.print_exc()
         return {"error": f"Failed to process image: {e}"}
 if __name__ == "__main__":
     # Modified for Hugging Face Spaces environment

 import numpy as np
 import json
 from PIL import Image
+from fastapi import FastAPI, UploadFile, File, WebSocket, Request, Response
+from fastapi.responses import StreamingResponse
 import uvicorn
+import cv2
+import mediapipe as mp
+import io
+import base64
+import asyncio
+import time
+from typing import List, Dict, Any
+from pydantic import BaseModel
 # Initialize MediaPipe Hands
 mp_hands = mp.solutions.hands
+# For static images, we use static_image_mode=True
+hands_static = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)
+# For video streams, we use static_image_mode=False for better performance
+hands_video = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5)
 mp_drawing = mp.solutions.drawing_utils
 # Create both Gradio and FastAPI apps
 index_to_class = {int(k): v for k, v in class_indices.items()}
+# Model and processing parameters
+MODEL_INPUT_SIZE = (224, 224)
+DETECTION_FREQUENCY = 5  # Process every Nth frame for performance
+CONFIDENCE_THRESHOLD = 0.5  # Minimum confidence to report a gesture
+# Data models for API
+class GestureResponse(BaseModel):
+    class_name: str
+    confidence: float
+    timestamp: float
+    all_predictions: Dict[str, float] = None
+class StreamRequest(BaseModel):
+    stream_id: str = None
+    width: int = 640
+    height: int = 480
+    fps: int = 15
+# Cache to store most recent detection results
+detection_cache = {}
 # Preprocess function now expects a PIL Image (already cropped)
 def preprocess_image(image):
     # Ensure image is RGB before resizing and converting
     if image.mode != 'RGB':
         image = image.convert('RGB')
+    image = image.resize(MODEL_INPUT_SIZE)
     image_array = np.array(image) / 255.0
     return np.expand_dims(image_array, axis=0).astype(np.float32)
+def detect_and_crop_hand(image_rgb):
+    """Detect hand in the image and return cropped hand region if found"""
+    h, w = image_rgb.shape[:2]
+    results = hands_static.process(image_rgb)
+    if not results.multi_hand_landmarks:
+        return None, "No hand detected"
+    # Get the first hand detected
+    hand_landmarks = results.multi_hand_landmarks[0]
+    # Calculate bounding box from landmarks
+    x_min, y_min = w, h
+    x_max, y_max = 0, 0
+    for landmark in hand_landmarks.landmark:
+        x, y = int(landmark.x * w), int(landmark.y * h)
+        if x < x_min: x_min = x
+        if y < y_min: y_min = y
+        if x > x_max: x_max = x
+        if y > y_max: y_max = y
+    # Add padding to the bounding box
+    padding = 30
+    x_min = max(0, x_min - padding)
+    y_min = max(0, y_min - padding)
+    x_max = min(w, x_max + padding)
+    y_max = min(h, y_max + padding)
+    # Check for valid dimensions
+    if x_min >= x_max or y_min >= y_max:
+        return None, "Invalid bounding box"
+    # Crop the hand region
+    cropped_image = image_rgb[y_min:y_max, x_min:x_max]
+    if cropped_image.size == 0:
+        return None, "Empty cropped image"
+    return cropped_image, None
+def process_frame_for_gesture(frame):
+    """Process a single frame for hand gesture recognition"""
     try:
+        # Convert to RGB for MediaPipe
+        if frame.shape[2] == 4:  # RGBA
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
+        elif frame.shape[2] == 3 and frame.dtype == np.uint8:
+            # Assuming BGR from OpenCV
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        # Detect and crop hand
+        cropped_hand, error = detect_and_crop_hand(frame)
+        if error:
+            return {"error": error}
+        # Convert cropped NumPy array to PIL Image
+        cropped_pil = Image.fromarray(cropped_hand)
+        # Preprocess and predict
+        processed_image = preprocess_image(cropped_pil)
+        interpreter.set_tensor(input_details[0]['index'], processed_image)
+        interpreter.invoke()
+        output_data = interpreter.get_tensor(output_details[0]['index'])
+        prediction = output_data[0]
+        # Get the prediction result
+        predicted_class_idx = int(np.argmax(prediction))
+        confidence = float(prediction[predicted_class_idx])
+        predicted_class = index_to_class.get(predicted_class_idx, f"unknown_{predicted_class_idx}")
+        # Return prediction info
+        return {
+            "class": predicted_class,
+            "confidence": confidence,
+            "timestamp": time.time(),
+            "all_predictions": {
+                index_to_class.get(i, f"class_{i}"): float(prediction[i])
+                for i in range(len(prediction))
+            }
+        }
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return {"error": str(e)}
+def predict(image_pil):
+    """Original prediction function for Gradio interface"""
+    try:
+        # Convert PIL image to OpenCV format
         image_cv = np.array(image_pil)
+        # Process the image with MediaPipe Hands
         image_rgb = cv2.cvtColor(image_cv, cv2.COLOR_RGB2BGR)
         image_rgb = cv2.cvtColor(image_rgb, cv2.COLOR_BGR2RGB)
+        # Detect hand and get cropped image
+        cropped_hand, error = detect_and_crop_hand(image_rgb)
+        if error:
+            return {"error": error}
+        # Convert cropped NumPy array to PIL Image
+        cropped_pil = Image.fromarray(cropped_hand)
+        # Preprocess and predict
+        processed_image = preprocess_image(cropped_pil)
         interpreter.set_tensor(input_details[0]['index'], processed_image)
         interpreter.invoke()
         output_data = interpreter.get_tensor(output_details[0]['index'])
         prediction = output_data[0]
+        # Get the prediction result
         predicted_class_idx = int(np.argmax(prediction))
         confidence = float(prediction[predicted_class_idx])
         predicted_class = index_to_class.get(predicted_class_idx, f"unknown_{predicted_class_idx}")
         return {
             "class": predicted_class,
             "confidence": confidence,
             "all_predictions": {
                 index_to_class.get(i, f"class_{i}"): float(prediction[i])
                 for i in range(len(prediction))
             }
         }
     except Exception as e:
         import traceback
         traceback.print_exc()
         return {"error": str(e)}
+# Define the Gradio interface
 with gradio_app:
     gr.Markdown("# Hand Gesture Recognition")
+    with gr.Tabs():
+        with gr.TabItem("Image Upload"):
+            with gr.Row():
+                input_image = gr.Image(type="pil", label="Upload Image")
+                output_json = gr.JSON(label="Prediction Results")
+            submit = gr.Button("Predict")
+            submit.click(
+                fn=predict,
+                inputs=input_image,
+                outputs=output_json
+            )
+            gr.Examples(
+                examples=[["examples/two_up.jpg"], ["examples/stop.jpg"]],
+                inputs=input_image
+            )
+        with gr.TabItem("Live Demo"):
+            gr.Markdown("""
+            ## Live Demo
+            Try the live demo using your webcam!
+            - Please allow camera access when prompted
+            - Hold your hand gesture in front of the camera
+            """)
+            camera_input = gr.Image(source="webcam", streaming=True, label="Camera Input")
+            live_output = gr.JSON(label="Live Detection Results")
+            def process_camera_input(img):
+                if img is None:
+                    return {"message": "No image received"}
+                return predict(img)
+            camera_input.change(
+                fn=process_camera_input,
+                inputs=camera_input,
+                outputs=live_output
+            )
 # Mount Gradio app to FastAPI
 fastapi_app = gr.mount_gradio_app(fastapi_app, gradio_app, path="/")
+# API endpoint for single image prediction
 @fastapi_app.post("/api/predict")
 async def api_predict(file: UploadFile = File(...)):
     try:
         # Convert BGR (OpenCV default) to RGB for PIL
         img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
         image_pil = Image.fromarray(img_rgb)
+        return predict(image_pil)
     except Exception as e:
         import traceback
         traceback.print_exc()
         return {"error": f"Failed to process image: {e}"}
+# WebSocket endpoint for video stream processing
+@fastapi_app.websocket("/api/stream")
+async def websocket_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    try:
+        # Get stream configuration
+        config_data = await websocket.receive_text()
+        config = json.loads(config_data)
+        stream_id = config.get("stream_id", f"stream_{int(time.time())}")
+        frame_count = 0
+        last_detection_time = time.time()
+        processing_interval = 1.0 / DETECTION_FREQUENCY  # Process every N frames
+        while True:
+            # Receive frame data
+            data = await websocket.receive_bytes()
+            # Decode the image
+            nparr = np.frombuffer(data, np.uint8)
+            frame = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+            if frame is None:
+                await websocket.send_json({"error": "Invalid frame data"})
+                continue
+            frame_count += 1
+            current_time = time.time()
+            # Process every N frames for performance
+            if frame_count % DETECTION_FREQUENCY == 0 or (current_time - last_detection_time) >= processing_interval:
+                # Process the frame for gesture recognition
+                result = process_frame_for_gesture(frame)
+                if "error" not in result:
+                    # Cache the result
+                    detection_cache[stream_id] = result
+                    last_detection_time = current_time
+                    # Send results back to client
+                    await websocket.send_json(result)
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        print(f"WebSocket error: {e}")
+    finally:
+        print(f"WebSocket connection closed")
+# REST API endpoints for mobile integration
+@fastapi_app.post("/api/video/frame")
+async def process_video_frame(request: Request):
+    """Process a single video frame sent from Android app"""
+    try:
+        # Get the raw bytes from the request
+        content = await request.body()
+        # Get stream ID from header if available
+        stream_id = request.headers.get("X-Stream-ID", f"stream_{int(time.time())}")
+        # Decode the image
+        nparr = np.frombuffer(content, np.uint8)
+        frame = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+        if frame is None:
+            return {"error": "Could not decode image data"}
+        # Process the frame
+        result = process_frame_for_gesture(frame)
+        if "error" not in result:
+            # Cache the result for this stream
+            detection_cache[stream_id] = result
+            # Return the result
+            return result
+        else:
+            return result
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return {"error": f"Failed to process frame: {e}"}
+@fastapi_app.get("/api/gestures")
+def get_available_gestures():
+    """Return all available gesture classes the model can recognize"""
+    return {"gestures": list(index_to_class.values())}
+@fastapi_app.get("/health")
+def health_check():
+    """Simple health check endpoint"""
+    return {"status": "healthy", "timestamp": time.time()}
+# Documentation for Android integration
+@fastapi_app.get("/")
+async def root():
+    return {
+        "app": "Hand Gesture Recognition API",
+        "usage": {
+            "image_prediction": "POST /api/predict with image file",
+            "video_streaming": "WebSocket /api/stream or POST frames to /api/video/frame",
+            "available_gestures": "GET /api/gestures"
+        },
+        "android_integration": {
+            "single_image": "Send image as multipart/form-data to /api/predict",
+            "video_stream": "Send individual frames to /api/video/frame with X-Stream-ID header",
+            "websocket": "Connect to /api/stream for bidirectional communication"
+        }
+    }
 if __name__ == "__main__":
     # Modified for Hugging Face Spaces environment

examples/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

requirements.txt CHANGED Viewed

@@ -6,4 +6,6 @@ numpy>=1.22.0
 pillow>=9.0.0
 python-multipart>=0.0.6
 mediapipe>=0.10.0
-opencv-python-headless>=4.5.0

 pillow>=9.0.0
 python-multipart>=0.0.6
 mediapipe>=0.10.0
+opencv-python-headless>=4.5.0
+websockets
+pydantic