Using cvzone HandDetector with a webcam in Django

311 views Asked by At

I am working on a Django project that involves using a webcam to play a rock-paper-scissors game against a computer. I have implemented the HandDetector algorithm to detect a hand in the webcam video feed, and a Keras classifier to classify the hand gesture. However, I am running into an issue with getting the HandDetector to work properly with the webcam video.

I have tried using a single still image from the webcam, but the HandDetector does not work well with just one image. I think the solution is to use a video of the hand, but I am not sure how to implement this in my Django project.

Here is the code for my jQuery script that captures the video from the webcam and sends it to a Django view:

<div class="container">
  <div class="row">
    <div class="col-md-6">
      <video id="webcam" width="640" height="480" autoplay></video>
    </div>
    <div class="col-md-6">
      <button id="play-button" class="btn btn-primary" onclick="play()">Play</button>
    </div>
  </div>
</div>

<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>

<script>
$(document).ready(function() { startWebcam(); });

var video = document.getElementById('webcam');
var stream;

function startWebcam() {
    navigator.mediaDevices.getUserMedia({ video: true, audio: false })
    .then(function(localMediaStream) {
        stream = localMediaStream;
        video.srcObject = localMediaStream;
        video.play();
    })
    .catch(function(err) {
        console.log("An error occurred: " + err);
    });
}

function play() {
    // Stop the video stream from the webcam
    var tracks = stream.getTracks();

    for (var i = 0; i < tracks.length; i++) {
        var track = tracks[i];
        track.stop();
    }
    // Start recording
    var chunks = [];
    var recorder = new MediaRecorder(stream);
    recorder.start();

    // Count down from 3 seconds
    var count = 3;
    var countdown = setInterval(function() {
        $('#countdown').html(count);
        count--;
        if (count === -1) {
            clearInterval(countdown);
            recorder.stop();
        }
    }, 1000);

    recorder.ondataavailable = function(e) {
        chunks.push(e.data);
    }

    recorder.onstop = function() {
        // Concatenate all the chunks
        var blob = new Blob(chunks, { type: 'video/webm' });
        chunks = [];

        // Send the video to the server
        var formData = new FormData();
        formData.append("video_data", blob);
        formData.append("csrfmiddlewaretoken", '{{ csrf_token }}');

        $.ajax({
            type: 'POST',
            url: '/process-video/',
            data: formData,
            processData: false,
            contentType: false,
            success: function(response) {
                console.log(response);
                $('#result').html(response);
            },
            error: function(xhr, status, error) {
                console.log(error);
            }
        });
    };
}
</script>

Here are the django views I use:

def process_video(request):
    if request.method != 'POST':
        return HttpResponse("Error: Invalid request method")

    # Get the video data from the request
    video_data = request.FILES.get('video_data')

    # Use OpenCV to read the video and extract the frames
    cap = cv2.VideoCapture(video_data)
    while cap.isOpened():
        ret, frame = cap.read()
        if ret:
            # Process the frame and check for a hand
            prediction = process_image_data(frame)
            if prediction != "Error: No hand detected in the image":
                cap.release()
                return HttpResponse(prediction)
        else:
            break

    cap.release()
    return HttpResponse("Error: No hand detected in the video. Please try again.")


def process_image_data(image_data):
    # Initialize the hand detector, image processor, and classifier
    detector = HandDetector(maxHands=1)
    processor = ImageProcessor(detector, 300, 20)
    classifier = HandClassifier()
    # Find the hand in the image
    hands = detector.findHands(image_data, draw=False)

    # If a hand is detected in the image
    if hands:
        hand = hands[0]
        x, y, w, h = hand['bbox']
        img_white, img_resize = processor.process(image_data, hand)
        # Calculate the aspect ratio of the cropped image
        aspect_ratio = h / w
        # If the aspect ratio is greater than 1, add the resized image to the white image
        # with a gap at the top and bottom to center the image
        if aspect_ratio > 1:
            h_gap = math.ceil((300 - img_resize.shape[0]) / 2)
            img_white[h_gap:h_gap + img_resize.shape[0], :] = img_resize
        # If the aspect ratio is less than or equal to 1, add the resized image to the white image
        # with a gap at the left and right to center the image
        else:
            w_gap = math.ceil((300 - img_resize.shape[1]) / 2)
            img_white[:, w_gap:w_gap + img_resize.shape[1]] = img_resize

        # Classify the image and return the prediction
        return classifier.classify(img_white)
    else:
        return "Error: No hand detected in the image"

The HandClassifier class

import cv2
import numpy as np
from PIL import Image, ImageOps
from keras.models import load_model
import os


class HandClassifier:
    def __init__(self):
        # Get the absolute paths to the model and labels files
        model_path = os.path.abspath('../keras/keras_model.h5')
        labels_path = os.path.abspath('../keras/labels.txt')

        # Load the model and labels
        self.model = load_model(model_path, compile=False)
        with open(labels_path, 'r') as f:
            self.labels = f.read().splitlines()

    def classify(self, image):
        # Resize the image to a 224x224 with the same strategy as in TM2:
        # resizing the image to be at least 224x224 and then cropping from the center
        image = Image.fromarray(image)
        image.convert('RGB')
        size = (224, 224)
        image = ImageOps.fit(image, size, Image.Resampling.LANCZOS)
        # Turn the image into a NumPy array
        image_array = np.asarray(image)
        # Normalize the image
        normalized_image_array = (image_array.astype(np.float32) / 127.0) - 1
        # Add the image to the data array
        data = np.ndarray(shape=(1, 224, 224, 3), dtype=np.float32)
        data[0] = normalized_image_array
        # Use the model to make a prediction
        prediction = self.model.predict(data)
        # Get the index of the most likely class
        index = np.argmax(prediction[0])

        # run the inference
        prediction = self.model.predict(data)
        index = np.argmax(prediction)
        class_name = self.labels[index]
        confidence_score = prediction[0][index]

        return class_name

And the ImageProcessor class

import cv2
import numpy as np


class ImageProcessor:
    def __init__(self, detector, img_size, offset):
        self.detector = detector
        self.img_size = img_size
        self.offset = offset

    def process(self, img, hand):
        # Crop the image based on the bounding box of the detected hand
        x, y, w, h = hand['bbox']
        img_crop = img[y - self.offset:y + h + self.offset, x - self.offset:x + w + self.offset]

        # Convert the image to grayscale
        img_crop = cv2.cvtColor(img_crop, cv2.COLOR_BGR2GRAY)

        # Convert the image data to depth 8 bits per pixel
        img_hog = cv2.normalize(img_crop, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)

        # Apply median filtering to the image to remove noise
        img_hog = cv2.medianBlur(img_hog, ksize=5)

        # Resize the image to 300x300
        img_crop = cv2.resize(img_hog, (self.img_size, self.img_size))

        # Add a third dimension to the resized image to match the shape of the white image
        img_resize = np.expand_dims(img_crop, axis=2)

        # Create an empty white image of size 300x300
        img_white = np.ones((self.img_size, self.img_size, 3), np.uint8) * 255

        return img_white, img_resize

Any help/tips on what would be the best most performant way to do this would be greatly appreciated

I have tried first to send a image taken from the webcam to the views.py but the problem here was that the handdetector never detected any hands. When i try my code out in a test class I wrote it works and classifies the hand live, but this is a live webcam feed in a seperate opencv window. this is the testing class btw

import cv2
from cvzone.HandTrackingModule import HandDetector
import math

from src.rps.hand_recognition.HandClassifier import HandClassifier
from src.rps.hand_recognition.ImageProcessor import ImageProcessor

cap = cv2.VideoCapture(0)
detector = HandDetector(maxHands=1)
classifier = HandClassifier()

offset = 20
img_size = 300

labels = ["paper", "rock", "scissors"]

while True:
    processor = ImageProcessor(detector, img_size, offset)
    success, img = cap.read()
    hands = detector.findHands(img, draw=False)

    if hands:
        hand = hands[0]
        x, y, w, h = hand['bbox']
        img_white, img_resize = processor.process(img, hand)
        # Calculate the aspect ratio of the cropped image
        aspect_ratio = h / w
        # If the aspect ratio is greater than 1, add the resized image to the white image
        # with a gap at the top and bottom to center the image
        if aspect_ratio > 1:
            h_gap = math.ceil((img_size - img_resize.shape[0]) / 2)
            img_white[h_gap:h_gap + img_resize.shape[0], :] = img_resize

        # If the aspect ratio is less than or equal to 1, add the resized image to the white image
        # with a gap at the left and right to center the image
        else:
            w_gap = math.ceil((img_size - img_resize.shape[1]) / 2)
            img_white[:, w_gap:w_gap + img_resize.shape[1]] = img_resize

        # improve with probability!!!!
        prediction = classifier.classify(img_white)

        cv2.rectangle(img, (x - offset, y - offset - 50),
                      (x - offset + 90, y - offset - 50 + 50), (255, 0, 255), cv2.FILLED)
        cv2.putText(img, prediction, (x, y - 26), cv2.FONT_HERSHEY_COMPLEX, 1.7, (255, 255, 255), 2)
        cv2.rectangle(img, (x - offset, y - offset),
                      (x + w + offset, y + h + offset), (255, 0, 255), 4)

        cv2.imshow("ImageWhite", img_white)

    cv2.imshow("Image", img)
    cv2.waitKey(1)

0

There are 0 answers