I am working on a Django project that involves using a webcam to play a rock-paper-scissors game against a computer. I have implemented the HandDetector algorithm to detect a hand in the webcam video feed, and a Keras classifier to classify the hand gesture. However, I am running into an issue with getting the HandDetector to work properly with the webcam video.
I have tried using a single still image from the webcam, but the HandDetector does not work well with just one image. I think the solution is to use a video of the hand, but I am not sure how to implement this in my Django project.
Here is the code for my jQuery script that captures the video from the webcam and sends it to a Django view:
<div class="container">
<div class="row">
<div class="col-md-6">
<video id="webcam" width="640" height="480" autoplay></video>
</div>
<div class="col-md-6">
<button id="play-button" class="btn btn-primary" onclick="play()">Play</button>
</div>
</div>
</div>
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<script>
$(document).ready(function() { startWebcam(); });
var video = document.getElementById('webcam');
var stream;
function startWebcam() {
navigator.mediaDevices.getUserMedia({ video: true, audio: false })
.then(function(localMediaStream) {
stream = localMediaStream;
video.srcObject = localMediaStream;
video.play();
})
.catch(function(err) {
console.log("An error occurred: " + err);
});
}
function play() {
// Stop the video stream from the webcam
var tracks = stream.getTracks();
for (var i = 0; i < tracks.length; i++) {
var track = tracks[i];
track.stop();
}
// Start recording
var chunks = [];
var recorder = new MediaRecorder(stream);
recorder.start();
// Count down from 3 seconds
var count = 3;
var countdown = setInterval(function() {
$('#countdown').html(count);
count--;
if (count === -1) {
clearInterval(countdown);
recorder.stop();
}
}, 1000);
recorder.ondataavailable = function(e) {
chunks.push(e.data);
}
recorder.onstop = function() {
// Concatenate all the chunks
var blob = new Blob(chunks, { type: 'video/webm' });
chunks = [];
// Send the video to the server
var formData = new FormData();
formData.append("video_data", blob);
formData.append("csrfmiddlewaretoken", '{{ csrf_token }}');
$.ajax({
type: 'POST',
url: '/process-video/',
data: formData,
processData: false,
contentType: false,
success: function(response) {
console.log(response);
$('#result').html(response);
},
error: function(xhr, status, error) {
console.log(error);
}
});
};
}
</script>
Here are the django views I use:
def process_video(request):
if request.method != 'POST':
return HttpResponse("Error: Invalid request method")
# Get the video data from the request
video_data = request.FILES.get('video_data')
# Use OpenCV to read the video and extract the frames
cap = cv2.VideoCapture(video_data)
while cap.isOpened():
ret, frame = cap.read()
if ret:
# Process the frame and check for a hand
prediction = process_image_data(frame)
if prediction != "Error: No hand detected in the image":
cap.release()
return HttpResponse(prediction)
else:
break
cap.release()
return HttpResponse("Error: No hand detected in the video. Please try again.")
def process_image_data(image_data):
# Initialize the hand detector, image processor, and classifier
detector = HandDetector(maxHands=1)
processor = ImageProcessor(detector, 300, 20)
classifier = HandClassifier()
# Find the hand in the image
hands = detector.findHands(image_data, draw=False)
# If a hand is detected in the image
if hands:
hand = hands[0]
x, y, w, h = hand['bbox']
img_white, img_resize = processor.process(image_data, hand)
# Calculate the aspect ratio of the cropped image
aspect_ratio = h / w
# If the aspect ratio is greater than 1, add the resized image to the white image
# with a gap at the top and bottom to center the image
if aspect_ratio > 1:
h_gap = math.ceil((300 - img_resize.shape[0]) / 2)
img_white[h_gap:h_gap + img_resize.shape[0], :] = img_resize
# If the aspect ratio is less than or equal to 1, add the resized image to the white image
# with a gap at the left and right to center the image
else:
w_gap = math.ceil((300 - img_resize.shape[1]) / 2)
img_white[:, w_gap:w_gap + img_resize.shape[1]] = img_resize
# Classify the image and return the prediction
return classifier.classify(img_white)
else:
return "Error: No hand detected in the image"
The HandClassifier class
import cv2
import numpy as np
from PIL import Image, ImageOps
from keras.models import load_model
import os
class HandClassifier:
def __init__(self):
# Get the absolute paths to the model and labels files
model_path = os.path.abspath('../keras/keras_model.h5')
labels_path = os.path.abspath('../keras/labels.txt')
# Load the model and labels
self.model = load_model(model_path, compile=False)
with open(labels_path, 'r') as f:
self.labels = f.read().splitlines()
def classify(self, image):
# Resize the image to a 224x224 with the same strategy as in TM2:
# resizing the image to be at least 224x224 and then cropping from the center
image = Image.fromarray(image)
image.convert('RGB')
size = (224, 224)
image = ImageOps.fit(image, size, Image.Resampling.LANCZOS)
# Turn the image into a NumPy array
image_array = np.asarray(image)
# Normalize the image
normalized_image_array = (image_array.astype(np.float32) / 127.0) - 1
# Add the image to the data array
data = np.ndarray(shape=(1, 224, 224, 3), dtype=np.float32)
data[0] = normalized_image_array
# Use the model to make a prediction
prediction = self.model.predict(data)
# Get the index of the most likely class
index = np.argmax(prediction[0])
# run the inference
prediction = self.model.predict(data)
index = np.argmax(prediction)
class_name = self.labels[index]
confidence_score = prediction[0][index]
return class_name
And the ImageProcessor class
import cv2
import numpy as np
class ImageProcessor:
def __init__(self, detector, img_size, offset):
self.detector = detector
self.img_size = img_size
self.offset = offset
def process(self, img, hand):
# Crop the image based on the bounding box of the detected hand
x, y, w, h = hand['bbox']
img_crop = img[y - self.offset:y + h + self.offset, x - self.offset:x + w + self.offset]
# Convert the image to grayscale
img_crop = cv2.cvtColor(img_crop, cv2.COLOR_BGR2GRAY)
# Convert the image data to depth 8 bits per pixel
img_hog = cv2.normalize(img_crop, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)
# Apply median filtering to the image to remove noise
img_hog = cv2.medianBlur(img_hog, ksize=5)
# Resize the image to 300x300
img_crop = cv2.resize(img_hog, (self.img_size, self.img_size))
# Add a third dimension to the resized image to match the shape of the white image
img_resize = np.expand_dims(img_crop, axis=2)
# Create an empty white image of size 300x300
img_white = np.ones((self.img_size, self.img_size, 3), np.uint8) * 255
return img_white, img_resize
Any help/tips on what would be the best most performant way to do this would be greatly appreciated
I have tried first to send a image taken from the webcam to the views.py but the problem here was that the handdetector never detected any hands. When i try my code out in a test class I wrote it works and classifies the hand live, but this is a live webcam feed in a seperate opencv window. this is the testing class btw
import cv2
from cvzone.HandTrackingModule import HandDetector
import math
from src.rps.hand_recognition.HandClassifier import HandClassifier
from src.rps.hand_recognition.ImageProcessor import ImageProcessor
cap = cv2.VideoCapture(0)
detector = HandDetector(maxHands=1)
classifier = HandClassifier()
offset = 20
img_size = 300
labels = ["paper", "rock", "scissors"]
while True:
processor = ImageProcessor(detector, img_size, offset)
success, img = cap.read()
hands = detector.findHands(img, draw=False)
if hands:
hand = hands[0]
x, y, w, h = hand['bbox']
img_white, img_resize = processor.process(img, hand)
# Calculate the aspect ratio of the cropped image
aspect_ratio = h / w
# If the aspect ratio is greater than 1, add the resized image to the white image
# with a gap at the top and bottom to center the image
if aspect_ratio > 1:
h_gap = math.ceil((img_size - img_resize.shape[0]) / 2)
img_white[h_gap:h_gap + img_resize.shape[0], :] = img_resize
# If the aspect ratio is less than or equal to 1, add the resized image to the white image
# with a gap at the left and right to center the image
else:
w_gap = math.ceil((img_size - img_resize.shape[1]) / 2)
img_white[:, w_gap:w_gap + img_resize.shape[1]] = img_resize
# improve with probability!!!!
prediction = classifier.classify(img_white)
cv2.rectangle(img, (x - offset, y - offset - 50),
(x - offset + 90, y - offset - 50 + 50), (255, 0, 255), cv2.FILLED)
cv2.putText(img, prediction, (x, y - 26), cv2.FONT_HERSHEY_COMPLEX, 1.7, (255, 255, 255), 2)
cv2.rectangle(img, (x - offset, y - offset),
(x + w + offset, y + h + offset), (255, 0, 255), 4)
cv2.imshow("ImageWhite", img_white)
cv2.imshow("Image", img)
cv2.waitKey(1)