I'm doing a simple project that basically consists in using images and labels (labels contain the class and coordinates of the bboxes) to train a CNN model using pytorch and then finally display in real-time the bouncing boxes in the detected face. I know that are other better architectures and even faster for this type of task, but i wanted to do this way only for didactic purposes. The problem I'm facing is in the last part that is the real-time face detection, that for some reason does not display de bouncing box, but it does detects my face (pourly of course due to the fact i'm running locally and many other reasons).
This is my code:
# 1- Create custom dataset
import os
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
import json
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt
import cv2
import numpy as np
class CustomDataset(Dataset):
def __init__(self, image_folder, label_folder, transform=None):
self.image_folder = image_folder
self.label_folder = label_folder
self.transform = transform
self.images = sorted(os.listdir(image_folder))
self.labels = sorted(file for file in os.listdir(label_folder) if file.endswith('.json'))
def __len__(self):
return len(self.images)
def __getitem__(self, idx):
img_path = os.path.join(self.image_folder, self.images[idx])
# Get the corresponding label file in the label folder
label_file = self.labels[idx]
label_path = os.path.join(self.label_folder, label_file)
image = Image.open(img_path).convert("RGB")
# Open and read the content of the label file
with open(label_path, 'r') as f:
# Load JSON content from the label file
label_data = json.load(f)
# Convert label data to tensor
# Assuming your label_data contains 'class' and 'bboxes'
label_tensor = torch.tensor([label_data['class']] + label_data['bbox'], dtype=torch.float32)
# Apply transformations
if self.transform:
image = self.transform(image)
return image, label_tensor
# Define your transformation
transform = transforms.Compose([
transforms.Resize((120, 120)),
transforms.ToTensor()
])
train_dataset = CustomDataset(image_folder='aug_data/train/images', label_folder='aug_data/train/labels', transform=transform)
val_dataset = CustomDataset(image_folder='aug_data/val/images', label_folder='aug_data/val/labels/', transform=transform)
# 2- Create CNN
import torch.nn as nn
class SimpleCNN(nn.Module):
def __init__(self):
super(SimpleCNN, self).__init__()
self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
self.batch_norm1 = nn.BatchNorm2d(16)
self.pool1 = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
self.batch_norm2 = nn.BatchNorm2d(32)
self.pool2 = nn.MaxPool2d(2, 2)
self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
self.batch_norm3 = nn.BatchNorm2d(64)
self.pool3 = nn.MaxPool2d(2, 2)
self.conv4 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
self.batch_norm4 = nn.BatchNorm2d(128)
self.pool4 = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(128 * 7 * 7, 512)
self.fc2 = nn.Linear(512, 5) # Assuming 6 parameters for bounding box + class
def forward(self, x):
x = self.pool1(self.batch_norm1(nn.functional.relu(self.conv1(x))))
x = self.pool2(self.batch_norm2(nn.functional.relu(self.conv2(x))))
x = self.pool3(self.batch_norm3(nn.functional.relu(self.conv3(x))))
x = self.pool4(self.batch_norm4(nn.functional.relu(self.conv4(x))))
x = x.view(-1, 128 * 7 * 7)
x = nn.functional.relu(self.fc1(x))
x = self.fc2(x)
return x
model = SimpleCNN()
# 3- Train the model
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
def train(model, train_loader, val_loader, num_epochs=5, learning_rate=0.001):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train_losses = []
val_losses = []
for epoch in range(num_epochs):
model.train()
epoch_train_loss = 0.0
for images, labels in train_loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
epoch_train_loss += loss.item()
# Validation
model.eval()
epoch_val_loss = 0.0
with torch.no_grad():
for images, labels in val_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
val_loss = criterion(outputs, labels)
epoch_val_loss += val_loss.item()
# Average loss over all batches in an epoch
epoch_train_loss /= len(train_loader)
epoch_val_loss /= len(val_loader)
print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_train_loss}, Validation Loss: {epoch_val_loss}')
# Save losses for plotting
train_losses.append(epoch_train_loss)
val_losses.append(epoch_val_loss)
return train_losses, val_losses
train_losses, val_losses = train(model, train_loader, val_loader, num_epochs=5, learning_rate=0.001)
model_path = 'Models/model_CNN.pth'
torch.save(model.state_dict(), model_path)
#4- Plot the loss
def plot_loss(train_losses, val_losses):
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()
plot_loss(train_losses, val_losses)
#5- Detect face in real time
# Load the trained model
model = SimpleCNN()
model.load_state_dict(torch.load('Models/model_CNN.pth'))
model.eval()
def detect_face(frame, model):
# Preprocess the frame
img = cv2.resize(frame, (120, 120))
img = Image.fromarray(img) # Convert numpy array to PIL Image
transform = transforms.Compose([
transforms.ToTensor(),
# Add any other necessary transformations
])
img = transform(img)
img = img.unsqueeze(0) # Add batch dimension
# Run the model
with torch.no_grad():
output = model(img)
# Post-process the output (assuming bounding box coordinates and class)
# Extract class and bounding box coordinates from the output tensor
pred_class = int(output[0, 0])
bbox = list(map(int, output[0, 1:]))
# Draw bounding box only if the predicted class is positive
if pred_class == 1 and output[0, 0] > 0.5:
print(f"Class: {pred_class}, Bbox: {bbox}")
bbox = list(map(int, output[0, 1:])) # Assuming bounding box coordinates are normalized
bbox = [int(coord * frame.shape[1]) for coord in bbox] # Scale coordinates
cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
return frame
# Open a webcam
cap = cv2.VideoCapture(0)
while True:
ret, frame = cap.read()
if not ret:
break
# Detect face
frame = detect_face(frame, model)
# Display the resulting frame
cv2.imshow('Face Detection', frame)
if cv2.waitKey(1) & 0xFF == 27: # Press 'Esc' to exit
break
# Release the capture
cap.release()
cv2.destroyAllWindows()
This is inside of one of the json files and shows how a bbox should look like
After doing some slightly changes, the bboxes are being recognized now in the model, but the class is now returning 0, which is leading for not displaying the bounding boxes...
You can see here the bboxes being returned correctly but the class returning 0