I am making a 3 axis gimbal for my final year project, The gimbal has a feature of object tracking, I have a made a GUI where i can see the video feed, Yolo detects everything in the video, but i want to let the user decide what object to focus on, How should i go about this.
Below i have provided the code for face detection.
from ultralytics import YOLO
import cv2
import cvzone
import math
cap = cv2.VideoCapture(0)
cap.set(3, 1280)
cap.set(4, 720)
model = YOLO('../YOLO Weights/yolov8n.pt')
classNames = ["person",
"bicycle",
"car",
"motorbike",
"aeroplane",
"bus",
"train",
"truck",
"boat",
"traffic light",
"fire hydrant",
"stop sign",
"parking meter",
"bench",
"bird",
"cat",
"dog",
"horse",
"sheep",
"cow",
"elephant",
"bear",
"zebra",
"giraffe",
"backpack",
"umbrella",
"handbag",
"tie",
"suitcase",
"frisbee",
"skis",
"snowboard",
"sports ball",
"kite",
"baseball bat",
"baseball glove",
"skateboard",
"surfboard",
"tennis racket",
"bottle",
"wine glass",
"cup",
"fork",
"knife",
"spoon",
"bowl",
"banana",
"apple",
"sandwich",
"orange",
"broccoli",
"carrot",
"hot dog",
"pizza",
"donut",
"cake",
"chair",
"sofa",
"pottedplant",
"bed",
"diningtable",
"toilet",
"tvmonitor",
"laptop",
"mouse",
"remote",
"keyboard",
"cell phone",
"microwave",
"oven",
"toaster",
"sink",
"refrigerator",
"book",
"clock",
"vase",
"scissors",
"teddy bear",
"hair drier",
"toothbrush"]
while True:
success, img = cap.read()
results = model(img, stream=True)
for r in results:
boxes = r.boxes
for box in boxes:
x1, y1, x2, y2 = box.xyxy[0]
x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
w, h = x2-x1, y2-y1
cvzone.cornerRect(img, (x1, y1, w, h))
conf = math.ceil((box.conf[0]*100))/100
cls = box.cls[0]
name = classNames[int(cls)]
cvzone.putTextRect(
img, f'{name} 'f'{conf}', (max(0, x1), max(35, y1)), scale=2, thickness=2,
colorT=(255,255,255), colorR=(54,250,74))
cv2.imshow("Image", img)
cv2.waitKey(1)