I am calibrating the camera and met some issues when I tried to get the extrinsics matrix. Here is the setting. I used a webcam as the camera and the camera is fixed/static (the location deosn't change). Then, I used the checkerboard to calibrate the camera and I got the intrinsics of the camera. There are some known points in the world coordinate (I will attach an image to illustrate the x-y-z axis in the world coordinate). I got the pixel coordinates for each known point from the image. Therefore, ideally, I can get the extrinsics with cv2.solvePnP.
Problem: However, the output is totally insane. I have no idea why this happened. Could you guys give me some hints?
I really appreciate.
Known_Points This is my code.
import cv2
import numpy as np
import glob
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 0.001)
CHECKBOARD = (10, 7)
unit = 49 #in milimeters
p_3d = []
p_2d = []
obj_3d = np.zeros((1, CHECKBOARD[0] * CHECKBOARD[1], 3), np.float32)
obj_3d[0,:,:2] = np.mgrid[0:CHECKBOARD[0], 0:CHECKBOARD[1]].T.reshape(-1, 2) * unit
prev_img_shape = None
file_path = "C:/Users/18639/Desktop/SVD_GITHUB/SVD_Human_Detection/Test_videos/1019/"
images = glob.glob(file_path + "*.jpg")
idx = 0
for filename in images:
    img = cv2.imread(filename)
    #img_org = cv2.imread(filename)
    
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    ret, corners = cv2.findChessboardCorners(gray, CHECKBOARD, cv2.CALIB_CB_ADAPTIVE_THRESH + cv2.CALIB_CB_FAST_CHECK + cv2.CALIB_CB_NORMALIZE_IMAGE)
    if ret == True:
        if idx == 0:
            
            img_show = cv2.imread(filename)
            
            
        p_3d.append(obj_3d)
        corners_2d = cv2.cornerSubPix(gray, corners, (5,5), (-1, -1), criteria)
        p_2d.append(corners_2d)
        #print(corners_2d[0][0])
        #x = int(corners_2d[0][0][0])
        #y = int(corners_2d[0][0][1])
        #cv2.circle(img_org, (x,y), 3,(255,0,0), 3)
        img = cv2.drawChessboardCorners(img, CHECKBOARD, corners_2d, ret)
        #cv2.imshow("img", img_org)
        idx += 1
        
h, w = img.shape[:2]
p_3d = np.array(p_3d)
p_3d = p_3d.reshape((11,70,3)).astype(np.float32)
p_2d = np.array(p_2d)
p_2d = p_2d.reshape((11,70,2)).astype(np.float32)
ret, intr, dist, r_vecs, t_vecs = cv2.calibrateCamera(p_3d, p_2d, gray.shape[::-1], None, None)
# Finish getting intrinsics
###############################################################################################
# Strating getting extrinsics
feet_2_meter = 0.3048 * 1000
inch_2_meter = 0.0254 * 1000
z = 5 * feet_2_meter + 6 * inch_2_meter
x = 18 * feet_2_meter + 8 * inch_2_meter
y = 13 * feet_2_meter + 6 * inch_2_meter
world_cord = [(0,0,z/2), (0,0,z), (0,0,0), (x/2,0,z), (0,y,z), (0,y,z/2)]
world_cord = np.array(world_cord).astype(np.float32)
#world_cord = np.reshape(world_cord, (5,3,1))
img_cord = [(565,668), (556,495), (574,858), (441,585), (1454,461), (1444,611)]
#img_cord = [(668,565), (495,565), (858,574), (585,441), (461,1454), (611,1444)]
img_cord = np.array(img_cord).astype(np.float32)
retval, rvec, tvec = cv2.solvePnP(world_cord, img_cord, intr, dist)
rmatrix = cv2.Rodrigues(rvec)[0]
extrinsics = np.hstack((rmatrix,tvec))
Trans = intr @ extrinsics
test_points =[[0,0, 0,1], [x/3 ,0, z,1], [0, y/3, 0 ,1]]
test_points = np.array(test_points).astype(np.float32)
file_path = "C:/Users/18639/Desktop/SVD_GITHUB/SVD_Human_Detection/Test_videos/1019/cam/WIN_20231019_05_38_54_Pro.jpg"
img = cv2.imread(file_path)
inf = Trans @ test_points.T
for i in range(3):
    inf[:, i] /= inf[2, i]
inf = inf.astype(np.int64)
for i in range(3):
    cv2.circle(img, (inf[0,i], inf[1,i]), 3,(255,0,0), 3)
#cv2.circle(img, (100,100), 3, (255,0,0), 3)
cv2.imshow("f", img)
if cv2.waitKey(0) & 0xFF == ord('q'):
    pass
cv2.destroyAllWindows()
known points world coordinates in milimeters:
array([[   0. ,    0. ,  838.2],
       [   0. ,    0. , 1676.4],
       [   0. ,    0. ,    0. ],
       [2844.8,    0. , 1676.4],
       [   0. , 4114.8, 1676.4],
       [   0. , 4114.8,  838.2]], dtype=float32)
known points pixel coordinates:
array([[ 565.,  668.],
       [ 556.,  495.],
       [ 574.,  858.],
       [ 441.,  585.],
       [1454.,  461.],
       [1444.,  611.]], dtype=float32)
Intrinsics:
array([[2.13815906e+03, 0.00000000e+00, 9.78214954e+02],
       [0.00000000e+00, 2.15540979e+03, 5.50989831e+02],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00]])
Distortion:
array([[-2.89790743e-01,  3.57319693e+01,  3.37708994e-03,
        -1.61864652e-02, -7.60907655e+02]])
Extrinsics ([R|T]):
array([[ 8.20908600e-01, -5.65397521e-01,  8.02166691e-02,
        -3.85454283e-08],
       [-5.06195448e-02,  6.78718901e-02,  9.96409087e-01,
        -1.81841390e-08],
       [-5.68811684e-01, -8.22021320e-01,  2.70964531e-02,
        -2.52760720e-08]])
Weird things happend here. T vector is in e-8 scale.
And the inferred pixel-coordinates from the known points:
array([[ 7308, 79811],
       [ 7308, 79811],
       [ 4238,  2101],
       [-2379, -1540],
       [ 2382,  -708],
       [ 2416,  -163]], dtype=int64)
Ground_Truth:
array([[ 565.,  668.],
       [ 556.,  495.],
       [ 574.,  858.],
       [ 441.,  585.],
       [1454.,  461.],
       [1444.,  611.]], dtype=float32)
It's totally wrong!
I might make some mistakes somewhere but I have no idea what's wrong with it.
Thank you guys