I wrote my own implementation of SIFT (Scale-Invariant Feature Transform) and then I rotate an image and match points with the original image. However, the output results don't seem to align correctly, and I'm not sure where it went wrong. I suspect that there might be an issue with how the SIFT function is defined.
I would appreciate any assistance you can provide.
import cv2
import numpy as np
from cv2 import KeyPoint
def find_local_maxima(src, ksize):
(h, w) = src.shape
pad_img = np.zeros((h + ksize, w + ksize))
pad_img[ksize // 2:h + ksize // 2, ksize // 2:w + ksize // 2] = src
dst = np.zeros((h, w))
for row in range(h):
for col in range(w):
max_val = np.max(pad_img[row: row + ksize, col:col + ksize])
if max_val == 0:
continue
if src[row, col] == max_val:
dst[row, col] = src[row, col]
return dst
def my_padding(src, filter):
(h, w) = src.shape
if isinstance(filter, tuple):
(h_pad, w_pad) = filter
else:
(h_pad, w_pad) = filter.shape
h_pad = h_pad // 2
w_pad = w_pad // 2
padding_img = np.zeros((h + h_pad * 2, w + w_pad * 2))
padding_img[h_pad:h + h_pad, w_pad:w + w_pad] = src
# repetition padding
# up
padding_img[:h_pad, w_pad:w_pad + w] = src[0, :]
# down
padding_img[h_pad + h:, w_pad:w_pad + w] = src[h - 1, :]
# left
padding_img[:, :w_pad] = padding_img[:, w_pad:w_pad + 1]
# right
padding_img[:, w_pad + w:] = padding_img[:, w_pad + w - 1:w_pad + w]
return padding_img
def my_filtering(src, filter):
(h, w) = src.shape
(f_h, f_w) = filter.shape
# print('<filter>')
# print(filter)
pad_img = my_padding(src, filter)
dst = np.zeros((h, w))
for row in range(h):
for col in range(w):
dst[row, col] = np.sum(pad_img[row:row + f_h, col:col + f_w] * filter)
return dst
def get_my_sobel():
sobel_x = np.dot(np.array([[1], [2], [1]]), np.array([[-1, 0, 1]]))
sobel_y = np.dot(np.array([[-1], [0], [1]]), np.array([[1, 2, 1]]))
return sobel_x, sobel_y
def calc_derivatives(src):
# calculate Ix, Iy
sobel_x, sobel_y = get_my_sobel()
Ix = my_filtering(src, sobel_x)
Iy = my_filtering(src, sobel_y)
return Ix, Iy
def SIFT(src):
gray = cv2.cvtColor(src, cv2.COLOR_BGR2GRAY).astype(np.float32)
print("get keypoint")
dst = cv2.cornerHarris(gray, 3, 3, 0.04)
dst[dst < 0.01 * dst.max()] = 0
dst = find_local_maxima(dst, 21)
dst = dst / dst.max()
y, x = np.nonzero(dst)
keypoints = []
for i in range(len(x)):
# x, y, size, angle, response, octave, class_id
pt_x = int(x[i]) #point x
pt_y = int(y[i]) #point y
size = None
key_angle = -1.
response = dst[y[i], x[i]]
octave = 0
class_id = -1
keypoints.append(KeyPoint(pt_x, pt_y, size, key_angle, response, octave, class_id))
print('get Ix and Iy...')
Ix, Iy = calc_derivatives(gray)
print('calculate angle and magnitude')
# magnitude / orientation
magnitude = np.sqrt((Ix ** 2) + (Iy ** 2))
angle = np.arctan2(Iy, Ix) # radian
angle = np.rad2deg(angle) # radian
angle = (angle + 360) % 360
# keypoint
print('calculate orientation assignment')
for i in range(len(keypoints)):
x, y = keypoints[i].pt
orient_hist = np.zeros(36, )
for row in range(-8, 8):
for col in range(-8, 8):
p_y = int(y + row)
p_x = int(x + col)
if p_y < 0 or p_y > src.shape[0] - 1 or p_x < 0 or p_x > src.shape[1] - 1:
continue
gaussian_weight = np.exp((-1 / 16) * (row ** 2 + col ** 2))
orient_hist[int(angle[p_y, p_x] // 10)] += magnitude[p_y, p_x] * gaussian_weight
max_val = np.max(orient_hist)
keypoints[i].angle = float((np.argmax(orient_hist) * 10) % 360)
for j in range(36):
if orient_hist[j] > max_val * 0.8 and j != np.argmax(orient_hist):
new_keypoint = KeyPoint(pt_x, pt_y, size, (j * 10) % 360, response, octave, class_id)
keypoints.append(new_keypoint)
print('calculate descriptor')
descriptors = np.zeros((len(keypoints), 128)) # 8 orientation * 4 * 4 = 128 dimensions
for i in range(len(keypoints)):
x, y = keypoints[i].pt
theta = np.deg2rad(keypoints[i].angle)
cos_angle = np.cos(theta)
sin_angle = np.sin(theta)
for row in range(-8, 8):
for col in range(-8, 8):
row_rot = np.round((cos_angle * col) + (sin_angle * row))
col_rot = np.round((cos_angle * col) - (sin_angle * row))
p_y = int(y + row_rot)
p_x = int(x + col_rot)
if p_y < 0 or p_y > (src.shape[0] - 1) \
or p_x < 0 or p_x > (src.shape[1] - 1):
continue
gaussian_weight = np.exp((-1 / 16) * (row_rot ** 2 + col_rot ** 2))
weight = magnitude[p_y,p_x] * gaussian_weight
bin_num = int(angle[p_y, p_x] // 45)
cell_num_row = int((row + 8) // 4)
cell_num_col = int((col + 8) // 4)
descriptors[i][32 * cell_num_row + 8 * cell_num_col:32 * cell_num_row + 8 * (cell_num_col + 1)][
bin_num] += weight
return keypoints, descriptors
def main():
src = cv2.imread("zebra.png")
src_rotation = cv2.rotate(src, cv2.ROTATE_90_CLOCKWISE)
kp1, des1 = SIFT(src)
kp2, des2 = SIFT(src_rotation)
## Matching
bf = cv2.BFMatcher_create(cv2.NORM_HAMMING, crossCheck=True)
des1 = des1.astype(np.uint8)
des2 = des2.astype(np.uint8)
matches = bf.match(des1, des2)
matches = sorted(matches, key = lambda x:x.distance)
result = cv2.drawMatches(src, kp1, src_rotation, kp2, matches[:20], outImg=None, flags=2)
cv2.imshow('sift', result)
cv2.waitKey()
cv2.destroyAllWindows()
if __name__ == '__main__':
main()