I'm working on a small personal project to automate the data acquisition from different measurement devices (without a digital output); multimeters or luminescence meters. Example of the input image. I've read some tutorials and forum posts on the topic. Namely: this post from stackowerflow and this tutorial.
There are 2 main issues:
Finding the display ROI
OCR
I skipped the first point for now, because the simple canny>>contours>>aproxpoly did not work in all cases. In some cases the largest element was not the display but the luminescence meter itself or other things in the image.
I was mostly dealing with the segmentation of the digits from the display and their recognition. The segmentation is solved by pre-processing the image itself using adaptive thresholding and removing some small connected components from the image. After the digits are segmented, I've tried to apply the same methods that are used in the tutorial mentioned above. I added some additional pre-processing of individual digits in case that they are not recognised the first time (rotation by +-5 degrees).
At this point it is getting quite complicated to tune individual parameters for the pre-processing and segmentation methods, which would work on multiple devices at the same time.
I'm asking for advice, whether to push this project till the end using only computer vision techniques, or should I make some kind of hybrid machine learning/ CV solution (for example train a CNN for the recognition of individual digits or a YOLO for finding the display in the image, etc.)?
P.S.: I've already tried tesseract, but after it failed to recognize individual digits and reading the documentation for it - which explains, that this network is rather for recognizing blocks of texts or words, than individual digits somewhere in the image - I gave up on it. Please note, that the relative position of the camera and the measuring device or the individual devices display is not guarantied, I'll add some additional photos.
And my code:
import numpy as np
import cv2
import matplotlib.pyplot as plt
import os
## MAKE THE PROCESS MORE 'VERBOSE'
show_images=True
#checking active 7 segmentDP segments in digit_roi
def search4segments(digit_roi):
dig_H, dig_W = digit_roi.shape
seg_short_side= dig_W // 4
# define a set of tuples for creating the mask for the 7 segments
segments = [
((0, 0), (seg_short_side, dig_W)), # top
((0, 0), (dig_H // 2, seg_short_side)), # top-left
((0, dig_W - seg_short_side), (dig_H // 2, dig_W)), # top-right
((dig_H // 2 - seg_short_side // 2, 0), (dig_H // 2 + seg_short_side // 2, dig_W)), # center
((dig_H // 2, 0), (dig_H, seg_short_side)), # bottom-left
((dig_H // 2, dig_W - seg_short_side), (dig_H, dig_W)), # bottom-right
((dig_H - seg_short_side, 0), (dig_H, dig_W)) # bottom
]
# set of activated segments
on = [0] * len(segments)
# loop over the individual segments
for (i, ((xA, yA), (xB, yB))) in enumerate(segments):
# extract the segment ROI, count the total number of
# thresholded pixels in the segment, and then compute
# the area of the segment
segROI = digit[xA:xB, yA:yB]
total = cv2.countNonZero(segROI)
area = (xB - xA) * (yB - yA)
# if the total number of non-zero pixels is greater than
# 40% of the area, mark the segment as "on"
if total / float(area + 1e-1) > 0.5:
on[i] = 1
return on
#function for rotating an image by other angles than 90, 180 or 270
def rotate_image(image, angle):
image_center = tuple(np.array(image.shape[1::-1]) / 2)
rot_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
result = cv2.warpAffine(image, rot_mat, image.shape[1::-1], flags=cv2.INTER_LINEAR)
return result
#trimmean function definition
def trimmean(arr, percent):
n = len(arr)
k = int(round(n*(float(percent)/100)/2))
return np.mean(arr[k+1:n-k])
#read in the image paths to a list
current_dir=os.getcwd()
relative_imdir=r'\RealWorldImages'
img_paths=[os.path.join(current_dir+relative_imdir,path) for path in os.listdir(current_dir+relative_imdir)]
img=cv2.imread(img_paths[1],0)
#blurr the imag to get rid of high frequency noise
cv2.medianBlur(img,5,img)
#OPTIONAL: resize the image, so it can be displayed with cv2.imshow for ROI selection recommended for large images, which would not fit on the display screen
img=cv2.resize(img,(int(img.shape[1]/1.2),int(img.shape[0]/1.2)))
#Manual selection of ROI
roi = cv2.selectROI(img)
cv2.destroyAllWindows()
#ROI image
img_roi = img[int(roi[1]):int(roi[1] + roi[3]), int(roi[0]):int(roi[0] + roi[2])]
#kernel size for adaptive thresholding (depending on the size of the ROI ->expected No. of digits in ROI 4-6)
ksize=2*(roi[2]//8)+1 #make sure it has odd dimensions
#adaptive tresholding
img_bw=cv2.adaptiveThreshold(img_roi,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY_INV,ksize,3)
#getting connected components with stats
img_bw_CC = cv2.connectedComponentsWithStats(img_bw)
#create and populate a matrix with only the relevant information from CC with stats
CC_stats=np.zeros((len(img_bw_CC[2]), 4), np.uint32)
CC_stats[:, 0]=range(len(img_bw_CC[2]))
CC_stats[:, 1]= img_bw_CC[2][:, -1]
CC_stats[:, 2:]= img_bw_CC[2][:, 2:-1]
#remove all CC, which have a bounding box that touches the border of the ROI - probably not a digit
relevant_cc_stats =np.delete(CC_stats, \
np.logical_or(np.logical_or(img_bw_CC[2][:, 0] == 0, img_bw_CC[2][:, 1] == 0), \
np.logical_or(img_bw_CC[2][:, 0] + img_bw_CC[2][:, 2] == img_roi.shape[1], img_bw_CC[2][:, 1] + img_bw_CC[2][:, 3] == img_roi.shape[0])), 0)
#sort the matrix rows according to the pixelcount of regions (from largest -> smallest)
relevant_cc_stats=relevant_cc_stats[np.argsort(-relevant_cc_stats[:, 1])]
#TODO: FIX THE STATISTIC
#trimmed mean of CC sizes (calculated from the middle 40%)
mean=trimmean(relevant_cc_stats[:, 1], 60)
#standard deviation
stddev=np.std(relevant_cc_stats[1:, 1])
#remove all components which are smaller than a third of the mean (NOT QUITE ROBUST)
relevant_cc_stats=relevant_cc_stats[relevant_cc_stats[:, 1] > (mean / 4)]# relevant_cc_stats#relevant_cc_stats#relevant_cc_stats[np.bitwise_and(relevant_cc_stats[:,1]<mean+2*stddev, relevant_cc_stats[:,1]>mean-2*stddev)] ###
#creating an empty image to draw the remaining components
masks=np.zeros((img_bw.shape),dtype=np.uint8)
for i in range(relevant_cc_stats.shape[0]):
cv2.bitwise_or(masks, np.array(img_bw_CC[1] == relevant_cc_stats[i, 0], dtype=np.uint8), masks)
roi_thinned=cv2.ximgproc.thinning(masks)
lines=cv2.HoughLines(roi_thinned,1,np.pi/180,30)
if show_images:
#zobrazeni vysledku predzpracovani
_, axs = plt.subplots(1, 2)
axs[0].imshow(img_bw,cmap='binary')
# kernel=cv2.getStructuringElement(cv2.MORPH_ELLIPSE,((2*int(mean/50)+1),(2*int(mean/50)+1)))
# # cv2.morphologyEx(masks,cv2.MORPH_OPEN,kernel,masks)
axs[1].imshow(masks,cmap='binary')
plt.show()
#getting the images projection on the horizontal axis
ver_proj=np.sum(masks,axis=0)
ver_proj=np.sum(masks,axis=0)
#Threshold the vector according to its mean (1/5)
ver_proj[ver_proj<np.mean(ver_proj)/5]=0
ver_proj[ver_proj>=np.mean(ver_proj)]=1
ver_proj=ver_proj.astype(np.bool)
########################################################################
########################################################################
##################### DIGIT SEGMENTATION ###############################
########################################################################
########################################################################
#getting the borders (places where the thresholded projection changes)
borders=[]
for i in range(1,len(ver_proj)):
if ver_proj[i-1] ^ ver_proj[i]:
borders.append(i)
#getting the outermost borders
lengths=list(np.diff(borders))
if len(borders)>2:
if borders[0]>2:
lengths.insert(0,borders[0])
if sum(lengths)<masks.shape[1]:
lengths.append(masks.shape[1]-borders[-1])
else:
raise RuntimeError
#cutting the image to smaller images and storing these in a list
img_segments=[]
for i in range(1,len(borders)):
img_segments.append(masks[:,borders[i-1]:borders[i]])
if show_images:
_, axs = plt.subplots(1, len(img_segments))
for i in range(len(img_segments)):
axs[i].imshow(img_segments[i],cmap='binary')
plt.show()
#distinguishing the digits from the supposedly empty in-between-digit regions
digits=[]
i=0
for i in range(len(img_segments)):
if cv2.countNonZero(img_segments[i])>img_segments[i].size/5:
digits.append(img_segments[i])
if show_images:
_, axs = plt.subplots(1, len(digits))
for i in range(len(digits)):
axs[i].imshow(digits[i],cmap='binary')
plt.show()
#projecting the rows in sub-images to the vertical axis
digits_hor_proj=[np.sum(digit,axis=1) for digit in digits]
#further shrinking the images to get the most tight bounding boxes
for i in range(len(digits)):
tmp=np.where(digits_hor_proj[i]>np.mean(digits_hor_proj[i]/2))
digits[i]=digits[i][int(tmp[0][0]):int(tmp[0][-1])+1,:]
if show_images:
_, axs = plt.subplots(1, len(digits))
for i in range(len(digits)):
axs[i].imshow(digits[i],cmap='binary')
plt.show()
digits_closed=[]
for i in range(len(digits)):
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (int(digits[i].shape[1]/5), int(digits[i].shape[1]/5)))
digits_closed.append(cv2.morphologyEx(digits[i],cv2.MORPH_DILATE,kernel))
##########################################################################
##########################################################################
####################### DIGIT RECOGNITION ################################
##########################################################################
##########################################################################
# define the dictionary of digit segments
DIGITS_LOOKUP = {
(1, 1, 1, 0, 1, 1, 1): 0,
# (0, 0, 1, 0, 0, 1, 0): 1,#calculated differently
(1, 0, 1, 1, 1, 0, 1): 2,
(1, 0, 1, 1, 0, 1, 1): 3,
(0, 1, 1, 1, 0, 1, 0): 4,
(1, 1, 0, 1, 0, 1, 1): 5,
(1, 1, 0, 1, 1, 1, 1): 6,
(1, 0, 1, 0, 0, 1, 0): 7,
(1, 1, 1, 1, 1, 1, 1): 8,
(1, 1, 1, 1, 0, 1, 1): 9
}
#dreating an output list for recognised digits
digit_rec=[]
for digit in digits:
dig_H, dig_W = digit.shape
# if dig_H > 3 * dig_W:
# expanded_one = np.zeros((dig_H, 3 * dig_W), dtype=np.uint8)
# expanded_one[:, 2 * dig_W:] = digit
# digit = expanded_one
# dig_H, dig_W = digit.shape
seg_short_side= dig_W // 3
#slight rotation in case of not recognising the digit on the first try
trials=0
rot_pos=[-5,-3,3,5]
#for recognising a one
#TODO: expand the image with the one, as it is on its original position in the digit space (left side)
if dig_H>3*dig_W and cv2.countNonZero(digit)>0.4*dig_W*dig_H: #second part is commented out, as it fails on skewed ones
digit_rec.append(1)
else:
#if the digit does not fill at least 30% of its space, delate it
if cv2.countNonZero(digit)>0.3*dig_W*dig_H:
pass
else:
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(int(digits[i].shape[1] / 5), int(digits[i].shape[1] / 5)))
cv2.morphologyEx(digits[i], cv2.MORPH_DILATE, kernel,digits[i])
#cycle for multiple rotationary positions
while trials<len(rot_pos):
if trials==0:
on=search4segments(digit)
#lookup the digit and draw it on the image, append None, if the segment was not succesfully recognised
#(but to know, that there was something)
try:
digit_rec.append(DIGITS_LOOKUP[tuple(on)])
break
except KeyError:
rot_digit=rotate_image(digit,rot_pos[trials])
on = search4segments(rot_digit)
print(trials)
trials+=1
if trials==4:
digit_rec.append(None)
if show_images:
_, axs = plt.subplots(1, len(digits))
for i in range(len(digits)):
axs[i].imshow(digits[i],cmap='binary')
axs[i].title.set_text(digit_rec[i])
plt.show()
#Print the result vector
print(digit_rec)