pytesseract Erase table borders as delicately as possible

565 views Asked by At

I am using pytesseract and opencv to erase the borders of an image. to extract text from image using tesseract

This is the source code I wrote based on this post. What's the way to remove all lines and borders in image(keep texts) programmatically?

import cv2
import os
try:
    from PIL import Image
except ImportError:
     import Image
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

image = cv2.imread("D://ocrtestimg//id.jpg")
result = image.copy() 

gray = cv2.cvtColor(result, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30,1))
remove_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)

cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    cv2.drawContours(result, [c], -1, (255,255,255), 5) # contours 그리기


vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,30))
remove_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    cv2.drawContours(result, [c], -1, (255,255,255), 5)

cv2.imshow('thresh', thresh)
cv2.imshow('result', result)
cv2.imwrite('result.png', result)
cv2.waitKey()

image = cv2.imread('result.png')
result = image.copy()
gray = cv2.cvtColor(result, cv2.COLOR_BGR2GRAY)

filename = "{}.png".format(os.getpid())
print(filename)
cv2.imwrite(filename, gray)

#Simple image to string- OCR
text = pytesseract.image_to_string(Image.open(filename), lang='eng') 
os.remove(filename)
print(text)

cv2.imshow("Image", image)
cv2.waitKey(0)

enter image description here

image with borders removed enter image description here

The image quality is poor, so the vertical lines in the table are broken. What additional work do I need to do to erase this broken line? Even if you increase the dpi by importing an image or increase the size of the photo, the broken vertical lines are not erased.

0

There are 0 answers