I am using pytesseract and opencv to erase the borders of an image. to extract text from image using tesseract
This is the source code I wrote based on this post. What's the way to remove all lines and borders in image(keep texts) programmatically?
import cv2
import os
try:
from PIL import Image
except ImportError:
import Image
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'
image = cv2.imread("D://ocrtestimg//id.jpg")
result = image.copy()
gray = cv2.cvtColor(result, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30,1))
remove_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(result, [c], -1, (255,255,255), 5) # contours 그리기
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,30))
remove_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(result, [c], -1, (255,255,255), 5)
cv2.imshow('thresh', thresh)
cv2.imshow('result', result)
cv2.imwrite('result.png', result)
cv2.waitKey()
image = cv2.imread('result.png')
result = image.copy()
gray = cv2.cvtColor(result, cv2.COLOR_BGR2GRAY)
filename = "{}.png".format(os.getpid())
print(filename)
cv2.imwrite(filename, gray)
#Simple image to string- OCR
text = pytesseract.image_to_string(Image.open(filename), lang='eng')
os.remove(filename)
print(text)
cv2.imshow("Image", image)
cv2.waitKey(0)
The image quality is poor, so the vertical lines in the table are broken. What additional work do I need to do to erase this broken line? Even if you increase the dpi by importing an image or increase the size of the photo, the broken vertical lines are not erased.