I want to use preprocessed image as an input to def infer_text that will return me annotations. SO, how can I do this, what should I pass to infer_text function?
image_folder = Path("/home/Tasks/NM_spanish/Invoices_pdf")
def get_preprocessed_image(image_path: pathlib.Path) -> PIL.Image.Image:
try:
with image_path.open("rb") as image_input:
return PIL.Image.open(io.BytesIO(pyabbyy.preprocess(image_input.read())))
except (RuntimeError, AttributeError):
print(image_path)
def preprocess_image_folder_(image_folder: pathlib.Path) -> None:
for image_path in tqdm.tqdm(list(image_folder.rglob("*.pdf"))):
try:
get_preprocessed_image(image_path).save(image_path)
except (RuntimeError, AttributeError):
print(image_path)
def infer_text(image_path: pathlib.Path) -> graphanno.GraphAnnotation:
with image_path.open("rb") as img:
words = pyabbyy.read_text(img.read(), preprocess=False)
nodes = []
for word in words:
box = geometric.Box(
origin_x=word["origin_x"],
origin_y=word["origin_y"],
width=word["max_x"] - word["origin_x"],
height=word["max_y"] - word["origin_y"],
)
nodes.append(graphanno.Node(text=word["text"], box=box))
num_nodes = len(nodes)
annotation = graphanno.GraphAnnotation(
tuple(nodes),
graphanno.Adjacency(np.zeros((num_nodes, num_nodes))),
graphanno.Adjacency(np.zeros((num_nodes, num_nodes))),
graphanno.Adjacency(np.zeros((num_nodes, num_nodes))),
)
return annotation
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
preprocess_image_folder_(image_folder)
You want to use preprocessed_image in your infer_text, but preprocessed_image wants the image_path. You are already getting it inside your infer_text as a parameter. All you have to do is run get_preprocessed_image inside your infer_text and pass it inside the pathlib.Path that you take as a parameter. You can hold it in a variable, then use it wherever inside infer_text. Hope it helps