DocumentAI - Custom Extractor no entities

46 views Asked by At

I just train a custom extractor at Document AI and test it there and get the values for the tags that I created, but I was following the Sample Request for Python (here's the code sample) but I get no entities nor texts just the pages with the paragraphs and lines

def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    credentials_path: str,
    field_mask: Optional[str] = None,
    processor_version_id: Optional[str] = None,
    process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    credentials = credentials = service_account.Credentials.from_service_account_file(credentials_path)
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        ),
        credentials=credentials
    )
    if processor_version_id:
        name = client.processor_version_path(project_id, location, processor_id, processor_version_id)
    else:
        name = client.processor_path(project_id, location, processor_id)


    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        process_options=process_options,
    )

    result = client.process_document(request=request)

    return result

def process_document_sample(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
    credentials_path: str,
    field_mask: Optional[str] = None,
    processor_version_id: Optional[str] = None,
) -> None:

    process_options = documentai.ProcessOptions(
        # Process only specific pages
        individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
            pages=[1]
        ),     
    )
    
    result = process_document(
      project_id=project_id,
      location=location,
      processor_id=processor_id,
      file_path=file_path,
      mime_type=mime_type,
      credentials_path=credentials_path,
      field_mask=field_mask,
      processor_version=processor_version_id,
      process_options=process_options,
    )
    return result

Am I missing something for entities be empty?

This is the response:

{
    "uri": "",
    "mime_type": "application/pdf",
    "text": "File content here",
    "pages": {
        "page_number": 1,
        "dimension": {
            "width": 1758.0,
            "height": 2275.0,
            "unit": "pixels",
        },
        "layout": {
            "text_anchor": {
                "text_segments": {
                    "end_index": 1434,
                },
            },
            "confidence": 0.9815895557403564,
            "bounding_poly": {
                "vertices": {},
                "vertices": {
                    "x": 1758,
                },
                "vertices": {
                    "x": 1758,
                    "y": 2275,
                },
                "vertices": {
                    "y": 2275,
                },
                "normalized_vertices": {},
                "normalized_vertices": {
                    "x": 1.0,
                },
                "normalized_vertices": {
                    "x": 1.0,
                    "y": 1.0,
                },
                "normalized_vertices": {
                    "y": 1.0,
                },
            },
            "orientation": PAGE_UP,
        },
        "detected_languages": {
            "language_code": "es",
            "confidence": 0.7957947850227356,
        },
        "detected_languages": {
            "language_code": "pt",
            "confidence": 0.045905839651823044,
        },
        "detected_languages": {
            "language_code": "gn",
            "confidence": 0.03847753629088402,
        },
        "detected_languages": {
            "language_code": "en",
            "confidence": 0.03831394389271736,
        },
        "blocks": {
            "layout": {
                "text_anchor": {
                    "text_segments": {
                        "end_index": 13,
                    },
                },
                "confidence": 0.8404966592788696,
                "bounding_poly": {
                    "normalized_vertices": {
                        "x": 0.16837315261363983,
                        "y": 0.07868131995201111,
                    },
                    "normalized_vertices": {
                        "x": 0.29522183537483215,
                        "y": 0.07868131995201111,
                    },
                    "normalized_vertices": {
                        "x": 0.29522183537483215,
                        "y": 0.09758241474628448,
                    },
                    "normalized_vertices": {
                        "x": 0.16837315261363983,
                        "y": 0.09758241474628448,
                    },
                },
                "orientation": PAGE_UP,
            },
        },
        "blocks": {
            "layout": {
                "text_anchor": {
                    "text_segments": {
                        "start_index": 13,
                        "end_index": 34,
                    },
                },
                "confidence": 0.982234001159668,
                "bounding_poly": {
                    "normalized_vertices": {
                        "x": 0.17121729254722595,
                        "y": 0.13582417368888855,
                    },
                    "normalized_vertices": {
                        "x": 0.38168373703956604,
                        "y": 0.13846154510974884,
                    },
                    "normalized_vertices": {
                        "x": 0.3811149001121521,
                        "y": 0.15252746641635895,
                    },
                    "normalized_vertices": {
                        "x": 0.1706484705209732,
                        "y": 0.14989010989665985,
                    },
                },
                "orientation": PAGE_UP,
            },
        },
        "blocks": {
            "layout": {
                "text_anchor": {
                    "text_segments": {
                        "start_index": 34,
                        "end_index": 183,
                    },
                },
                "confidence": 0.9857864379882812,
                "bounding_poly": {
                    "normalized_vertices": {
                        "x": 0.1706484705209732,
                        "y": 0.1599999964237213,
                    },
                    "normalized_vertices": {
                        "x": 0.7792946696281433,
                        "y": 0.1652747243642807,
                    },
                    "normalized_vertices": {
                        "x": 0.778725802898407,
                        "y": 0.2074725329875946,
                    },
                    "normalized_vertices": {
                        "x": 0.17007963359355927,
                        "y": 0.20219780504703522,
                    },
                },
                "orientation": PAGE_UP,
            },
        },
        "paragraphs": {
            "layout": {
                "text_anchor": {
                    "text_segments": {
                        "end_index": 13,
                    },
                },
                "confidence": 0.8404966592788696,
                "bounding_poly": {
                    "normalized_vertices": {
                        "x": 0.16837315261363983,
                        "y": 0.07868131995201111,
                    },
                    "normalized_vertices": {
                        "x": 0.29522183537483215,
                        "y": 0.07868131995201111,
                    },
                    "normalized_vertices": {
                        "x": 0.29522183537483215,
                        "y": 0.09758241474628448,
                    },
                    "normalized_vertices": {
                        "x": 0.16837315261363983,
                        "y": 0.09758241474628448,
                    },
                },
                "orientation": PAGE_UP,
            },
        },
        "paragraphs": {
            "layout": {
                "text_anchor": {
                    "text_segments": {
                        "start_index": 13,
                        "end_index": 34,
                    },
                },
                "confidence": 0.982234001159668,
                "bounding_poly": {
                    "normalized_vertices": {
                        "x": 0.17121729254722595,
                        "y": 0.13582417368888855,
                    },
                    "normalized_vertices": {
                        "x": 0.38168373703956604,
                        "y": 0.13846154510974884,
                    },
                    "normalized_vertices": {
                        "x": 0.3811149001121521,
                        "y": 0.15252746641635895,
                    },
                    "normalized_vertices": {
                        "x": 0.1706484705209732,
                        "y": 0.14989010989665985,
                    },
                },
                "orientation": PAGE_UP,
            },
        },
        "paragraphs": {
            "layout": {
                "text_anchor": {
                    "text_segments": {
                        "start_index": 34,
                        "end_index": 40,
                    },
                },
                "confidence": 0.9939692616462708,
                "bounding_poly": {
                    "normalized_vertices": {
                        "x": 0.1706484705209732,
                        "y": 0.1599999964237213,
                    },
                    "normalized_vertices": {
                        "x": 0.20705346763134003,
                        "y": 0.16043956577777863,
                    },
                    "normalized_vertices": {
                        "x": 0.20705346763134003,
                        "y": 0.1683516502380371,
                    },
                    "normalized_vertices": {
                        "x": 0.1706484705209732,
                        "y": 0.1679120808839798,
                    },
                },
                "orientation": PAGE_UP,
            },
        },
        "lines": {
            "layout": {
                "text_anchor": {
                    "text_segments": {
                        "end_index": 13,
                    },
                },
                "confidence": 0.8404966592788696,
                "bounding_poly": {
                    "normalized_vertices": {
                        "x": 0.16837315261363983,
                        "y": 0.07868131995201111,
                    },
                    "normalized_vertices": {
                        "x": 0.29522183537483215,
                        "y": 0.07868131995201111,
                    },
                    "normalized_vertices": {
                        "x": 0.29522183537483215,
                        "y": 0.09758241474628448,
                    },
                    "normalized_vertices": {
                        "x": 0.16837315261363983,
                        "y": 0.09758241474628448,
                    },
                },
                "orientation": PAGE_UP,
            },
            "detected_languages": {
                "language_code": "es",
                "confidence": 1.0,
            },
        },
        "lines": {
            "layout": {
                "text_anchor": {
                    "text_segments": {
                        "start_index": 13,
                        "end_index": 34,
                    },
                },
                "confidence": 0.982234001159668,
                "bounding_poly": {
                    "normalized_vertices": {
                        "x": 0.17121729254722595,
                        "y": 0.13582417368888855,
                    },
                    "normalized_vertices": {
                        "x": 0.38168373703956604,
                        "y": 0.13846154510974884,
                    },
                    "normalized_vertices": {
                        "x": 0.3811149001121521,
                        "y": 0.15252746641635895,
                    },
                    "normalized_vertices": {
                        "x": 0.1706484705209732,
                        "y": 0.14989010989665985,
                    },
                },
                "orientation": PAGE_UP,
            },
            "detected_languages": {
                "language_code": "es",
                "confidence": 1.0,
            },
        },
        "lines": {
            "layout": {
                "text_anchor": {
                    "text_segments": {
                        "start_index": 34,
                        "end_index": 40,
                    },
                },
                "confidence": 0.9939692616462708,
                "bounding_poly": {
                    "normalized_vertices": {
                        "x": 0.1706484705209732,
                        "y": 0.1599999964237213,
                    },
                    "normalized_vertices": {
                        "x": 0.20705346763134003,
                        "y": 0.16043956577777863,
                    },
                    "normalized_vertices": {
                        "x": 0.20705346763134003,
                        "y": 0.1683516502380371,
                    },
                    "normalized_vertices": {
                        "x": 0.1706484705209732,
                        "y": 0.1679120808839798,
                    },
                },
                "orientation": PAGE_UP,
            },
            "detected_languages": {
                "language_code": "pt",
                "confidence": 1.0,
            },
        },
        "tokens": {
            "layout": {
                "text_anchor": {
                    "text_segments": {
                        "end_index": 1,
                    },
                },
                "confidence": 0.562225878238678,
                "bounding_poly": {
                    "normalized_vertices": {
                        "x": 0.16837315261363983,
                        "y": 0.07868131995201111,
                    },
                    "normalized_vertices": {
                        "x": 0.17349261045455933,
                        "y": 0.07868131995201111,
                    },
                    "normalized_vertices": {
                        "x": 0.17349261045455933,
                        "y": 0.09758241474628448,
                    },
                    "normalized_vertices": {
                        "x": 0.16837315261363983,
                        "y": 0.09758241474628448,
                    },
                },
                "orientation": PAGE_UP,
            },
            "detected_languages": {
                "language_code": "es",
                "confidence": 1.0,
            },
        },
        "tokens": {
            "layout": {
                "text_anchor": {
                    "text_segments": {
                        "start_index": 1,
                        "end_index": 5,
                    },
                },
                "confidence": 0.9125303626060486,
                "bounding_poly": {
                    "normalized_vertices": {
                        "x": 0.17292377352714539,
                        "y": 0.07868131995201111,
                    },
                    "normalized_vertices": {
                        "x": 0.2133105844259262,
                        "y": 0.07868131995201111,
                    },
                    "normalized_vertices": {
                        "x": 0.2133105844259262,
                        "y": 0.09758241474628448,
                    },
                    "normalized_vertices": {
                        "x": 0.17292377352714539,
                        "y": 0.09758241474628448,
                    },
                },
                "orientation": PAGE_UP,
            },
            "detected_break": {
                "type_": SPACE,
            },
            "detected_languages": {
                "language_code": "es",
                "confidence": 1.0,
            },
        },
        "image": {
            "content": "",
            "mime_type": "image/png",
            "width": 1758,
            "height": 2275,
        },
    },
},
1

There are 1 answers

0
Holt Skinner On

Make sure that you follow all of the steps as outlined in this guide to train and deploy the Custom Document Extractor processor version.

https://cloud.google.com/document-ai/docs/workbench/build-custom-processor

My guess would be that the processor version wasn't deployed or you aren't sending the request to the trained processor version.

Make sure to either explicitly send the processor version id in the request or set the trained processor version as default.

https://cloud.google.com/document-ai/docs/workbench/build-custom-processor#deploy_the_processor_version