I just train a custom extractor at Document AI and test it there and get the values for the tags that I created, but I was following the Sample Request
for Python (here's the code sample) but I get no entities nor texts just the pages with the paragraphs and lines
def process_document(
project_id: str,
location: str,
processor_id: str,
processor_version: str,
file_path: str,
mime_type: str,
credentials_path: str,
field_mask: Optional[str] = None,
processor_version_id: Optional[str] = None,
process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
credentials = credentials = service_account.Credentials.from_service_account_file(credentials_path)
client = documentai.DocumentProcessorServiceClient(
client_options=ClientOptions(
api_endpoint=f"{location}-documentai.googleapis.com"
),
credentials=credentials
)
if processor_version_id:
name = client.processor_version_path(project_id, location, processor_id, processor_version_id)
else:
name = client.processor_path(project_id, location, processor_id)
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
# Configure the process request
request = documentai.ProcessRequest(
name=name,
raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
process_options=process_options,
)
result = client.process_document(request=request)
return result
def process_document_sample(
project_id: str,
location: str,
processor_id: str,
file_path: str,
mime_type: str,
credentials_path: str,
field_mask: Optional[str] = None,
processor_version_id: Optional[str] = None,
) -> None:
process_options = documentai.ProcessOptions(
# Process only specific pages
individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
pages=[1]
),
)
result = process_document(
project_id=project_id,
location=location,
processor_id=processor_id,
file_path=file_path,
mime_type=mime_type,
credentials_path=credentials_path,
field_mask=field_mask,
processor_version=processor_version_id,
process_options=process_options,
)
return result
Am I missing something for entities be empty?
This is the response:
{
"uri": "",
"mime_type": "application/pdf",
"text": "File content here",
"pages": {
"page_number": 1,
"dimension": {
"width": 1758.0,
"height": 2275.0,
"unit": "pixels",
},
"layout": {
"text_anchor": {
"text_segments": {
"end_index": 1434,
},
},
"confidence": 0.9815895557403564,
"bounding_poly": {
"vertices": {},
"vertices": {
"x": 1758,
},
"vertices": {
"x": 1758,
"y": 2275,
},
"vertices": {
"y": 2275,
},
"normalized_vertices": {},
"normalized_vertices": {
"x": 1.0,
},
"normalized_vertices": {
"x": 1.0,
"y": 1.0,
},
"normalized_vertices": {
"y": 1.0,
},
},
"orientation": PAGE_UP,
},
"detected_languages": {
"language_code": "es",
"confidence": 0.7957947850227356,
},
"detected_languages": {
"language_code": "pt",
"confidence": 0.045905839651823044,
},
"detected_languages": {
"language_code": "gn",
"confidence": 0.03847753629088402,
},
"detected_languages": {
"language_code": "en",
"confidence": 0.03831394389271736,
},
"blocks": {
"layout": {
"text_anchor": {
"text_segments": {
"end_index": 13,
},
},
"confidence": 0.8404966592788696,
"bounding_poly": {
"normalized_vertices": {
"x": 0.16837315261363983,
"y": 0.07868131995201111,
},
"normalized_vertices": {
"x": 0.29522183537483215,
"y": 0.07868131995201111,
},
"normalized_vertices": {
"x": 0.29522183537483215,
"y": 0.09758241474628448,
},
"normalized_vertices": {
"x": 0.16837315261363983,
"y": 0.09758241474628448,
},
},
"orientation": PAGE_UP,
},
},
"blocks": {
"layout": {
"text_anchor": {
"text_segments": {
"start_index": 13,
"end_index": 34,
},
},
"confidence": 0.982234001159668,
"bounding_poly": {
"normalized_vertices": {
"x": 0.17121729254722595,
"y": 0.13582417368888855,
},
"normalized_vertices": {
"x": 0.38168373703956604,
"y": 0.13846154510974884,
},
"normalized_vertices": {
"x": 0.3811149001121521,
"y": 0.15252746641635895,
},
"normalized_vertices": {
"x": 0.1706484705209732,
"y": 0.14989010989665985,
},
},
"orientation": PAGE_UP,
},
},
"blocks": {
"layout": {
"text_anchor": {
"text_segments": {
"start_index": 34,
"end_index": 183,
},
},
"confidence": 0.9857864379882812,
"bounding_poly": {
"normalized_vertices": {
"x": 0.1706484705209732,
"y": 0.1599999964237213,
},
"normalized_vertices": {
"x": 0.7792946696281433,
"y": 0.1652747243642807,
},
"normalized_vertices": {
"x": 0.778725802898407,
"y": 0.2074725329875946,
},
"normalized_vertices": {
"x": 0.17007963359355927,
"y": 0.20219780504703522,
},
},
"orientation": PAGE_UP,
},
},
"paragraphs": {
"layout": {
"text_anchor": {
"text_segments": {
"end_index": 13,
},
},
"confidence": 0.8404966592788696,
"bounding_poly": {
"normalized_vertices": {
"x": 0.16837315261363983,
"y": 0.07868131995201111,
},
"normalized_vertices": {
"x": 0.29522183537483215,
"y": 0.07868131995201111,
},
"normalized_vertices": {
"x": 0.29522183537483215,
"y": 0.09758241474628448,
},
"normalized_vertices": {
"x": 0.16837315261363983,
"y": 0.09758241474628448,
},
},
"orientation": PAGE_UP,
},
},
"paragraphs": {
"layout": {
"text_anchor": {
"text_segments": {
"start_index": 13,
"end_index": 34,
},
},
"confidence": 0.982234001159668,
"bounding_poly": {
"normalized_vertices": {
"x": 0.17121729254722595,
"y": 0.13582417368888855,
},
"normalized_vertices": {
"x": 0.38168373703956604,
"y": 0.13846154510974884,
},
"normalized_vertices": {
"x": 0.3811149001121521,
"y": 0.15252746641635895,
},
"normalized_vertices": {
"x": 0.1706484705209732,
"y": 0.14989010989665985,
},
},
"orientation": PAGE_UP,
},
},
"paragraphs": {
"layout": {
"text_anchor": {
"text_segments": {
"start_index": 34,
"end_index": 40,
},
},
"confidence": 0.9939692616462708,
"bounding_poly": {
"normalized_vertices": {
"x": 0.1706484705209732,
"y": 0.1599999964237213,
},
"normalized_vertices": {
"x": 0.20705346763134003,
"y": 0.16043956577777863,
},
"normalized_vertices": {
"x": 0.20705346763134003,
"y": 0.1683516502380371,
},
"normalized_vertices": {
"x": 0.1706484705209732,
"y": 0.1679120808839798,
},
},
"orientation": PAGE_UP,
},
},
"lines": {
"layout": {
"text_anchor": {
"text_segments": {
"end_index": 13,
},
},
"confidence": 0.8404966592788696,
"bounding_poly": {
"normalized_vertices": {
"x": 0.16837315261363983,
"y": 0.07868131995201111,
},
"normalized_vertices": {
"x": 0.29522183537483215,
"y": 0.07868131995201111,
},
"normalized_vertices": {
"x": 0.29522183537483215,
"y": 0.09758241474628448,
},
"normalized_vertices": {
"x": 0.16837315261363983,
"y": 0.09758241474628448,
},
},
"orientation": PAGE_UP,
},
"detected_languages": {
"language_code": "es",
"confidence": 1.0,
},
},
"lines": {
"layout": {
"text_anchor": {
"text_segments": {
"start_index": 13,
"end_index": 34,
},
},
"confidence": 0.982234001159668,
"bounding_poly": {
"normalized_vertices": {
"x": 0.17121729254722595,
"y": 0.13582417368888855,
},
"normalized_vertices": {
"x": 0.38168373703956604,
"y": 0.13846154510974884,
},
"normalized_vertices": {
"x": 0.3811149001121521,
"y": 0.15252746641635895,
},
"normalized_vertices": {
"x": 0.1706484705209732,
"y": 0.14989010989665985,
},
},
"orientation": PAGE_UP,
},
"detected_languages": {
"language_code": "es",
"confidence": 1.0,
},
},
"lines": {
"layout": {
"text_anchor": {
"text_segments": {
"start_index": 34,
"end_index": 40,
},
},
"confidence": 0.9939692616462708,
"bounding_poly": {
"normalized_vertices": {
"x": 0.1706484705209732,
"y": 0.1599999964237213,
},
"normalized_vertices": {
"x": 0.20705346763134003,
"y": 0.16043956577777863,
},
"normalized_vertices": {
"x": 0.20705346763134003,
"y": 0.1683516502380371,
},
"normalized_vertices": {
"x": 0.1706484705209732,
"y": 0.1679120808839798,
},
},
"orientation": PAGE_UP,
},
"detected_languages": {
"language_code": "pt",
"confidence": 1.0,
},
},
"tokens": {
"layout": {
"text_anchor": {
"text_segments": {
"end_index": 1,
},
},
"confidence": 0.562225878238678,
"bounding_poly": {
"normalized_vertices": {
"x": 0.16837315261363983,
"y": 0.07868131995201111,
},
"normalized_vertices": {
"x": 0.17349261045455933,
"y": 0.07868131995201111,
},
"normalized_vertices": {
"x": 0.17349261045455933,
"y": 0.09758241474628448,
},
"normalized_vertices": {
"x": 0.16837315261363983,
"y": 0.09758241474628448,
},
},
"orientation": PAGE_UP,
},
"detected_languages": {
"language_code": "es",
"confidence": 1.0,
},
},
"tokens": {
"layout": {
"text_anchor": {
"text_segments": {
"start_index": 1,
"end_index": 5,
},
},
"confidence": 0.9125303626060486,
"bounding_poly": {
"normalized_vertices": {
"x": 0.17292377352714539,
"y": 0.07868131995201111,
},
"normalized_vertices": {
"x": 0.2133105844259262,
"y": 0.07868131995201111,
},
"normalized_vertices": {
"x": 0.2133105844259262,
"y": 0.09758241474628448,
},
"normalized_vertices": {
"x": 0.17292377352714539,
"y": 0.09758241474628448,
},
},
"orientation": PAGE_UP,
},
"detected_break": {
"type_": SPACE,
},
"detected_languages": {
"language_code": "es",
"confidence": 1.0,
},
},
"image": {
"content": "",
"mime_type": "image/png",
"width": 1758,
"height": 2275,
},
},
},
Make sure that you follow all of the steps as outlined in this guide to train and deploy the Custom Document Extractor processor version.
https://cloud.google.com/document-ai/docs/workbench/build-custom-processor
My guess would be that the processor version wasn't deployed or you aren't sending the request to the trained processor version.
Make sure to either explicitly send the processor version id in the request or set the trained processor version as default.
https://cloud.google.com/document-ai/docs/workbench/build-custom-processor#deploy_the_processor_version