I am trying to create a search solution using Azure AI Search/cognitive search. I need to chunk the data, so that the retrieved text is limited and more relevant. I also need to implement a Hybrid search over the data hence trying out the embedding creation as well.
I tried using splittext and AzureOpenaiEmbedding skillsets however they are not getting indexed. My intention is to use OCR ,Key phrase extraction etc and get them collated using merge skill and then do chunking and embeddings over it. Also I would like to add Incremental indexing [updating the indexes without redoing the whole indexing] as an additional feature.
I am not sure where I am going wrong. any help will be highly appreciated.
The following is what I tried:
def create_skillset(skillset_name, uri, headers):
print("Trying out updated 2 set ------------------------->")
skillset = {
"description": "Skillset created from the portal. skillsetName: azureblob-skillsetn; contentField: merged_content; enrichmentGranularity: document; knowledgeStoreStorageAccount: ;",
"skills": [
{
"@odata.type": "#Microsoft.Skills.Text.V3.EntityRecognitionSkill",
"name": "#1",
"context": "/document/merged_content",
"categories": [
"Product",
"PhoneNumber",
"Person",
"Quantity",
"Organization",
"IPAddress",
"URL",
"Email",
"Event",
"Skill",
"Location",
"PersonType",
"Address",
"DateTime"
],
"defaultLanguageCode": "en",
"inputs": [
{
"name": "text",
"source": "/document/merged_content"
}
],
"outputs": [
{
"name": "persons",
"targetName": "people"
},
{
"name": "organizations",
"targetName": "organizations"
},
{
"name": "locations",
"targetName": "locations"
}
]
},
{
"@odata.type": "#Microsoft.Skills.Text.KeyPhraseExtractionSkill",
"name": "#2",
"context": "/document/merged_content",
"defaultLanguageCode": "en",
"inputs": [
{
"name": "text",
"source": "/document/merged_content"
}
],
"outputs": [
{
"name": "keyPhrases",
"targetName": "keyphrases"
}
]
},
{
"@odata.type": "#Microsoft.Skills.Text.SplitSkill",
"textSplitMode": "pages",
"maximumPageLength": 500,
"pageOverlapLength": 50,
"maximumPagesToTake": 1,
"defaultLanguageCode": "en",
"context": "/document",
"inputs": [
{
"name": "text",
"source": "/document/merged_text"
}
],
"outputs": [
{
"name": "textItems",
"targetName": "pages"
}
]
},
{
"@odata.type": "#Microsoft.Skills.Text.TranslationSkill",
"name": "#3",
"context": "/document/merged_content",
"defaultToLanguageCode": "en",
"suggestedFrom": "en",
"inputs": [
{
"name": "text",
"source": "/document/merged_content"
}
],
"outputs": [
{
"name": "translatedText",
"targetName": "translated_text"
}
]
},
{
"@odata.type": "#Microsoft.Skills.Text.MergeSkill",
"name": "#4",
"context": "/document",
"insertPreTag": " ",
"insertPostTag": " ",
"inputs": [
{
"name": "text",
"source": "/document/content"
},
{
"name": "itemsToInsert",
"source": "/document/normalized_images/*/text"
},
{
"name": "offsets",
"source": "/document/normalized_images/*/contentOffset"
}
],
"outputs": [
{
"name": "mergedText",
"targetName": "merged_content"
}
]
},
{
"@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
"description": "Connects a deployed embedding model.",
"resourceUri": "https://MYRESOURCE.openai.azure.com/",
"deploymentId": "text-embedding-ada-002",
"inputs": [
{
"name": "text",
"source": "/document/merged_content"
}
],
"outputs": [
{
"name": "embedding"
}
]
},
{
"@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
"name": "#5",
"context": "/document/normalized_images/*",
"lineEnding": "Space",
"defaultLanguageCode": "en",
"inputs": [
{
"name": "image",
"source": "/document/normalized_images/*"
}
],
"outputs": [
{
"name": "text",
"targetName": "text"
},
{
"name": "layoutText",
"targetName": "layoutText"
}
]
},
{
"@odata.type": "#Microsoft.Skills.Vision.ImageAnalysisSkill",
"name": "#6",
"context": "/document/normalized_images/*",
"defaultLanguageCode": "en",
"visualFeatures": [
"tags",
"description"
],
"details": [],
"inputs": [
{
"name": "image",
"source": "/document/normalized_images/*"
}
],
"outputs": [
{
"name": "tags",
"targetName": "imageTags"
},
{
"name": "description",
"targetName": "imageCaption"
}
]
}
],
"cognitiveServices": {
"@odata.type": "#Microsoft.Azure.Search.CognitiveServicesByKey",
"description": "",
"key": ""
}
}
resp = requests.put(uri, headers=headers, data=json.dumps(skillset), verify=False)
logging.info(f"Message: {resp.status_code}\n"
f"Message: {resp.ok}\n")
def create_index(index_name, uri, headers):
index = {
"fields": [
{
"name": "id",
"type": "Edm.String",
"key": True,
"searchable": True,
"filterable": True,
"facetable": True,
"sortable": True
},
{
"name": "metadata_storage_name",
"type": "Edm.String",
"searchable": True,
"filterable": False,
"facetable": False,
"sortable": False
},
{
"name": "imageCaption",
"type": "Collection(Edm.String)",
"searchable": True,
"filterable": False,
"facetable": False,
"sortable": False
},
{
"name": "imageTags",
"type": "Collection(Edm.String)",
"searchable": True,
"filterable": False,
"facetable": False,
"sortable": False
},
{
"name": "translated_text",
"type": "Edm.String",
"searchable": True,
"filterable": False,
"facetable": False,
"sortable": False
},
{
"name": "keyphrases",
"type": "Collection(Edm.String)",
"searchable": True,
"filterable": False,
"facetable": False,
"sortable": False
},
{
"name": "merged_content",
"type": "Edm.String",
"searchable": True,
"filterable": True,
"facetable": True,
"sortable": True
},
{
"name": "pages",
"type": "Edm.String",
"searchable": True,
"filterable": True,
"facetable": True,
"sortable": True
},
]
}
resp = requests.put(uri, headers=headers, data=json.dumps(index), verify=False)
logging.info(f"Message: {resp.status_code}\n"
f"Message: {resp.ok}\n")
def create_indexer(indexer_name, datasource_name, index_name, skillset_name, uri, headers):
indexer = {
"name": indexer_name,
"dataSourceName" : datasource_name,
"targetIndexName" : index_name,
"skillsetName" : skillset_name,
"fieldMappings" : [
{
"sourceFieldName" : "metadata_storage_path",
"targetFieldName" : "id",
"mappingFunction" : {"name": "base64Encode"}
},
{
"sourceFieldName" : "metadata_storage_name",
"targetFieldName" : "metadata_storage_name",
}
],
"outputFieldMappings" :
[
{
"sourceFieldName" : "/document/merged_content",
"targetFieldName" : "merged_content"
},
{
"sourceFieldName" : "/document/pages",
"targetFieldName" : "pages"
},
{
"sourceFieldName": "/document/merged_content/translated_text",
"targetFieldName": "translated_text"
},
{
"sourceFieldName" : "/document/merged_content/keyphrases",
"targetFieldName" : "keyphrases"
},
{
"sourceFieldName": "/document/normalized_images/*/imageTags/*/name",
"targetFieldName": "imageTags"
},
{
"sourceFieldName": "/document/normalized_images/*/imageCaption",
"targetFieldName": "imageCaption"
}
],
"parameters":
{
"maxFailedItems": 4,
"maxFailedItemsPerBatch": 4,
"configuration":
{
"dataToExtract": "contentAndMetadata",
"parsingMode": "default",
"firstLineContainsHeaders": True,
"delimitedTextDelimiter": ",",
"imageAction": "generateNormalizedImages"
}
}
}
resp = requests.put(uri, headers=headers, data=json.dumps(indexer), verify=False)
logging.info(f"Message: {resp.status_code}\n"
f"Message: {resp.ok}\n")
*** I am calling these functions like this ***
# Delete already exisitng datasource
uri = f"https://{service_name}.search.windows.net/datasources/{datasource_name}?api-version={api_version}"
resp = requests.delete(uri, headers=headers, verify=False)
uri = f"https://{service_name}.search.windows.net/indexers/{indexer_name}?api-version={api_version}"
# Delete already existing indexer
resp = requests.delete(uri, headers=headers, verify=False)
con_str="hjgjhgkj"
blob_service_client = BlobServiceClient.from_connection_string(con_str)
container_client = blob_service_client.get_container_client(container_name)
uri = f"https://{service_name}.search.windows.net/datasources?api-version={api_version}"
create_datasource(datasource_name, uri, headers,container_name)
time.sleep(15)
uri = f"https://{service_name}.search.windows.net/skillsets/{skillset_name}?api-version={api_version}"
#Delete already existing skillset
resp = requests.delete(uri, headers=headers, verify=False)
create_skillset(skillset_name, uri, headers)
uri = f"https://{service_name}.search.windows.net/indexes/{index_name}?api-version={api_version}"
# Delete already existing index
resp = requests.delete(uri, headers=headers, verify=False)
create_index(index_name, uri, headers)
uri = f"https://{service_name}.search.windows.net/indexers/{indexer_name}?api-version={api_version}"
create_indexer(indexer_name, datasource_name, index_name, skillset_name, uri, headers)
uri = f"https://{service_name}.search.windows.net/indexers/{indexer_name}/status?api-version={api_version}"
resp = requests.get(uri, headers=headers, verify=False)
The error i get is as follows:
{"error":{"code":"InvalidRequestParameter","message":"The request is invalid. Details: skillset : One or more skills are invalid. Details: Unexpected properties found on Skill.","details":[{"code":"InvalidSkillset","message":"One or more skills are invalid. Details: Unexpected properties found on Skill. Parameters: skillset"}]}}
the spittext and openai embeddings are the two new skillsets that i added, rest of it are working, I followed the azure documentation while creating these skillsets. Please advice on where am I going wrong.
The issue is that, the following 2 parameters are not available, even though it is thus mentioned in azure documentation. take these out and that will fix the splittext skill issue:
"maximumPagesToTake": 1,
for AzureOpenAIEmbeddingSkill, it is available only on 2023-10-01-preview api. you just need to change the api parameter while creating the skillset, it should work.