we want to sort the data using few fileds before applying terms aggregation, tried using terms and sort sub aggregation using which sort is working however terms aggregation is giving duplicate records.
elastic search version: 8.7.1
sample elatic search doc :
{
"itemDetails": {
"itemId": "3076",
"itemId2": "1003918865",
"usecase": "habc",
"usecaseId": "xyz"
},
"metaData": {
"cId": "96ff54507c2d018e5c767785c705a5b2",
"date1": "2023-09-29T12:29:54",
"date2": "2023-09-29T12:30:09"
}
other properties....
}
In the above doc first i would like to sort the records using date1 and date2 then need to get the distinct records using "cId".
using below query able to sort the data based on date1 and date2 however we are getting records with duplicate cId:
Query1:
POST /index/_search?typed_keys=true
{
"size": 0,
"query": {
"bool": {
"must": [
{
"terms": {
"itemDetails.itemId": [
"3076"
]
}
},
{
"terms": {
"itemDetails.usecase": [
"habc"
]
}
},
{
"range": {
"metaData.date1": {
"lte": "2023-09-30T19:55:54.611Z",
"gte": "2023-09-27T19:55:54.611Z"
}
}
}
]
}
},
"aggs": {
"sortby_date1": {
"terms": {
"field": "metaData.date1",
"order": {
"_key": "desc"
},
"size": 6
},
"aggs": {
"sortby_date2": {
"terms": {
"field": "metaData.date2",
"order": {
"_key": "desc"
}
},
"aggs": {
"groupby_cId": {
"terms": {
"field": "metaData.cId"
},
"aggs": {
"top_doc": {
"top_hits": {
"size": 1,
"_source": {
"includes": [
"itemDetails.itemId",
"itemDetails.usecase",
"metaData.cId",
"metaData.date1"
]
}
}
}
}
}
}
}
}
}
}
}
Query2 : Terms aggregation not giving unique records when used with sort.
{
"query": {
"bool": {
"must": [
{
"terms": {
"itemDetails.itemId": [
"3077"
]
}
},
{
"terms": {
"itemDetails.usecase": [
"xyz"
]
}
},
{
"range": {
"metaData.date1": {
"lte": "2023-09-30T19:55:54.611Z",
"gte": "2023-09-27T19:55:54.611Z"
}
}
}
]
}
},
"sort": [
{
"metaData.date1": {
"order": "desc"
}
},
{
"metaData.date2": {
"order": "desc"
}
}
],
"aggs": {
"distinct_cIds": {
"terms": {
"field": "metaData.cId"
},
"aggs": {
"top_doc": {
"top_hits": {
"size": 1,
"_source": {
"includes": [
"itemDetails.itemId",
"itemDetails.usecase",
"metaData.cId",
"metaData.date1"
]
}
}
}
}
}
}
}
current response using the query mentioned in the answer:
{"aggregations": {
"sterms#distinct_cIds": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "96ff54507c2d01",
"doc_count": 101,
"top_hits#top_doc": {
"hits": {
"total": {
"value": 101,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "index",
"_id": "BN0vhoAyg",
"_score": null,
"_source": {
"itemDetails": {
"itemId": "3077",
"usecase": "xyz"
},
"metaData": {
"cId": "96ff54507c2d01",
"date1": "2023-09-29T12:29:54",
"date2": "2023-09-29T13:30:57"
}
},
"sort": [
1695990594000,
1695994257000
]
}
]
}
}
},
{
"key": "846bc8708b746b23674",
"doc_count": 2,
"top_hits#top_doc": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "index",
"_id": "BN0vhoAyg1",
"_score": null,
"_source": {
"itemDetails": {
"itemId": "3077",
"usecase": "xyz"
},
"metaData": {
"cId": "846bc8708b746b23674",
"date1": "2023-09-29T03:34:08",
"date2": "2023-09-29T13:30:57"
}
},
"sort": [
1695990594000,
1695994257000
]
}
]
}
}
},
{
"key": "27053a17a1919f",
"doc_count": 2,
"top_hits#top_doc": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "index",
"_id": "BN0vhoAyg1",
"_score": null,
"_source": {
"itemDetails": {
"itemId": "3077",
"usecase": "xyz"
},
"metaData": {
"cId": "27053a17a1919f",
"date1": "2023-09-27T21:58:33",
"date2": "2023-09-27T21:58:34"
}
},
"sort": [
1695990594000,
1695994257000
]
}
]
}
}
},
{
"key": "74dc2fff7f4b5635b",
"doc_count": 2,
"top_hits#top_doc": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "index",
"_id": "BN0vhoAyg1",
"_score": null,
"_source": {
"itemDetails": {
"itemId": "3077",
"usecase": "xyz"
},
"metaData": {
"cId": "74dc2fff7f4b5635b",
"date1": "2023-09-29T23:36:38",
"date2": "2023-09-29T23:36:41",
}
},
"sort": [
1695990594000,
1695994257000
]
}
]
}
}
}
]
}
}}
Expected Response
now we are getting latest records using date1 and date2 in case of duplicates, we would also need latest record among unique cIds based on date1 and date2, record with "cId": "74dc2fff7f4b5635b" should come at the top as it contains latest date1 and date2 compare to other cIds.
You're almost there. You should simply aggregate by
cId
to get a unique record for each and then return the top hit sorted by descendingdate1
anddate2
.Note that the top level sort is only applied to hits, and has no effect on the aggregations. When you set size to 0 because you're only interested in aggregation results (i.e. unique cId), the top-level sort is of no value to you, but you can add it to the
top_hits
aggregation: