how to sort the data before applying terms aggregation in elastic search

70 views Asked by At

we want to sort the data using few fileds before applying terms aggregation, tried using terms and sort sub aggregation using which sort is working however terms aggregation is giving duplicate records.

elastic search version: 8.7.1

sample elatic search doc :

{
  "itemDetails": {
    "itemId": "3076",
    "itemId2": "1003918865",
    "usecase": "habc",
    "usecaseId": "xyz"
  },
  "metaData": {
    "cId": "96ff54507c2d018e5c767785c705a5b2",
    "date1": "2023-09-29T12:29:54",
    "date2": "2023-09-29T12:30:09"
  }
other properties....
}

In the above doc first i would like to sort the records using date1 and date2 then need to get the distinct records using "cId".

using below query able to sort the data based on date1 and date2 however we are getting records with duplicate cId:

Query1:

POST /index/_search?typed_keys=true
{
  "size": 0,
  "query": {
    "bool": {
      "must": [
        {
          "terms": {
            "itemDetails.itemId": [
              "3076"
            ]
          }
        },
        {
          "terms": {
            "itemDetails.usecase": [
              "habc"
            ]
          }
        },
        {
          "range": {
            "metaData.date1": {
              "lte": "2023-09-30T19:55:54.611Z",
              "gte": "2023-09-27T19:55:54.611Z"
            }
          }
        }
      ]
    }
  },
  "aggs": {
    "sortby_date1": {
      "terms": {
        "field": "metaData.date1",
        "order": {
          "_key": "desc"
        },
        "size": 6
      },
      "aggs": {
        "sortby_date2": {
          "terms": {
            "field": "metaData.date2",
            "order": {
              "_key": "desc"
            }
          },
          "aggs": {
            "groupby_cId": {
              "terms": {
                "field": "metaData.cId"
              },
              "aggs": {
                "top_doc": {
                  "top_hits": {
                    "size": 1,
                    "_source": {
                      "includes": [
                        "itemDetails.itemId",
                        "itemDetails.usecase",
                        "metaData.cId",
                        "metaData.date1"
                      
                      ]
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}

Query2 : Terms aggregation not giving unique records when used with sort.

{
  "query": {
    "bool": {
      "must": [
        {
          "terms": {
            "itemDetails.itemId": [
              "3077"
            ]
          }
        },
        {
          "terms": {
            "itemDetails.usecase": [
              "xyz"
            ]
          }
        },
        {
          "range": {
            "metaData.date1": {
              "lte": "2023-09-30T19:55:54.611Z",
              "gte": "2023-09-27T19:55:54.611Z"
            }
          }
        }
      ]
    }
  },
  "sort": [
    {
      "metaData.date1": {
        "order": "desc"
      }
    },
    {
      "metaData.date2": {
        "order": "desc"
      }
    }
  ],
  "aggs": {
    "distinct_cIds": {
      "terms": {
        "field": "metaData.cId"
      },
      "aggs": {
        "top_doc": {
          "top_hits": {
            "size": 1,
            "_source": {
              "includes": [
                        "itemDetails.itemId",
                        "itemDetails.usecase",
                        "metaData.cId",
                        "metaData.date1"
              ]
            }
          }
        }
      }
    }
  }
}

current response using the query mentioned in the answer:

{"aggregations": {
    "sterms#distinct_cIds": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "96ff54507c2d01",
          "doc_count": 101,
          "top_hits#top_doc": {
            "hits": {
              "total": {
                "value": 101,
                "relation": "eq"
              },
              "max_score": null,
              "hits": [
                {
                  "_index": "index",
                  "_id": "BN0vhoAyg",
                  "_score": null,
                  "_source": {
                    "itemDetails": {
                      "itemId": "3077",
                      "usecase": "xyz"
                    },
                    "metaData": {
                      "cId": "96ff54507c2d01",
                      "date1": "2023-09-29T12:29:54",
                      "date2": "2023-09-29T13:30:57"
                    }
                  },
                  "sort": [
                    1695990594000,
                    1695994257000
                  ]
                }
              ]
            }
          }
        },
        {
          "key": "846bc8708b746b23674",
          "doc_count": 2,
          "top_hits#top_doc": {
            "hits": {
              "total": {
                "value": 2,
                "relation": "eq"
              },
              "max_score": null,
              "hits": [
                {
                  "_index": "index",
                  "_id": "BN0vhoAyg1",
                  "_score": null,
                  "_source": {
                    "itemDetails": {
                      "itemId": "3077",
                      "usecase": "xyz"
                    },
                    "metaData": {
                      "cId": "846bc8708b746b23674",
                      "date1": "2023-09-29T03:34:08",
                      "date2": "2023-09-29T13:30:57"
                    }
                  },
                  "sort": [
                    1695990594000,
                    1695994257000
                  ]
                }
              ]
            }
          }
        },
        {
          "key": "27053a17a1919f",
          "doc_count": 2,
          "top_hits#top_doc": {
            "hits": {
              "total": {
                "value": 2,
                "relation": "eq"
              },
              "max_score": null,
              "hits": [
                {
                  "_index": "index",
                  "_id": "BN0vhoAyg1",
                  "_score": null,
                  "_source": {
                    "itemDetails": {
                      "itemId": "3077",
                      "usecase": "xyz"
                    },
                    "metaData": {
                      "cId": "27053a17a1919f",
                      "date1": "2023-09-27T21:58:33",
                      "date2": "2023-09-27T21:58:34"
                    }
                  },
                  "sort": [
                    1695990594000,
                    1695994257000
                  ]
                }
              ]
            }
          }
        },
        {
          "key": "74dc2fff7f4b5635b",
          "doc_count": 2,
          "top_hits#top_doc": {
            "hits": {
              "total": {
                "value": 2,
                "relation": "eq"
              },
              "max_score": null,
              "hits": [
                {
                  "_index": "index",
                  "_id": "BN0vhoAyg1",
                  "_score": null,
                  "_source": {
                    "itemDetails": {
                      "itemId": "3077",
                      "usecase": "xyz"
                    },
                    "metaData": {
                      "cId": "74dc2fff7f4b5635b",
                      "date1": "2023-09-29T23:36:38",
                      "date2": "2023-09-29T23:36:41",
                    }
                  },
                  "sort": [
                    1695990594000,
                    1695994257000
                  ]
                }
              ]
            }
          }
        }
      ]
    }
  }}

Expected Response

now we are getting latest records using date1 and date2 in case of duplicates, we would also need latest record among unique cIds based on date1 and date2, record with "cId": "74dc2fff7f4b5635b" should come at the top as it contains latest date1 and date2 compare to other cIds.

1

There are 1 answers

13
Val On BEST ANSWER

You're almost there. You should simply aggregate by cId to get a unique record for each and then return the top hit sorted by descending date1 and date2.

Note that the top level sort is only applied to hits, and has no effect on the aggregations. When you set size to 0 because you're only interested in aggregation results (i.e. unique cId), the top-level sort is of no value to you, but you can add it to the top_hits aggregation:

POST /index/_search?typed_keys=true
{
  "size": 0,
  "query": {
    "bool": {
      "must": [
        {
          "terms": {
            "itemDetails.itemId": [
              "3076"
            ]
          }
        },
        {
          "terms": {
            "itemDetails.usecase": [
              "habc"
            ]
          }
        },
        {
          "range": {
            "metaData.date1": {
              "lte": "2023-09-30T19:55:54.611Z",
              "gte": "2023-09-27T19:55:54.611Z"
            }
          }
        }
      ]
    }
  },
  "aggs": {
    "distinct_cIds": {
      "terms": {
        "field": "metaData.cId"
        "order": { "max_date1": "desc" }
      },
      "aggs": {
        "max_date1": { 
          "max": { "field": "metaData.date1" } 
        },
        "top_doc": {
          "top_hits": {
            "size": 1,
            "sort": [
               {"metaData.date1": "desc"},           <---- add this sort
               {"metaData.date2": "desc"}            <---- add this sort
            ],
            "_source": {
              "includes": [
                        "itemDetails.itemId",
                        "itemDetails.usecase",
                        "metaData.cId",
                        "metaData.date1"
              ]
            }
          }
        }
      }
    }
  }
}