ElasticSearch / Solr dis_max query skipping matching clauses

172 views Asked by At

I have an Elasticsearch index on one shard, for which I'm running a [dis_max][4] query that, given some user details

(First Name, Last Name, Date of Birth, Address, Phone, Username, Email etc.)

queries users from an index combining a set of criteria/matching clauses.

E.g.

  • match username ([fuzzy][1], boosted 2x)
  • should match first and last name ([bool][3] combining [match-term][2] query for FN and LN, boosted 1.1x)
  • must match FN, LN and DOB ([bool][3] combining [fuzzy][1] for FN and LN and [match-term][2] for DOB, boosted 3x)
  • match phone ([match-term][2] boosted 2x)

etc.

See query below (with obscured input data):

{
   "from":0,
   "size":100,
   "explain": true,
   "query":{
      "dis_max":{
         "tie_breaker":0.5,
         "queries":[
            {
               "fuzzy":{
                  "username":{
                     "value":"xxx",
                     "fuzziness":"AUTO",
                     "prefix_length":0,
                     "max_expansions":50,
                     "transpositions":false,
                     "boost":2.0
                  }
               }
            },
            {
               "term":{
                  "email":{
                     "value":"gmail.com",
                     "boost":1.0
                  }
               }
            },
            {
               "fuzzy":{
                  "email":{
                     "value":"xxx",
                     "fuzziness":"AUTO",
                     "prefix_length":0,
                     "max_expansions":50,
                     "transpositions":false,
                     "boost":1.0
                  }
               }
            },
            {
               "term":{
                  "password-hash":{
                     "value":"xxx",
                     "boost":1.0
                  }
               }
            },
            {
               "term":{
                  "currency-code":{
                     "value":"xxx",
                     "boost":0.5
                  }
               }
            },
            {
               "match":{
                  "first-name":{
                     "query":"xxx",
                     "operator":"OR",
                     "prefix_length":0,
                     "max_expansions":50,
                     "fuzzy_transpositions":true,
                     "lenient":false,
                     "zero_terms_query":"NONE",
                     "auto_generate_synonyms_phrase_query":true,
                     "boost":1.0
                  }
               }
            },
            {
               "match":{
                  "last-name":{
                     "query":"xxx",
                     "operator":"OR",
                     "prefix_length":0,
                     "max_expansions":50,
                     "fuzzy_transpositions":true,
                     "lenient":false,
                     "zero_terms_query":"NONE",
                     "auto_generate_synonyms_phrase_query":true,
                     "boost":1.0
                  }
               }
            },
            {
               "dis_max":{
                  "queries":[
                     {
                        "match":{
                           "first-name":{
                              "query":"xxx",
                              "operator":"OR",
                              "prefix_length":0,
                              "max_expansions":50,
                              "fuzzy_transpositions":true,
                              "lenient":false,
                              "zero_terms_query":"NONE",
                              "auto_generate_synonyms_phrase_query":true,
                              "boost":1.0
                           }
                        }
                     },
                     {
                        "match":{
                           "last-name":{
                              "query":"xxx",
                              "operator":"OR",
                              "prefix_length":0,
                              "max_expansions":50,
                              "fuzzy_transpositions":true,
                              "lenient":false,
                              "zero_terms_query":"NONE",
                              "auto_generate_synonyms_phrase_query":true,
                              "boost":1.0
                           }
                        }
                     }
                  ],
                  "boost":1.1
               }
            },
            {
               "match":{
                  "date-of-birth":{
                     "query":"xxxx-xx-xx",
                     "operator":"OR",
                     "prefix_length":0,
                     "max_expansions":50,
                     "fuzzy_transpositions":true,
                     "lenient":false,
                     "zero_terms_query":"NONE",
                     "auto_generate_synonyms_phrase_query":true,
                     "boost":1.0
                  }
               }
            },
            {
               "bool":{
                  "must":[
                     {
                    "fuzzy" : {
                        "first-name" : {
                        "value" : "xxx",
                        "fuzziness" : "AUTO",
                        "prefix_length" : 0,
                        "max_expansions" : 50,
                        "transpositions" : true,
                        "boost" : 1.5
                        }
                    }
                     },
                     {
                        "fuzzy" : {
                            "last-name" : {
                            "value" : "xxx",
                            "fuzziness" : "AUTO",
                            "prefix_length" : 0,
                            "max_expansions" : 50,
                            "transpositions" : true,
                            "boost" : 1.5
                            }
                        }
                     },
                     {
                        "match":{
                           "date-of-birth":{
                              "query":"xxxx-xx-xx",
                              "operator":"OR",
                              "prefix_length":0,
                              "max_expansions":50,
                              "fuzzy_transpositions":true,
                              "lenient":false,
                              "zero_terms_query":"NONE",
                              "auto_generate_synonyms_phrase_query":true,
                              "boost":1.0
                           }
                        }
                     }
                  ],
                  "adjust_pure_negative":true,
                  "boost":3.0
               }
            },
            {
               "match":{
                  "address":{
                     "query":"xxx",
                     "operator":"OR",
                     "analyzer":"whitespace",
                     "fuzziness":"AUTO",
                     "prefix_length":0,
                     "max_expansions":50,
                     "minimum_should_match":"60%",
                     "fuzzy_transpositions":true,
                     "lenient":false,
                     "zero_terms_query":"NONE",
                     "auto_generate_synonyms_phrase_query":true,
                     "boost":1.1
                  }
               }
            },
            {
               "match":{
                  "city":{
                     "query":"xxx",
                     "operator":"OR",
                     "prefix_length":0,
                     "max_expansions":50,
                     "fuzzy_transpositions":true,
                     "lenient":false,
                     "zero_terms_query":"NONE",
                     "auto_generate_synonyms_phrase_query":true,
                     "boost":1.0
                  }
               }
            },
            {
               "term":{
                  "postal-code":{
                     "value":"xxx ",
                     "boost":1.5
                  }
               }
            },
            {
               "boosting":{
                  "positive":{
                     "match":{
                        "address":{
                           "query":"xxx",
                           "operator":"OR",
                           "analyzer":"whitespace",
                           "fuzziness":"AUTO",
                           "prefix_length":0,
                           "max_expansions":50,
                           "minimum_should_match":"60%",
                           "fuzzy_transpositions":true,
                           "lenient":false,
                           "zero_terms_query":"NONE",
                           "auto_generate_synonyms_phrase_query":true,
                           "boost":1.1
                        }
                     }
                  },
                  "negative":{
                     "bool":{
                        "must_not":[
                           {
                              "term":{
                                 "postal-code":{
                                    "value":"xxx ",
                                    "boost":1.5
                                 }
                              }
                           }
                        ],
                        "adjust_pure_negative":true,
                        "boost":1.0
                     }
                  },
                  "negative_boost":0.7,
                  "boost":1.0
               }
            },
            {
               "term":{
                  "country-code":{
                     "value":"xxx",
                     "boost":1.0
                  }
               }
            },
            {
               "match":{
                  "phone":{
                     "query":"xxx",
                     "operator":"OR",
                     "prefix_length":0,
                     "max_expansions":50,
                     "fuzzy_transpositions":true,
                     "lenient":false,
                     "zero_terms_query":"NONE",
                     "auto_generate_synonyms_phrase_query":true,
                     "boost":2.0
                  }
               }
            },
            {
               "match":{
                  "security-question":{
                     "query":"xxx?",
                     "operator":"OR",
                     "prefix_length":0,
                     "max_expansions":50,
                     "fuzzy_transpositions":true,
                     "lenient":false,
                     "zero_terms_query":"NONE",
                     "auto_generate_synonyms_phrase_query":true,
                     "boost":1.0
                  }
               }
            },
            {
               "match":{
                  "security-answer":{
                     "query":"xxx ",
                     "operator":"OR",
                     "prefix_length":0,
                     "max_expansions":50,
                     "fuzzy_transpositions":true,
                     "lenient":false,
                     "zero_terms_query":"NONE",
                     "auto_generate_synonyms_phrase_query":true,
                     "boost":1.0
                  }
               }
            }
         ],
         "boost":1.0
      }
   }
}

All criteria account for a score and I've set a tie_breaker to 0.5 so that the score of a result will be the max amongst all the scores, plus 0.5 times the rest of the scores.

Performing the query with few input combinations,

  • on some instances I get good scores that make for good matching,
  • on other instances, even expecting same or high enough score I get a very low score because some of the most relevant matching clauses seem to be skipped.

I have in fact debugged the query execution with "explain": true and in the explanation

  • the first result scores a high value with all query clauses,
  • the second one (that from the data should score enough) just scores a low value and some clauses don't appear in the explanation as if they were excluded/ignored.

I'd like to understand why these would be ignored/skipped in some cases. Is anybody aware if this could be an issue in the way ES builds queryes into Solr?

See result example below (all data obscured but the results would be quite close in the distinct fields).

{
    "took": 312,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 10000,
            "relation": "gte"
        },
        "max_score": 164.04868,
        "hits": [
            {
                "_shard": "[my-shard][0]",
                "_node": "acccSjFfQnOffqbiHV5nAg",
                "_index": "my-index",
                "_type": "_doc",
                "_id": "360086",
                "_score": 164.04868,
                "_source": {
                    "id": "360086",
                    "user-id": 389872,
                    "username": "xxx",
                    "email": "[email protected]",
                    "password-hash": "xxx",
                    "currency-code": "xxx",
                    "first-name": "xxx",
                    "last-name": "xxx",
                    "date-of-birth": "xxxx-xx-xx",
                    "address": "xxx",
                    "city": "N/A",
                    "postal-code": "xxx",
                    "country-code": "xxx",
                    "phone": "xxx",
                    "security-question": "xxx?",
                    "security-answer": "xxx "
                },
                "_explanation": {
                    "value": 164.04868,
                    "description": "max plus 0.5 times others of:",
                    "details": [
                        {
                            "value": 22.275639,
                            "description": "weight(username:xxx in 3223) [PerFieldSimilarity], result of:",
                            "details": [...]
                        },
                        {
                            "value": 0.88973737,
                            "description": "weight(email:gmail.com in 3223) [PerFieldSimilarity], result of:",
                            "details": [...]
                        },
                        {
                            "value": 12.44133,
                            "description": "weight(email:XXX in 3223) [PerFieldSimilarity], result of:",
                            "details": [...]
                        },
                        {
                            "value": 11.066888,
                            "description": "weight(password-hash:XXX in 3223) [PerFieldSimilarity], result of:",
                            "details": [...]
                        },
                        {
                            "value": 12.547058,
                            "description": "weight(first-name:XXX in 3223) [PerFieldSimilarity], result of:",
                            "details": [...]
                        },
                        {
                            "value": 12.619294,
                            "description": "weight(last-name:XXX in 3223) [PerFieldSimilarity], result of:",
                            "details": [...]
                        },
                        {
                            "value": 13.881224,
                            "description": "max of:",
                            "details": [
                                {
                                    "value": 13.801764,
                                    "description": "weight(first-name:xxx in 3223) [PerFieldSimilarity], result of:",
                                    "details": [...]
                                },
                                {
                                    "value": 13.881224,
                                    "description": "weight(last-name:xxx in 3223) [PerFieldSimilarity], result of:",
                                    "details": [...]
                                }
                            ]
                        },
                        {
                            "value": 1.0,
                            "description": "date-of-birth:[XXX TO XXX]",
                            "details": []
                        },
                        {
                            "value": 92.82605,
                            "description": "sum of:",
                            "details": [
                                {
                                    "value": 46.42945,
                                    "description": "sum of:",
                                    "details": [
                                        {
                                            "value": 46.42945,
                                            "description": "weight(first-name:XXX in 3223) [PerFieldSimilarity], result of:",
                                            "details": [...]
                                        }
                                    ]
                                },
                                {
                                    "value": 43.3966,
                                    "description": "sum of:",
                                    "details": [
                                        {
                                            "value": 43.3966,
                                            "description": "weight(last-name:XXX in 3223) [PerFieldSimilarity], result of:",
                                            "details": [...]
                                        }
                                    ]
                                },
                                {
                                    "value": 3.0,
                                    "description": "date-of-birth:[XXX TO XXX]^3.0",
                                    "details": []
                                }
                            ]
                        },
                        {
                            "value": 17.182709,
                            "description": "weight(postal-code:XXX  in 3223) [PerFieldSimilarity], result of:",
                            "details": []
                        },
                        {
                            "value": 0.6058445,
                            "description": "weight(country-code:XXX in 3223) [PerFieldSimilarity], result of:",
                            "details": []
                        },
                        {
                            "value": 24.692732,
                            "description": "weight(phone:XXX in 3223) [PerFieldSimilarity], result of:",
                            "details": [...]
                        },
                        {
                            "value": 4.2287235,
                            "description": "sum of:",
                            "details": [...]
                                },
                                {
                                    "value": 0.7475863,
                                    "description": "weight(security-question:XXX in 3223) [PerFieldSimilarity], result of:",
                                    "details": [...]
                                },
                                {
                                    "value": 1.7405679,
                                    "description": "weight(security-question:XXX in 3223) [PerFieldSimilarity], result of:",
                                    "details": [...]
                                },
                                {
                                    "value": 1.7405679,
                                    "description": "weight(security-question:XXX in 3223) [PerFieldSimilarity], result of:",
                                    "details": [...]
                                }
                            ]
                        },
                        {
                            "value": 9.014062,
                            "description": "weight(security-answer:XXX in 3223) [PerFieldSimilarity], result of:",
                            "details": [...]
                        }
                    ]
                }
            },
            {
                "_shard": "[my-shard][0]",
                "_node": "acccSjFfQnOffqbiHV5nAg",
                "_index": "my-index",
                "_type": "_doc",
                "_id": "359895",
                "_score": 40.7084,
                "_source": {
                    "id": "359895",
                    "user-id": 389681,
                    "username": "XXX",
                    "email": "[email protected]",
                    "password-hash": "XXX",
                    "currency-code": "XXX",
                    "first-name": "XXX",
                    "last-name": "XXX",
                    "date-of-birth": "1973-03-01",
                    "address": "XXX",
                    "city": "N/A",
                    "postal-code": "XXX ",
                    "country-code": "XXX",
                    "phone": "XXX",
                    "security-question": "XXX?",
                    "security-answer": "XXX"
                },
                "_explanation": {
                    "value": 40.7084,
                    "description": "max plus 0.5 times others of:",
                    "details": [
                        {
                            "value": 1.0,
                            "description": "date-of-birth:[XXX TO XXX]",
                            "details": []
                        },
                        {
                            "value": 17.182709,
                            "description": "weight(postal-code:XXX  in 3183) [PerFieldSimilarity], result of:",
                            "details": [...]
                        },
                        {
                            "value": 0.6058445,
                            "description": "weight(country-code:XXX in 3183) [PerFieldSimilarity], result of:",
                            "details": [...]
                        },
                        {
                            "value": 24.692732,
                            "description": "weight(phone:XXX in 3183) [PerFieldSimilarity], result of:",
                            "details": [...]
                        },
                        {
                            "value": 4.2287235,
                            "description": "sum of:",
                            "details": [
                                {
                                    "value": 1.5324217E-6,
                                    "description": "weight(security-question:XXX in 3183) [PerFieldSimilarity], result of:",
                                    "details": [...]
                                },
                                {
                                    "value": 0.7475863,
                                    "description": "weight(security-question:XXX in 3183) [PerFieldSimilarity], result of:",
                                    "details": [...]
                                },
                                {
                                    "value": 1.7405679,
                                    "description": "weight(security-question:XXX in 3183) [PerFieldSimilarity], result of:",
                                    "details": [...]
                                },
                                {
                                    "value": 1.7405679,
                                    "description": "weight(security-question:XXX in 3183) [PerFieldSimilarity], result of:",
                                    "details": [...]
                                }
                            ]
                        },
                        {
                            "value": 9.014062,
                            "description": "weight(security-answer:XXX in 3183) [PerFieldSimilarity], result of:",
                            "details": [...]
                        }
                    ]
                }
            }

0

There are 0 answers