ES suggest, search all words in index item (not only the first word)

569 views Asked by At

Based on this answer (the first option) I've created this index:

    'settings' => array(
        'analysis' => array(
            'analyzer' => array(
                'stop_analyzer' => array( 
                    'type' => 'custom',
                    'tokenizer' => 'standard',
                    'filter' => array(
                        'lowercase',
                        'english_stop'
                    )
                )
            ),
            "filter" => array(
                "english_stop" => array(
                    "type" => "stop",
                    "stopwords" => "_english_"
                )
            )
        )
    ),
    'mappings' => array(
        'properties' => array(
            'texts' => array(
                'type' => 'completion',
                "analyzer" => "stop_analyzer",
                "search_analyzer" => "stop_analyzer", 
                'preserve_position_increments' => false
            ),
        ),
    )

This works perfect when I begin the suggest search with or without a stop word. However, when for example I have this in my index: This is the text, and I search for text I won't get any results, so what's the proper way to do this? I'd rather not use N-gram.

My search query:

'suggest' => array(
    'suggestion' => array(
        'prefix'=> 'text',
        'completion' => array(
            'field' => 'texts'
        )
    )
)
2

There are 2 answers

0
ESCoder On BEST ANSWER

Based on the comment given by the user, adding another answer, for searching all the words using n-grams. The previous method works perfectly but it is quite expensive to use regex.

Adding a working example with index mapping, index data, search query, and search result

Index Mapping:

{
  "settings": {
    "analysis": {
      "filter": {
        "my_custom_stop_words_filter": {
          "type": "stop",
          "ignore_case": true,
          "stopwords": [
            "and",
            "is",
            "the"
          ]
        },
        "ngram_filter": {
          "type": "ngram",
          "min_gram": 4,
          "max_gram": 20
        }
      },
      "analyzer": {
        "ngram_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "ngram_filter",
            "my_custom_stop_words_filter"
          ]
        }
      }
    },
    "max_ngram_diff": 50
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "text",
        "analyzer": "ngram_analyzer",
        "search_analyzer": "standard"
      }
    }
  }
}

Analyze API

POST/_analyze
{
  "analyzer" : "ngram_analyzer",
  "text" : "This is the text"
}

The following tokens are generated:

{
    "tokens": [
        {
            "token": "this",
            "start_offset": 0,
            "end_offset": 4,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "text",
            "start_offset": 12,
            "end_offset": 16,
            "type": "<ALPHANUM>",
            "position": 3
        }
    ]
}

Index Data:

{
  "title": [
    "This is the text"
  ]
}

Search Query:

{
    "query": {
        "match": {
           "title": "text"
        }
    }
}

Search Result:

"hits": [
            {
                "_index": "stof_29753971",
                "_type": "_doc",
                "_id": "1",
                "_score": 0.41978103,
                "_source": {
                    "title": [
                        "This is the text"
                    ]
                }
            }
        ]
0
ESCoder On

The best way to the completion suggester that can match the middle of fields is n-gram filter.

But since you don't want to use n-gram, you can try out the below approach:

You can use multiple suggestions, where one suggestion is based on the prefix and for matching in the middle of fields you can use regex.

Adding a working example with index mapping, data, search query, and search result

Index Mapping:

{
  "settings": {
    "analysis": {
      "filter": {
        "my_custom_stop_words_filter": {
          "type": "stop",
          "ignore_case": true,
          "stopwords": [ "and", "is", "the" ]
        }
      },
      "analyzer": {
        "autocomplete": {
          "type": "custom",
          "tokenizer": "whitespace",
          "filter": [
            "lowercase",
            "my_custom_stop_words_filter"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "keyword"
      },
      "suggest": {
        "type": "completion",
        "analyzer": "autocomplete",
        "search_analyzer": "standard"
      }
    }
  }
}

Index Data:

{
  "suggest": [
    {
      "input": "This is the text"
    }
  ]
}
{
  "suggest": [
    {
      "input": "Software Manager"
    }
  ]
}

Search Query:

{
    "suggest": {
        "suggest-exact": {
            "prefix": "text",
            "completion": {
                "field": "suggest",
                "skip_duplicates": true
            }
        },
        "suggest-regex": {
            "regex": ".*text.*",
            "completion": {
                "field": "suggest",
                "skip_duplicates": true
            }
        }
    }
}

Search Result:

"suggest": {
    "suggest-exact": [
      {
        "text": "text",
        "offset": 0,
        "length": 4,
        "options": []
      }
    ],
    "suggest-regex": [
      {
        "text": ".*text.*",
        "offset": 0,
        "length": 8,
        "options": [
          {
            "text": "This is the text",
            "_index": "test",
            "_type": "_doc",
            "_id": "1",
            "_score": 1.0,
            "_source": {
              "suggest": [
                {
                  "input": "This is the text"
                }
              ]
            }
          }
        ]
      }
    ]
  }