A set of analyzers aimed at analyzing specific language text. The
following types are supported:
arabic
,
armenian
,
basque
,
bengali
,
brazilian
,
bulgarian
,
catalan
,
cjk
,
czech
,
danish
,
dutch
,
english
,
finnish
,
french
,
galician
,
german
,
greek
,
hindi
,
hungarian
,
indonesian
,
irish
,
italian
,
latvian
,
lithuanian
,
norwegian
,
persian
,
portuguese
,
romanian
,
russian
,
sorani
,
spanish
,
swedish
,
turkish
,
thai
.
All analyzers support setting custom stopwords
either internally in
the config, or by using an external stopwords file by setting
stopwords_path
. Check Stop Analyzer for
more details.
The stem_exclusion
parameter allows you to specify an array
of lowercase words that should not be stemmed. Internally, this
functionality is implemented by adding the
keyword_marker
token filter
with the keywords
set to the value of the stem_exclusion
parameter.
The following analyzers support setting custom stem_exclusion
list:
arabic
, armenian
, basque
, bengali
, bulgarian
, catalan
, czech
,
dutch
, english
, finnish
, french
, galician
,
german
, hindi
, hungarian
, indonesian
, irish
, italian
, latvian
,
lithuanian
, norwegian
, portuguese
, romanian
, russian
, sorani
,
spanish
, swedish
, turkish
.
The built-in language analyzers can be reimplemented as custom
analyzers
(as described below) in order to customize their behaviour.
If you do not intend to exclude words from being stemmed (the
equivalent of the stem_exclusion
parameter above), then you should remove
the keyword_marker
token filter from the custom analyzer configuration.
The arabic
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /arabic_example { "settings": { "analysis": { "filter": { "arabic_stop": { "type": "stop", "stopwords": "_arabic_" }, "arabic_keywords": { "type": "keyword_marker", "keywords": ["مثال"] }, "arabic_stemmer": { "type": "stemmer", "language": "arabic" } }, "analyzer": { "rebuilt_arabic": { "tokenizer": "standard", "filter": [ "lowercase", "decimal_digit", "arabic_stop", "arabic_normalization", "arabic_keywords", "arabic_stemmer" ] } } } } }
The armenian
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /armenian_example { "settings": { "analysis": { "filter": { "armenian_stop": { "type": "stop", "stopwords": "_armenian_" }, "armenian_keywords": { "type": "keyword_marker", "keywords": ["օրինակ"] }, "armenian_stemmer": { "type": "stemmer", "language": "armenian" } }, "analyzer": { "rebuilt_armenian": { "tokenizer": "standard", "filter": [ "lowercase", "armenian_stop", "armenian_keywords", "armenian_stemmer" ] } } } } }
The basque
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /basque_example { "settings": { "analysis": { "filter": { "basque_stop": { "type": "stop", "stopwords": "_basque_" }, "basque_keywords": { "type": "keyword_marker", "keywords": ["Adibidez"] }, "basque_stemmer": { "type": "stemmer", "language": "basque" } }, "analyzer": { "rebuilt_basque": { "tokenizer": "standard", "filter": [ "lowercase", "basque_stop", "basque_keywords", "basque_stemmer" ] } } } } }
The bengali
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /bengali_example { "settings": { "analysis": { "filter": { "bengali_stop": { "type": "stop", "stopwords": "_bengali_" }, "bengali_keywords": { "type": "keyword_marker", "keywords": ["উদাহরণ"] }, "bengali_stemmer": { "type": "stemmer", "language": "bengali" } }, "analyzer": { "rebuilt_bengali": { "tokenizer": "standard", "filter": [ "lowercase", "decimal_digit", "bengali_keywords", "indic_normalization", "bengali_normalization", "bengali_stop", "bengali_stemmer" ] } } } } }
The brazilian
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /brazilian_example { "settings": { "analysis": { "filter": { "brazilian_stop": { "type": "stop", "stopwords": "_brazilian_" }, "brazilian_keywords": { "type": "keyword_marker", "keywords": ["exemplo"] }, "brazilian_stemmer": { "type": "stemmer", "language": "brazilian" } }, "analyzer": { "rebuilt_brazilian": { "tokenizer": "standard", "filter": [ "lowercase", "brazilian_stop", "brazilian_keywords", "brazilian_stemmer" ] } } } } }
The bulgarian
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /bulgarian_example { "settings": { "analysis": { "filter": { "bulgarian_stop": { "type": "stop", "stopwords": "_bulgarian_" }, "bulgarian_keywords": { "type": "keyword_marker", "keywords": ["пример"] }, "bulgarian_stemmer": { "type": "stemmer", "language": "bulgarian" } }, "analyzer": { "rebuilt_bulgarian": { "tokenizer": "standard", "filter": [ "lowercase", "bulgarian_stop", "bulgarian_keywords", "bulgarian_stemmer" ] } } } } }
The catalan
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /catalan_example { "settings": { "analysis": { "filter": { "catalan_elision": { "type": "elision", "articles": [ "d", "l", "m", "n", "s", "t"], "articles_case": true }, "catalan_stop": { "type": "stop", "stopwords": "_catalan_" }, "catalan_keywords": { "type": "keyword_marker", "keywords": ["exemple"] }, "catalan_stemmer": { "type": "stemmer", "language": "catalan" } }, "analyzer": { "rebuilt_catalan": { "tokenizer": "standard", "filter": [ "catalan_elision", "lowercase", "catalan_stop", "catalan_keywords", "catalan_stemmer" ] } } } } }
You may find that icu_analyzer
in the ICU analysis plugin works better
for CJK text than the cjk
analyzer. Experiment with your text and queries.
The cjk
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /cjk_example { "settings": { "analysis": { "filter": { "english_stop": { "type": "stop", "stopwords": [ "a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "www" ] } }, "analyzer": { "rebuilt_cjk": { "tokenizer": "standard", "filter": [ "cjk_width", "lowercase", "cjk_bigram", "english_stop" ] } } } } }
The czech
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /czech_example { "settings": { "analysis": { "filter": { "czech_stop": { "type": "stop", "stopwords": "_czech_" }, "czech_keywords": { "type": "keyword_marker", "keywords": ["příklad"] }, "czech_stemmer": { "type": "stemmer", "language": "czech" } }, "analyzer": { "rebuilt_czech": { "tokenizer": "standard", "filter": [ "lowercase", "czech_stop", "czech_keywords", "czech_stemmer" ] } } } } }
The danish
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /danish_example { "settings": { "analysis": { "filter": { "danish_stop": { "type": "stop", "stopwords": "_danish_" }, "danish_keywords": { "type": "keyword_marker", "keywords": ["eksempel"] }, "danish_stemmer": { "type": "stemmer", "language": "danish" } }, "analyzer": { "rebuilt_danish": { "tokenizer": "standard", "filter": [ "lowercase", "danish_stop", "danish_keywords", "danish_stemmer" ] } } } } }
The dutch
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /dutch_example { "settings": { "analysis": { "filter": { "dutch_stop": { "type": "stop", "stopwords": "_dutch_" }, "dutch_keywords": { "type": "keyword_marker", "keywords": ["voorbeeld"] }, "dutch_stemmer": { "type": "stemmer", "language": "dutch" }, "dutch_override": { "type": "stemmer_override", "rules": [ "fiets=>fiets", "bromfiets=>bromfiets", "ei=>eier", "kind=>kinder" ] } }, "analyzer": { "rebuilt_dutch": { "tokenizer": "standard", "filter": [ "lowercase", "dutch_stop", "dutch_keywords", "dutch_override", "dutch_stemmer" ] } } } } }
The english
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /english_example { "settings": { "analysis": { "filter": { "english_stop": { "type": "stop", "stopwords": "_english_" }, "english_keywords": { "type": "keyword_marker", "keywords": ["example"] }, "english_stemmer": { "type": "stemmer", "language": "english" }, "english_possessive_stemmer": { "type": "stemmer", "language": "possessive_english" } }, "analyzer": { "rebuilt_english": { "tokenizer": "standard", "filter": [ "english_possessive_stemmer", "lowercase", "english_stop", "english_keywords", "english_stemmer" ] } } } } }
The finnish
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /finnish_example { "settings": { "analysis": { "filter": { "finnish_stop": { "type": "stop", "stopwords": "_finnish_" }, "finnish_keywords": { "type": "keyword_marker", "keywords": ["esimerkki"] }, "finnish_stemmer": { "type": "stemmer", "language": "finnish" } }, "analyzer": { "rebuilt_finnish": { "tokenizer": "standard", "filter": [ "lowercase", "finnish_stop", "finnish_keywords", "finnish_stemmer" ] } } } } }
The french
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /french_example { "settings": { "analysis": { "filter": { "french_elision": { "type": "elision", "articles_case": true, "articles": [ "l", "m", "t", "qu", "n", "s", "j", "d", "c", "jusqu", "quoiqu", "lorsqu", "puisqu" ] }, "french_stop": { "type": "stop", "stopwords": "_french_" }, "french_keywords": { "type": "keyword_marker", "keywords": ["Exemple"] }, "french_stemmer": { "type": "stemmer", "language": "light_french" } }, "analyzer": { "rebuilt_french": { "tokenizer": "standard", "filter": [ "french_elision", "lowercase", "french_stop", "french_keywords", "french_stemmer" ] } } } } }
The galician
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /galician_example { "settings": { "analysis": { "filter": { "galician_stop": { "type": "stop", "stopwords": "_galician_" }, "galician_keywords": { "type": "keyword_marker", "keywords": ["exemplo"] }, "galician_stemmer": { "type": "stemmer", "language": "galician" } }, "analyzer": { "rebuilt_galician": { "tokenizer": "standard", "filter": [ "lowercase", "galician_stop", "galician_keywords", "galician_stemmer" ] } } } } }
The german
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /german_example { "settings": { "analysis": { "filter": { "german_stop": { "type": "stop", "stopwords": "_german_" }, "german_keywords": { "type": "keyword_marker", "keywords": ["Beispiel"] }, "german_stemmer": { "type": "stemmer", "language": "light_german" } }, "analyzer": { "rebuilt_german": { "tokenizer": "standard", "filter": [ "lowercase", "german_stop", "german_keywords", "german_normalization", "german_stemmer" ] } } } } }
The greek
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /greek_example { "settings": { "analysis": { "filter": { "greek_stop": { "type": "stop", "stopwords": "_greek_" }, "greek_lowercase": { "type": "lowercase", "language": "greek" }, "greek_keywords": { "type": "keyword_marker", "keywords": ["παράδειγμα"] }, "greek_stemmer": { "type": "stemmer", "language": "greek" } }, "analyzer": { "rebuilt_greek": { "tokenizer": "standard", "filter": [ "greek_lowercase", "greek_stop", "greek_keywords", "greek_stemmer" ] } } } } }
The hindi
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /hindi_example { "settings": { "analysis": { "filter": { "hindi_stop": { "type": "stop", "stopwords": "_hindi_" }, "hindi_keywords": { "type": "keyword_marker", "keywords": ["उदाहरण"] }, "hindi_stemmer": { "type": "stemmer", "language": "hindi" } }, "analyzer": { "rebuilt_hindi": { "tokenizer": "standard", "filter": [ "lowercase", "decimal_digit", "hindi_keywords", "indic_normalization", "hindi_normalization", "hindi_stop", "hindi_stemmer" ] } } } } }
The hungarian
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /hungarian_example { "settings": { "analysis": { "filter": { "hungarian_stop": { "type": "stop", "stopwords": "_hungarian_" }, "hungarian_keywords": { "type": "keyword_marker", "keywords": ["példa"] }, "hungarian_stemmer": { "type": "stemmer", "language": "hungarian" } }, "analyzer": { "rebuilt_hungarian": { "tokenizer": "standard", "filter": [ "lowercase", "hungarian_stop", "hungarian_keywords", "hungarian_stemmer" ] } } } } }
The indonesian
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /indonesian_example { "settings": { "analysis": { "filter": { "indonesian_stop": { "type": "stop", "stopwords": "_indonesian_" }, "indonesian_keywords": { "type": "keyword_marker", "keywords": ["contoh"] }, "indonesian_stemmer": { "type": "stemmer", "language": "indonesian" } }, "analyzer": { "rebuilt_indonesian": { "tokenizer": "standard", "filter": [ "lowercase", "indonesian_stop", "indonesian_keywords", "indonesian_stemmer" ] } } } } }
The irish
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /irish_example { "settings": { "analysis": { "filter": { "irish_hyphenation": { "type": "stop", "stopwords": [ "h", "n", "t" ], "ignore_case": true }, "irish_elision": { "type": "elision", "articles": [ "d", "m", "b" ], "articles_case": true }, "irish_stop": { "type": "stop", "stopwords": "_irish_" }, "irish_lowercase": { "type": "lowercase", "language": "irish" }, "irish_keywords": { "type": "keyword_marker", "keywords": ["sampla"] }, "irish_stemmer": { "type": "stemmer", "language": "irish" } }, "analyzer": { "rebuilt_irish": { "tokenizer": "standard", "filter": [ "irish_hyphenation", "irish_elision", "irish_lowercase", "irish_stop", "irish_keywords", "irish_stemmer" ] } } } } }
The italian
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /italian_example { "settings": { "analysis": { "filter": { "italian_elision": { "type": "elision", "articles": [ "c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell", "gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d" ], "articles_case": true }, "italian_stop": { "type": "stop", "stopwords": "_italian_" }, "italian_keywords": { "type": "keyword_marker", "keywords": ["esempio"] }, "italian_stemmer": { "type": "stemmer", "language": "light_italian" } }, "analyzer": { "rebuilt_italian": { "tokenizer": "standard", "filter": [ "italian_elision", "lowercase", "italian_stop", "italian_keywords", "italian_stemmer" ] } } } } }
The latvian
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /latvian_example { "settings": { "analysis": { "filter": { "latvian_stop": { "type": "stop", "stopwords": "_latvian_" }, "latvian_keywords": { "type": "keyword_marker", "keywords": ["piemērs"] }, "latvian_stemmer": { "type": "stemmer", "language": "latvian" } }, "analyzer": { "rebuilt_latvian": { "tokenizer": "standard", "filter": [ "lowercase", "latvian_stop", "latvian_keywords", "latvian_stemmer" ] } } } } }
The lithuanian
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /lithuanian_example { "settings": { "analysis": { "filter": { "lithuanian_stop": { "type": "stop", "stopwords": "_lithuanian_" }, "lithuanian_keywords": { "type": "keyword_marker", "keywords": ["pavyzdys"] }, "lithuanian_stemmer": { "type": "stemmer", "language": "lithuanian" } }, "analyzer": { "rebuilt_lithuanian": { "tokenizer": "standard", "filter": [ "lowercase", "lithuanian_stop", "lithuanian_keywords", "lithuanian_stemmer" ] } } } } }
The norwegian
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /norwegian_example { "settings": { "analysis": { "filter": { "norwegian_stop": { "type": "stop", "stopwords": "_norwegian_" }, "norwegian_keywords": { "type": "keyword_marker", "keywords": ["eksempel"] }, "norwegian_stemmer": { "type": "stemmer", "language": "norwegian" } }, "analyzer": { "rebuilt_norwegian": { "tokenizer": "standard", "filter": [ "lowercase", "norwegian_stop", "norwegian_keywords", "norwegian_stemmer" ] } } } } }
The persian
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /persian_example { "settings": { "analysis": { "char_filter": { "zero_width_spaces": { "type": "mapping", "mappings": [ "\\u200C=>\\u0020"] } }, "filter": { "persian_stop": { "type": "stop", "stopwords": "_persian_" } }, "analyzer": { "rebuilt_persian": { "tokenizer": "standard", "char_filter": [ "zero_width_spaces" ], "filter": [ "lowercase", "decimal_digit", "arabic_normalization", "persian_normalization", "persian_stop" ] } } } } }
The portuguese
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /portuguese_example { "settings": { "analysis": { "filter": { "portuguese_stop": { "type": "stop", "stopwords": "_portuguese_" }, "portuguese_keywords": { "type": "keyword_marker", "keywords": ["exemplo"] }, "portuguese_stemmer": { "type": "stemmer", "language": "light_portuguese" } }, "analyzer": { "rebuilt_portuguese": { "tokenizer": "standard", "filter": [ "lowercase", "portuguese_stop", "portuguese_keywords", "portuguese_stemmer" ] } } } } }
The romanian
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /romanian_example { "settings": { "analysis": { "filter": { "romanian_stop": { "type": "stop", "stopwords": "_romanian_" }, "romanian_keywords": { "type": "keyword_marker", "keywords": ["exemplu"] }, "romanian_stemmer": { "type": "stemmer", "language": "romanian" } }, "analyzer": { "rebuilt_romanian": { "tokenizer": "standard", "filter": [ "lowercase", "romanian_stop", "romanian_keywords", "romanian_stemmer" ] } } } } }
The russian
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /russian_example { "settings": { "analysis": { "filter": { "russian_stop": { "type": "stop", "stopwords": "_russian_" }, "russian_keywords": { "type": "keyword_marker", "keywords": ["пример"] }, "russian_stemmer": { "type": "stemmer", "language": "russian" } }, "analyzer": { "rebuilt_russian": { "tokenizer": "standard", "filter": [ "lowercase", "russian_stop", "russian_keywords", "russian_stemmer" ] } } } } }
The sorani
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /sorani_example { "settings": { "analysis": { "filter": { "sorani_stop": { "type": "stop", "stopwords": "_sorani_" }, "sorani_keywords": { "type": "keyword_marker", "keywords": ["mînak"] }, "sorani_stemmer": { "type": "stemmer", "language": "sorani" } }, "analyzer": { "rebuilt_sorani": { "tokenizer": "standard", "filter": [ "sorani_normalization", "lowercase", "decimal_digit", "sorani_stop", "sorani_keywords", "sorani_stemmer" ] } } } } }
The spanish
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /spanish_example { "settings": { "analysis": { "filter": { "spanish_stop": { "type": "stop", "stopwords": "_spanish_" }, "spanish_keywords": { "type": "keyword_marker", "keywords": ["ejemplo"] }, "spanish_stemmer": { "type": "stemmer", "language": "light_spanish" } }, "analyzer": { "rebuilt_spanish": { "tokenizer": "standard", "filter": [ "lowercase", "spanish_stop", "spanish_keywords", "spanish_stemmer" ] } } } } }
The swedish
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /swedish_example { "settings": { "analysis": { "filter": { "swedish_stop": { "type": "stop", "stopwords": "_swedish_" }, "swedish_keywords": { "type": "keyword_marker", "keywords": ["exempel"] }, "swedish_stemmer": { "type": "stemmer", "language": "swedish" } }, "analyzer": { "rebuilt_swedish": { "tokenizer": "standard", "filter": [ "lowercase", "swedish_stop", "swedish_keywords", "swedish_stemmer" ] } } } } }
The turkish
analyzer could be reimplemented as a custom
analyzer as follows:
PUT /turkish_example { "settings": { "analysis": { "filter": { "turkish_stop": { "type": "stop", "stopwords": "_turkish_" }, "turkish_lowercase": { "type": "lowercase", "language": "turkish" }, "turkish_keywords": { "type": "keyword_marker", "keywords": ["örnek"] }, "turkish_stemmer": { "type": "stemmer", "language": "turkish" } }, "analyzer": { "rebuilt_turkish": { "tokenizer": "standard", "filter": [ "apostrophe", "turkish_lowercase", "turkish_stop", "turkish_keywords", "turkish_stemmer" ] } } } } }
The thai
analyzer could be reimplemented as a custom
analyzer as follows: