Skip to content
Merged
16 changes: 16 additions & 0 deletions docs/changelog/136915.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
pr: 136915
summary: Implement native synthetic source for normalized keywords
area: Mapping
type: breaking
issues: []
breaking:
title: Implement native synthetic source for normalized keywords
area: Mapping
details: "This adds a new mapping parameter `normalizer_skip_store_original_value` to keyword fields. When this\
\ parameter is set, and synthetic_source is enabled, keyword fields with configured normalizers will not store the\
\ original non-normalized value in _ignored_source, and will instead use the normalized value to reconstruct the\
\ source. This parameter enabled by default for the built-in `lowercase` normalizer, and is disabled by default for\
\ other custom normalizers."
impact: "Keyword fields using the `lowercase` normalizer will return the normalized value in the source when synthetic\
\ source is enabled."
notable: false
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,270 @@ keyword with normalizer:
keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]

---
keyword with non-lowercase normalizer:
- do:
indices.create:
index: test-keyword-with-normalizer
body:
settings:
analysis:
normalizer:
test_normalizer:
type: custom
filter:
- asciifolding
- uppercase
index:
mapping.source.mode: synthetic
mappings:
properties:
keyword:
type: keyword
normalizer: test_normalizer
keyword_with_skip_store:
type: keyword
normalizer: test_normalizer
normalizer_skip_store_original_value: true
keyword_with_ignore_above:
type: keyword
normalizer: test_normalizer
ignore_above: 10
keyword_without_doc_values:
type: keyword
normalizer: test_normalizer
doc_values: false

- do:
index:
index: test-keyword-with-normalizer
id: 1
body:
keyword: "the Quìck Brøwn Fox jumps over the låzy Dog"
keyword_with_skip_store: "the Quìck Brøwn Fox jumps over the låzy Dog"
keyword_with_ignore_above: "the Quìck Brøwn Fox jumps over the låzy Dog"
keyword_without_doc_values: "the Quìck Brøwn Fox jumps over the låzy Dog"

- do:
index:
index: test-keyword-with-normalizer
id: 2
body:
keyword: "The five BØXING wìzårds jümp Quickly"
keyword_with_skip_store: "The five BØXING wìzårds jümp Quickly"
keyword_with_ignore_above: "The five BØXING wìzårds jümp Quickly"
keyword_without_doc_values: "The five BØXING wìzårds jümp Quickly"

- do:
index:
index: test-keyword-with-normalizer
id: 3
body:
keyword: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]
keyword_with_skip_store: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]
keyword_with_ignore_above: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]
keyword_without_doc_values: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]

- do:
mget:
index: test-keyword-with-normalizer
body:
ids: [ 1, 2, 3 ]
- match: { docs.0._index: "test-keyword-with-normalizer" }
- match: { docs.0._id: "1" }
- match:
docs.0._source:
keyword: "the Quìck Brøwn Fox jumps over the låzy Dog"
keyword_with_skip_store: "THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG"
keyword_with_ignore_above: "the Quìck Brøwn Fox jumps over the låzy Dog"
keyword_without_doc_values: "the Quìck Brøwn Fox jumps over the låzy Dog"

- match: { docs.1._index: "test-keyword-with-normalizer" }
- match: { docs.1._id: "2" }
- match:
docs.1._source:
keyword: "The five BØXING wìzårds jümp Quickly"
keyword_with_skip_store: "THE FIVE BOXING WIZARDS JUMP QUICKLY"
keyword_with_ignore_above: "The five BØXING wìzårds jümp Quickly"
keyword_without_doc_values: "The five BØXING wìzårds jümp Quickly"

- match: { docs.2._index: "test-keyword-with-normalizer" }
- match: { docs.2._id: "3" }
- match:
docs.2._source:
keyword: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]
keyword_with_skip_store: [ "DO OR DO NOT, THERE IS NO TRY", "MAY THE FORCE BE WITH YOU!" ]
keyword_with_ignore_above: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]
keyword_without_doc_values: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]

---
keyword with normalizer and skip store original value:
- do:
indices.create:
index: test-keyword-with-normalizer
body:
settings:
analysis:
normalizer:
lowercase:
type: custom
filter:
- lowercase
index:
mapping.source.mode: synthetic
mappings:
properties:
keyword:
type: keyword
normalizer: lowercase
normalizer_skip_store_original_value: true
keyword_with_ignore_above:
type: keyword
normalizer: lowercase
normalizer_skip_store_original_value: true
ignore_above: 10
keyword_without_doc_values:
type: keyword
normalizer: lowercase
normalizer_skip_store_original_value: true
doc_values: false

- do:
index:
index: test-keyword-with-normalizer
id: 1
body:
keyword: "the quick brown fox jumps over the lazy dog"
keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog"
keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog"

- do:
index:
index: test-keyword-with-normalizer
id: 2
body:
keyword: "the five boxing wizards jump quickly"
keyword_with_ignore_above: "The five BOXING wizards jump Quickly"
keyword_without_doc_values: "The five BOXING wizards jump Quickly"

- do:
index:
index: test-keyword-with-normalizer
id: 3
body:
keyword: [ "may the force be with you!", "do or do not, there is no try" ]
keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]

- do:
mget:
index: test-keyword-with-normalizer
body:
ids: [ 1, 2, 3 ]
- match: { docs.0._index: "test-keyword-with-normalizer" }
- match: { docs.0._id: "1" }
- match:
docs.0._source:
keyword: "the quick brown fox jumps over the lazy dog"
keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog"
keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog"

- match: { docs.1._index: "test-keyword-with-normalizer" }
- match: { docs.1._id: "2" }
- match:
docs.1._source:
keyword: "the five boxing wizards jump quickly"
keyword_with_ignore_above: "The five BOXING wizards jump Quickly"
keyword_without_doc_values: "The five BOXING wizards jump Quickly"

- match: { docs.2._index: "test-keyword-with-normalizer" }
- match: { docs.2._id: "3" }
- match:
docs.2._source:
keyword: [ "do or do not, there is no try", "may the force be with you!" ]
keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]

---

keyword with built-in normalizer:
- do:
indices.create:
index: test-keyword-with-normalizer
body:
settings:
index:
mapping.source.mode: synthetic
mappings:
properties:
keyword:
type: keyword
normalizer: lowercase
keyword_with_ignore_above:
type: keyword
normalizer: lowercase
ignore_above: 10
keyword_without_doc_values:
type: keyword
normalizer: lowercase
doc_values: false

- do:
index:
index: test-keyword-with-normalizer
id: 1
body:
keyword: "the quick brown fox jumps over the lazy dog"
keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog"
keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog"

- do:
index:
index: test-keyword-with-normalizer
id: 2
body:
keyword: "the five boxing wizards jump quickly"
keyword_with_ignore_above: "The five BOXING wizards jump Quickly"
keyword_without_doc_values: "The five BOXING wizards jump Quickly"

- do:
index:
index: test-keyword-with-normalizer
id: 3
body:
keyword: [ "may the force be with you!", "do or do not, there is no try" ]
keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]

- do:
mget:
index: test-keyword-with-normalizer
body:
ids: [ 1, 2, 3 ]
- match: { docs.0._index: "test-keyword-with-normalizer" }
- match: { docs.0._id: "1" }
- match:
docs.0._source:
keyword: "the quick brown fox jumps over the lazy dog"
keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog"
keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog"

- match: { docs.1._index: "test-keyword-with-normalizer" }
- match: { docs.1._id: "2" }
- match:
docs.1._source:
keyword: "the five boxing wizards jump quickly"
keyword_with_ignore_above: "The five BOXING wizards jump Quickly"
keyword_without_doc_values: "The five BOXING wizards jump Quickly"

- match: { docs.2._index: "test-keyword-with-normalizer" }
- match: { docs.2._id: "3" }
- match:
docs.2._source:
keyword: [ "do or do not, there is no try", "may the force be with you!" ]
keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe also add a test here that uses a custom normalizer that does something else than lowercasing? (e.g. asciifolding or uppercase) And check that we retain original value for keyword field?

---
stored text:
- requires:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.LowercaseNormalizer;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.fielddata.FieldData;
import org.elasticsearch.index.fielddata.FieldDataContext;
Expand Down Expand Up @@ -191,6 +192,7 @@ public static final class Builder extends FieldMapper.DimensionBuilder {
);

private final Parameter<String> normalizer;
private final Parameter<Boolean> normalizerSkipStoreOriginalValue;

private final Parameter<Boolean> splitQueriesOnWhitespace = Parameter.boolParam(
"split_queries_on_whitespace",
Expand Down Expand Up @@ -278,6 +280,14 @@ private Builder(
m -> toType(m).normalizerName,
null
).acceptsNull();
this.normalizerSkipStoreOriginalValue = Parameter.boolParam(
"normalizer_skip_store_original_value",
false,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't it ok to go from false -> true here? Although probably not ok the other way around.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that would work. Going from false -> true would cause previously indexed documents to start returning their normalized values, and the original values stored in _ignored_source would be unused.

I'm reluctant to allow it though, as I think it might create some unnecessary confusion. It's simpler to reason about and to debug if the values are all stored the same way throughout the life of the index.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, let's keep this mapping attribute immutable.

m -> ((KeywordFieldMapper) m).isNormalizerSkipStoreOriginalValue(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we allow customers to set this in the first place when synthetic source is not enabled? They might find it confusing if they enable it but it doesn't work for whatever reason. In reality, they just don't have synthetic source enabled.

Just a question. I'm not sure how we normally deal with parameters unique to synthetic source.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a good point, although I'm not sure I would want to completely disallow it and make setting it on a non-synthetic index a fatal invalid mapping exception. Maybe just a warning?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should add validations or warnings. A cluster can start with synthetic source, but then fall back to basic and then use stored source, this shouldn't cause any warning or failures.

I think this mapping should just be a no-op in case source mode isn't synthetic.

() -> "lowercase".equals(normalizer.getValue())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] do you think it makes sense to extract "lowercase" into an enum?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I'll try and extract it into some form of constant.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looked into this, and the built-in lowercase normalizer is registered with a string literal "lowercase" (code). I could extract that out into a constant, then reference the constant here, but I'm reluctant to touch the AnalysisModule in this PR.

I think it's probably fine to just leave the string literal here, but if we do want to extract out the constant, I think it'd make sense as a follow-up PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normalizer mapping attribute can contain any value, because it is allowed to define custom normalizers via index settings. So I think it is best to keep this as a string.

&& indexAnalyzers.getNormalizer(normalizer.getValue()).analyzer() instanceof LowercaseNormalizer
).setSerializerCheck((includeDefaults, isConfigured, value) -> includeDefaults || isConfigured || value);

this.script.precludesParameters(nullValue);
addScriptValidation(script, indexed, hasDocValues);

Expand Down Expand Up @@ -407,6 +417,7 @@ protected Parameter<?>[] getParameters() {
hasNorms,
similarity,
normalizer,
normalizerSkipStoreOriginalValue,
splitQueriesOnWhitespace,
script,
onScriptError,
Expand Down Expand Up @@ -1110,6 +1121,7 @@ public Query automatonQuery(
private final String indexOptions;
private final FieldType fieldType;
private final String normalizerName;
private final boolean normalizerSkipStoreOriginalValue;
private final boolean splitQueriesOnWhitespace;
private final Script script;
private final ScriptCompiler scriptCompiler;
Expand Down Expand Up @@ -1140,6 +1152,7 @@ private KeywordFieldMapper(
this.indexOptions = builder.indexOptions.getValue();
this.fieldType = freezeAndDeduplicateFieldType(fieldType);
this.normalizerName = builder.normalizer.getValue();
this.normalizerSkipStoreOriginalValue = builder.normalizerSkipStoreOriginalValue.getValue();
this.splitQueriesOnWhitespace = builder.splitQueriesOnWhitespace.getValue();
this.script = builder.script.get();
this.indexAnalyzers = builder.indexAnalyzers;
Expand All @@ -1164,6 +1177,10 @@ public String getOffsetFieldName() {
return offsetsFieldName;
}

public boolean isNormalizerSkipStoreOriginalValue() {
return normalizerSkipStoreOriginalValue;
}

protected void parseCreateField(DocumentParserContext context) throws IOException {
var value = context.parser().optimizedTextOrNull();

Expand Down Expand Up @@ -1343,9 +1360,8 @@ boolean hasNormalizer() {

@Override
protected SyntheticSourceSupport syntheticSourceSupport() {
if (hasNormalizer()) {
// NOTE: no matter if we have doc values or not we use fallback synthetic source
// to store the original value whose doc values would be altered by the normalizer
if (hasNormalizer() && normalizerSkipStoreOriginalValue == false) {
// NOTE: we use fallback synthetic source to store the original value since the doc values would be altered by the normalizer
return SyntheticSourceSupport.FALLBACK;
}

Expand Down
Loading