Skip to content
Merged
16 changes: 16 additions & 0 deletions docs/changelog/136915.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
pr: 136915
summary: Implement native synthetic source for normalized keywords
area: Mapping
type: breaking
issues: []
breaking:
title: Implement native synthetic source for normalized keywords
area: Mapping
details: "This adds a new mapping parameter `normalizer_skip_store_original_value` to keyword fields. When this\
\ parameter is set, and synthetic_source is enabled, keyword fields with configured normalizers will not store the\
\ original non-normalized value in _ignored_source, and will instead use the normalized value to reconstruct the\
\ source. This parameter enabled by default for the built-in `lowercase` normalizer, and is disabled by default for\
\ other custom normalizers."
impact: "Keyword fields using the `lowercase` normalizer will return the normalized value in the source when synthetic\
\ source is enabled."
notable: false
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,17 @@ keyword with normalizer:
keyword:
type: keyword
normalizer: lowercase
normalizer_skip_store_original_value: false
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be configured if it defaults to false anyways?

Copy link
Contributor Author

@jordan-powers jordan-powers Oct 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It defaults to true for the lowercase normalizer, so I have to explicitly configure it to false here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just noticed this test is defining a custom normalizer called "lowercase". Currently the default logic will trigger for any normalizer called "lowercase", no matter if it's the built-in one or a custom one shadowing the built-in one. I'll look into what it would take to only set the default for the built-in one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, this should be addressed as of 6c7f6bf

keyword_with_ignore_above:
type: keyword
normalizer: lowercase
ignore_above: 10
normalizer_skip_store_original_value: false
keyword_without_doc_values:
type: keyword
normalizer: lowercase
doc_values: false
normalizer_skip_store_original_value: false

- do:
index:
Expand Down Expand Up @@ -138,6 +141,94 @@ keyword with normalizer:
keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]

---
keyword with normalizer and skip store original value:
- do:
indices.create:
index: test-keyword-with-normalizer
body:
settings:
analysis:
normalizer:
lowercase:
type: custom
filter:
- lowercase
index:
mapping.source.mode: synthetic
mappings:
properties:
keyword:
type: keyword
normalizer: lowercase
normalizer_skip_store_original_value: true
keyword_with_ignore_above:
type: keyword
normalizer: lowercase
normalizer_skip_store_original_value: true
ignore_above: 10
keyword_without_doc_values:
type: keyword
normalizer: lowercase
normalizer_skip_store_original_value: true
doc_values: false

- do:
index:
index: test-keyword-with-normalizer
id: 1
body:
keyword: "the quick brown fox jumps over the lazy dog"
keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog"
keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog"

- do:
index:
index: test-keyword-with-normalizer
id: 2
body:
keyword: "the five boxing wizards jump quickly"
keyword_with_ignore_above: "The five BOXING wizards jump Quickly"
keyword_without_doc_values: "The five BOXING wizards jump Quickly"

- do:
index:
index: test-keyword-with-normalizer
id: 3
body:
keyword: [ "may the force be with you!", "do or do not, there is no try" ]
keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]

- do:
mget:
index: test-keyword-with-normalizer
body:
ids: [ 1, 2, 3 ]
- match: { docs.0._index: "test-keyword-with-normalizer" }
- match: { docs.0._id: "1" }
- match:
docs.0._source:
keyword: "the quick brown fox jumps over the lazy dog"
keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog"
keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog"

- match: { docs.1._index: "test-keyword-with-normalizer" }
- match: { docs.1._id: "2" }
- match:
docs.1._source:
keyword: "the five boxing wizards jump quickly"
keyword_with_ignore_above: "The five BOXING wizards jump Quickly"
keyword_without_doc_values: "The five BOXING wizards jump Quickly"

- match: { docs.2._index: "test-keyword-with-normalizer" }
- match: { docs.2._id: "3" }
- match:
docs.2._source:
keyword: [ "do or do not, there is no try", "may the force be with you!" ]
keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe also add a test here that uses a custom normalizer that does something else than lowercasing? (e.g. asciifolding or uppercase) And check that we retain original value for keyword field?

---
stored text:
- requires:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ public static final class Builder extends FieldMapper.DimensionBuilder {
);

private final Parameter<String> normalizer;
private final Parameter<Boolean> normalizerSkipStoreOriginalValue;

private final Parameter<Boolean> splitQueriesOnWhitespace = Parameter.boolParam(
"split_queries_on_whitespace",
Expand Down Expand Up @@ -278,6 +279,13 @@ private Builder(
m -> toType(m).normalizerName,
null
).acceptsNull();
this.normalizerSkipStoreOriginalValue = Parameter.boolParam(
"normalizer_skip_store_original_value",
false,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't it ok to go from false -> true here? Although probably not ok the other way around.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that would work. Going from false -> true would cause previously indexed documents to start returning their normalized values, and the original values stored in _ignored_source would be unused.

I'm reluctant to allow it though, as I think it might create some unnecessary confusion. It's simpler to reason about and to debug if the values are all stored the same way throughout the life of the index.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, let's keep this mapping attribute immutable.

m -> ((KeywordFieldMapper) m).isNormalizerSkipStoreOriginalValue(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we allow customers to set this in the first place when synthetic source is not enabled? They might find it confusing if they enable it but it doesn't work for whatever reason. In reality, they just don't have synthetic source enabled.

Just a question. I'm not sure how we normally deal with parameters unique to synthetic source.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a good point, although I'm not sure I would want to completely disallow it and make setting it on a non-synthetic index a fatal invalid mapping exception. Maybe just a warning?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should add validations or warnings. A cluster can start with synthetic source, but then fall back to basic and then use stored source, this shouldn't cause any warning or failures.

I think this mapping should just be a no-op in case source mode isn't synthetic.

() -> "lowercase".equals(normalizer.getValue())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] do you think it makes sense to extract "lowercase" into an enum?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I'll try and extract it into some form of constant.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looked into this, and the built-in lowercase normalizer is registered with a string literal "lowercase" (code). I could extract that out into a constant, then reference the constant here, but I'm reluctant to touch the AnalysisModule in this PR.

I think it's probably fine to just leave the string literal here, but if we do want to extract out the constant, I think it'd make sense as a follow-up PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normalizer mapping attribute can contain any value, because it is allowed to define custom normalizers via index settings. So I think it is best to keep this as a string.

).setSerializerCheck((includeDefaults, isConfigured, value) -> includeDefaults || isConfigured || value);

this.script.precludesParameters(nullValue);
addScriptValidation(script, indexed, hasDocValues);

Expand Down Expand Up @@ -407,6 +415,7 @@ protected Parameter<?>[] getParameters() {
hasNorms,
similarity,
normalizer,
normalizerSkipStoreOriginalValue,
splitQueriesOnWhitespace,
script,
onScriptError,
Expand Down Expand Up @@ -1110,6 +1119,7 @@ public Query automatonQuery(
private final String indexOptions;
private final FieldType fieldType;
private final String normalizerName;
private final boolean normalizerSkipStoreOriginalValue;
private final boolean splitQueriesOnWhitespace;
private final Script script;
private final ScriptCompiler scriptCompiler;
Expand Down Expand Up @@ -1140,6 +1150,7 @@ private KeywordFieldMapper(
this.indexOptions = builder.indexOptions.getValue();
this.fieldType = freezeAndDeduplicateFieldType(fieldType);
this.normalizerName = builder.normalizer.getValue();
this.normalizerSkipStoreOriginalValue = builder.normalizerSkipStoreOriginalValue.getValue();
this.splitQueriesOnWhitespace = builder.splitQueriesOnWhitespace.getValue();
this.script = builder.script.get();
this.indexAnalyzers = builder.indexAnalyzers;
Expand All @@ -1164,6 +1175,10 @@ public String getOffsetFieldName() {
return offsetsFieldName;
}

public boolean isNormalizerSkipStoreOriginalValue() {
return normalizerSkipStoreOriginalValue;
}

protected void parseCreateField(DocumentParserContext context) throws IOException {
var value = context.parser().optimizedTextOrNull();

Expand Down Expand Up @@ -1343,9 +1358,8 @@ boolean hasNormalizer() {

@Override
protected SyntheticSourceSupport syntheticSourceSupport() {
if (hasNormalizer()) {
// NOTE: no matter if we have doc values or not we use fallback synthetic source
// to store the original value whose doc values would be altered by the normalizer
if (hasNormalizer() && normalizerSkipStoreOriginalValue == false) {
// NOTE: we use fallback synthetic source to store the original value since the doc values would be altered by the normalizer
return SyntheticSourceSupport.FALLBACK;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -576,6 +576,48 @@ public void testNormalizerNamedDefault() throws IOException {
assertEquals(new BytesRef("foo"), doc.rootDoc().getField("field2").binaryValue());
}

public void testNormalizerSyntheticSource() throws IOException {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] name could be more descriptive, like the other tests below it. Maybe testNormalizerSyntheticSourceWhenSkipStoreOriginalValueDisabled?

MapperService mapper = createSytheticSourceMapperService(
fieldMapping(
b -> b.field("type", "keyword").field("normalizer", "lowercase").field("normalizer_skip_store_original_value", false)
)
);
assertEquals("{\"field\":\"AbC\"}", syntheticSource(mapper.documentMapper(), b -> b.field("field", "AbC")));
}

public void testNormalizerSyntheticSourceSkipStoreOriginalValue() throws IOException {
MapperService mapper = createSytheticSourceMapperService(
fieldMapping(
b -> b.field("type", "keyword").field("normalizer", "lowercase").field("normalizer_skip_store_original_value", true)
)
);
assertEquals("{\"field\":\"abc\"}", syntheticSource(mapper.documentMapper(), b -> b.field("field", "AbC")));
}

public void testSkipStoreOriginalValueForLowercaseNormalizer() throws IOException {
MapperService mapper = createSytheticSourceMapperService(
fieldMapping(b -> b.field("type", "keyword").field("normalizer", "lowercase"))
);

var keywordMapper = mapper.mappingLookup().getMapper("field");
assertThat(keywordMapper, Matchers.instanceOf(KeywordFieldMapper.class));
assertTrue(((KeywordFieldMapper) keywordMapper).isNormalizerSkipStoreOriginalValue());

assertEquals("{\"field\":\"abc\"}", syntheticSource(mapper.documentMapper(), b -> b.field("field", "AbC")));
}

public void testSkipStoreOriginalValueForCustomNormalizer() throws IOException {
MapperService mapper = createSytheticSourceMapperService(
fieldMapping(b -> b.field("type", "keyword").field("normalizer", "other_lowercase"))
);

var keywordMapper = mapper.mappingLookup().getMapper("field");
assertThat(keywordMapper, Matchers.instanceOf(KeywordFieldMapper.class));
assertFalse(((KeywordFieldMapper) keywordMapper).isNormalizerSkipStoreOriginalValue());

assertEquals("{\"field\":\"AbC\"}", syntheticSource(mapper.documentMapper(), b -> b.field("field", "AbC")));
}

public void testParsesKeywordNestedEmptyObjectStrict() throws IOException {
DocumentMapper defaultMapper = createDocumentMapper(fieldMapping(this::minimalMapping));

Expand Down Expand Up @@ -623,7 +665,11 @@ public void testParsesKeywordNullStrict() throws IOException {
}

public void testUpdateNormalizer() throws IOException {
MapperService mapperService = createMapperService(fieldMapping(b -> b.field("type", "keyword").field("normalizer", "lowercase")));
MapperService mapperService = createMapperService(
fieldMapping(
b -> b.field("type", "keyword").field("normalizer", "lowercase").field("normalizer_skip_store_original_value", false)
)
);
IllegalArgumentException e = expectThrows(
IllegalArgumentException.class,
() -> merge(mapperService, fieldMapping(b -> b.field("type", "keyword").field("normalizer", "other_lowercase")))
Expand Down Expand Up @@ -841,6 +887,7 @@ public void testLegacyField() throws Exception {
b.startObject("mykeyw");
b.field("type", "keyword");
b.field("normalizer", "lowercase");
b.field("normalizer_skip_store_original_value", false);
b.endObject();
}));
assertThat(service.fieldType("mykeyw"), instanceOf(KeywordFieldMapper.KeywordFieldType.class));
Expand Down