Skip to content

Commit c09b318

Browse files
jordan-powersfzowl
authored andcommitted
Implement native synthetic source for normalized keywords (elastic#136915)
Currently, when a synthetic source index has a keyword field with a normalizer, the original, non-normalized value of the field is stored in _ignored_source so that the original source can be reconstructed. However, this can create significant storage overhead as we are essentially double-storing the value. This PR adds a new boolean keyword mapper parameter `normalizer_skip_store_original_value`. When this value is set, the original value is not stored in _ignored_source and is instead discarded. The source will be reconstructed using the normalized value. For custom normalizers, this parameter will default to false and the original value will be stored. However, for the built-in lowercase normalizer, the parameter will default to true and the original value will not be stored. This is a breaking change as previously keyword field mappers with the lowercase normalizer would default to storing the original value.
1 parent ef08ab2 commit c09b318

File tree

4 files changed

+347
-4
lines changed

4 files changed

+347
-4
lines changed

docs/changelog/136915.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
pr: 136915
2+
summary: Implement native synthetic source for normalized keywords
3+
area: Mapping
4+
type: breaking
5+
issues: []
6+
breaking:
7+
title: Implement native synthetic source for normalized keywords
8+
area: Mapping
9+
details: "This adds a new mapping parameter `normalizer_skip_store_original_value` to keyword fields. When this\
10+
\ parameter is set, and synthetic_source is enabled, keyword fields with configured normalizers will not store the\
11+
\ original non-normalized value in _ignored_source, and will instead use the normalized value to reconstruct the\
12+
\ source. This parameter enabled by default for the built-in `lowercase` normalizer, and is disabled by default for\
13+
\ other custom normalizers."
14+
impact: "Keyword fields using the `lowercase` normalizer will return the normalized value in the source when synthetic\
15+
\ source is enabled."
16+
notable: false

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/mget/90_synthetic_source.yml

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,270 @@ keyword with normalizer:
138138
keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
139139
keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
140140

141+
---
142+
keyword with non-lowercase normalizer:
143+
- do:
144+
indices.create:
145+
index: test-keyword-with-normalizer
146+
body:
147+
settings:
148+
analysis:
149+
normalizer:
150+
test_normalizer:
151+
type: custom
152+
filter:
153+
- asciifolding
154+
- uppercase
155+
index:
156+
mapping.source.mode: synthetic
157+
mappings:
158+
properties:
159+
keyword:
160+
type: keyword
161+
normalizer: test_normalizer
162+
keyword_with_skip_store:
163+
type: keyword
164+
normalizer: test_normalizer
165+
normalizer_skip_store_original_value: true
166+
keyword_with_ignore_above:
167+
type: keyword
168+
normalizer: test_normalizer
169+
ignore_above: 10
170+
keyword_without_doc_values:
171+
type: keyword
172+
normalizer: test_normalizer
173+
doc_values: false
174+
175+
- do:
176+
index:
177+
index: test-keyword-with-normalizer
178+
id: 1
179+
body:
180+
keyword: "the Quìck Brøwn Fox jumps over the låzy Dog"
181+
keyword_with_skip_store: "the Quìck Brøwn Fox jumps over the låzy Dog"
182+
keyword_with_ignore_above: "the Quìck Brøwn Fox jumps over the låzy Dog"
183+
keyword_without_doc_values: "the Quìck Brøwn Fox jumps over the låzy Dog"
184+
185+
- do:
186+
index:
187+
index: test-keyword-with-normalizer
188+
id: 2
189+
body:
190+
keyword: "The five BØXING wìzårds jümp Quickly"
191+
keyword_with_skip_store: "The five BØXING wìzårds jümp Quickly"
192+
keyword_with_ignore_above: "The five BØXING wìzårds jümp Quickly"
193+
keyword_without_doc_values: "The five BØXING wìzårds jümp Quickly"
194+
195+
- do:
196+
index:
197+
index: test-keyword-with-normalizer
198+
id: 3
199+
body:
200+
keyword: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]
201+
keyword_with_skip_store: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]
202+
keyword_with_ignore_above: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]
203+
keyword_without_doc_values: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]
204+
205+
- do:
206+
mget:
207+
index: test-keyword-with-normalizer
208+
body:
209+
ids: [ 1, 2, 3 ]
210+
- match: { docs.0._index: "test-keyword-with-normalizer" }
211+
- match: { docs.0._id: "1" }
212+
- match:
213+
docs.0._source:
214+
keyword: "the Quìck Brøwn Fox jumps over the låzy Dog"
215+
keyword_with_skip_store: "THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG"
216+
keyword_with_ignore_above: "the Quìck Brøwn Fox jumps over the låzy Dog"
217+
keyword_without_doc_values: "the Quìck Brøwn Fox jumps over the låzy Dog"
218+
219+
- match: { docs.1._index: "test-keyword-with-normalizer" }
220+
- match: { docs.1._id: "2" }
221+
- match:
222+
docs.1._source:
223+
keyword: "The five BØXING wìzårds jümp Quickly"
224+
keyword_with_skip_store: "THE FIVE BOXING WIZARDS JUMP QUICKLY"
225+
keyword_with_ignore_above: "The five BØXING wìzårds jümp Quickly"
226+
keyword_without_doc_values: "The five BØXING wìzårds jümp Quickly"
227+
228+
- match: { docs.2._index: "test-keyword-with-normalizer" }
229+
- match: { docs.2._id: "3" }
230+
- match:
231+
docs.2._source:
232+
keyword: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]
233+
keyword_with_skip_store: [ "DO OR DO NOT, THERE IS NO TRY", "MAY THE FORCE BE WITH YOU!" ]
234+
keyword_with_ignore_above: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]
235+
keyword_without_doc_values: [ "Mây the FORCE bè with Yoü!", "Do or Do Not, There is no Try" ]
236+
237+
---
238+
keyword with normalizer and skip store original value:
239+
- do:
240+
indices.create:
241+
index: test-keyword-with-normalizer
242+
body:
243+
settings:
244+
analysis:
245+
normalizer:
246+
lowercase:
247+
type: custom
248+
filter:
249+
- lowercase
250+
index:
251+
mapping.source.mode: synthetic
252+
mappings:
253+
properties:
254+
keyword:
255+
type: keyword
256+
normalizer: lowercase
257+
normalizer_skip_store_original_value: true
258+
keyword_with_ignore_above:
259+
type: keyword
260+
normalizer: lowercase
261+
normalizer_skip_store_original_value: true
262+
ignore_above: 10
263+
keyword_without_doc_values:
264+
type: keyword
265+
normalizer: lowercase
266+
normalizer_skip_store_original_value: true
267+
doc_values: false
268+
269+
- do:
270+
index:
271+
index: test-keyword-with-normalizer
272+
id: 1
273+
body:
274+
keyword: "the quick brown fox jumps over the lazy dog"
275+
keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog"
276+
keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog"
277+
278+
- do:
279+
index:
280+
index: test-keyword-with-normalizer
281+
id: 2
282+
body:
283+
keyword: "the five boxing wizards jump quickly"
284+
keyword_with_ignore_above: "The five BOXING wizards jump Quickly"
285+
keyword_without_doc_values: "The five BOXING wizards jump Quickly"
286+
287+
- do:
288+
index:
289+
index: test-keyword-with-normalizer
290+
id: 3
291+
body:
292+
keyword: [ "may the force be with you!", "do or do not, there is no try" ]
293+
keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
294+
keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
295+
296+
- do:
297+
mget:
298+
index: test-keyword-with-normalizer
299+
body:
300+
ids: [ 1, 2, 3 ]
301+
- match: { docs.0._index: "test-keyword-with-normalizer" }
302+
- match: { docs.0._id: "1" }
303+
- match:
304+
docs.0._source:
305+
keyword: "the quick brown fox jumps over the lazy dog"
306+
keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog"
307+
keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog"
308+
309+
- match: { docs.1._index: "test-keyword-with-normalizer" }
310+
- match: { docs.1._id: "2" }
311+
- match:
312+
docs.1._source:
313+
keyword: "the five boxing wizards jump quickly"
314+
keyword_with_ignore_above: "The five BOXING wizards jump Quickly"
315+
keyword_without_doc_values: "The five BOXING wizards jump Quickly"
316+
317+
- match: { docs.2._index: "test-keyword-with-normalizer" }
318+
- match: { docs.2._id: "3" }
319+
- match:
320+
docs.2._source:
321+
keyword: [ "do or do not, there is no try", "may the force be with you!" ]
322+
keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
323+
keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
324+
325+
---
326+
327+
keyword with built-in normalizer:
328+
- do:
329+
indices.create:
330+
index: test-keyword-with-normalizer
331+
body:
332+
settings:
333+
index:
334+
mapping.source.mode: synthetic
335+
mappings:
336+
properties:
337+
keyword:
338+
type: keyword
339+
normalizer: lowercase
340+
keyword_with_ignore_above:
341+
type: keyword
342+
normalizer: lowercase
343+
ignore_above: 10
344+
keyword_without_doc_values:
345+
type: keyword
346+
normalizer: lowercase
347+
doc_values: false
348+
349+
- do:
350+
index:
351+
index: test-keyword-with-normalizer
352+
id: 1
353+
body:
354+
keyword: "the quick brown fox jumps over the lazy dog"
355+
keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog"
356+
keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog"
357+
358+
- do:
359+
index:
360+
index: test-keyword-with-normalizer
361+
id: 2
362+
body:
363+
keyword: "the five boxing wizards jump quickly"
364+
keyword_with_ignore_above: "The five BOXING wizards jump Quickly"
365+
keyword_without_doc_values: "The five BOXING wizards jump Quickly"
366+
367+
- do:
368+
index:
369+
index: test-keyword-with-normalizer
370+
id: 3
371+
body:
372+
keyword: [ "may the force be with you!", "do or do not, there is no try" ]
373+
keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
374+
keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
375+
376+
- do:
377+
mget:
378+
index: test-keyword-with-normalizer
379+
body:
380+
ids: [ 1, 2, 3 ]
381+
- match: { docs.0._index: "test-keyword-with-normalizer" }
382+
- match: { docs.0._id: "1" }
383+
- match:
384+
docs.0._source:
385+
keyword: "the quick brown fox jumps over the lazy dog"
386+
keyword_with_ignore_above: "the Quick Brown Fox jumps over the lazy Dog"
387+
keyword_without_doc_values: "the Quick Brown Fox jumps over the lazy Dog"
388+
389+
- match: { docs.1._index: "test-keyword-with-normalizer" }
390+
- match: { docs.1._id: "2" }
391+
- match:
392+
docs.1._source:
393+
keyword: "the five boxing wizards jump quickly"
394+
keyword_with_ignore_above: "The five BOXING wizards jump Quickly"
395+
keyword_without_doc_values: "The five BOXING wizards jump Quickly"
396+
397+
- match: { docs.2._index: "test-keyword-with-normalizer" }
398+
- match: { docs.2._id: "3" }
399+
- match:
400+
docs.2._source:
401+
keyword: [ "do or do not, there is no try", "may the force be with you!" ]
402+
keyword_with_ignore_above: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
403+
keyword_without_doc_values: [ "May the FORCE be with You!", "Do or Do Not, There is no Try" ]
404+
141405
---
142406
stored text:
143407
- requires:

server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
import org.elasticsearch.index.IndexVersion;
4848
import org.elasticsearch.index.IndexVersions;
4949
import org.elasticsearch.index.analysis.IndexAnalyzers;
50+
import org.elasticsearch.index.analysis.LowercaseNormalizer;
5051
import org.elasticsearch.index.analysis.NamedAnalyzer;
5152
import org.elasticsearch.index.fielddata.FieldData;
5253
import org.elasticsearch.index.fielddata.FieldDataContext;
@@ -191,6 +192,7 @@ public static final class Builder extends FieldMapper.DimensionBuilder {
191192
);
192193

193194
private final Parameter<String> normalizer;
195+
private final Parameter<Boolean> normalizerSkipStoreOriginalValue;
194196

195197
private final Parameter<Boolean> splitQueriesOnWhitespace = Parameter.boolParam(
196198
"split_queries_on_whitespace",
@@ -278,6 +280,14 @@ private Builder(
278280
m -> toType(m).normalizerName,
279281
null
280282
).acceptsNull();
283+
this.normalizerSkipStoreOriginalValue = Parameter.boolParam(
284+
"normalizer_skip_store_original_value",
285+
false,
286+
m -> ((KeywordFieldMapper) m).isNormalizerSkipStoreOriginalValue(),
287+
() -> "lowercase".equals(normalizer.getValue())
288+
&& indexAnalyzers.getNormalizer(normalizer.getValue()).analyzer() instanceof LowercaseNormalizer
289+
).setSerializerCheck((includeDefaults, isConfigured, value) -> includeDefaults || isConfigured || value);
290+
281291
this.script.precludesParameters(nullValue);
282292
addScriptValidation(script, indexed, hasDocValues);
283293

@@ -407,6 +417,7 @@ protected Parameter<?>[] getParameters() {
407417
hasNorms,
408418
similarity,
409419
normalizer,
420+
normalizerSkipStoreOriginalValue,
410421
splitQueriesOnWhitespace,
411422
script,
412423
onScriptError,
@@ -1110,6 +1121,7 @@ public Query automatonQuery(
11101121
private final String indexOptions;
11111122
private final FieldType fieldType;
11121123
private final String normalizerName;
1124+
private final boolean normalizerSkipStoreOriginalValue;
11131125
private final boolean splitQueriesOnWhitespace;
11141126
private final Script script;
11151127
private final ScriptCompiler scriptCompiler;
@@ -1140,6 +1152,7 @@ private KeywordFieldMapper(
11401152
this.indexOptions = builder.indexOptions.getValue();
11411153
this.fieldType = freezeAndDeduplicateFieldType(fieldType);
11421154
this.normalizerName = builder.normalizer.getValue();
1155+
this.normalizerSkipStoreOriginalValue = builder.normalizerSkipStoreOriginalValue.getValue();
11431156
this.splitQueriesOnWhitespace = builder.splitQueriesOnWhitespace.getValue();
11441157
this.script = builder.script.get();
11451158
this.indexAnalyzers = builder.indexAnalyzers;
@@ -1164,6 +1177,10 @@ public String getOffsetFieldName() {
11641177
return offsetsFieldName;
11651178
}
11661179

1180+
public boolean isNormalizerSkipStoreOriginalValue() {
1181+
return normalizerSkipStoreOriginalValue;
1182+
}
1183+
11671184
protected void parseCreateField(DocumentParserContext context) throws IOException {
11681185
var value = context.parser().optimizedTextOrNull();
11691186

@@ -1343,9 +1360,8 @@ boolean hasNormalizer() {
13431360

13441361
@Override
13451362
protected SyntheticSourceSupport syntheticSourceSupport() {
1346-
if (hasNormalizer()) {
1347-
// NOTE: no matter if we have doc values or not we use fallback synthetic source
1348-
// to store the original value whose doc values would be altered by the normalizer
1363+
if (hasNormalizer() && normalizerSkipStoreOriginalValue == false) {
1364+
// NOTE: we use fallback synthetic source to store the original value since the doc values would be altered by the normalizer
13491365
return SyntheticSourceSupport.FALLBACK;
13501366
}
13511367

0 commit comments

Comments
 (0)