Skip to content
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
5851594
Reset the last optimised value when stringEnd gets reset too.
gmarouli Sep 22, 2025
8552375
Update docs/changelog/135184.yaml
gmarouli Sep 22, 2025
add871b
Revert "Reset the last optimised value when stringEnd gets reset too."
gmarouli Sep 23, 2025
c9844d1
Use qualified import
gmarouli Sep 23, 2025
0048c84
Add unit test that captures the issue.
gmarouli Sep 23, 2025
57d4c8f
[CI] Update transport version definitions
Sep 23, 2025
3402bc6
Reset the last value properly
gmarouli Sep 23, 2025
4b71f8c
[CI] Update transport version definitions
Sep 23, 2025
222a251
Add comments
gmarouli Sep 23, 2025
3be8275
Update docs/changelog/135184.yaml
gmarouli Sep 24, 2025
34ddd56
Merge branch 'main' into bug-fix-utf8-optimiser
gmarouli Sep 24, 2025
482c33e
Add getText() also in ESCborParserTests
gmarouli Sep 24, 2025
964ef6d
Merge branch 'main' into bug-fix-utf8-optimiser
gmarouli Sep 25, 2025
9a39f67
Add randomised test that compares an optimised and a baseline parseAr…
gmarouli Sep 25, 2025
1a00f09
Revert adding to importing org.hamcrest.Matchers
gmarouli Sep 25, 2025
5caf1f6
Revert adding to importing org.hamcrest.Matchers and remove not neede…
gmarouli Sep 25, 2025
f3e75d1
Delete docs/changelog/135184.yaml
gmarouli Sep 25, 2025
8f5f178
Use `mapOrdered` to simplify the assertion
gmarouli Sep 29, 2025
3cf40d3
Add yaml test
gmarouli Sep 29, 2025
23f6a11
Make `text()` also call `optimisedText()` some times
gmarouli Sep 29, 2025
850c4ac
Merge branch 'main' into bug-fix-utf8-optimiser
gmarouli Sep 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@
import java.util.ArrayList;
import java.util.List;

/**
* Provides the method getValueAsText that is a best-effort optimization for UTF8 fields. If the
* {@link UTF8StreamJsonParser} has already parsed the text, then the caller should fall back to getText.
* This is sufficient because when we call getText, jackson stores the parsed UTF-16 value for future calls.
* Once we've already got the parsed UTF-16 value, the optimization isn't necessary.
*/
public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser implements OptimizedTextCapable {
protected int stringEnd = -1;
protected int stringLength;
Expand Down Expand Up @@ -53,6 +59,7 @@ public ESUTF8StreamJsonParser(
*/
@Override
public Text getValueAsText() throws IOException {
// _tokenIncomplete is true when UTF8StreamJsonParser has already processed this value.
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) {
if (lastOptimisedValue != null) {
return new Text(new XContentString.UTF8Bytes(lastOptimisedValue), stringLength);
Expand Down Expand Up @@ -148,33 +155,33 @@ protected Text _finishAndReturnText() throws IOException {

@Override
public JsonToken nextToken() throws IOException {
maybeResetCurrentTokenState();
stringEnd = -1;
resetCurrentTokenState();
return super.nextToken();
}

@Override
public boolean nextFieldName(SerializableString str) throws IOException {
maybeResetCurrentTokenState();
stringEnd = -1;
resetCurrentTokenState();
return super.nextFieldName(str);
}

@Override
public String nextFieldName() throws IOException {
maybeResetCurrentTokenState();
stringEnd = -1;
resetCurrentTokenState();
return super.nextFieldName();
}

/**
* Resets the current token state before moving to the next.
* Resets the current token state before moving to the next. It resets the _inputPtr and the
* _tokenIncomplete only if {@link UTF8StreamJsonParser#getText()} or {@link UTF8StreamJsonParser#getValueAsString()}
* hasn't run yet.
*/
private void maybeResetCurrentTokenState() {
private void resetCurrentTokenState() {
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
_inputPtr = stringEnd;
_tokenIncomplete = false;
lastOptimisedValue = null;
}
lastOptimisedValue = null;
stringEnd = -1;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.exc.InputCoercionException;
import com.fasterxml.jackson.core.exc.StreamConstraintsException;
import com.fasterxml.jackson.core.filter.FilteringParserDelegate;
import com.fasterxml.jackson.core.io.JsonEOFException;

import org.elasticsearch.core.IOUtils;
Expand All @@ -26,6 +27,7 @@
import org.elasticsearch.xcontent.XContentParserConfiguration;
import org.elasticsearch.xcontent.XContentString;
import org.elasticsearch.xcontent.XContentType;
import org.elasticsearch.xcontent.provider.OptimizedTextCapable;
import org.elasticsearch.xcontent.provider.XContentParserConfigurationImpl;
import org.elasticsearch.xcontent.support.AbstractXContentParser;

Expand Down Expand Up @@ -143,7 +145,19 @@ public String text() throws IOException {

@Override
public XContentString optimizedText() throws IOException {
// TODO: enable utf-8 parsing optimization once verified it is completely safe
if (currentToken().isValue() == false) {
throwOnNoText();
}
var parser = this.parser;
if (parser instanceof FilteringParserDelegate delegate) {
parser = delegate.delegate();
}
if (parser instanceof OptimizedTextCapable optimizedTextCapableParser) {
var bytesRef = optimizedTextCapableParser.getValueAsText();
if (bytesRef != null) {
return bytesRef;
}
}
return new Text(text());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.util.Map;

import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.nullValue;

public class ESCborParserTests extends ESTestCase {

Expand Down Expand Up @@ -54,8 +55,15 @@ private void testStringValue(String expected) throws IOException {
assertThat(text.hasBytes(), equalTo(true));
assertThat(text.stringLength(), equalTo(expected.length()));
assertThat(text.string(), equalTo(expected));
// Retrieve twice
assertThat(parser.getValueAsText().string(), equalTo(expected));
assertThat(parser.getValueAsString(), equalTo(expected));
// Use the getText() to ensure _tokenIncomplete works
assertThat(parser.getText(), equalTo(expected));
// The optimisation is not used after the getText()
assertThat(parser.getValueAsText(), nullValue());
// Original CBOR getValueAsString works.
assertThat(parser.getValueAsString(), equalTo(expected));
assertThat(parser.nextToken(), equalTo(JsonToken.END_OBJECT));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,23 @@
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;

import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.xcontent.XContentHelper;
import org.elasticsearch.core.CheckedConsumer;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.xcontent.FilterXContentParserWrapper;
import org.elasticsearch.xcontent.XContentParser;
import org.elasticsearch.xcontent.XContentParserConfiguration;
import org.elasticsearch.xcontent.XContentString;
import org.elasticsearch.xcontent.XContentType;
import org.elasticsearch.xcontent.provider.XContentParserConfigurationImpl;
import org.hamcrest.Matchers;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

public class ESUTF8StreamJsonParserTests extends ESTestCase {

Expand Down Expand Up @@ -272,12 +281,17 @@ public void testGetValueRandomized() throws IOException {
var text = parser.getValueAsText();
assertTextRef(text.bytes(), currVal);
assertThat(text.stringLength(), Matchers.equalTo(currVal.length()));

// Retrieve it twice to ensure it works as expected
// Retrieve it a second time
text = parser.getValueAsText();
assertThat(text, Matchers.notNullValue());
assertTextRef(text.bytes(), currVal);
assertThat(text.stringLength(), Matchers.equalTo(currVal.length()));
// Use getText()
assertThat(parser.getText(), Matchers.equalTo(text.string()));
// After retrieving it with getText() we do not use the optimised value anymore.
assertThat(parser.getValueAsText(), Matchers.nullValue());
} else {
assertThat(parser.getText(), Matchers.notNullValue());
assertThat(parser.getValueAsText(), Matchers.nullValue());
assertThat(parser.getValueAsString(), Matchers.equalTo(currVal));
// Retrieve it twice to ensure it works as expected
Expand All @@ -288,4 +302,88 @@ public void testGetValueRandomized() throws IOException {
});
}

/**
* This test compares the retrieval of an optimised text against the baseline.
*/
public void testOptimisedParser() throws Exception {
for (int i = 0; i < 200; i++) {
String json = randomJsonInput(randomIntBetween(1, 6));
try (
var baselineParser = XContentHelper.createParser(
XContentParserConfiguration.EMPTY,
new BytesArray(json),
XContentType.JSON
);
var optimisedParser = TestXContentParser.create(json)
) {
var expected = baselineParser.mapOrdered();
var actual = optimisedParser.mapOrdered();
assertThat(expected, Matchers.equalTo(actual));
}
}
}

private String randomJsonInput(int depth) {
StringBuilder sb = new StringBuilder();
sb.append('{');
int numberOfFields = randomIntBetween(1, 10);
for (int i = 0; i < numberOfFields; i++) {
sb.append("\"k-").append(randomAlphanumericOfLength(10)).append("\":");
if (depth == 0 || randomBoolean()) {
if (randomIntBetween(0, 9) == 0) {
sb.append(
IntStream.range(0, randomIntBetween(1, 10))
.mapToObj(ignored -> randomUTF8Value())
.collect(Collectors.joining(",", "[", "]"))
);
} else {
sb.append(randomUTF8Value());
}
} else {
sb.append(randomJsonInput(depth - 1));
}
if (i < numberOfFields - 1) {
sb.append(',');
}
}
sb.append("}");
return sb.toString();
}

private String randomUTF8Value() {
return "\"" + buildRandomInput(randomIntBetween(10, 50)).input + "\"";
}

/**
* This XContentParser introduces a random mix of getText() and getOptimisedText()
* to simulate different access patterns for optimised fields.
*/
private static class TestXContentParser extends FilterXContentParserWrapper {

TestXContentParser(XContentParser delegate) {
super(delegate);
}

static TestXContentParser create(String input) throws IOException {
JsonFactory factory = new ESJsonFactoryBuilder().build();
assertThat(factory, Matchers.instanceOf(ESJsonFactory.class));

JsonParser parser = factory.createParser(StandardCharsets.UTF_8.encode(input).array());
assertThat(parser, Matchers.instanceOf(ESUTF8StreamJsonParser.class));
return new TestXContentParser(new JsonXContentParser(XContentParserConfigurationImpl.EMPTY, parser));
}

@Override
public XContentString optimizedText() throws IOException {
int extraCalls = randomIntBetween(0, 5);
for (int i = 0; i < extraCalls; i++) {
if (randomBoolean()) {
super.optimizedText();
} else {
super.text();
}
}
return super.optimizedText();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Keyword with escaped characters as multi-field:
- requires:
cluster_features: [ "mapper.multi_field.unicode_optimisation_fix" ]
reason: "requires a fix (#134770)"
reason: "Captures the scenarios in #134770 & 135256"
- do:
indices.create:
index: test
Expand All @@ -14,14 +14,20 @@ Keyword with escaped characters as multi-field:
fields:
bar:
type: keyword
bar-text:
type: text
my-ip:
type: ip

# Ensure the IP is correctly parsed after a multi-field mapping that combines optimised and non-optimised fields
- do:
index:
index: test
id: "1"
refresh: true
body:
foo: "c:\\windows\\system32\\svchost.exe"
my-ip: "127.0.0.1"

- do:
search:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,7 @@ public void testOptimizedTextHasBytes() throws Exception {
assertSame(XContentParser.Token.FIELD_NAME, parser.nextToken());
assertTrue(parser.nextToken().isValue());
Text text = (Text) parser.optimizedText();
// TODO: uncomment after utf8 optimized parsing has been enabled again:
// assertTrue(text.hasBytes());
assertTrue(text.hasBytes());
assertThat(text.string(), equalTo("foo"));
}
}
Expand Down