Skip to content

Fix text_structure finder to be able to work with nested NDJSON samples. #131530

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -333,27 +333,101 @@ static Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> guessMapp
.filter(Objects::nonNull)
.collect(Collectors.toList());

Tuple<Map<String, String>, FieldStats> mappingAndFieldStats = guessMappingAndCalculateFieldStats(
explanation,
fieldName,
fieldValues,
timeoutChecker,
ecsCompatibility,
timestampFormatOverride
);
if (mappingAndFieldStats != null) {
if (mappingAndFieldStats.v1() != null) {
mappings.put(fieldName, mappingAndFieldStats.v1());
}
if (mappingAndFieldStats.v2() != null) {
fieldStats.put(fieldName, mappingAndFieldStats.v2());
// Process nested fields dynamically (recursively if needed)
if (isNestedField(fieldValues)) {
// Recursively process nested fields
List<Map<String, ?>> nestedFieldValues = extractNestedFieldValues(sampleRecords, fieldName);

Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> nestedResult = guessMappingsAndCalculateFieldStats(
explanation,
nestedFieldValues,
timeoutChecker,
ecsCompatibility,
timestampFormatOverride
);
// Create a nested mapping for the parent field and merge the nested field mappings
mappings.put(fieldName, createNestedMapping(nestedResult.v1(), determineNestedFieldType(nestedFieldValues))); // Apply
// type:
// nested for
// the parent
// field
fieldStats.putAll(nestedResult.v2());

} else {
// For non-nested fields, process them normally
Tuple<Map<String, String>, FieldStats> mappingAndFieldStats = guessMappingAndCalculateFieldStats(
explanation,
fieldName,
fieldValues,
timeoutChecker,
ecsCompatibility,
timestampFormatOverride
);
if (mappingAndFieldStats != null) {
if (mappingAndFieldStats.v1() != null) {
mappings.put(fieldName, mappingAndFieldStats.v1());
}
if (mappingAndFieldStats.v2() != null) {
fieldStats.put(fieldName, mappingAndFieldStats.v2());
}
}
}
}

return new Tuple<>(mappings, fieldStats);
}

/**
* Extracts the nested field values for a given field from a list of sample records.
*
* @param sampleRecords The list of records, where each record is a map containing field names as keys.
* @param fieldName The name of the field whose values are to be extracted from each record.
* @return A list of Maps representing the nested field values for the specified field.
*/
private static List<Map<String, ?>> extractNestedFieldValues(List<Map<String, ?>> sampleRecords, String fieldName) {
@SuppressWarnings("unchecked")
List<Map<String, ?>> extractedFieldValue = sampleRecords.stream()
.map(record -> record.get(fieldName))
.filter(Objects::nonNull)
.filter(val -> val instanceof Map)
.map(val -> (Map<String, ?>) val)
.collect(Collectors.toList());
return extractedFieldValue;
}

/**
* Creates a mapping for a nested field, either as a "nested" or "object" field type.
*
* @param nestedFieldMappings A map containing the field mappings for the nested fields.
* @param nestingType The type of the nested field (either "nested" or "object").
* @return A map representing the field mapping for the nested field with the specified type.
*/
private static Object createNestedMapping(SortedMap<String, Object> nestedFieldMappings, String nestingType) {
SortedMap<String, Object> nestedMapping = new TreeMap<>();
nestedMapping.put(MAPPING_TYPE_SETTING, nestingType);
nestedMapping.put(MAPPING_PROPERTIES_SETTING, nestedFieldMappings);
return nestedMapping;
}

/**
* @param fieldValues value of a field in the sample records
* @return boolean for whether the field is nested (i.e., Map or List of Maps)
*/
static boolean isNestedField(List<Object> fieldValues) {
return fieldValues.stream().anyMatch(val -> val instanceof Map || val instanceof List);
}

/**
* @param nestedFieldValues value of a nested field in the sample records
* @return "nested" or "object" based on the data structure
* @TODO: If the field values contain a List of Maps and need to be queried independently, set as "nested"
* If not, treat it as a regular "object"
*/
static String determineNestedFieldType(List<Map<String, ?>> nestedFieldValues) {
// just supports object for now
return "object";
}

/**
* Given the sampled records, guess appropriate Elasticsearch mappings.
* @param explanation List of reasons for choosing the overall text structure. This list
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@
import org.elasticsearch.xpack.core.textstructure.structurefinder.TextStructure;

import java.util.Collections;
import java.util.Map;

public class NdJsonTextStructureFinderTests extends TextStructureTestCase {

private final TextStructureFinderFactory factory = new NdJsonTextStructureFinderFactory();

public void testCreateConfigsGivenGoodJson() throws Exception {
testCreateConfigsGivenFlatJson();
testCreateConfigsGivenNestedJson();
}

public void testCreateConfigsGivenFlatJson() throws Exception {
assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE, 0.0));

String charset = randomFrom(POSSIBLE_CHARSETS);
Expand Down Expand Up @@ -49,4 +55,49 @@ public void testCreateConfigsGivenGoodJson() throws Exception {
assertEquals(Collections.singletonList("UNIX_MS"), structure.getJodaTimestampFormats());
assertEquals(Collections.singleton("properties"), structure.getMappings().keySet());
}

public void testCreateConfigsGivenNestedJson() throws Exception {
assertTrue(factory.canCreateFromSample(explanation, NESTED_NDJSON_SAMPLE, 0.0));

String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
TextStructureFinder structureFinder = factory.createFromSample(
explanation,
NESTED_NDJSON_SAMPLE,
charset,
hasByteOrderMarker,
TextStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT,
TextStructureOverrides.EMPTY_OVERRIDES,
NOOP_TIMEOUT_CHECKER
);

TextStructure structure = structureFinder.getStructure();
assertEquals(TextStructure.Format.NDJSON, structure.getFormat());
assertEquals(charset, structure.getCharset());

if (hasByteOrderMarker == null) {
assertNull(structure.getHasByteOrderMarker());
} else {
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
}

assertEquals("timestamp", structure.getTimestampField());
assertEquals(Collections.singletonList("UNIX_MS"), structure.getJodaTimestampFormats());
assertEquals(1, structure.getMappings().size());

// Verify that the 'host' field is nested
@SuppressWarnings("unchecked")
Map<String, Object> props = (Map<String, Object>) structure.getMappings().get("properties");
@SuppressWarnings("unchecked")
Map<String, Object> hostMapping = (Map<String, Object>) props.get("host");
assertNotNull("Host should be a nested field", hostMapping);
assertEquals("object", hostMapping.get("type"));

// Verify 'host' properties
@SuppressWarnings("unchecked")
Map<String, Object> hostProperties = (Map<String, Object>) hostMapping.get("properties");
assertTrue("Host should have 'id' property", hostProperties.containsKey("id"));
assertTrue("Host should have 'category' property", hostProperties.containsKey("category"));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ public abstract class TextStructureTestCase extends ESTestCase {
"class":"ml","method":"core::SomeNoiseMaker","file":"Noisemaker.cc","line":333}
""";

protected static final String NESTED_NDJSON_SAMPLE = """
{"host": {"id": 1, "category": "NETWORKING DEVICE"}, "timestamp": "1478261151445"}
{"host": {"id": 2, "category": "COMPUTE NODE"}, "timestamp": "1478261151445"}
""";

protected static final String PIPE_DELIMITED_SAMPLE = """
2018-01-06 16:56:14.295748|INFO |VirtualServer |1 |listening on 0.0.0.0:9987, :::9987
2018-01-06 17:19:44.465252|INFO |VirtualServer |1 |client 'User1'(id:2) changed default admin channelgroup to 'Guest'(id:8)
Expand Down