Use FallbackSyntheticSourceBlockLoader for text fields (#126237)

This commit is contained in:
Oleksandr Kolomiiets 2025-04-07 09:32:35 -07:00 committed by GitHub
parent 20eb59080c
commit 21ff72bef4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 330 additions and 9 deletions

View File

@ -0,0 +1,5 @@
pr: 126237
summary: Use `FallbackSyntheticSourceBlockLoader` for text fields
area: Mapping
type: enhancement
issues: []

View File

@ -74,6 +74,7 @@ import org.elasticsearch.script.field.TextDocValuesField;
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
import org.elasticsearch.xcontent.ToXContent;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentParser;
import java.io.IOException;
import java.util.ArrayList;
@ -1016,10 +1017,53 @@ public final class TextFieldMapper extends FieldMapper {
if (isStored()) {
return new BlockStoredFieldsReader.BytesFromStringsBlockLoader(name());
}
// _ignored_source field will only be present if text field is not stored
// and there is no syntheticSourceDelegate
if (isSyntheticSource && syntheticSourceDelegate == null) {
return fallbackSyntheticSourceBlockLoader();
}
SourceValueFetcher fetcher = SourceValueFetcher.toString(blContext.sourcePaths(name()));
return new BlockSourceReader.BytesRefsBlockLoader(fetcher, blockReaderDisiLookup(blContext));
}
FallbackSyntheticSourceBlockLoader fallbackSyntheticSourceBlockLoader() {
var reader = new FallbackSyntheticSourceBlockLoader.SingleValueReader<BytesRef>(null) {
@Override
public void convertValue(Object value, List<BytesRef> accumulator) {
if (value != null) {
accumulator.add(new BytesRef(value.toString()));
}
}
@Override
protected void parseNonNullValue(XContentParser parser, List<BytesRef> accumulator) throws IOException {
var text = parser.textOrNull();
if (text != null) {
accumulator.add(new BytesRef(text));
}
}
@Override
public void writeToBlock(List<BytesRef> values, BlockLoader.Builder blockBuilder) {
var bytesRefBuilder = (BlockLoader.BytesRefBuilder) blockBuilder;
for (var value : values) {
bytesRefBuilder.appendBytesRef(value);
}
}
};
return new FallbackSyntheticSourceBlockLoader(reader, name()) {
@Override
public Builder builder(BlockFactory factory, int expectedCount) {
return factory.bytesRefs(expectedCount);
}
};
}
/**
* Build an iterator of documents that have the field. This mirrors parseCreateField,
* using whatever

View File

@ -25,9 +25,13 @@ public class KeywordFieldBlockLoaderTests extends BlockLoaderTestCase {
super(FieldType.KEYWORD.toString(), params);
}
@SuppressWarnings("unchecked")
@Override
protected Object expected(Map<String, Object> fieldMapping, Object value, TestContext testContext) {
return expectedValue(fieldMapping, value, params, testContext);
}
@SuppressWarnings("unchecked")
public static Object expectedValue(Map<String, Object> fieldMapping, Object value, Params params, TestContext testContext) {
var nullValue = (String) fieldMapping.get("null_value");
var ignoreAbove = fieldMapping.get("ignore_above") == null
@ -45,7 +49,7 @@ public class KeywordFieldBlockLoaderTests extends BlockLoaderTestCase {
Function<Stream<String>, Stream<BytesRef>> convertValues = s -> s.map(v -> convert(v, nullValue, ignoreAbove))
.filter(Objects::nonNull);
boolean hasDocValues = hasDocValues(fieldMapping, false);
boolean hasDocValues = hasDocValues(fieldMapping, true);
boolean useDocValues = params.preference() == MappedFieldType.FieldExtractPreference.NONE
|| params.preference() == MappedFieldType.FieldExtractPreference.DOC_VALUES
|| params.syntheticSource();
@ -63,7 +67,7 @@ public class KeywordFieldBlockLoaderTests extends BlockLoaderTestCase {
return maybeFoldList(resultList);
}
private BytesRef convert(String value, String nullValue, int ignoreAbove) {
private static BytesRef convert(String value, String nullValue, int ignoreAbove) {
if (value == null) {
if (nullValue != null) {
value = nullValue;

View File

@ -0,0 +1,135 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/
package org.elasticsearch.index.mapper.blockloader;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.mapper.BlockLoaderTestCase;
import org.elasticsearch.logsdb.datageneration.FieldType;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
public class TextFieldBlockLoaderTests extends BlockLoaderTestCase {
public TextFieldBlockLoaderTests(Params params) {
super(FieldType.TEXT.toString(), params);
}
@SuppressWarnings("unchecked")
@Override
protected Object expected(Map<String, Object> fieldMapping, Object value, TestContext testContext) {
if (fieldMapping.getOrDefault("store", false).equals(true)) {
return valuesInSourceOrder(value);
}
var fields = (Map<String, Object>) fieldMapping.get("fields");
if (fields != null) {
var keywordMultiFieldMapping = (Map<String, Object>) fields.get("kwd");
boolean docValues = hasDocValues(keywordMultiFieldMapping, true);
boolean index = keywordMultiFieldMapping.getOrDefault("index", true).equals(true);
boolean store = keywordMultiFieldMapping.getOrDefault("store", false).equals(true);
Object ignoreAbove = keywordMultiFieldMapping.get("ignore_above");
// See TextFieldMapper.SyntheticSourceHelper#usingSyntheticSourceDelegate
// and TextFieldMapper#canUseSyntheticSourceDelegateForLoading().
boolean usingSyntheticSourceDelegate = docValues || store;
boolean canUseSyntheticSourceDelegateForLoading = usingSyntheticSourceDelegate && ignoreAbove == null && (index || store);
if (canUseSyntheticSourceDelegateForLoading) {
return KeywordFieldBlockLoaderTests.expectedValue(keywordMultiFieldMapping, value, params, testContext);
}
// Even if multi-field is not eligible for loading it can still be used to produce synthetic source
// and then we load from the synthetic source.
// Synthetic source is actually different from keyword field block loader results
// because synthetic source includes values exceeding ignore_above and block loader doesn't.
// TODO ideally this logic should be in some kind of KeywordFieldSyntheticSourceTest that uses same infra as
// KeywordFieldBlockLoaderTest
// It is here since KeywordFieldBlockLoaderTest does not really need it
if (params.syntheticSource() && testContext.forceFallbackSyntheticSource() == false && usingSyntheticSourceDelegate) {
var nullValue = (String) keywordMultiFieldMapping.get("null_value");
// Due to how TextFieldMapper#blockReaderDisiLookup works this is complicated.
// If we are using lookupMatchingAll() then we'll see all docs, generate synthetic source using syntheticSourceDelegate,
// parse it and see null_value inside.
// But if we are using lookupFromNorms() we will skip the document (since the text field itself does not exist).
// Same goes for lookupFromFieldNames().
boolean textFieldIndexed = (boolean) fieldMapping.getOrDefault("index", true);
if (value == null) {
if (textFieldIndexed == false
&& nullValue != null
&& (ignoreAbove == null || nullValue.length() <= (int) ignoreAbove)) {
return new BytesRef(nullValue);
}
return null;
}
if (value instanceof String s) {
return new BytesRef(s);
}
var values = (List<String>) value;
// See note above about TextFieldMapper#blockReaderDisiLookup.
if (textFieldIndexed && values.stream().allMatch(Objects::isNull)) {
return null;
}
var indexed = values.stream()
.map(s -> s == null ? nullValue : s)
.filter(Objects::nonNull)
.filter(s -> ignoreAbove == null || s.length() <= (int) ignoreAbove)
.map(BytesRef::new)
.collect(Collectors.toList());
if (store == false) {
// using doc_values for synthetic source
indexed = new ArrayList<>(new HashSet<>(indexed));
indexed.sort(BytesRef::compareTo);
}
// ignored values always come last
List<BytesRef> ignored = ignoreAbove == null
? List.of()
: values.stream()
.map(s -> s == null ? nullValue : s)
.filter(Objects::nonNull)
.filter(s -> s.length() > (int) ignoreAbove)
.map(BytesRef::new)
.toList();
indexed.addAll(ignored);
return maybeFoldList(indexed);
}
}
// Loading from _ignored_source or stored _source
return valuesInSourceOrder(value);
}
@SuppressWarnings("unchecked")
private Object valuesInSourceOrder(Object value) {
if (value == null) {
return null;
}
if (value instanceof String s) {
return new BytesRef(s);
}
var resultList = ((List<String>) value).stream().filter(Objects::nonNull).map(BytesRef::new).toList();
return maybeFoldList(resultList);
}
}

View File

@ -23,6 +23,7 @@ import org.elasticsearch.logsdb.datageneration.fields.leaf.KeywordFieldDataGener
import org.elasticsearch.logsdb.datageneration.fields.leaf.LongFieldDataGenerator;
import org.elasticsearch.logsdb.datageneration.fields.leaf.ScaledFloatFieldDataGenerator;
import org.elasticsearch.logsdb.datageneration.fields.leaf.ShortFieldDataGenerator;
import org.elasticsearch.logsdb.datageneration.fields.leaf.TextFieldDataGenerator;
import org.elasticsearch.logsdb.datageneration.fields.leaf.UnsignedLongFieldDataGenerator;
/**
@ -42,7 +43,8 @@ public enum FieldType {
COUNTED_KEYWORD("counted_keyword"),
BOOLEAN("boolean"),
DATE("date"),
GEO_POINT("geo_point");
GEO_POINT("geo_point"),
TEXT("text");
private final String name;
@ -66,6 +68,7 @@ public enum FieldType {
case BOOLEAN -> new BooleanFieldDataGenerator(dataSource);
case DATE -> new DateFieldDataGenerator(dataSource);
case GEO_POINT -> new GeoPointFieldDataGenerator(dataSource);
case TEXT -> new TextFieldDataGenerator(dataSource);
};
}
@ -85,6 +88,7 @@ public enum FieldType {
case "boolean" -> FieldType.BOOLEAN;
case "date" -> FieldType.DATE;
case "geo_point" -> FieldType.GEO_POINT;
case "text" -> FieldType.TEXT;
default -> null;
};
}

View File

@ -35,10 +35,7 @@ public class DefaultMappingParametersHandler implements DataSourceHandler {
return null;
}
var map = new HashMap<String, Object>();
map.put("store", ESTestCase.randomBoolean());
map.put("index", ESTestCase.randomBoolean());
map.put("doc_values", ESTestCase.randomBoolean());
var map = commonMappingParameters();
if (ESTestCase.randomBoolean()) {
map.put(Mapper.SYNTHETIC_SOURCE_KEEP_PARAM, ESTestCase.randomFrom("none", "arrays", "all"));
}
@ -51,6 +48,7 @@ public class DefaultMappingParametersHandler implements DataSourceHandler {
case BOOLEAN -> booleanMapping(map);
case DATE -> dateMapping(map);
case GEO_POINT -> geoPointMapping(map);
case TEXT -> textMapping(request, new HashMap<>());
});
}
@ -190,6 +188,35 @@ public class DefaultMappingParametersHandler implements DataSourceHandler {
};
}
private Supplier<Map<String, Object>> textMapping(
DataSourceRequest.LeafMappingParametersGenerator request,
Map<String, Object> injected
) {
return () -> {
injected.put("store", ESTestCase.randomBoolean());
injected.put("index", ESTestCase.randomBoolean());
if (ESTestCase.randomDouble() <= 0.1) {
var keywordMultiFieldMapping = keywordMapping(request, commonMappingParameters()).get();
keywordMultiFieldMapping.put("type", "keyword");
keywordMultiFieldMapping.remove("copy_to");
injected.put("fields", Map.of("kwd", keywordMultiFieldMapping));
}
return injected;
};
}
private static HashMap<String, Object> commonMappingParameters() {
var map = new HashMap<String, Object>();
map.put("store", ESTestCase.randomBoolean());
map.put("index", ESTestCase.randomBoolean());
map.put("doc_values", ESTestCase.randomBoolean());
return map;
}
@Override
public DataSourceResponse.ObjectMappingParametersGenerator handle(DataSourceRequest.ObjectMappingParametersGenerator request) {
if (request.isNested()) {

View File

@ -0,0 +1,34 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/
package org.elasticsearch.logsdb.datageneration.fields.leaf;
import org.elasticsearch.logsdb.datageneration.FieldDataGenerator;
import org.elasticsearch.logsdb.datageneration.datasource.DataSource;
import org.elasticsearch.logsdb.datageneration.datasource.DataSourceRequest;
import java.util.Map;
import java.util.function.Supplier;
public class TextFieldDataGenerator implements FieldDataGenerator {
private final Supplier<Object> valueGenerator;
public TextFieldDataGenerator(DataSource dataSource) {
var strings = dataSource.get(new DataSourceRequest.StringGenerator());
var nulls = dataSource.get(new DataSourceRequest.NullWrapper());
var arrays = dataSource.get(new DataSourceRequest.ArrayWrapper());
this.valueGenerator = arrays.wrapper().compose(nulls.wrapper()).apply(() -> strings.generator().get());
}
@Override
public Object generateValue(Map<String, Object> fieldMapping) {
return valueGenerator.get();
}
}

View File

@ -61,6 +61,7 @@ interface FieldSpecificMatcher {
put("geo_shape", new ExactMatcher("geo_shape", actualMappings, actualSettings, expectedMappings, expectedSettings));
put("shape", new ExactMatcher("shape", actualMappings, actualSettings, expectedMappings, expectedSettings));
put("geo_point", new GeoPointMatcher(actualMappings, actualSettings, expectedMappings, expectedSettings));
put("text", new TextMatcher(actualMappings, actualSettings, expectedMappings, expectedSettings));
}
};
}
@ -598,6 +599,73 @@ interface FieldSpecificMatcher {
}
}
class TextMatcher implements FieldSpecificMatcher {
private final XContentBuilder actualMappings;
private final Settings.Builder actualSettings;
private final XContentBuilder expectedMappings;
private final Settings.Builder expectedSettings;
TextMatcher(
XContentBuilder actualMappings,
Settings.Builder actualSettings,
XContentBuilder expectedMappings,
Settings.Builder expectedSettings
) {
this.actualMappings = actualMappings;
this.actualSettings = actualSettings;
this.expectedMappings = expectedMappings;
this.expectedSettings = expectedSettings;
}
@Override
@SuppressWarnings("unchecked")
public MatchResult match(
List<Object> actual,
List<Object> expected,
Map<String, Object> actualMapping,
Map<String, Object> expectedMapping
) {
var expectedNormalized = normalize(expected);
var actualNormalized = normalize(actual);
// Match simply as text first.
if (actualNormalized.equals(expectedNormalized)) {
return MatchResult.match();
}
// In some cases synthetic source for text fields is synthesized using the keyword multi field.
// So in this case it's appropriate to match it using keyword matching logic (mainly to cover `null_value`).
var multiFields = (Map<String, Object>) getMappingParameter("fields", actualMapping, expectedMapping);
if (multiFields != null) {
var keywordMatcher = new KeywordMatcher(actualMappings, actualSettings, expectedMappings, expectedSettings);
var keywordFieldMapping = (Map<String, Object>) multiFields.get("kwd");
var keywordMatchResult = keywordMatcher.match(actual, expected, keywordFieldMapping, keywordFieldMapping);
if (keywordMatchResult.isMatch()) {
return MatchResult.match();
}
}
return MatchResult.noMatch(
formatErrorMessage(
actualMappings,
actualSettings,
expectedMappings,
expectedSettings,
"Values of type [text] don't match, " + prettyPrintCollections(actual, expected)
)
);
}
private Set<Object> normalize(List<Object> values) {
if (values == null) {
return Set.of();
}
return values.stream().filter(Objects::nonNull).collect(Collectors.toSet());
}
}
/**
* Generic matcher that supports common matching logic like null values.
*/

View File

@ -46,7 +46,7 @@ class MappingTransforms {
if (entry.getKey().equals("_doc") || entry.getKey().equals("properties")) {
descend(pathFromRoot, (Map<String, Object>) entry.getValue(), flattened);
} else {
if (entry.getValue() instanceof Map<?, ?> map) {
if (entry.getKey().equals("fields") == false && entry.getValue() instanceof Map<?, ?> map) {
var pathToField = pathFromRoot == null ? entry.getKey() : pathFromRoot + "." + entry.getKey();
// Descending to subobject, we need to remember parent mapping