Support returning default index_options for semantic_text fields when include_defaults is true (#129967)

This commit is contained in:
Kathleen DeRusso 2025-06-26 12:31:58 -04:00 committed by GitHub
parent 0a77bdfbb1
commit 81a6eadba2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 394 additions and 39 deletions

View File

@ -0,0 +1,6 @@
pr: 129967
summary: Support returning default `index_options` for `semantic_text` fields when
`include_defaults` is true
area: Search
type: bug
issues: []

View File

@ -17,6 +17,7 @@ import java.util.Set;
import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_EXCLUDE_SUB_FIELDS_FROM_FIELD_CAPS;
import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_INDEX_OPTIONS;
import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS;
import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_SUPPORT_CHUNKING_CONFIG;
import static org.elasticsearch.xpack.inference.queries.SemanticKnnVectorQueryRewriteInterceptor.SEMANTIC_KNN_FILTER_FIX;
import static org.elasticsearch.xpack.inference.queries.SemanticKnnVectorQueryRewriteInterceptor.SEMANTIC_KNN_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED;
@ -66,7 +67,8 @@ public class InferenceFeatures implements FeatureSpecification {
SEMANTIC_TEXT_MATCH_ALL_HIGHLIGHTER,
SEMANTIC_TEXT_EXCLUDE_SUB_FIELDS_FROM_FIELD_CAPS,
SEMANTIC_TEXT_INDEX_OPTIONS,
COHERE_V2_API
COHERE_V2_API,
SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS
);
}
}

View File

@ -69,6 +69,7 @@ import org.elasticsearch.inference.ChunkingSettings;
import org.elasticsearch.inference.InferenceResults;
import org.elasticsearch.inference.MinimalServiceSettings;
import org.elasticsearch.inference.SimilarityMeasure;
import org.elasticsearch.inference.TaskType;
import org.elasticsearch.search.fetch.StoredFieldsSpec;
import org.elasticsearch.search.lookup.Source;
import org.elasticsearch.search.vectors.KnnVectorQueryBuilder;
@ -139,6 +140,9 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
"semantic_text.exclude_sub_fields_from_field_caps"
);
public static final NodeFeature SEMANTIC_TEXT_INDEX_OPTIONS = new NodeFeature("semantic_text.index_options");
public static final NodeFeature SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS = new NodeFeature(
"semantic_text.index_options_with_defaults"
);
public static final String CONTENT_TYPE = "semantic_text";
public static final String DEFAULT_ELSER_2_INFERENCE_ID = DEFAULT_ELSER_ID;
@ -166,19 +170,9 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
public static class Builder extends FieldMapper.Builder {
private final ModelRegistry modelRegistry;
private final boolean useLegacyFormat;
private final IndexVersion indexVersionCreated;
private final Parameter<String> inferenceId = Parameter.stringParam(
INFERENCE_ID_FIELD,
false,
mapper -> ((SemanticTextFieldType) mapper.fieldType()).inferenceId,
DEFAULT_ELSER_2_INFERENCE_ID
).addValidator(v -> {
if (Strings.isEmpty(v)) {
throw new IllegalArgumentException(
"[" + INFERENCE_ID_FIELD + "] on mapper [" + leafName() + "] of type [" + CONTENT_TYPE + "] must not be empty"
);
}
}).alwaysSerialize();
private final Parameter<String> inferenceId;
private final Parameter<String> searchInferenceId = Parameter.stringParam(
SEARCH_INFERENCE_ID_FIELD,
@ -193,25 +187,9 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
}
});
private final Parameter<MinimalServiceSettings> modelSettings = new Parameter<>(
MODEL_SETTINGS_FIELD,
true,
() -> null,
(n, c, o) -> SemanticTextField.parseModelSettingsFromMap(o),
mapper -> ((SemanticTextFieldType) mapper.fieldType()).modelSettings,
XContentBuilder::field,
Objects::toString
).acceptsNull().setMergeValidator(SemanticTextFieldMapper::canMergeModelSettings);
private final Parameter<MinimalServiceSettings> modelSettings;
private final Parameter<SemanticTextIndexOptions> indexOptions = new Parameter<>(
INDEX_OPTIONS_FIELD,
true,
() -> null,
(n, c, o) -> parseIndexOptionsFromMap(n, o, c.indexVersionCreated()),
mapper -> ((SemanticTextFieldType) mapper.fieldType()).indexOptions,
XContentBuilder::field,
Objects::toString
).acceptsNull();
private final Parameter<SemanticTextIndexOptions> indexOptions;
@SuppressWarnings("unchecked")
private final Parameter<ChunkingSettings> chunkingSettings = new Parameter<>(
@ -248,6 +226,50 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
super(name);
this.modelRegistry = modelRegistry;
this.useLegacyFormat = InferenceMetadataFieldsMapper.isEnabled(indexSettings.getSettings()) == false;
this.indexVersionCreated = indexSettings.getIndexVersionCreated();
this.inferenceId = Parameter.stringParam(
INFERENCE_ID_FIELD,
false,
mapper -> ((SemanticTextFieldType) mapper.fieldType()).inferenceId,
DEFAULT_ELSER_2_INFERENCE_ID
).addValidator(v -> {
if (Strings.isEmpty(v)) {
throw new IllegalArgumentException(
"[" + INFERENCE_ID_FIELD + "] on mapper [" + leafName() + "] of type [" + CONTENT_TYPE + "] must not be empty"
);
}
}).alwaysSerialize();
this.modelSettings = new Parameter<>(
MODEL_SETTINGS_FIELD,
true,
() -> null,
(n, c, o) -> SemanticTextField.parseModelSettingsFromMap(o),
mapper -> ((SemanticTextFieldType) mapper.fieldType()).modelSettings,
XContentBuilder::field,
Objects::toString
).acceptsNull().setMergeValidator(SemanticTextFieldMapper::canMergeModelSettings);
this.indexOptions = new Parameter<>(
INDEX_OPTIONS_FIELD,
true,
() -> null,
(n, c, o) -> parseIndexOptionsFromMap(n, o, c.indexVersionCreated()),
mapper -> ((SemanticTextFieldType) mapper.fieldType()).indexOptions,
(b, n, v) -> {
if (v == null) {
MinimalServiceSettings resolvedModelSettings = modelSettings.get() != null
? modelSettings.get()
: modelRegistry.getMinimalServiceSettings(inferenceId.get());
b.field(INDEX_OPTIONS_FIELD, defaultIndexOptions(indexVersionCreated, resolvedModelSettings));
} else {
b.field(INDEX_OPTIONS_FIELD, v);
}
},
Objects::toString
).acceptsNull();
this.inferenceFieldBuilder = c -> {
// Resolve the model setting from the registry if it has not been set yet.
var resolvedModelSettings = modelSettings.get() != null ? modelSettings.get() : getResolvedModelSettings(c, false);
@ -365,8 +387,11 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
validateServiceSettings(modelSettings.get(), resolvedModelSettings);
}
if (context.getMergeReason() != MapperService.MergeReason.MAPPING_RECOVERY && indexOptions.get() != null) {
validateIndexOptions(indexOptions.get(), inferenceId.getValue(), resolvedModelSettings);
// If index_options are specified by the user, we will validate them against the model settings to ensure compatibility.
// We do not serialize or otherwise store model settings at this time, this happens when the underlying vector field is created.
SemanticTextIndexOptions builderIndexOptions = indexOptions.get();
if (context.getMergeReason() != MapperService.MergeReason.MAPPING_RECOVERY && builderIndexOptions != null) {
validateIndexOptions(builderIndexOptions, inferenceId.getValue(), resolvedModelSettings);
}
final String fullName = context.buildFullName(leafName());
@ -1166,6 +1191,9 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
}
denseVectorMapperBuilder.dimensions(modelSettings.dimensions());
denseVectorMapperBuilder.elementType(modelSettings.elementType());
// Here is where we persist index_options. If they are specified by the user, we will use those index_options,
// otherwise we will determine if we can set default index options. If we can't, we won't persist any index_options
// and the field will use the defaults for the dense_vector field.
if (indexOptions != null) {
DenseVectorFieldMapper.DenseVectorIndexOptions denseVectorIndexOptions =
(DenseVectorFieldMapper.DenseVectorIndexOptions) indexOptions.indexOptions();
@ -1208,7 +1236,6 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
// As embedding models for text perform better with BBQ, we aggressively default semantic_text fields to use optimized index
// options
if (indexVersionDefaultsToBbqHnsw(indexVersionCreated)) {
DenseVectorFieldMapper.DenseVectorIndexOptions defaultBbqHnswIndexOptions = defaultBbqHnswDenseVectorIndexOptions();
return defaultBbqHnswIndexOptions.validate(modelSettings.elementType(), modelSettings.dimensions(), false)
? defaultBbqHnswIndexOptions
@ -1230,11 +1257,24 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
return new DenseVectorFieldMapper.BBQHnswIndexOptions(m, efConstruction, rescoreVector);
}
static SemanticTextIndexOptions defaultBbqHnswSemanticTextIndexOptions() {
return new SemanticTextIndexOptions(
SemanticTextIndexOptions.SupportedIndexOptions.DENSE_VECTOR,
defaultBbqHnswDenseVectorIndexOptions()
);
static SemanticTextIndexOptions defaultIndexOptions(IndexVersion indexVersionCreated, MinimalServiceSettings modelSettings) {
if (modelSettings == null) {
return null;
}
SemanticTextIndexOptions defaultIndexOptions = null;
if (modelSettings.taskType() == TaskType.TEXT_EMBEDDING) {
DenseVectorFieldMapper.DenseVectorIndexOptions denseVectorIndexOptions = defaultDenseVectorIndexOptions(
indexVersionCreated,
modelSettings
);
defaultIndexOptions = denseVectorIndexOptions == null
? null
: new SemanticTextIndexOptions(SemanticTextIndexOptions.SupportedIndexOptions.DENSE_VECTOR, denseVectorIndexOptions);
}
return defaultIndexOptions;
}
private static boolean canMergeModelSettings(MinimalServiceSettings previous, MinimalServiceSettings current, Conflicts conflicts) {

View File

@ -20,6 +20,7 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
/**
* Represents index options for a semantic_text field.
@ -50,6 +51,25 @@ public class SemanticTextIndexOptions implements ToXContent {
return indexOptions;
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other == null || getClass() != other.getClass()) {
return false;
}
SemanticTextIndexOptions otherSemanticTextIndexOptions = (SemanticTextIndexOptions) other;
return type == otherSemanticTextIndexOptions.type && Objects.equals(indexOptions, otherSemanticTextIndexOptions.indexOptions);
}
@Override
public int hashCode() {
return Objects.hash(type, indexOptions);
}
public enum SupportedIndexOptions {
DENSE_VECTOR("dense_vector") {
@Override

View File

@ -833,3 +833,147 @@ setup:
type: int8_flat
- match: { status: 400 }
---
"Displaying default index_options with and without include_defaults":
- requires:
cluster_features: "semantic_text.index_options_with_defaults"
reason: Index options defaults support introduced in 9.2.0
# Semantic text defaults to BBQ HNSW starting in 8.19.0/9.1.0
- do:
indices.create:
index: test-index-options-dense
body:
settings:
index:
mapping:
semantic_text:
use_legacy_format: false
mappings:
properties:
semantic_field:
type: semantic_text
inference_id: dense-inference-id-compatible-with-bbq
- do:
indices.get_mapping:
index: test-index-options-dense
- not_exists: test-index-options-dense.mappings.properties.semantic_field.index_options
- do:
indices.get_field_mapping:
index: test-index-options-dense
fields: semantic_field
include_defaults: true
- match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.type": "bbq_hnsw" }
- match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.m": 16 }
- match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.ef_construction": 100 }
- match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 }
# Validate that actually specifying the same values as our defaults will still serialize the user provided index_options
- do:
indices.create:
index: test-index-options-dense2
body:
settings:
index:
mapping:
semantic_text:
use_legacy_format: false
mappings:
properties:
semantic_field:
type: semantic_text
inference_id: dense-inference-id-compatible-with-bbq
index_options:
dense_vector:
type: bbq_hnsw
m: 16
ef_construction: 100
rescore_vector:
oversample: 3
- do:
indices.get_mapping:
index: test-index-options-dense2
- match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.type": "bbq_hnsw" }
- match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.m": 16 }
- match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.ef_construction": 100 }
- match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 }
- do:
indices.get_field_mapping:
index: test-index-options-dense2
fields: semantic_field
include_defaults: true
- match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.type": "bbq_hnsw" }
- match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.m": 16 }
- match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.ef_construction": 100 }
- match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 }
# Indices not compatible with BBQ for whatever reason will fall back to whatever `dense_vector` defaults are.
- do:
indices.create:
index: test-index-options-dense-no-bbq
body:
settings:
index:
mapping:
semantic_text:
use_legacy_format: false
mappings:
properties:
semantic_field:
type: semantic_text
inference_id: dense-inference-id
- do:
indices.get_mapping:
index: test-index-options-dense-no-bbq
- not_exists: test-index-options-dense-no-bbq.mappings.properties.semantic_field.index_options
- do:
indices.get_field_mapping:
index: test-index-options-dense-no-bbq
fields: semantic_field
include_defaults: true
- not_exists: test-index-options-dense-no-bbq.mappings.properties.semantic_field.index_options
# Sparse embeddings models do not have index options for semantic_text in 8.19/9.1.
- do:
indices.create:
index: test-index-options-sparse
body:
settings:
index:
mapping:
semantic_text:
use_legacy_format: false
mappings:
properties:
semantic_field:
type: semantic_text
inference_id: sparse-inference-id
- do:
indices.get_mapping:
index: test-index-options-sparse
- not_exists: test-index-options-sparse.mappings.properties.semantic_field.index_options
- do:
indices.get_field_mapping:
index: test-index-options-sparse
fields: semantic_field
include_defaults: true
- not_exists: test-index-options-sparse.mappings.properties.semantic_field.index_options

View File

@ -736,3 +736,146 @@ setup:
type: int8_flat
- match: { status: 400 }
---
"Displaying default index_options with and without include_defaults":
- requires:
cluster_features: "semantic_text.index_options_with_defaults"
reason: Index options defaults support introduced in 9.2.0
# Semantic text defaults to BBQ HNSW starting in 8.19.0/9.1.0
- do:
indices.create:
index: test-index-options-dense
body:
settings:
index:
mapping:
semantic_text:
use_legacy_format: true
mappings:
properties:
semantic_field:
type: semantic_text
inference_id: dense-inference-id-compatible-with-bbq
- do:
indices.get_mapping:
index: test-index-options-dense
- not_exists: test-index-options-dense.mappings.properties.semantic_field.index_options
- do:
indices.get_field_mapping:
index: test-index-options-dense
fields: semantic_field
include_defaults: true
- match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.type": "bbq_hnsw" }
- match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.m": 16 }
- match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.ef_construction": 100 }
- match: { "test-index-options-dense.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 }
# Validate that actually specifying the same values as our defaults will still serialize the user provided index_options
- do:
indices.create:
index: test-index-options-dense2
body:
settings:
index:
mapping:
semantic_text:
use_legacy_format: true
mappings:
properties:
semantic_field:
type: semantic_text
inference_id: dense-inference-id-compatible-with-bbq
index_options:
dense_vector:
type: bbq_hnsw
m: 16
ef_construction: 100
rescore_vector:
oversample: 3
- do:
indices.get_mapping:
index: test-index-options-dense2
- match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.type": "bbq_hnsw" }
- match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.m": 16 }
- match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.ef_construction": 100 }
- match: { "test-index-options-dense2.mappings.properties.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 }
- do:
indices.get_field_mapping:
index: test-index-options-dense2
fields: semantic_field
include_defaults: true
- match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.type": "bbq_hnsw" }
- match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.m": 16 }
- match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.ef_construction": 100 }
- match: { "test-index-options-dense2.mappings.semantic_field.mapping.semantic_field.index_options.dense_vector.rescore_vector.oversample": 3 }
# Indices not compatible with BBQ for whatever reason will fall back to whatever `dense_vector` defaults are.
- do:
indices.create:
index: test-index-options-dense-no-bbq
body:
settings:
index:
mapping:
semantic_text:
use_legacy_format: true
mappings:
properties:
semantic_field:
type: semantic_text
inference_id: dense-inference-id
- do:
indices.get_mapping:
index: test-index-options-dense-no-bbq
- not_exists: test-index-options-dense-no-bbq.mappings.properties.semantic_field.index_options
- do:
indices.get_field_mapping:
index: test-index-options-dense-no-bbq
fields: semantic_field
include_defaults: true
- not_exists: test-index-options-dense-no-bbq.mappings.properties.semantic_field.index_options
# Sparse embeddings models do not have index options for semantic_text in 8.19/9.1.
- do:
indices.create:
index: test-index-options-sparse
body:
settings:
index:
mapping:
semantic_text:
use_legacy_format: true
mappings:
properties:
semantic_field:
type: semantic_text
inference_id: sparse-inference-id
- do:
indices.get_mapping:
index: test-index-options-sparse
- not_exists: test-index-options-sparse.mappings.properties.semantic_field.index_options
- do:
indices.get_field_mapping:
index: test-index-options-sparse
fields: semantic_field
include_defaults: true
- not_exists: test-index-options-sparse.mappings.properties.semantic_field.index_options