Add synthetic vectors support for sparse_vector (#130756)

This change adds the support for synthetic vectors (added in #130382) in the sparse_vector field type.
This commit is contained in:
Jim Ferenczi 2025-07-07 20:30:02 +01:00 committed by GitHub
parent 83076c2dcd
commit 6d81ff94b0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 511 additions and 84 deletions

View File

@ -0,0 +1,380 @@
setup:
- requires:
reason: 'synthetic vectors are required'
test_runner_features: [ capabilities ]
capabilities:
- method: GET
path: /_search
capabilities: [ synthetic_vectors_setting ]
- skip:
features: "headers"
- do:
indices.create:
index: test
body:
settings:
index.mapping.synthetic_vectors: true
mappings:
properties:
name:
type: keyword
emb:
type: sparse_vector
nested:
type: nested
properties:
paragraph_id:
type: keyword
emb:
type: sparse_vector
- do:
index:
index: test
id: "1"
body:
name: cow.jpg
emb:
token_1: 2.0
token_2: 3.0
- do:
index:
index: test
id: "2"
body:
name: moose.jpg
nested:
- paragraph_id: 0
emb:
token_1: 2.0
token_2: 3.0
- paragraph_id: 2
emb:
token_3: 2.0
token_2: 3.0
- paragraph_id: 3
emb:
token_3: 2.0
token_7: 3.0
token_1: 4.0
- do:
index:
index: test
id: "3"
body:
name: rabbit.jpg
emb:
token_3: 2.0
token_9: 3.0
token_2: 4.0
- do:
index:
index: test
id: "4"
body:
name: zoolander.jpg
nested:
- paragraph_id: 0
emb:
token_3: 2.0
token_7: 3.0
token_1: 4.0
- paragraph_id: 1
- paragraph_id: 2
emb:
token_8: 2.0
- do:
indices.refresh: {}
---
"exclude synthetic vectors":
- do:
search:
index: test
body:
sort: ["name"]
- match: { hits.hits.0._id: "1"}
- match: { hits.hits.0._source.name: "cow.jpg"}
- not_exists: hits.hits.0._source.emb
- match: { hits.hits.1._id: "2"}
- match: { hits.hits.1._source.name: "moose.jpg"}
- length: { hits.hits.1._source.nested: 3 }
- not_exists: hits.hits.1._source.nested.0.emb
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
- not_exists: hits.hits.1._source.nested.1.emb
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
- not_exists: hits.hits.1._source.nested.2.emb
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }
- match: { hits.hits.2._id: "3" }
- match: { hits.hits.2._source.name: "rabbit.jpg" }
- not_exists: hits.hits.2._source.emb
- match: { hits.hits.3._id: "4" }
- match: { hits.hits.3._source.name: "zoolander.jpg" }
- length: { hits.hits.3._source.nested: 3 }
- not_exists: hits.hits.3._source.nested.0.emb
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
- match: { hits.hits.3._source.nested.1.paragraph_id: 1 }
- not_exists: hits.hits.3._source.nested.2.emb
- match: { hits.hits.3._source.nested.2.paragraph_id: 2 }
---
"include synthetic vectors":
- do:
search:
index: test
body:
_source:
exclude_vectors: false
sort: ["name"]
- match: { hits.hits.0._id: "1"}
- match: { hits.hits.0._source.name: "cow.jpg"}
- exists: hits.hits.0._source.emb
- match: { hits.hits.1._id: "2"}
- match: { hits.hits.1._source.name: "moose.jpg"}
- length: { hits.hits.1._source.nested: 3 }
- exists: hits.hits.1._source.nested.0.emb
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
- exists: hits.hits.1._source.nested.1.emb
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
- exists: hits.hits.1._source.nested.2.emb
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }
- match: { hits.hits.2._id: "3" }
- match: { hits.hits.2._source.name: "rabbit.jpg" }
- exists: hits.hits.2._source.emb
- match: { hits.hits.3._id: "4" }
- match: { hits.hits.3._source.name: "zoolander.jpg" }
- length: { hits.hits.3._source.nested: 3 }
- exists: hits.hits.3._source.nested.0.emb
- length: { hits.hits.3._source.nested.0.emb: 3 }
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
- do:
search:
index: test
body:
_source:
exclude_vectors: false
includes: nested.emb
sort: ["name"]
- match: { hits.hits.0._id: "1"}
- length: { hits.hits.0._source: 0}
- match: { hits.hits.1._id: "2"}
- length: { hits.hits.3._source: 1 }
- length: { hits.hits.1._source.nested: 3 }
- exists: hits.hits.1._source.nested.0.emb
- not_exists: hits.hits.1._source.nested.0.paragraph_id
- exists: hits.hits.1._source.nested.1.emb
- not_exists: hits.hits.1._source.nested.1.paragraph_id
- exists: hits.hits.1._source.nested.2.emb
- not_exists: hits.hits.1._source.nested.2.paragraph_id
- match: { hits.hits.2._id: "3" }
- length: { hits.hits.2._source: 0}
- match: { hits.hits.3._id: "4" }
- length: { hits.hits.3._source: 1 }
- length: { hits.hits.3._source.nested: 2 }
- exists: hits.hits.3._source.nested.0.emb
- length: { hits.hits.3._source.nested.0.emb: 3 }
- not_exists: hits.hits.3._source.nested.0.paragraph_id
- exists: hits.hits.3._source.nested.1.emb
- length: { hits.hits.3._source.nested.1.emb: 1 }
- not_exists: hits.hits.3._source.nested.1.paragraph_id
- do:
headers:
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
Content-Type: application/json
search:
index: test
body:
_source:
exclude_vectors: true
sort: ["name"]
fields: ["emb"]
- match: { hits.hits.0._id: "1"}
- match: { hits.hits.0._source.name: "cow.jpg"}
- not_exists: hits.hits.0._source.emb
- length: { hits.hits.0.fields.emb: 1}
- length: { hits.hits.0.fields.emb.0: 2}
- match: { hits.hits.0.fields.emb.0.token_1: 2.0}
- match: { hits.hits.0.fields.emb.0.token_2: 3.0}
- match: { hits.hits.1._id: "2"}
- match: { hits.hits.1._source.name: "moose.jpg"}
- length: { hits.hits.1._source.nested: 3 }
- not_exists: hits.hits.1._source.nested.0.emb
- match: { hits.hits.2._id: "3" }
- match: { hits.hits.2._source.name: "rabbit.jpg" }
- length: { hits.hits.2.fields.emb: 1}
- length: { hits.hits.2.fields.emb.0: 3}
- match: { hits.hits.2.fields.emb.0.token_2: 4.0}
- match: { hits.hits.2.fields.emb.0.token_3: 2.0}
- match: { hits.hits.2.fields.emb.0.token_9: 3.0}
- match: { hits.hits.3._id: "4" }
- match: { hits.hits.3._source.name: "zoolander.jpg" }
- length: { hits.hits.3._source.nested: 3 }
- not_exists: hits.hits.3._source.nested.0.emb
---
"Bulk partial update with synthetic vectors":
- do:
headers:
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
Content-Type: application/json
bulk:
index: test
_source: true
body:
- '{"update": {"_id": "4"}}'
- >
{
"doc": {
"name": "zoolander2.jpg",
"emb": {
"token_12": 2.0,
"token_13": 1.0
}
}
}
- length: { items.0.update.get._source.emb: 2 }
- match: { items.0.update.get._source.emb.token_12: 2.0 }
- match: { items.0.update.get._source.emb.token_13: 1.0 }
- exists: items.0.update.get._source.nested
- length: { items.0.update.get._source.nested: 3}
- exists: items.0.update.get._source.nested.0.emb
- match: { items.0.update.get._source.nested.0.paragraph_id: 0 }
- length: { items.0.update.get._source.nested.0.emb: 3 }
- not_exists: items.0.update.get._source.nested.1.emb
- match: { items.0.update.get._source.nested.1.paragraph_id: 1 }
- exists: items.0.update.get._source.nested.2.emb
- length: { items.0.update.get._source.nested.2.emb: 1 }
- match: { items.0.update.get._source.nested.2.paragraph_id: 2 }
- set: { items.0.update.get._source.nested: original_nested }
- do:
headers:
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
Content-Type: application/json
get:
_source_exclude_vectors: false
index: test
id: "4"
- match: { _source.name: zoolander2.jpg }
- length: { _source.emb: 2 }
- match: { _source.emb.token_12: 2.0 }
- match: { _source.emb.token_13: 1.0 }
- match: { _source.nested: $original_nested }
- do:
indices.refresh: {}
- do:
headers:
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
Content-Type: application/json
search:
index: test
body:
_source:
"exclude_vectors": false
query:
term:
_id: 4
- match: { hits.total.value: 1 }
- match: { hits.total.relation: eq }
- match: { hits.hits.0._source.name: zoolander2.jpg }
- match: { hits.hits.0._source.nested: $original_nested }
---
"Partial update with synthetic vectors":
- do:
headers:
# Force JSON content type so that we use a parser that interprets the vectors as doubles
Content-Type: application/json
update:
index: test
id: "4"
body:
_source: true
doc: {
"name": "zoolander3.jpg",
"emb": {
"token_3": 2.0,
"token_9": 2.5
}
}
- length: { get._source.emb: 2 }
- match: { get._source.emb.token_3: 2.0 }
- match: { get._source.emb.token_9: 2.5 }
- exists: get._source.nested
- length: { get._source.nested: 3}
- exists: get._source.nested.0.emb
- match: { get._source.nested.0.paragraph_id: 0 }
- length: { get._source.nested.0.emb: 3 }
- not_exists: get._source.nested.1.emb
- match: { get._source.nested.1.paragraph_id: 1 }
- exists: get._source.nested.2.emb
- length: { get._source.nested.2.emb: 1 }
- match: { get._source.nested.2.paragraph_id: 2 }
- set: { get._source.nested: original_nested }
- do:
headers:
# Force JSON content type so that we use a parser that interprets the vectors as doubles
Content-Type: application/json
get:
_source_exclude_vectors: false
index: test
id: "4"
- length: { _source.emb: 2 }
- match: { _source.emb.token_3: 2.0 }
- match: { _source.emb.token_9: 2.5 }
- match: { _source.name: zoolander3.jpg }
- match: { _source.nested: $original_nested }
- do:
indices.refresh: {}
- do:
headers:
# Force JSON content type so that we use a parser that interprets the vectors as doubles
Content-Type: application/json
search:
index: test
body:
_source:
"exclude_vectors": false
query:
term:
_id: 4
- match: { hits.total.value: 1 }
- match: { hits.total.relation: eq }
- match: { hits.hits.0._source.name: zoolander3.jpg }
- match: { hits.hits.0._source.nested: $original_nested }

View File

@ -25,7 +25,6 @@ import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.KnnVectorValues;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
@ -3028,9 +3027,11 @@ public class DenseVectorFieldMapper extends FieldMapper {
@Override
public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() {
return isSyntheticVector
? new SyntheticDenseVectorPatchLoader(new IndexedSyntheticFieldLoader(indexCreatedVersion, fieldType().similarity))
: null;
if (isSyntheticVector) {
var syntheticField = new IndexedSyntheticFieldLoader(indexCreatedVersion, fieldType().similarity);
return new SyntheticVectorsPatchFieldLoader(syntheticField, syntheticField::copyVectorAsList);
}
return null;
}
@Override
@ -3127,7 +3128,7 @@ public class DenseVectorFieldMapper extends FieldMapper {
*
* @throws IOException if reading fails
*/
private Object copyVectorAsList() throws IOException {
private List<?> copyVectorAsList() throws IOException {
assert hasValue : "vector is null for ord=" + ord;
if (floatValues != null) {
float[] raw = floatValues.vectorValue(ord);
@ -3218,29 +3219,6 @@ public class DenseVectorFieldMapper extends FieldMapper {
}
}
public class SyntheticDenseVectorPatchLoader implements SourceLoader.SyntheticVectorsLoader {
private final IndexedSyntheticFieldLoader syntheticFieldLoader;
public SyntheticDenseVectorPatchLoader(IndexedSyntheticFieldLoader syntheticFieldLoader) {
this.syntheticFieldLoader = syntheticFieldLoader;
}
public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) throws IOException {
var dvLoader = syntheticFieldLoader.docValuesLoader(context.reader(), null);
return (doc, acc) -> {
if (dvLoader == null) {
return;
}
if (dvLoader.advanceToDoc(doc) && syntheticFieldLoader.hasValue()) {
// add vectors as list since that's how they're parsed from xcontent.
acc.add(
new SourceLoader.LeafSyntheticVectorPath(syntheticFieldLoader.fieldName(), syntheticFieldLoader.copyVectorAsList())
);
}
};
}
}
/**
* Interface for a function that takes a int and boolean
*/

View File

@ -63,6 +63,7 @@ import java.util.Map;
import java.util.Objects;
import java.util.stream.Stream;
import static org.elasticsearch.index.IndexSettings.INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING;
import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST;
import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg;
@ -107,9 +108,12 @@ public class SparseVectorFieldMapper extends FieldMapper {
Objects::toString
).acceptsNull().setSerializerCheck(this::indexOptionsSerializerCheck);
public Builder(String name, IndexVersion indexVersionCreated) {
private boolean isSyntheticVector;
public Builder(String name, IndexVersion indexVersionCreated, boolean isSyntheticVector) {
super(name);
this.indexVersionCreated = indexVersionCreated;
this.isSyntheticVector = isSyntheticVector;
}
public Builder setStored(boolean value) {
@ -129,16 +133,19 @@ public class SparseVectorFieldMapper extends FieldMapper {
builderIndexOptions = getDefaultIndexOptions(indexVersionCreated);
}
final boolean syntheticVectorFinal = context.isSourceSynthetic() == false && isSyntheticVector;
final boolean storedFinal = stored.getValue() || syntheticVectorFinal;
return new SparseVectorFieldMapper(
leafName(),
new SparseVectorFieldType(
indexVersionCreated,
context.buildFullName(leafName()),
stored.getValue(),
storedFinal,
meta.getValue(),
builderIndexOptions
),
builderParams(this, context)
builderParams(this, context),
syntheticVectorFinal
);
}
@ -196,7 +203,11 @@ public class SparseVectorFieldMapper extends FieldMapper {
throw new IllegalArgumentException(ERROR_MESSAGE_8X);
}
return new Builder(n, c.indexVersionCreated());
return new Builder(
n,
c.indexVersionCreated(),
INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING.get(c.getIndexSettings().getSettings())
);
}, notInMultiFields(CONTENT_TYPE));
public static final class SparseVectorFieldType extends MappedFieldType {
@ -302,8 +313,16 @@ public class SparseVectorFieldMapper extends FieldMapper {
}
}
private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldType, BuilderParams builderParams) {
private final boolean isSyntheticVector;
private SparseVectorFieldMapper(
String simpleName,
MappedFieldType mappedFieldType,
BuilderParams builderParams,
boolean isSyntheticVector
) {
super(simpleName, mappedFieldType, builderParams);
this.isSyntheticVector = isSyntheticVector;
}
@Override
@ -314,6 +333,15 @@ public class SparseVectorFieldMapper extends FieldMapper {
return super.syntheticSourceSupport();
}
@Override
public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() {
if (isSyntheticVector) {
var syntheticField = new SparseVectorSyntheticFieldLoader(fullPath(), leafName());
return new SyntheticVectorsPatchFieldLoader(syntheticField, syntheticField::copyAsMap);
}
return null;
}
@Override
public Map<String, NamedAnalyzer> indexAnalyzers() {
return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER);
@ -321,7 +349,7 @@ public class SparseVectorFieldMapper extends FieldMapper {
@Override
public FieldMapper.Builder getMergeBuilder() {
return new Builder(leafName(), this.fieldType().indexVersionCreated).init(this);
return new Builder(leafName(), this.fieldType().indexVersionCreated, this.isSyntheticVector).init(this);
}
@Override
@ -504,9 +532,26 @@ public class SparseVectorFieldMapper extends FieldMapper {
b.endObject();
}
/**
* Returns a deep-copied tokens map for the current document.
*
* @throws IOException if reading fails
*/
private Map<String, Float> copyAsMap() throws IOException {
assert termsDocEnum != null;
Map<String, Float> tokenMap = new LinkedHashMap<>();
PostingsEnum reuse = null;
do {
reuse = termsDocEnum.postings(reuse);
reuse.nextDoc();
tokenMap.put(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq()));
} while (termsDocEnum.next() != null);
return tokenMap;
}
@Override
public String fieldName() {
return leafName;
return fullPath;
}
@Override

View File

@ -0,0 +1,41 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/
package org.elasticsearch.index.mapper.vectors;
import org.apache.lucene.index.LeafReaderContext;
import org.elasticsearch.core.CheckedSupplier;
import org.elasticsearch.index.mapper.SourceLoader;
import java.io.IOException;
public class SyntheticVectorsPatchFieldLoader implements SourceLoader.SyntheticVectorsLoader {
private final SourceLoader.SyntheticFieldLoader syntheticLoader;
private final CheckedSupplier<Object, IOException> copyObject;
public SyntheticVectorsPatchFieldLoader(
SourceLoader.SyntheticFieldLoader syntheticLoader,
CheckedSupplier<Object, IOException> copyObject
) {
this.syntheticLoader = syntheticLoader;
this.copyObject = copyObject;
}
public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) throws IOException {
var dvLoader = syntheticLoader.docValuesLoader(context.reader(), null);
return (doc, acc) -> {
if (dvLoader == null) {
return;
}
if (dvLoader.advanceToDoc(doc) && syntheticLoader.hasValue()) {
acc.add(new SourceLoader.LeafSyntheticVectorPath(syntheticLoader.fieldName(), copyObject.get()));
}
};
}
}

View File

@ -26,6 +26,7 @@ import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.core.CheckedConsumer;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.Tuple;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.mapper.DocumentMapper;
@ -33,7 +34,6 @@ import org.elasticsearch.index.mapper.DocumentParsingException;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.MapperTestCase;
import org.elasticsearch.index.mapper.ParsedDocument;
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.inference.WeightedToken;
@ -54,6 +54,7 @@ import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import static org.elasticsearch.index.IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT;
import static org.elasticsearch.index.IndexVersions.UPGRADE_TO_LUCENE_10_0_0;
@ -66,14 +67,13 @@ import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
public class SparseVectorFieldMapperTests extends MapperTestCase {
public class SparseVectorFieldMapperTests extends SyntheticVectorsMapperTestCase {
@Override
protected Object getSampleValueForDocument() {
Map<String, Float> map = new LinkedHashMap<>();
map.put("ten", 10f);
map.put("twenty", 20f);
return map;
return new TreeMap<>(
randomMap(1, 5, () -> Tuple.tuple(randomAlphaOfLengthBetween(5, 10), Float.valueOf(randomIntBetween(1, 127))))
);
}
@Override
@ -209,26 +209,22 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping));
assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString());
ParsedDocument doc1 = mapper.parse(source(this::writeField));
@SuppressWarnings("unchecked")
var expected = (Map<String, Float>) getSampleValueForDocument();
ParsedDocument doc1 = mapper.parse(source(b -> b.field("field", expected)));
List<IndexableField> fields = doc1.rootDoc().getFields("field");
assertEquals(2, fields.size());
assertEquals(expected.size(), fields.size());
assertThat(fields.get(0), Matchers.instanceOf(FeatureField.class));
FeatureField featureField1 = null;
FeatureField featureField2 = null;
for (IndexableField field : fields) {
if (field.stringValue().equals("ten")) {
featureField1 = (FeatureField) field;
} else if (field.stringValue().equals("twenty")) {
featureField2 = (FeatureField) field;
} else {
throw new UnsupportedOperationException();
if (field instanceof FeatureField fField) {
var value = expected.remove(fField.stringValue());
assertThat(fField.getFeatureValue(), equalTo(value));
int freq1 = getFrequency(fField.tokenStream(null, null));
assertThat(XFeatureField.decodeFeatureValue(freq1), equalTo(value));
}
}
int freq1 = getFrequency(featureField1.tokenStream(null, null));
int freq2 = getFrequency(featureField2.tokenStream(null, null));
assertTrue(freq1 < freq2);
}
public void testDefaultsWithAndWithoutIncludeDefaults() throws Exception {
@ -460,7 +456,8 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
@Override
public SyntheticSourceExample example(int maxValues) {
return new SyntheticSourceExample(getSampleValueForDocument(), getSampleValueForDocument(), b -> {
var sample = getSampleValueForDocument();
return new SyntheticSourceExample(sample, sample, b -> {
if (withStore) {
minimalStoreMapping(b);
} else {

View File

@ -1169,7 +1169,7 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
boolean useLegacyFormat
) {
return switch (modelSettings.taskType()) {
case SPARSE_EMBEDDING -> new SparseVectorFieldMapper.Builder(CHUNKED_EMBEDDINGS_FIELD, indexVersionCreated).setStored(
case SPARSE_EMBEDDING -> new SparseVectorFieldMapper.Builder(CHUNKED_EMBEDDINGS_FIELD, indexVersionCreated, false).setStored(
useLegacyFormat == false
);
case TEXT_EMBEDDING -> {

View File

@ -6,6 +6,7 @@
*/
module org.elasticsearch.rank.vectors {
requires org.elasticsearch.base;
requires org.elasticsearch.xcore;
requires org.elasticsearch.painless.spi;
requires org.elasticsearch.server;

View File

@ -10,7 +10,6 @@ package org.elasticsearch.xpack.rank.vectors.mapper;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.FieldExistsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
@ -30,6 +29,7 @@ import org.elasticsearch.index.mapper.SourceLoader;
import org.elasticsearch.index.mapper.TextSearchInfo;
import org.elasticsearch.index.mapper.ValueFetcher;
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
import org.elasticsearch.index.mapper.vectors.SyntheticVectorsPatchFieldLoader;
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.license.LicenseUtils;
import org.elasticsearch.license.XPackLicenseState;
@ -406,7 +406,11 @@ public class RankVectorsFieldMapper extends FieldMapper {
@Override
public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() {
return isSyntheticVector ? new SyntheticRankVectorPatchLoader(new DocValuesSyntheticFieldLoader()) : null;
if (isSyntheticVector) {
var syntheticField = new DocValuesSyntheticFieldLoader();
return new SyntheticVectorsPatchFieldLoader(syntheticField, syntheticField::copyVectorsAsList);
}
return null;
}
private class DocValuesSyntheticFieldLoader extends SourceLoader.DocValuesBasedSyntheticFieldLoader {
@ -455,7 +459,12 @@ public class RankVectorsFieldMapper extends FieldMapper {
b.endArray();
}
private Object copyVectorsAsList() throws IOException {
/**
* Returns deep-copied vectors for the current document, either as a list.
*
* @throws IOException if reading fails
*/
private List<List<?>> copyVectorsAsList() throws IOException {
assert hasValue : "rank vector is null";
BytesRef ref = values.binaryValue();
ByteBuffer byteBuffer = ByteBuffer.wrap(ref.bytes, ref.offset, ref.length).order(ByteOrder.LITTLE_ENDIAN);
@ -492,28 +501,4 @@ public class RankVectorsFieldMapper extends FieldMapper {
return fullPath();
}
}
private class SyntheticRankVectorPatchLoader implements SourceLoader.SyntheticVectorsLoader {
private final DocValuesSyntheticFieldLoader syntheticFieldLoader;
private SyntheticRankVectorPatchLoader(DocValuesSyntheticFieldLoader syntheticFieldLoader) {
this.syntheticFieldLoader = syntheticFieldLoader;
}
@Override
public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) throws IOException {
var dvLoader = syntheticFieldLoader.docValuesLoader(context.reader(), null);
return (doc, acc) -> {
if (dvLoader == null) {
return;
}
if (dvLoader.advanceToDoc(doc) && syntheticFieldLoader.hasValue()) {
// add vectors as list since that's how they're parsed from xcontent.
acc.add(
new SourceLoader.LeafSyntheticVectorPath(syntheticFieldLoader.fieldName(), syntheticFieldLoader.copyVectorsAsList())
);
}
};
}
}
}