Add synthetic vectors support for sparse_vector (#130756)

This change adds the support for synthetic vectors (added in #130382) in the sparse_vector field type.
2025-07-07 20:30:02 +01:00 · 2025-07-07 20:30:02 +01:00 · 6d81ff94b0
parent 83076c2dcd
commit 6d81ff94b0
9 changed files with 511 additions and 84 deletions
--- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/240_source_synthetic_dense_vectors.yml
+++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/240_source_synthetic_dense_vectors.yml
--- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/250_source_synthetic_sparse_vectors.yml
+++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/250_source_synthetic_sparse_vectors.yml
@ -0,0 +1,380 @@
+setup:
+  - requires:
+      reason: 'synthetic vectors are required'
+      test_runner_features: [ capabilities ]
+      capabilities:
+        - method: GET
+          path: /_search
+          capabilities: [ synthetic_vectors_setting ]
+  - skip:
+      features: "headers"
+
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            index.mapping.synthetic_vectors: true
+          mappings:
+            properties:
+              name:
+                type: keyword
+              emb:
+                type: sparse_vector
+
+              nested:
+                type: nested
+                properties:
+                  paragraph_id:
+                    type: keyword
+                  emb:
+                    type: sparse_vector
+
+  - do:
+      index:
+        index: test
+        id: "1"
+        body:
+          name: cow.jpg
+          emb:
+            token_1: 2.0
+            token_2: 3.0
+
+  - do:
+      index:
+        index: test
+        id: "2"
+        body:
+          name: moose.jpg
+          nested:
+          - paragraph_id: 0
+            emb:
+              token_1: 2.0
+              token_2: 3.0
+          - paragraph_id: 2
+            emb:
+              token_3: 2.0
+              token_2: 3.0
+          - paragraph_id: 3
+            emb:
+              token_3: 2.0
+              token_7: 3.0
+              token_1: 4.0
+
+  - do:
+      index:
+        index: test
+        id: "3"
+        body:
+          name: rabbit.jpg
+          emb:
+            token_3: 2.0
+            token_9: 3.0
+            token_2: 4.0
+
+  - do:
+      index:
+        index: test
+        id: "4"
+        body:
+          name: zoolander.jpg
+          nested:
+            - paragraph_id: 0
+              emb:
+                token_3: 2.0
+                token_7: 3.0
+                token_1: 4.0
+            - paragraph_id: 1
+            - paragraph_id: 2
+              emb:
+                token_8: 2.0
+
+  - do:
+      indices.refresh: {}
+
+---
+"exclude synthetic vectors":
+  - do:
+      search:
+        index: test
+        body:
+          sort: ["name"]
+
+  - match:      { hits.hits.0._id: "1"}
+  - match:      { hits.hits.0._source.name: "cow.jpg"}
+  - not_exists:   hits.hits.0._source.emb
+
+  - match:      { hits.hits.1._id: "2"}
+  - match:      { hits.hits.1._source.name: "moose.jpg"}
+  - length:     { hits.hits.1._source.nested: 3 }
+  - not_exists:   hits.hits.1._source.nested.0.emb
+  - match:      { hits.hits.1._source.nested.0.paragraph_id: 0 }
+  - not_exists:   hits.hits.1._source.nested.1.emb
+  - match:      { hits.hits.1._source.nested.1.paragraph_id: 2 }
+  - not_exists:   hits.hits.1._source.nested.2.emb
+  - match:      { hits.hits.1._source.nested.2.paragraph_id: 3 }
+
+  - match:      { hits.hits.2._id: "3" }
+  - match:      { hits.hits.2._source.name: "rabbit.jpg" }
+  - not_exists:   hits.hits.2._source.emb
+
+  - match:      { hits.hits.3._id: "4" }
+  - match:      { hits.hits.3._source.name: "zoolander.jpg" }
+  - length:     { hits.hits.3._source.nested: 3 }
+  - not_exists:   hits.hits.3._source.nested.0.emb
+  - match:      { hits.hits.3._source.nested.0.paragraph_id: 0 }
+  - match:      { hits.hits.3._source.nested.1.paragraph_id: 1 }
+  - not_exists:   hits.hits.3._source.nested.2.emb
+  - match:      { hits.hits.3._source.nested.2.paragraph_id: 2 }
+
+---
+"include synthetic vectors":
+  - do:
+      search:
+        index: test
+        body:
+          _source:
+            exclude_vectors: false
+          sort: ["name"]
+
+  - match:      { hits.hits.0._id: "1"}
+  - match:      { hits.hits.0._source.name: "cow.jpg"}
+  - exists:       hits.hits.0._source.emb
+
+  - match:      { hits.hits.1._id: "2"}
+  - match:      { hits.hits.1._source.name: "moose.jpg"}
+  - length:     { hits.hits.1._source.nested: 3 }
+  - exists:       hits.hits.1._source.nested.0.emb
+  - match:      { hits.hits.1._source.nested.0.paragraph_id: 0 }
+  - exists:       hits.hits.1._source.nested.1.emb
+  - match:      { hits.hits.1._source.nested.1.paragraph_id: 2 }
+  - exists:       hits.hits.1._source.nested.2.emb
+  - match:      { hits.hits.1._source.nested.2.paragraph_id: 3 }
+
+  - match:      { hits.hits.2._id: "3" }
+  - match:      { hits.hits.2._source.name: "rabbit.jpg" }
+  - exists:       hits.hits.2._source.emb
+
+  - match:      { hits.hits.3._id: "4" }
+  - match:      { hits.hits.3._source.name: "zoolander.jpg" }
+  - length:     { hits.hits.3._source.nested: 3 }
+  - exists:       hits.hits.3._source.nested.0.emb
+  - length:      { hits.hits.3._source.nested.0.emb: 3 }
+  - match:      { hits.hits.3._source.nested.0.paragraph_id: 0 }
+
+  - do:
+      search:
+        index: test
+        body:
+          _source:
+            exclude_vectors: false
+            includes: nested.emb
+          sort: ["name"]
+
+  - match:           { hits.hits.0._id: "1"}
+  - length:          { hits.hits.0._source: 0}
+
+  - match:           { hits.hits.1._id: "2"}
+  - length:          { hits.hits.3._source: 1 }
+  - length:          { hits.hits.1._source.nested: 3 }
+  - exists:            hits.hits.1._source.nested.0.emb
+  - not_exists:        hits.hits.1._source.nested.0.paragraph_id
+  - exists:            hits.hits.1._source.nested.1.emb
+  - not_exists:        hits.hits.1._source.nested.1.paragraph_id
+  - exists:            hits.hits.1._source.nested.2.emb
+  - not_exists:        hits.hits.1._source.nested.2.paragraph_id
+
+  - match:           { hits.hits.2._id: "3" }
+  - length:          { hits.hits.2._source: 0}
+
+  - match:           { hits.hits.3._id: "4" }
+  - length:          { hits.hits.3._source: 1 }
+  - length:          { hits.hits.3._source.nested: 2 }
+  - exists:            hits.hits.3._source.nested.0.emb
+  - length:          { hits.hits.3._source.nested.0.emb: 3 }
+  - not_exists:        hits.hits.3._source.nested.0.paragraph_id
+  - exists:            hits.hits.3._source.nested.1.emb
+  - length:          { hits.hits.3._source.nested.1.emb: 1 }
+  - not_exists:        hits.hits.3._source.nested.1.paragraph_id
+
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the embeddings as doubles
+        Content-Type: application/json
+      search:
+        index: test
+        body:
+          _source:
+            exclude_vectors: true
+          sort: ["name"]
+          fields: ["emb"]
+
+  - match:      { hits.hits.0._id: "1"}
+  - match:      { hits.hits.0._source.name: "cow.jpg"}
+  - not_exists:   hits.hits.0._source.emb
+  - length:     { hits.hits.0.fields.emb: 1}
+  - length:     { hits.hits.0.fields.emb.0: 2}
+  - match:      { hits.hits.0.fields.emb.0.token_1: 2.0}
+  - match:      { hits.hits.0.fields.emb.0.token_2: 3.0}
+
+  - match:      { hits.hits.1._id: "2"}
+  - match:      { hits.hits.1._source.name: "moose.jpg"}
+  - length:     { hits.hits.1._source.nested: 3 }
+  - not_exists:   hits.hits.1._source.nested.0.emb
+
+  - match:      { hits.hits.2._id: "3" }
+  - match:      { hits.hits.2._source.name: "rabbit.jpg" }
+  - length:     { hits.hits.2.fields.emb: 1}
+  - length:     { hits.hits.2.fields.emb.0: 3}
+  - match:      { hits.hits.2.fields.emb.0.token_2: 4.0}
+  - match:      { hits.hits.2.fields.emb.0.token_3: 2.0}
+  - match:      { hits.hits.2.fields.emb.0.token_9: 3.0}
+
+  - match:      { hits.hits.3._id: "4" }
+  - match:      { hits.hits.3._source.name: "zoolander.jpg" }
+  - length:     { hits.hits.3._source.nested: 3 }
+  - not_exists:   hits.hits.3._source.nested.0.emb
+
+
+---
+"Bulk partial update with synthetic vectors":
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the embeddings as doubles
+        Content-Type: application/json
+      bulk:
+        index: test
+        _source: true
+        body:
+          - '{"update": {"_id": "4"}}'
+          - >
+            {
+              "doc": {
+                "name": "zoolander2.jpg",
+                "emb": {
+                  "token_12": 2.0,
+                  "token_13": 1.0
+                }
+              }
+            }
+
+  - length:     { items.0.update.get._source.emb: 2 }
+  - match:      { items.0.update.get._source.emb.token_12: 2.0 }
+  - match:      { items.0.update.get._source.emb.token_13: 1.0 }
+  - exists:       items.0.update.get._source.nested
+  - length:     { items.0.update.get._source.nested: 3}
+  - exists:       items.0.update.get._source.nested.0.emb
+  - match:      { items.0.update.get._source.nested.0.paragraph_id: 0 }
+  - length:     { items.0.update.get._source.nested.0.emb: 3 }
+  - not_exists:   items.0.update.get._source.nested.1.emb
+  - match:      { items.0.update.get._source.nested.1.paragraph_id: 1 }
+  - exists:       items.0.update.get._source.nested.2.emb
+  - length:     { items.0.update.get._source.nested.2.emb: 1 }
+  - match:      { items.0.update.get._source.nested.2.paragraph_id: 2 }
+  - set:        { items.0.update.get._source.nested: original_nested }
+
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the embeddings as doubles
+        Content-Type: application/json
+      get:
+        _source_exclude_vectors: false
+        index: test
+        id: "4"
+
+  - match:    { _source.name: zoolander2.jpg }
+  - length:   { _source.emb: 2 }
+  - match:    { _source.emb.token_12: 2.0 }
+  - match:    { _source.emb.token_13: 1.0 }
+  - match:    { _source.nested: $original_nested }
+
+  - do:
+      indices.refresh: {}
+
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the embeddings as doubles
+        Content-Type: application/json
+      search:
+        index: test
+        body:
+          _source:
+            "exclude_vectors": false
+          query:
+            term:
+              _id: 4
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.total.relation: eq }
+  - match: { hits.hits.0._source.name: zoolander2.jpg }
+  - match: { hits.hits.0._source.nested: $original_nested }
+
+---
+"Partial update with synthetic vectors":
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the vectors as doubles
+        Content-Type: application/json
+      update:
+        index: test
+        id: "4"
+        body:
+          _source: true
+          doc: {
+            "name": "zoolander3.jpg",
+            "emb": {
+              "token_3": 2.0,
+              "token_9": 2.5
+            }
+          }
+
+  - length:     { get._source.emb: 2 }
+  - match:      { get._source.emb.token_3: 2.0 }
+  - match:      { get._source.emb.token_9: 2.5 }
+  - exists:       get._source.nested
+  - length:     { get._source.nested: 3}
+  - exists:       get._source.nested.0.emb
+  - match:      { get._source.nested.0.paragraph_id: 0 }
+  - length:     { get._source.nested.0.emb: 3 }
+  - not_exists:   get._source.nested.1.emb
+  - match:      { get._source.nested.1.paragraph_id: 1 }
+  - exists:       get._source.nested.2.emb
+  - length:     { get._source.nested.2.emb: 1 }
+  - match:      { get._source.nested.2.paragraph_id: 2 }
+  - set:        { get._source.nested: original_nested }
+
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the vectors as doubles
+        Content-Type: application/json
+      get:
+        _source_exclude_vectors: false
+        index: test
+        id: "4"
+
+  - length:   { _source.emb: 2 }
+  - match:    { _source.emb.token_3: 2.0 }
+  - match:    { _source.emb.token_9: 2.5 }
+  - match:    { _source.name: zoolander3.jpg }
+  - match:    { _source.nested: $original_nested }
+
+  - do:
+      indices.refresh: {}
+
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the vectors as doubles
+        Content-Type: application/json
+      search:
+        index: test
+        body:
+          _source:
+            "exclude_vectors": false
+          query:
+            term:
+              _id: 4
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.total.relation: eq }
+  - match: { hits.hits.0._source.name: zoolander3.jpg }
+  - match: { hits.hits.0._source.nested: $original_nested }
--- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java
+++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java
@ -25,7 +25,6 @@ import org.apache.lucene.index.FilterLeafReader;
 import org.apache.lucene.index.FloatVectorValues;
 import org.apache.lucene.index.KnnVectorValues;
 import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
@ -3028,9 +3027,11 @@ public class DenseVectorFieldMapper extends FieldMapper {

    @Override
    public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() {
-        return isSyntheticVector
-            ? new SyntheticDenseVectorPatchLoader(new IndexedSyntheticFieldLoader(indexCreatedVersion, fieldType().similarity))
-            : null;
+        if (isSyntheticVector) {
+            var syntheticField = new IndexedSyntheticFieldLoader(indexCreatedVersion, fieldType().similarity);
+            return new SyntheticVectorsPatchFieldLoader(syntheticField, syntheticField::copyVectorAsList);
+        }
+        return null;
    }

    @Override
@ -3127,7 +3128,7 @@ public class DenseVectorFieldMapper extends FieldMapper {
         *
         * @throws IOException if reading fails
         */
-        private Object copyVectorAsList() throws IOException {
+        private List<?> copyVectorAsList() throws IOException {
            assert hasValue : "vector is null for ord=" + ord;
            if (floatValues != null) {
                float[] raw = floatValues.vectorValue(ord);
@ -3218,29 +3219,6 @@ public class DenseVectorFieldMapper extends FieldMapper {
        }
    }

-    public class SyntheticDenseVectorPatchLoader implements SourceLoader.SyntheticVectorsLoader {
-        private final IndexedSyntheticFieldLoader syntheticFieldLoader;
-
-        public SyntheticDenseVectorPatchLoader(IndexedSyntheticFieldLoader syntheticFieldLoader) {
-            this.syntheticFieldLoader = syntheticFieldLoader;
-        }
-
-        public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) throws IOException {
-            var dvLoader = syntheticFieldLoader.docValuesLoader(context.reader(), null);
-            return (doc, acc) -> {
-                if (dvLoader == null) {
-                    return;
-                }
-                if (dvLoader.advanceToDoc(doc) && syntheticFieldLoader.hasValue()) {
-                    // add vectors as list since that's how they're parsed from xcontent.
-                    acc.add(
-                        new SourceLoader.LeafSyntheticVectorPath(syntheticFieldLoader.fieldName(), syntheticFieldLoader.copyVectorAsList())
-                    );
-                }
-            };
-        }
-    }
-
    /**
     * Interface for a function that takes a int and boolean
     */
--- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java
+++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java
@ -63,6 +63,7 @@ import java.util.Map;
 import java.util.Objects;
 import java.util.stream.Stream;

+import static org.elasticsearch.index.IndexSettings.INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING;
 import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST;
 import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg;

@ -107,9 +108,12 @@ public class SparseVectorFieldMapper extends FieldMapper {
            Objects::toString
        ).acceptsNull().setSerializerCheck(this::indexOptionsSerializerCheck);

-        public Builder(String name, IndexVersion indexVersionCreated) {
+        private boolean isSyntheticVector;
+
+        public Builder(String name, IndexVersion indexVersionCreated, boolean isSyntheticVector) {
            super(name);
            this.indexVersionCreated = indexVersionCreated;
+            this.isSyntheticVector = isSyntheticVector;
        }

        public Builder setStored(boolean value) {
@ -129,16 +133,19 @@ public class SparseVectorFieldMapper extends FieldMapper {
                builderIndexOptions = getDefaultIndexOptions(indexVersionCreated);
            }

+            final boolean syntheticVectorFinal = context.isSourceSynthetic() == false && isSyntheticVector;
+            final boolean storedFinal = stored.getValue() || syntheticVectorFinal;
            return new SparseVectorFieldMapper(
                leafName(),
                new SparseVectorFieldType(
                    indexVersionCreated,
                    context.buildFullName(leafName()),
-                    stored.getValue(),
+                    storedFinal,
                    meta.getValue(),
                    builderIndexOptions
                ),
-                builderParams(this, context)
+                builderParams(this, context),
+                syntheticVectorFinal
            );
        }

@ -196,7 +203,11 @@ public class SparseVectorFieldMapper extends FieldMapper {
            throw new IllegalArgumentException(ERROR_MESSAGE_8X);
        }

-        return new Builder(n, c.indexVersionCreated());
+        return new Builder(
+            n,
+            c.indexVersionCreated(),
+            INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING.get(c.getIndexSettings().getSettings())
+        );
    }, notInMultiFields(CONTENT_TYPE));

    public static final class SparseVectorFieldType extends MappedFieldType {
@ -302,8 +313,16 @@ public class SparseVectorFieldMapper extends FieldMapper {
        }
    }

-    private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldType, BuilderParams builderParams) {
+    private final boolean isSyntheticVector;
+
+    private SparseVectorFieldMapper(
+        String simpleName,
+        MappedFieldType mappedFieldType,
+        BuilderParams builderParams,
+        boolean isSyntheticVector
+    ) {
        super(simpleName, mappedFieldType, builderParams);
+        this.isSyntheticVector = isSyntheticVector;
    }

    @Override
@ -314,6 +333,15 @@ public class SparseVectorFieldMapper extends FieldMapper {
        return super.syntheticSourceSupport();
    }

+    @Override
+    public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() {
+        if (isSyntheticVector) {
+            var syntheticField = new SparseVectorSyntheticFieldLoader(fullPath(), leafName());
+            return new SyntheticVectorsPatchFieldLoader(syntheticField, syntheticField::copyAsMap);
+        }
+        return null;
+    }
+
    @Override
    public Map<String, NamedAnalyzer> indexAnalyzers() {
        return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER);
@ -321,7 +349,7 @@ public class SparseVectorFieldMapper extends FieldMapper {

    @Override
    public FieldMapper.Builder getMergeBuilder() {
-        return new Builder(leafName(), this.fieldType().indexVersionCreated).init(this);
+        return new Builder(leafName(), this.fieldType().indexVersionCreated, this.isSyntheticVector).init(this);
    }

    @Override
@ -504,9 +532,26 @@ public class SparseVectorFieldMapper extends FieldMapper {
            b.endObject();
        }

+        /**
+         * Returns a deep-copied tokens map for the current document.
+         *
+         * @throws IOException if reading fails
+         */
+        private Map<String, Float> copyAsMap() throws IOException {
+            assert termsDocEnum != null;
+            Map<String, Float> tokenMap = new LinkedHashMap<>();
+            PostingsEnum reuse = null;
+            do {
+                reuse = termsDocEnum.postings(reuse);
+                reuse.nextDoc();
+                tokenMap.put(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq()));
+            } while (termsDocEnum.next() != null);
+            return tokenMap;
+        }
+
        @Override
        public String fieldName() {
-            return leafName;
+            return fullPath;
        }

        @Override
--- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SyntheticVectorsPatchFieldLoader.java
+++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SyntheticVectorsPatchFieldLoader.java
@ -0,0 +1,41 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.index.mapper.vectors;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.elasticsearch.core.CheckedSupplier;
+import org.elasticsearch.index.mapper.SourceLoader;
+
+import java.io.IOException;
+
+public class SyntheticVectorsPatchFieldLoader implements SourceLoader.SyntheticVectorsLoader {
+    private final SourceLoader.SyntheticFieldLoader syntheticLoader;
+    private final CheckedSupplier<Object, IOException> copyObject;
+
+    public SyntheticVectorsPatchFieldLoader(
+        SourceLoader.SyntheticFieldLoader syntheticLoader,
+        CheckedSupplier<Object, IOException> copyObject
+    ) {
+        this.syntheticLoader = syntheticLoader;
+        this.copyObject = copyObject;
+    }
+
+    public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) throws IOException {
+        var dvLoader = syntheticLoader.docValuesLoader(context.reader(), null);
+        return (doc, acc) -> {
+            if (dvLoader == null) {
+                return;
+            }
+            if (dvLoader.advanceToDoc(doc) && syntheticLoader.hasValue()) {
+                acc.add(new SourceLoader.LeafSyntheticVectorPath(syntheticLoader.fieldName(), copyObject.get()));
+            }
+        };
+    }
+}
--- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java
+++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java
@ -26,6 +26,7 @@ import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.compress.CompressedXContent;
 import org.elasticsearch.core.CheckedConsumer;
 import org.elasticsearch.core.Nullable;
+import org.elasticsearch.core.Tuple;
 import org.elasticsearch.index.IndexVersion;
 import org.elasticsearch.index.IndexVersions;
 import org.elasticsearch.index.mapper.DocumentMapper;
@ -33,7 +34,6 @@ import org.elasticsearch.index.mapper.DocumentParsingException;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.MapperParsingException;
 import org.elasticsearch.index.mapper.MapperService;
-import org.elasticsearch.index.mapper.MapperTestCase;
 import org.elasticsearch.index.mapper.ParsedDocument;
 import org.elasticsearch.index.query.SearchExecutionContext;
 import org.elasticsearch.inference.WeightedToken;
@ -54,6 +54,7 @@ import java.util.Collection;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.TreeMap;

 import static org.elasticsearch.index.IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT;
 import static org.elasticsearch.index.IndexVersions.UPGRADE_TO_LUCENE_10_0_0;
@ -66,14 +67,13 @@ import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.instanceOf;

-public class SparseVectorFieldMapperTests extends MapperTestCase {
+public class SparseVectorFieldMapperTests extends SyntheticVectorsMapperTestCase {

    @Override
    protected Object getSampleValueForDocument() {
-        Map<String, Float> map = new LinkedHashMap<>();
-        map.put("ten", 10f);
-        map.put("twenty", 20f);
-        return map;
+        return new TreeMap<>(
+            randomMap(1, 5, () -> Tuple.tuple(randomAlphaOfLengthBetween(5, 10), Float.valueOf(randomIntBetween(1, 127))))
+        );
    }

    @Override
@ -209,26 +209,22 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
        DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping));
        assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString());

-        ParsedDocument doc1 = mapper.parse(source(this::writeField));
+        @SuppressWarnings("unchecked")
+        var expected = (Map<String, Float>) getSampleValueForDocument();
+        ParsedDocument doc1 = mapper.parse(source(b -> b.field("field", expected)));

        List<IndexableField> fields = doc1.rootDoc().getFields("field");
-        assertEquals(2, fields.size());
+        assertEquals(expected.size(), fields.size());
        assertThat(fields.get(0), Matchers.instanceOf(FeatureField.class));
-        FeatureField featureField1 = null;
-        FeatureField featureField2 = null;
+
        for (IndexableField field : fields) {
-            if (field.stringValue().equals("ten")) {
-                featureField1 = (FeatureField) field;
-            } else if (field.stringValue().equals("twenty")) {
-                featureField2 = (FeatureField) field;
-            } else {
-                throw new UnsupportedOperationException();
+            if (field instanceof FeatureField fField) {
+                var value = expected.remove(fField.stringValue());
+                assertThat(fField.getFeatureValue(), equalTo(value));
+                int freq1 = getFrequency(fField.tokenStream(null, null));
+                assertThat(XFeatureField.decodeFeatureValue(freq1), equalTo(value));
            }
        }
-
-        int freq1 = getFrequency(featureField1.tokenStream(null, null));
-        int freq2 = getFrequency(featureField2.tokenStream(null, null));
-        assertTrue(freq1 < freq2);
    }

    public void testDefaultsWithAndWithoutIncludeDefaults() throws Exception {
@ -460,7 +456,8 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {

            @Override
            public SyntheticSourceExample example(int maxValues) {
-                return new SyntheticSourceExample(getSampleValueForDocument(), getSampleValueForDocument(), b -> {
+                var sample = getSampleValueForDocument();
+                return new SyntheticSourceExample(sample, sample, b -> {
                    if (withStore) {
                        minimalStoreMapping(b);
                    } else {
--- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java
+++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java
@ -1169,7 +1169,7 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
        boolean useLegacyFormat
    ) {
        return switch (modelSettings.taskType()) {
-            case SPARSE_EMBEDDING -> new SparseVectorFieldMapper.Builder(CHUNKED_EMBEDDINGS_FIELD, indexVersionCreated).setStored(
+            case SPARSE_EMBEDDING -> new SparseVectorFieldMapper.Builder(CHUNKED_EMBEDDINGS_FIELD, indexVersionCreated, false).setStored(
                useLegacyFormat == false
            );
            case TEXT_EMBEDDING -> {
--- a/x-pack/plugin/rank-vectors/src/main/java/module-info.java
+++ b/x-pack/plugin/rank-vectors/src/main/java/module-info.java
@ -6,6 +6,7 @@
 */

 module org.elasticsearch.rank.vectors {
+    requires org.elasticsearch.base;
    requires org.elasticsearch.xcore;
    requires org.elasticsearch.painless.spi;
    requires org.elasticsearch.server;
--- a/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapper.java
+++ b/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapper.java
@ -10,7 +10,6 @@ package org.elasticsearch.xpack.rank.vectors.mapper;
 import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.search.FieldExistsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
@ -30,6 +29,7 @@ import org.elasticsearch.index.mapper.SourceLoader;
 import org.elasticsearch.index.mapper.TextSearchInfo;
 import org.elasticsearch.index.mapper.ValueFetcher;
 import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
+import org.elasticsearch.index.mapper.vectors.SyntheticVectorsPatchFieldLoader;
 import org.elasticsearch.index.query.SearchExecutionContext;
 import org.elasticsearch.license.LicenseUtils;
 import org.elasticsearch.license.XPackLicenseState;
@ -406,7 +406,11 @@ public class RankVectorsFieldMapper extends FieldMapper {

    @Override
    public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() {
-        return isSyntheticVector ? new SyntheticRankVectorPatchLoader(new DocValuesSyntheticFieldLoader()) : null;
+        if (isSyntheticVector) {
+            var syntheticField = new DocValuesSyntheticFieldLoader();
+            return new SyntheticVectorsPatchFieldLoader(syntheticField, syntheticField::copyVectorsAsList);
+        }
+        return null;
    }

    private class DocValuesSyntheticFieldLoader extends SourceLoader.DocValuesBasedSyntheticFieldLoader {
@ -455,7 +459,12 @@ public class RankVectorsFieldMapper extends FieldMapper {
            b.endArray();
        }

-        private Object copyVectorsAsList() throws IOException {
+        /**
+         * Returns deep-copied vectors  for the current document, either as a list.
+         *
+         * @throws IOException if reading fails
+         */
+        private List<List<?>> copyVectorsAsList() throws IOException {
            assert hasValue : "rank vector is null";
            BytesRef ref = values.binaryValue();
            ByteBuffer byteBuffer = ByteBuffer.wrap(ref.bytes, ref.offset, ref.length).order(ByteOrder.LITTLE_ENDIAN);
@ -492,28 +501,4 @@ public class RankVectorsFieldMapper extends FieldMapper {
            return fullPath();
        }
    }
-
-    private class SyntheticRankVectorPatchLoader implements SourceLoader.SyntheticVectorsLoader {
-        private final DocValuesSyntheticFieldLoader syntheticFieldLoader;
-
-        private SyntheticRankVectorPatchLoader(DocValuesSyntheticFieldLoader syntheticFieldLoader) {
-            this.syntheticFieldLoader = syntheticFieldLoader;
-        }
-
-        @Override
-        public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) throws IOException {
-            var dvLoader = syntheticFieldLoader.docValuesLoader(context.reader(), null);
-            return (doc, acc) -> {
-                if (dvLoader == null) {
-                    return;
-                }
-                if (dvLoader.advanceToDoc(doc) && syntheticFieldLoader.hasValue()) {
-                    // add vectors as list since that's how they're parsed from xcontent.
-                    acc.add(
-                        new SourceLoader.LeafSyntheticVectorPath(syntheticFieldLoader.fieldName(), syntheticFieldLoader.copyVectorsAsList())
-                    );
-                }
-            };
-        }
-    }
 }