Add synthetic vectors support for sparse_vector (#130756)
This change adds the support for synthetic vectors (added in #130382) in the sparse_vector field type.
This commit is contained in:
parent
83076c2dcd
commit
6d81ff94b0
|
@ -0,0 +1,380 @@
|
|||
setup:
|
||||
- requires:
|
||||
reason: 'synthetic vectors are required'
|
||||
test_runner_features: [ capabilities ]
|
||||
capabilities:
|
||||
- method: GET
|
||||
path: /_search
|
||||
capabilities: [ synthetic_vectors_setting ]
|
||||
- skip:
|
||||
features: "headers"
|
||||
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
index.mapping.synthetic_vectors: true
|
||||
mappings:
|
||||
properties:
|
||||
name:
|
||||
type: keyword
|
||||
emb:
|
||||
type: sparse_vector
|
||||
|
||||
nested:
|
||||
type: nested
|
||||
properties:
|
||||
paragraph_id:
|
||||
type: keyword
|
||||
emb:
|
||||
type: sparse_vector
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test
|
||||
id: "1"
|
||||
body:
|
||||
name: cow.jpg
|
||||
emb:
|
||||
token_1: 2.0
|
||||
token_2: 3.0
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test
|
||||
id: "2"
|
||||
body:
|
||||
name: moose.jpg
|
||||
nested:
|
||||
- paragraph_id: 0
|
||||
emb:
|
||||
token_1: 2.0
|
||||
token_2: 3.0
|
||||
- paragraph_id: 2
|
||||
emb:
|
||||
token_3: 2.0
|
||||
token_2: 3.0
|
||||
- paragraph_id: 3
|
||||
emb:
|
||||
token_3: 2.0
|
||||
token_7: 3.0
|
||||
token_1: 4.0
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test
|
||||
id: "3"
|
||||
body:
|
||||
name: rabbit.jpg
|
||||
emb:
|
||||
token_3: 2.0
|
||||
token_9: 3.0
|
||||
token_2: 4.0
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test
|
||||
id: "4"
|
||||
body:
|
||||
name: zoolander.jpg
|
||||
nested:
|
||||
- paragraph_id: 0
|
||||
emb:
|
||||
token_3: 2.0
|
||||
token_7: 3.0
|
||||
token_1: 4.0
|
||||
- paragraph_id: 1
|
||||
- paragraph_id: 2
|
||||
emb:
|
||||
token_8: 2.0
|
||||
|
||||
- do:
|
||||
indices.refresh: {}
|
||||
|
||||
---
|
||||
"exclude synthetic vectors":
|
||||
- do:
|
||||
search:
|
||||
index: test
|
||||
body:
|
||||
sort: ["name"]
|
||||
|
||||
- match: { hits.hits.0._id: "1"}
|
||||
- match: { hits.hits.0._source.name: "cow.jpg"}
|
||||
- not_exists: hits.hits.0._source.emb
|
||||
|
||||
- match: { hits.hits.1._id: "2"}
|
||||
- match: { hits.hits.1._source.name: "moose.jpg"}
|
||||
- length: { hits.hits.1._source.nested: 3 }
|
||||
- not_exists: hits.hits.1._source.nested.0.emb
|
||||
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
|
||||
- not_exists: hits.hits.1._source.nested.1.emb
|
||||
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
|
||||
- not_exists: hits.hits.1._source.nested.2.emb
|
||||
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }
|
||||
|
||||
- match: { hits.hits.2._id: "3" }
|
||||
- match: { hits.hits.2._source.name: "rabbit.jpg" }
|
||||
- not_exists: hits.hits.2._source.emb
|
||||
|
||||
- match: { hits.hits.3._id: "4" }
|
||||
- match: { hits.hits.3._source.name: "zoolander.jpg" }
|
||||
- length: { hits.hits.3._source.nested: 3 }
|
||||
- not_exists: hits.hits.3._source.nested.0.emb
|
||||
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
|
||||
- match: { hits.hits.3._source.nested.1.paragraph_id: 1 }
|
||||
- not_exists: hits.hits.3._source.nested.2.emb
|
||||
- match: { hits.hits.3._source.nested.2.paragraph_id: 2 }
|
||||
|
||||
---
|
||||
"include synthetic vectors":
|
||||
- do:
|
||||
search:
|
||||
index: test
|
||||
body:
|
||||
_source:
|
||||
exclude_vectors: false
|
||||
sort: ["name"]
|
||||
|
||||
- match: { hits.hits.0._id: "1"}
|
||||
- match: { hits.hits.0._source.name: "cow.jpg"}
|
||||
- exists: hits.hits.0._source.emb
|
||||
|
||||
- match: { hits.hits.1._id: "2"}
|
||||
- match: { hits.hits.1._source.name: "moose.jpg"}
|
||||
- length: { hits.hits.1._source.nested: 3 }
|
||||
- exists: hits.hits.1._source.nested.0.emb
|
||||
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
|
||||
- exists: hits.hits.1._source.nested.1.emb
|
||||
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
|
||||
- exists: hits.hits.1._source.nested.2.emb
|
||||
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }
|
||||
|
||||
- match: { hits.hits.2._id: "3" }
|
||||
- match: { hits.hits.2._source.name: "rabbit.jpg" }
|
||||
- exists: hits.hits.2._source.emb
|
||||
|
||||
- match: { hits.hits.3._id: "4" }
|
||||
- match: { hits.hits.3._source.name: "zoolander.jpg" }
|
||||
- length: { hits.hits.3._source.nested: 3 }
|
||||
- exists: hits.hits.3._source.nested.0.emb
|
||||
- length: { hits.hits.3._source.nested.0.emb: 3 }
|
||||
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
|
||||
|
||||
- do:
|
||||
search:
|
||||
index: test
|
||||
body:
|
||||
_source:
|
||||
exclude_vectors: false
|
||||
includes: nested.emb
|
||||
sort: ["name"]
|
||||
|
||||
- match: { hits.hits.0._id: "1"}
|
||||
- length: { hits.hits.0._source: 0}
|
||||
|
||||
- match: { hits.hits.1._id: "2"}
|
||||
- length: { hits.hits.3._source: 1 }
|
||||
- length: { hits.hits.1._source.nested: 3 }
|
||||
- exists: hits.hits.1._source.nested.0.emb
|
||||
- not_exists: hits.hits.1._source.nested.0.paragraph_id
|
||||
- exists: hits.hits.1._source.nested.1.emb
|
||||
- not_exists: hits.hits.1._source.nested.1.paragraph_id
|
||||
- exists: hits.hits.1._source.nested.2.emb
|
||||
- not_exists: hits.hits.1._source.nested.2.paragraph_id
|
||||
|
||||
- match: { hits.hits.2._id: "3" }
|
||||
- length: { hits.hits.2._source: 0}
|
||||
|
||||
- match: { hits.hits.3._id: "4" }
|
||||
- length: { hits.hits.3._source: 1 }
|
||||
- length: { hits.hits.3._source.nested: 2 }
|
||||
- exists: hits.hits.3._source.nested.0.emb
|
||||
- length: { hits.hits.3._source.nested.0.emb: 3 }
|
||||
- not_exists: hits.hits.3._source.nested.0.paragraph_id
|
||||
- exists: hits.hits.3._source.nested.1.emb
|
||||
- length: { hits.hits.3._source.nested.1.emb: 1 }
|
||||
- not_exists: hits.hits.3._source.nested.1.paragraph_id
|
||||
|
||||
- do:
|
||||
headers:
|
||||
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
|
||||
Content-Type: application/json
|
||||
search:
|
||||
index: test
|
||||
body:
|
||||
_source:
|
||||
exclude_vectors: true
|
||||
sort: ["name"]
|
||||
fields: ["emb"]
|
||||
|
||||
- match: { hits.hits.0._id: "1"}
|
||||
- match: { hits.hits.0._source.name: "cow.jpg"}
|
||||
- not_exists: hits.hits.0._source.emb
|
||||
- length: { hits.hits.0.fields.emb: 1}
|
||||
- length: { hits.hits.0.fields.emb.0: 2}
|
||||
- match: { hits.hits.0.fields.emb.0.token_1: 2.0}
|
||||
- match: { hits.hits.0.fields.emb.0.token_2: 3.0}
|
||||
|
||||
- match: { hits.hits.1._id: "2"}
|
||||
- match: { hits.hits.1._source.name: "moose.jpg"}
|
||||
- length: { hits.hits.1._source.nested: 3 }
|
||||
- not_exists: hits.hits.1._source.nested.0.emb
|
||||
|
||||
- match: { hits.hits.2._id: "3" }
|
||||
- match: { hits.hits.2._source.name: "rabbit.jpg" }
|
||||
- length: { hits.hits.2.fields.emb: 1}
|
||||
- length: { hits.hits.2.fields.emb.0: 3}
|
||||
- match: { hits.hits.2.fields.emb.0.token_2: 4.0}
|
||||
- match: { hits.hits.2.fields.emb.0.token_3: 2.0}
|
||||
- match: { hits.hits.2.fields.emb.0.token_9: 3.0}
|
||||
|
||||
- match: { hits.hits.3._id: "4" }
|
||||
- match: { hits.hits.3._source.name: "zoolander.jpg" }
|
||||
- length: { hits.hits.3._source.nested: 3 }
|
||||
- not_exists: hits.hits.3._source.nested.0.emb
|
||||
|
||||
|
||||
---
|
||||
"Bulk partial update with synthetic vectors":
|
||||
- do:
|
||||
headers:
|
||||
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
|
||||
Content-Type: application/json
|
||||
bulk:
|
||||
index: test
|
||||
_source: true
|
||||
body:
|
||||
- '{"update": {"_id": "4"}}'
|
||||
- >
|
||||
{
|
||||
"doc": {
|
||||
"name": "zoolander2.jpg",
|
||||
"emb": {
|
||||
"token_12": 2.0,
|
||||
"token_13": 1.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
- length: { items.0.update.get._source.emb: 2 }
|
||||
- match: { items.0.update.get._source.emb.token_12: 2.0 }
|
||||
- match: { items.0.update.get._source.emb.token_13: 1.0 }
|
||||
- exists: items.0.update.get._source.nested
|
||||
- length: { items.0.update.get._source.nested: 3}
|
||||
- exists: items.0.update.get._source.nested.0.emb
|
||||
- match: { items.0.update.get._source.nested.0.paragraph_id: 0 }
|
||||
- length: { items.0.update.get._source.nested.0.emb: 3 }
|
||||
- not_exists: items.0.update.get._source.nested.1.emb
|
||||
- match: { items.0.update.get._source.nested.1.paragraph_id: 1 }
|
||||
- exists: items.0.update.get._source.nested.2.emb
|
||||
- length: { items.0.update.get._source.nested.2.emb: 1 }
|
||||
- match: { items.0.update.get._source.nested.2.paragraph_id: 2 }
|
||||
- set: { items.0.update.get._source.nested: original_nested }
|
||||
|
||||
- do:
|
||||
headers:
|
||||
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
|
||||
Content-Type: application/json
|
||||
get:
|
||||
_source_exclude_vectors: false
|
||||
index: test
|
||||
id: "4"
|
||||
|
||||
- match: { _source.name: zoolander2.jpg }
|
||||
- length: { _source.emb: 2 }
|
||||
- match: { _source.emb.token_12: 2.0 }
|
||||
- match: { _source.emb.token_13: 1.0 }
|
||||
- match: { _source.nested: $original_nested }
|
||||
|
||||
- do:
|
||||
indices.refresh: {}
|
||||
|
||||
- do:
|
||||
headers:
|
||||
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
|
||||
Content-Type: application/json
|
||||
search:
|
||||
index: test
|
||||
body:
|
||||
_source:
|
||||
"exclude_vectors": false
|
||||
query:
|
||||
term:
|
||||
_id: 4
|
||||
|
||||
- match: { hits.total.value: 1 }
|
||||
- match: { hits.total.relation: eq }
|
||||
- match: { hits.hits.0._source.name: zoolander2.jpg }
|
||||
- match: { hits.hits.0._source.nested: $original_nested }
|
||||
|
||||
---
|
||||
"Partial update with synthetic vectors":
|
||||
- do:
|
||||
headers:
|
||||
# Force JSON content type so that we use a parser that interprets the vectors as doubles
|
||||
Content-Type: application/json
|
||||
update:
|
||||
index: test
|
||||
id: "4"
|
||||
body:
|
||||
_source: true
|
||||
doc: {
|
||||
"name": "zoolander3.jpg",
|
||||
"emb": {
|
||||
"token_3": 2.0,
|
||||
"token_9": 2.5
|
||||
}
|
||||
}
|
||||
|
||||
- length: { get._source.emb: 2 }
|
||||
- match: { get._source.emb.token_3: 2.0 }
|
||||
- match: { get._source.emb.token_9: 2.5 }
|
||||
- exists: get._source.nested
|
||||
- length: { get._source.nested: 3}
|
||||
- exists: get._source.nested.0.emb
|
||||
- match: { get._source.nested.0.paragraph_id: 0 }
|
||||
- length: { get._source.nested.0.emb: 3 }
|
||||
- not_exists: get._source.nested.1.emb
|
||||
- match: { get._source.nested.1.paragraph_id: 1 }
|
||||
- exists: get._source.nested.2.emb
|
||||
- length: { get._source.nested.2.emb: 1 }
|
||||
- match: { get._source.nested.2.paragraph_id: 2 }
|
||||
- set: { get._source.nested: original_nested }
|
||||
|
||||
- do:
|
||||
headers:
|
||||
# Force JSON content type so that we use a parser that interprets the vectors as doubles
|
||||
Content-Type: application/json
|
||||
get:
|
||||
_source_exclude_vectors: false
|
||||
index: test
|
||||
id: "4"
|
||||
|
||||
- length: { _source.emb: 2 }
|
||||
- match: { _source.emb.token_3: 2.0 }
|
||||
- match: { _source.emb.token_9: 2.5 }
|
||||
- match: { _source.name: zoolander3.jpg }
|
||||
- match: { _source.nested: $original_nested }
|
||||
|
||||
- do:
|
||||
indices.refresh: {}
|
||||
|
||||
- do:
|
||||
headers:
|
||||
# Force JSON content type so that we use a parser that interprets the vectors as doubles
|
||||
Content-Type: application/json
|
||||
search:
|
||||
index: test
|
||||
body:
|
||||
_source:
|
||||
"exclude_vectors": false
|
||||
query:
|
||||
term:
|
||||
_id: 4
|
||||
|
||||
- match: { hits.total.value: 1 }
|
||||
- match: { hits.total.relation: eq }
|
||||
- match: { hits.hits.0._source.name: zoolander3.jpg }
|
||||
- match: { hits.hits.0._source.nested: $original_nested }
|
|
@ -25,7 +25,6 @@ import org.apache.lucene.index.FilterLeafReader;
|
|||
import org.apache.lucene.index.FloatVectorValues;
|
||||
import org.apache.lucene.index.KnnVectorValues;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
@ -3028,9 +3027,11 @@ public class DenseVectorFieldMapper extends FieldMapper {
|
|||
|
||||
@Override
|
||||
public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() {
|
||||
return isSyntheticVector
|
||||
? new SyntheticDenseVectorPatchLoader(new IndexedSyntheticFieldLoader(indexCreatedVersion, fieldType().similarity))
|
||||
: null;
|
||||
if (isSyntheticVector) {
|
||||
var syntheticField = new IndexedSyntheticFieldLoader(indexCreatedVersion, fieldType().similarity);
|
||||
return new SyntheticVectorsPatchFieldLoader(syntheticField, syntheticField::copyVectorAsList);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -3127,7 +3128,7 @@ public class DenseVectorFieldMapper extends FieldMapper {
|
|||
*
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
private Object copyVectorAsList() throws IOException {
|
||||
private List<?> copyVectorAsList() throws IOException {
|
||||
assert hasValue : "vector is null for ord=" + ord;
|
||||
if (floatValues != null) {
|
||||
float[] raw = floatValues.vectorValue(ord);
|
||||
|
@ -3218,29 +3219,6 @@ public class DenseVectorFieldMapper extends FieldMapper {
|
|||
}
|
||||
}
|
||||
|
||||
public class SyntheticDenseVectorPatchLoader implements SourceLoader.SyntheticVectorsLoader {
|
||||
private final IndexedSyntheticFieldLoader syntheticFieldLoader;
|
||||
|
||||
public SyntheticDenseVectorPatchLoader(IndexedSyntheticFieldLoader syntheticFieldLoader) {
|
||||
this.syntheticFieldLoader = syntheticFieldLoader;
|
||||
}
|
||||
|
||||
public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) throws IOException {
|
||||
var dvLoader = syntheticFieldLoader.docValuesLoader(context.reader(), null);
|
||||
return (doc, acc) -> {
|
||||
if (dvLoader == null) {
|
||||
return;
|
||||
}
|
||||
if (dvLoader.advanceToDoc(doc) && syntheticFieldLoader.hasValue()) {
|
||||
// add vectors as list since that's how they're parsed from xcontent.
|
||||
acc.add(
|
||||
new SourceLoader.LeafSyntheticVectorPath(syntheticFieldLoader.fieldName(), syntheticFieldLoader.copyVectorAsList())
|
||||
);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Interface for a function that takes a int and boolean
|
||||
*/
|
||||
|
|
|
@ -63,6 +63,7 @@ import java.util.Map;
|
|||
import java.util.Objects;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.elasticsearch.index.IndexSettings.INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING;
|
||||
import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST;
|
||||
import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg;
|
||||
|
||||
|
@ -107,9 +108,12 @@ public class SparseVectorFieldMapper extends FieldMapper {
|
|||
Objects::toString
|
||||
).acceptsNull().setSerializerCheck(this::indexOptionsSerializerCheck);
|
||||
|
||||
public Builder(String name, IndexVersion indexVersionCreated) {
|
||||
private boolean isSyntheticVector;
|
||||
|
||||
public Builder(String name, IndexVersion indexVersionCreated, boolean isSyntheticVector) {
|
||||
super(name);
|
||||
this.indexVersionCreated = indexVersionCreated;
|
||||
this.isSyntheticVector = isSyntheticVector;
|
||||
}
|
||||
|
||||
public Builder setStored(boolean value) {
|
||||
|
@ -129,16 +133,19 @@ public class SparseVectorFieldMapper extends FieldMapper {
|
|||
builderIndexOptions = getDefaultIndexOptions(indexVersionCreated);
|
||||
}
|
||||
|
||||
final boolean syntheticVectorFinal = context.isSourceSynthetic() == false && isSyntheticVector;
|
||||
final boolean storedFinal = stored.getValue() || syntheticVectorFinal;
|
||||
return new SparseVectorFieldMapper(
|
||||
leafName(),
|
||||
new SparseVectorFieldType(
|
||||
indexVersionCreated,
|
||||
context.buildFullName(leafName()),
|
||||
stored.getValue(),
|
||||
storedFinal,
|
||||
meta.getValue(),
|
||||
builderIndexOptions
|
||||
),
|
||||
builderParams(this, context)
|
||||
builderParams(this, context),
|
||||
syntheticVectorFinal
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -196,7 +203,11 @@ public class SparseVectorFieldMapper extends FieldMapper {
|
|||
throw new IllegalArgumentException(ERROR_MESSAGE_8X);
|
||||
}
|
||||
|
||||
return new Builder(n, c.indexVersionCreated());
|
||||
return new Builder(
|
||||
n,
|
||||
c.indexVersionCreated(),
|
||||
INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING.get(c.getIndexSettings().getSettings())
|
||||
);
|
||||
}, notInMultiFields(CONTENT_TYPE));
|
||||
|
||||
public static final class SparseVectorFieldType extends MappedFieldType {
|
||||
|
@ -302,8 +313,16 @@ public class SparseVectorFieldMapper extends FieldMapper {
|
|||
}
|
||||
}
|
||||
|
||||
private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldType, BuilderParams builderParams) {
|
||||
private final boolean isSyntheticVector;
|
||||
|
||||
private SparseVectorFieldMapper(
|
||||
String simpleName,
|
||||
MappedFieldType mappedFieldType,
|
||||
BuilderParams builderParams,
|
||||
boolean isSyntheticVector
|
||||
) {
|
||||
super(simpleName, mappedFieldType, builderParams);
|
||||
this.isSyntheticVector = isSyntheticVector;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -314,6 +333,15 @@ public class SparseVectorFieldMapper extends FieldMapper {
|
|||
return super.syntheticSourceSupport();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() {
|
||||
if (isSyntheticVector) {
|
||||
var syntheticField = new SparseVectorSyntheticFieldLoader(fullPath(), leafName());
|
||||
return new SyntheticVectorsPatchFieldLoader(syntheticField, syntheticField::copyAsMap);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, NamedAnalyzer> indexAnalyzers() {
|
||||
return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER);
|
||||
|
@ -321,7 +349,7 @@ public class SparseVectorFieldMapper extends FieldMapper {
|
|||
|
||||
@Override
|
||||
public FieldMapper.Builder getMergeBuilder() {
|
||||
return new Builder(leafName(), this.fieldType().indexVersionCreated).init(this);
|
||||
return new Builder(leafName(), this.fieldType().indexVersionCreated, this.isSyntheticVector).init(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -504,9 +532,26 @@ public class SparseVectorFieldMapper extends FieldMapper {
|
|||
b.endObject();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a deep-copied tokens map for the current document.
|
||||
*
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
private Map<String, Float> copyAsMap() throws IOException {
|
||||
assert termsDocEnum != null;
|
||||
Map<String, Float> tokenMap = new LinkedHashMap<>();
|
||||
PostingsEnum reuse = null;
|
||||
do {
|
||||
reuse = termsDocEnum.postings(reuse);
|
||||
reuse.nextDoc();
|
||||
tokenMap.put(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq()));
|
||||
} while (termsDocEnum.next() != null);
|
||||
return tokenMap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String fieldName() {
|
||||
return leafName;
|
||||
return fullPath;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the "Elastic License
|
||||
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
|
||||
* Public License v 1"; you may not use this file except in compliance with, at
|
||||
* your election, the "Elastic License 2.0", the "GNU Affero General Public
|
||||
* License v3.0 only", or the "Server Side Public License, v 1".
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.mapper.vectors;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.elasticsearch.core.CheckedSupplier;
|
||||
import org.elasticsearch.index.mapper.SourceLoader;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class SyntheticVectorsPatchFieldLoader implements SourceLoader.SyntheticVectorsLoader {
|
||||
private final SourceLoader.SyntheticFieldLoader syntheticLoader;
|
||||
private final CheckedSupplier<Object, IOException> copyObject;
|
||||
|
||||
public SyntheticVectorsPatchFieldLoader(
|
||||
SourceLoader.SyntheticFieldLoader syntheticLoader,
|
||||
CheckedSupplier<Object, IOException> copyObject
|
||||
) {
|
||||
this.syntheticLoader = syntheticLoader;
|
||||
this.copyObject = copyObject;
|
||||
}
|
||||
|
||||
public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) throws IOException {
|
||||
var dvLoader = syntheticLoader.docValuesLoader(context.reader(), null);
|
||||
return (doc, acc) -> {
|
||||
if (dvLoader == null) {
|
||||
return;
|
||||
}
|
||||
if (dvLoader.advanceToDoc(doc) && syntheticLoader.hasValue()) {
|
||||
acc.add(new SourceLoader.LeafSyntheticVectorPath(syntheticLoader.fieldName(), copyObject.get()));
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -26,6 +26,7 @@ import org.elasticsearch.common.bytes.BytesReference;
|
|||
import org.elasticsearch.common.compress.CompressedXContent;
|
||||
import org.elasticsearch.core.CheckedConsumer;
|
||||
import org.elasticsearch.core.Nullable;
|
||||
import org.elasticsearch.core.Tuple;
|
||||
import org.elasticsearch.index.IndexVersion;
|
||||
import org.elasticsearch.index.IndexVersions;
|
||||
import org.elasticsearch.index.mapper.DocumentMapper;
|
||||
|
@ -33,7 +34,6 @@ import org.elasticsearch.index.mapper.DocumentParsingException;
|
|||
import org.elasticsearch.index.mapper.MappedFieldType;
|
||||
import org.elasticsearch.index.mapper.MapperParsingException;
|
||||
import org.elasticsearch.index.mapper.MapperService;
|
||||
import org.elasticsearch.index.mapper.MapperTestCase;
|
||||
import org.elasticsearch.index.mapper.ParsedDocument;
|
||||
import org.elasticsearch.index.query.SearchExecutionContext;
|
||||
import org.elasticsearch.inference.WeightedToken;
|
||||
|
@ -54,6 +54,7 @@ import java.util.Collection;
|
|||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import static org.elasticsearch.index.IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT;
|
||||
import static org.elasticsearch.index.IndexVersions.UPGRADE_TO_LUCENE_10_0_0;
|
||||
|
@ -66,14 +67,13 @@ import static org.hamcrest.Matchers.containsString;
|
|||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
|
||||
public class SparseVectorFieldMapperTests extends MapperTestCase {
|
||||
public class SparseVectorFieldMapperTests extends SyntheticVectorsMapperTestCase {
|
||||
|
||||
@Override
|
||||
protected Object getSampleValueForDocument() {
|
||||
Map<String, Float> map = new LinkedHashMap<>();
|
||||
map.put("ten", 10f);
|
||||
map.put("twenty", 20f);
|
||||
return map;
|
||||
return new TreeMap<>(
|
||||
randomMap(1, 5, () -> Tuple.tuple(randomAlphaOfLengthBetween(5, 10), Float.valueOf(randomIntBetween(1, 127))))
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -209,26 +209,22 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
|
|||
DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping));
|
||||
assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString());
|
||||
|
||||
ParsedDocument doc1 = mapper.parse(source(this::writeField));
|
||||
@SuppressWarnings("unchecked")
|
||||
var expected = (Map<String, Float>) getSampleValueForDocument();
|
||||
ParsedDocument doc1 = mapper.parse(source(b -> b.field("field", expected)));
|
||||
|
||||
List<IndexableField> fields = doc1.rootDoc().getFields("field");
|
||||
assertEquals(2, fields.size());
|
||||
assertEquals(expected.size(), fields.size());
|
||||
assertThat(fields.get(0), Matchers.instanceOf(FeatureField.class));
|
||||
FeatureField featureField1 = null;
|
||||
FeatureField featureField2 = null;
|
||||
|
||||
for (IndexableField field : fields) {
|
||||
if (field.stringValue().equals("ten")) {
|
||||
featureField1 = (FeatureField) field;
|
||||
} else if (field.stringValue().equals("twenty")) {
|
||||
featureField2 = (FeatureField) field;
|
||||
} else {
|
||||
throw new UnsupportedOperationException();
|
||||
if (field instanceof FeatureField fField) {
|
||||
var value = expected.remove(fField.stringValue());
|
||||
assertThat(fField.getFeatureValue(), equalTo(value));
|
||||
int freq1 = getFrequency(fField.tokenStream(null, null));
|
||||
assertThat(XFeatureField.decodeFeatureValue(freq1), equalTo(value));
|
||||
}
|
||||
}
|
||||
|
||||
int freq1 = getFrequency(featureField1.tokenStream(null, null));
|
||||
int freq2 = getFrequency(featureField2.tokenStream(null, null));
|
||||
assertTrue(freq1 < freq2);
|
||||
}
|
||||
|
||||
public void testDefaultsWithAndWithoutIncludeDefaults() throws Exception {
|
||||
|
@ -460,7 +456,8 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
|
|||
|
||||
@Override
|
||||
public SyntheticSourceExample example(int maxValues) {
|
||||
return new SyntheticSourceExample(getSampleValueForDocument(), getSampleValueForDocument(), b -> {
|
||||
var sample = getSampleValueForDocument();
|
||||
return new SyntheticSourceExample(sample, sample, b -> {
|
||||
if (withStore) {
|
||||
minimalStoreMapping(b);
|
||||
} else {
|
||||
|
|
|
@ -1169,7 +1169,7 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
|
|||
boolean useLegacyFormat
|
||||
) {
|
||||
return switch (modelSettings.taskType()) {
|
||||
case SPARSE_EMBEDDING -> new SparseVectorFieldMapper.Builder(CHUNKED_EMBEDDINGS_FIELD, indexVersionCreated).setStored(
|
||||
case SPARSE_EMBEDDING -> new SparseVectorFieldMapper.Builder(CHUNKED_EMBEDDINGS_FIELD, indexVersionCreated, false).setStored(
|
||||
useLegacyFormat == false
|
||||
);
|
||||
case TEXT_EMBEDDING -> {
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
*/
|
||||
|
||||
module org.elasticsearch.rank.vectors {
|
||||
requires org.elasticsearch.base;
|
||||
requires org.elasticsearch.xcore;
|
||||
requires org.elasticsearch.painless.spi;
|
||||
requires org.elasticsearch.server;
|
||||
|
|
|
@ -10,7 +10,6 @@ package org.elasticsearch.xpack.rank.vectors.mapper;
|
|||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.FieldExistsQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -30,6 +29,7 @@ import org.elasticsearch.index.mapper.SourceLoader;
|
|||
import org.elasticsearch.index.mapper.TextSearchInfo;
|
||||
import org.elasticsearch.index.mapper.ValueFetcher;
|
||||
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
|
||||
import org.elasticsearch.index.mapper.vectors.SyntheticVectorsPatchFieldLoader;
|
||||
import org.elasticsearch.index.query.SearchExecutionContext;
|
||||
import org.elasticsearch.license.LicenseUtils;
|
||||
import org.elasticsearch.license.XPackLicenseState;
|
||||
|
@ -406,7 +406,11 @@ public class RankVectorsFieldMapper extends FieldMapper {
|
|||
|
||||
@Override
|
||||
public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() {
|
||||
return isSyntheticVector ? new SyntheticRankVectorPatchLoader(new DocValuesSyntheticFieldLoader()) : null;
|
||||
if (isSyntheticVector) {
|
||||
var syntheticField = new DocValuesSyntheticFieldLoader();
|
||||
return new SyntheticVectorsPatchFieldLoader(syntheticField, syntheticField::copyVectorsAsList);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private class DocValuesSyntheticFieldLoader extends SourceLoader.DocValuesBasedSyntheticFieldLoader {
|
||||
|
@ -455,7 +459,12 @@ public class RankVectorsFieldMapper extends FieldMapper {
|
|||
b.endArray();
|
||||
}
|
||||
|
||||
private Object copyVectorsAsList() throws IOException {
|
||||
/**
|
||||
* Returns deep-copied vectors for the current document, either as a list.
|
||||
*
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
private List<List<?>> copyVectorsAsList() throws IOException {
|
||||
assert hasValue : "rank vector is null";
|
||||
BytesRef ref = values.binaryValue();
|
||||
ByteBuffer byteBuffer = ByteBuffer.wrap(ref.bytes, ref.offset, ref.length).order(ByteOrder.LITTLE_ENDIAN);
|
||||
|
@ -492,28 +501,4 @@ public class RankVectorsFieldMapper extends FieldMapper {
|
|||
return fullPath();
|
||||
}
|
||||
}
|
||||
|
||||
private class SyntheticRankVectorPatchLoader implements SourceLoader.SyntheticVectorsLoader {
|
||||
private final DocValuesSyntheticFieldLoader syntheticFieldLoader;
|
||||
|
||||
private SyntheticRankVectorPatchLoader(DocValuesSyntheticFieldLoader syntheticFieldLoader) {
|
||||
this.syntheticFieldLoader = syntheticFieldLoader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) throws IOException {
|
||||
var dvLoader = syntheticFieldLoader.docValuesLoader(context.reader(), null);
|
||||
return (doc, acc) -> {
|
||||
if (dvLoader == null) {
|
||||
return;
|
||||
}
|
||||
if (dvLoader.advanceToDoc(doc) && syntheticFieldLoader.hasValue()) {
|
||||
// add vectors as list since that's how they're parsed from xcontent.
|
||||
acc.add(
|
||||
new SourceLoader.LeafSyntheticVectorPath(syntheticFieldLoader.fieldName(), syntheticFieldLoader.copyVectorsAsList())
|
||||
);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue