Skip to content

Add synthetic vectors support for rank_vectors #130715

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2853,10 +2853,6 @@ private DenseVectorFieldMapper(
this.isSyntheticVector = isSyntheticVector;
}

public boolean isSyntheticVector() {
return isSyntheticVector;
}

@Override
public DenseVectorFieldType fieldType() {
return (DenseVectorFieldType) super.fieldType();
Expand Down Expand Up @@ -3032,7 +3028,7 @@ public String toString() {

@Override
public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() {
return isSyntheticVector()
return isSyntheticVector
? new SyntheticDenseVectorPatchLoader(new IndexedSyntheticFieldLoader(indexCreatedVersion, fieldType().similarity))
: null;
}
Expand Down Expand Up @@ -3131,7 +3127,7 @@ public void write(XContentBuilder b) throws IOException {
*
* @throws IOException if reading fails
*/
public Object copyVectorAsList() throws IOException {
private Object copyVectorAsList() throws IOException {
assert hasValue : "vector is null for ord=" + ord;
if (floatValues != null) {
float[] raw = floatValues.vectorValue(ord);
Expand Down Expand Up @@ -3235,8 +3231,7 @@ public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context)
if (dvLoader == null) {
return;
}
dvLoader.advanceToDoc(doc);
if (syntheticFieldLoader.hasValue()) {
if (dvLoader.advanceToDoc(doc) && syntheticFieldLoader.hasValue()) {
// add vectors as list since that's how they're parsed from xcontent.
acc.add(
new SourceLoader.LeafSyntheticVectorPath(syntheticFieldLoader.fieldName(), syntheticFieldLoader.copyVectorAsList())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,21 @@

import com.carrotsearch.randomizedtesting.generators.RandomPicks;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.KnnByteVectorField;
import org.apache.lucene.document.KnnFloatVectorField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.FieldExistsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.VectorUtil;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.xcontent.XContentHelper;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.codec.CodecService;
Expand All @@ -46,7 +39,6 @@
import org.elasticsearch.index.mapper.MapperBuilderContext;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.MapperTestCase;
import org.elasticsearch.index.mapper.ParsedDocument;
import org.elasticsearch.index.mapper.SourceToParse;
import org.elasticsearch.index.mapper.ValueFetcher;
Expand All @@ -61,7 +53,6 @@
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.index.IndexVersionUtils;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentType;
import org.junit.AssumptionViolatedException;

import java.io.IOException;
Expand All @@ -74,18 +65,16 @@
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN;
import static org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase.randomNormalizedVector;
import static org.elasticsearch.index.IndexSettings.SYNTHETIC_VECTORS;
import static org.elasticsearch.index.codec.vectors.IVFVectorsFormat.DYNAMIC_NPROBE;
import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.DEFAULT_OVERSAMPLE;
import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.IVF_FORMAT;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertToXContentEquivalent;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

public class DenseVectorFieldMapperTests extends MapperTestCase {
public class DenseVectorFieldMapperTests extends SyntheticVectorsMapperTestCase {

private static final IndexVersion INDEXED_BY_DEFAULT_PREVIOUS_INDEX_VERSION = IndexVersions.V_8_10_0;
private final ElementType elementType;
Expand All @@ -95,7 +84,7 @@ public class DenseVectorFieldMapperTests extends MapperTestCase {

public DenseVectorFieldMapperTests() {
this.elementType = randomFrom(ElementType.BYTE, ElementType.FLOAT, ElementType.BIT);
this.indexed = randomBoolean();
this.indexed = usually();
this.indexOptionsSet = this.indexed && randomBoolean();
int baseDims = ElementType.BIT == elementType ? 4 * Byte.SIZE : 4;
int randomMultiplier = ElementType.FLOAT == elementType ? randomIntBetween(1, 64) : 1;
Expand Down Expand Up @@ -160,17 +149,25 @@ private void indexMapping(XContentBuilder b, IndexVersion indexVersion) throws I
protected Object getSampleValueForDocument() {
return elementType == ElementType.FLOAT
? convertToList(randomNormalizedVector(this.dims))
: List.of((byte) 1, (byte) 1, (byte) 1, (byte) 1);
: convertToList(randomByteArrayOfLength(elementType == ElementType.BIT ? this.dims / Byte.SIZE : dims));
}

private static List<Float> convertToList(float[] vector) {
public static List<Float> convertToList(float[] vector) {
List<Float> list = new ArrayList<>(vector.length);
for (float v : vector) {
list.add(v);
}
return list;
}

public static List<Byte> convertToList(byte[] vector) {
List<Byte> list = new ArrayList<>(vector.length);
for (byte v : vector) {
list.add(v);
}
return list;
}

@Override
protected void registerParameters(ParameterChecker checker) throws IOException {
checker.registerConflictCheck(
Expand Down Expand Up @@ -2920,249 +2917,6 @@ public void testInvalidVectorDimensions() {
}
}

public void testSyntheticVectorsMinimalValidDocument() throws IOException {
assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS);
for (XContentType type : XContentType.values()) {
BytesReference source = generateRandomDoc(type, true, true, false, false, false);
assertSyntheticVectors(buildVectorMapping(), source, type);
}
}

public void testSyntheticVectorsFullDocument() throws IOException {
assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS);
for (XContentType type : XContentType.values()) {
BytesReference source = generateRandomDoc(type, true, true, true, true, false);
assertSyntheticVectors(buildVectorMapping(), source, type);
}
}

public void testSyntheticVectorsWithUnmappedFields() throws IOException {
assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS);
for (XContentType type : XContentType.values()) {
BytesReference source = generateRandomDoc(type, true, true, true, true, true);
assertSyntheticVectors(buildVectorMapping(), source, type);
}
}

public void testSyntheticVectorsMissingRootFields() throws IOException {
assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS);
for (XContentType type : XContentType.values()) {
BytesReference source = generateRandomDoc(type, false, false, false, false, false);
assertSyntheticVectors(buildVectorMapping(), source, type);
}
}

public void testSyntheticVectorsPartialNestedContent() throws IOException {
assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS);
for (XContentType type : XContentType.values()) {
BytesReference source = generateRandomDoc(type, true, true, true, false, false);
assertSyntheticVectors(buildVectorMapping(), source, type);
}
}

public void testFlatPathDocument() throws IOException {
assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS);
for (XContentType type : XContentType.values()) {
BytesReference source = generateRandomDocWithFlatPath(type);
assertSyntheticVectors(buildVectorMapping(), source, type);
}
}

private static String buildVectorMapping() throws IOException {
try (XContentBuilder builder = XContentBuilder.builder(XContentType.JSON.xContent())) {
builder.startObject(); // root
builder.startObject("_doc");
builder.field("dynamic", "false");

builder.startObject("properties");

// field
builder.startObject("field");
builder.field("type", "keyword");
builder.endObject();

// emb
builder.startObject("emb");
builder.field("type", "dense_vector");
builder.field("dims", 3);
builder.field("similarity", "cosine");
builder.endObject();

// another_field
builder.startObject("another_field");
builder.field("type", "keyword");
builder.endObject();

// obj
builder.startObject("obj");
builder.startObject("properties");

// nested
builder.startObject("nested");
builder.field("type", "nested");
builder.startObject("properties");

// nested.field
builder.startObject("field");
builder.field("type", "keyword");
builder.endObject();

// nested.emb
builder.startObject("emb");
builder.field("type", "dense_vector");
builder.field("dims", 3);
builder.field("similarity", "cosine");
builder.endObject();

// double_nested
builder.startObject("double_nested");
builder.field("type", "nested");
builder.startObject("properties");

// double_nested.field
builder.startObject("field");
builder.field("type", "keyword");
builder.endObject();

// double_nested.emb
builder.startObject("emb");
builder.field("type", "dense_vector");
builder.field("dims", 3);
builder.field("similarity", "cosine");
builder.endObject();

builder.endObject(); // double_nested.properties
builder.endObject(); // double_nested

builder.endObject(); // nested.properties
builder.endObject(); // nested

builder.endObject(); // obj.properties
builder.endObject(); // obj

builder.endObject(); // properties
builder.endObject(); // _doc
builder.endObject(); // root

return Strings.toString(builder);
}
}

private BytesReference generateRandomDoc(
XContentType xContentType,
boolean includeRootField,
boolean includeVector,
boolean includeNested,
boolean includeDoubleNested,
boolean includeUnmapped
) throws IOException {
try (var builder = XContentBuilder.builder(xContentType.xContent())) {
builder.startObject();

if (includeRootField) {
builder.field("field", randomAlphaOfLengthBetween(1, 2));
}

if (includeVector) {
builder.array("emb", new float[] { 1, 2, 3 });
}

if (includeUnmapped) {
builder.field("unmapped_field", "extra");
}

builder.startObject("obj");
if (includeNested) {
builder.startArray("nested");

// Entry with just a field
builder.startObject();
builder.field("field", randomAlphaOfLengthBetween(3, 6));
builder.endObject();

// Empty object
builder.startObject();
builder.endObject();

// Entry with emb and double_nested
if (includeDoubleNested) {
builder.startObject();
builder.array("emb", new float[] { 1, 2, 3 });
builder.field("field", "nested_val");
builder.startArray("double_nested");
for (int i = 0; i < 2; i++) {
builder.startObject();
builder.array("emb", new float[] { 1, 2, 3 });
builder.field("field", "dn_field");
builder.endObject();
}
builder.endArray();
builder.endObject();
}

builder.endArray();
}
builder.endObject();

builder.endObject();
return BytesReference.bytes(builder);
}
}

private BytesReference generateRandomDocWithFlatPath(XContentType xContentType) throws IOException {
try (var builder = XContentBuilder.builder(xContentType.xContent())) {
builder.startObject();

// Root-level fields
builder.field("field", randomAlphaOfLengthBetween(1, 2));
builder.array("emb", new float[] { 1, 2, 3 });
builder.field("another_field", randomAlphaOfLengthBetween(3, 5));

// Simulated flattened "obj.nested"
builder.startObject("obj.nested");

builder.field("field", randomAlphaOfLengthBetween(4, 8));
builder.array("emb", new float[] { 1, 2, 3 });

builder.startArray("double_nested");
for (int i = 0; i < randomIntBetween(1, 2); i++) {
builder.startObject();
builder.field("field", randomAlphaOfLengthBetween(4, 8));
builder.array("emb", new float[] { 1, 2, 3 });
builder.endObject();
}
builder.endArray();

builder.endObject(); // end obj.nested

builder.endObject();
return BytesReference.bytes(builder);
}
}

private void assertSyntheticVectors(String mapping, BytesReference source, XContentType xContentType) throws IOException {
var settings = Settings.builder().put(IndexSettings.INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING.getKey(), true).build();
MapperService mapperService = createMapperService(settings, mapping);
var parsedDoc = mapperService.documentMapper().parse(new SourceToParse("0", source, xContentType));
try (var directory = newDirectory()) {
IndexWriterConfig config = newIndexWriterConfig(random(), new StandardAnalyzer());
try (var iw = new RandomIndexWriter(random(), directory, config)) {
parsedDoc.updateSeqID(0, 1);
parsedDoc.version().setLongValue(0);
iw.addDocuments(parsedDoc.docs());
}
try (var indexReader = wrapInMockESDirectoryReader(DirectoryReader.open(directory))) {
var provider = SourceProvider.fromLookup(
mapperService.mappingLookup(),
null,
mapperService.getMapperMetrics().sourceFieldMetrics()
);
var searchSource = provider.getSource(indexReader.leaves().get(0), parsedDoc.docs().size() - 1);
assertToXContentEquivalent(source, searchSource.internalSourceRef(), xContentType);
}
}
}

@Override
protected IngestScriptSupport ingestScriptSupport() {
throw new AssumptionViolatedException("not supported");
Expand Down
Loading