From b8a1162738e87ea75cbc0fd186f29f4738d44a55 Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Tue, 7 Jan 2025 21:17:26 +0530 Subject: [PATCH 01/34] Initial cut of CuVS into Lucene as a Codec in sandbox --- build-tools/build-infra/build.gradle | 1 + gradle/globals.gradle | 1 + lucene/sandbox/build.gradle | 7 + lucene/sandbox/src/java/module-info.java | 5 +- .../vectorsearch/CagraFieldVectorsWriter.java | 35 ++ .../sandbox/vectorsearch/CuVSCodec.java | 31 ++ .../sandbox/vectorsearch/CuVSIndex.java | 56 +++ .../vectorsearch/CuVSKnnFloatVectorQuery.java | 33 ++ .../sandbox/vectorsearch/CuVSSegmentFile.java | 43 +++ .../vectorsearch/CuVSVectorsFormat.java | 70 ++++ .../vectorsearch/CuVSVectorsReader.java | 310 ++++++++++++++++ .../vectorsearch/CuVSVectorsWriter.java | 339 ++++++++++++++++++ .../vectorsearch/PerLeafCuVSKnnCollector.java | 74 ++++ .../vectorsearch/SegmentInputStream.java | 90 +++++ .../lucene/sandbox/vectorsearch/Util.java | 142 ++++++++ .../sandbox/vectorsearch/package-info.java | 1 + .../sandbox/vectorsearch/IntegrationTest.java | 201 +++++++++++ versions.toml | 6 + 18 files changed, 1444 insertions(+), 1 deletion(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java diff --git a/build-tools/build-infra/build.gradle b/build-tools/build-infra/build.gradle index 5cb1426cba97..34d71f7509d3 100644 --- a/build-tools/build-infra/build.gradle +++ b/build-tools/build-infra/build.gradle @@ -22,6 +22,7 @@ plugins { } repositories { + mavenLocal() mavenCentral() } diff --git a/gradle/globals.gradle b/gradle/globals.gradle index bcab6461ea91..25bfddc9bebf 100644 --- a/gradle/globals.gradle +++ b/gradle/globals.gradle @@ -22,6 +22,7 @@ allprojects { // Repositories to fetch dependencies from. repositories { + mavenLocal() mavenCentral() } diff --git a/lucene/sandbox/build.gradle b/lucene/sandbox/build.gradle index 72762fe1c3d2..6d225fd78ba4 100644 --- a/lucene/sandbox/build.gradle +++ b/lucene/sandbox/build.gradle @@ -19,9 +19,16 @@ apply plugin: 'java-library' description = 'Various third party contributions and new ideas' +repositories { + mavenLocal() +} + + dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:queries') moduleApi project(':lucene:facet') moduleTestImplementation project(':lucene:test-framework') + moduleImplementation deps.commons.lang3 + moduleImplementation deps.cuvs } diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index f40a05af433a..b2d45adf4d30 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -20,7 +20,10 @@ requires org.apache.lucene.core; requires org.apache.lucene.queries; requires org.apache.lucene.facet; - + requires java.logging; + requires com.nvidia.cuvs; + requires org.apache.commons.lang3; + exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; exports org.apache.lucene.sandbox.codecs.quantization; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java new file mode 100644 index 000000000000..21c088bd84f8 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -0,0 +1,35 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.index.FieldInfo; + +public class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { + + public final String fieldName; + public final ConcurrentHashMap vectors = new ConcurrentHashMap(); + public int fieldVectorDimension = -1; + + public CagraFieldVectorsWriter(FieldInfo fieldInfo) { + this.fieldName = fieldInfo.getName(); + this.fieldVectorDimension = fieldInfo.getVectorDimension(); + } + + @Override + public long ramBytesUsed() { + return fieldName.getBytes().length + Integer.BYTES + (vectors.size() * fieldVectorDimension * Float.BYTES); + } + + @Override + public void addValue(int docID, float[] vectorValue) throws IOException { + vectors.put(docID, vectorValue); + } + + @Override + public float[] copyValue(float[] vectorValue) { + throw new UnsupportedOperationException(); + } + +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java new file mode 100644 index 000000000000..448803bb7fc4 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -0,0 +1,31 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; + + +public class CuVSCodec extends FilterCodec { + + public CuVSCodec() { + this("CuVSCodec", new Lucene101Codec()); + } + + public CuVSCodec(String name, Codec delegate) { + super(name, delegate); + setKnnFormat(new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE)); + } + + KnnVectorsFormat knnFormat = null; + + @Override + public KnnVectorsFormat knnVectorsFormat() { + return knnFormat; + } + + public void setKnnFormat(KnnVectorsFormat format) { + this.knnFormat = format; + } +} \ No newline at end of file diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java new file mode 100644 index 000000000000..1878b6c236bc --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -0,0 +1,56 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.util.List; +import java.util.Objects; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.CagraIndex; + +public class CuVSIndex { + private final CagraIndex cagraIndex; + private final BruteForceIndex bruteforceIndex; + private final List mapping; + private final List vectors; + private final int maxDocs; + + private final String fieldName; + private final String segmentName; + + public CuVSIndex(String segmentName, String fieldName, CagraIndex cagraIndex, List mapping, List vectors, int maxDocs, BruteForceIndex bruteforceIndex) { + this.cagraIndex = Objects.requireNonNull(cagraIndex); + this.bruteforceIndex = Objects.requireNonNull(bruteforceIndex); + this.mapping = Objects.requireNonNull(mapping); + this.vectors = Objects.requireNonNull(vectors); + this.fieldName = Objects.requireNonNull(fieldName); + this.segmentName = Objects.requireNonNull(segmentName); + this.maxDocs = Objects.requireNonNull(maxDocs); + } + + public CagraIndex getCagraIndex() { + return cagraIndex; + } + + public BruteForceIndex getBruteforceIndex() { + return bruteforceIndex; + } + + public List getMapping() { + return mapping; + } + + public String getFieldName() { + return fieldName; + } + + public List getVectors() { + return vectors; + } + + public String getSegmentName() { + return segmentName; + } + + public int getMaxDocs() { + return maxDocs; + } +} \ No newline at end of file diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java new file mode 100644 index 000000000000..1bbae88c5630 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -0,0 +1,33 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; + +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.knn.KnnCollectorManager; +import org.apache.lucene.util.Bits; + +public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { + + final private int iTopK; + final private int searchWidth; + + public CuVSKnnFloatVectorQuery(String field, float[] target, int k, int iTopK, int searchWidth) { + super(field, target, k); + this.iTopK = iTopK; + this.searchWidth = searchWidth; + } + + @Override + protected TopDocs approximateSearch(LeafReaderContext context, Bits acceptDocs, int visitedLimit, KnnCollectorManager knnCollectorManager) throws IOException { + + PerLeafCuVSKnnCollector results = new PerLeafCuVSKnnCollector(k, iTopK, searchWidth); + + LeafReader reader = context.reader(); + reader.searchNearestVectors(field, this.getTargetCopy(), results, null); + return results.topDocs(); + } + +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java new file mode 100644 index 000000000000..9ca0d63ba087 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java @@ -0,0 +1,43 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Logger; +import java.util.zip.Deflater; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; + +public class CuVSSegmentFile implements AutoCloseable{ + final private ZipOutputStream zos; + + private Set filesAdded = new HashSet(); + + public CuVSSegmentFile(OutputStream out) { + zos = new ZipOutputStream(out); + zos.setLevel(Deflater.NO_COMPRESSION); + } + + protected Logger log = Logger.getLogger(getClass().getName()); + + public void addFile(String name, byte[] bytes) throws IOException { + log.info("Writing the file: " + name + ", size="+bytes.length + ", space remaining: "+new File("/").getFreeSpace()); + ZipEntry indexFileZipEntry = new ZipEntry(name); + zos.putNextEntry(indexFileZipEntry); + zos.write(bytes, 0, bytes.length); + zos.closeEntry(); + filesAdded.add(name); + } + + public Set getFilesAdded() { + return Collections.unmodifiableSet(filesAdded); + } + + @Override + public void close() throws IOException { + zos.close(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java new file mode 100644 index 000000000000..c17b5258c9d5 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -0,0 +1,70 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; + +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; + +import com.nvidia.cuvs.CuVSResources; + +public class CuVSVectorsFormat extends KnnVectorsFormat { + + public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; + public static final String VECTOR_DATA_EXTENSION = "cag"; + public static final String META_EXTENSION = "cagmf"; + public static final int VERSION_CURRENT = 0; + public final int maxDimensions = 4096; + public final int cuvsWriterThreads; + public final int intGraphDegree; + public final int graphDegree; + public MergeStrategy mergeStrategy; + public static CuVSResources resources; + + public CuVSVectorsFormat() { + super("CuVSVectorsFormat"); + this.cuvsWriterThreads = 1; + this.intGraphDegree = 128; + this.graphDegree = 64; + try { + resources = new CuVSResources(); + } catch (Throwable e) { + e.printStackTrace(); + } + } + + public CuVSVectorsFormat(int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy) { + super("CuVSVectorsFormat"); + this.mergeStrategy = mergeStrategy; + this.cuvsWriterThreads = cuvsWriterThreads; + this.intGraphDegree = intGraphDegree; + this.graphDegree = graphDegree; + try { + resources = new CuVSResources(); + } catch (Throwable e) { + e.printStackTrace(); + } + } + + @Override + public CuVSVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new CuVSVectorsWriter(state, cuvsWriterThreads, intGraphDegree, graphDegree, mergeStrategy, resources); + } + + @Override + public CuVSVectorsReader fieldsReader(SegmentReadState state) throws IOException { + try { + return new CuVSVectorsReader(state, resources); + } catch (Throwable e) { + e.printStackTrace(); + } + return null; + } + + @Override + public int getMaxDimensions(String fieldName) { + return maxDimensions; + } + +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java new file mode 100644 index 000000000000..cac870afec6c --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -0,0 +1,310 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.lang.StackWalker.StackFrame; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.logging.Logger; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +import org.apache.commons.lang3.SerializationUtils; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.TopKnnCollector; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceQuery; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraQuery; +import com.nvidia.cuvs.CagraSearchParams; +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.HnswIndex; +import com.nvidia.cuvs.HnswIndexParams; + +public class CuVSVectorsReader extends KnnVectorsReader { + + protected Logger log = Logger.getLogger(getClass().getName()); + + IndexInput vectorDataReader = null; + public String fileName = null; + public byte[] indexFileBytes; + public int[] docIds; + public float[] vectors; + public SegmentReadState segmentState = null; + public int indexFilePayloadSize = 0; + public long initialFilePointerLoc = 0; + public SegmentInputStream segmentInputStream; + + // Field to List of Indexes + public Map> cuvsIndexes; + + private CuVSResources resources; + + public CuVSVectorsReader(SegmentReadState state, CuVSResources resources) throws Throwable { + + segmentState = state; + this.resources = resources; + + fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, + CuVSVectorsFormat.VECTOR_DATA_EXTENSION); + + vectorDataReader = segmentState.directory.openInput(fileName, segmentState.context); + CodecUtil.readIndexHeader(vectorDataReader); + + initialFilePointerLoc = vectorDataReader.getFilePointer(); + indexFilePayloadSize = (int)vectorDataReader.length() - (int)initialFilePointerLoc; //vectorMetaReader.readInt(); + segmentInputStream = new SegmentInputStream(vectorDataReader, indexFilePayloadSize, initialFilePointerLoc); + log.info("payloadSize: " + indexFilePayloadSize); + log.info("initialFilePointerLoc: " + initialFilePointerLoc); + + List stackTrace = StackWalker.getInstance().walk(this::getStackTrace); + + boolean isMergeCase = false; + for (StackFrame s : stackTrace) { + if (s.toString().startsWith("org.apache.lucene.index.IndexWriter.merge")) { + isMergeCase = true; + log.info("Reader opening on merge call"); + break; + } + } + + log.info("Source of this segment "+segmentState.segmentSuffix+" is " + segmentState.segmentInfo.getDiagnostics().get(IndexWriter.SOURCE)); + log.info("Loading for " + segmentState.segmentInfo.name + ", mergeCase? " + isMergeCase); + //if (!isMergeCase) { nocommit: TODO: don't load the cagra index for merge case. + log.info("Not the merge case, hence loading for " + segmentState.segmentInfo.name); + this.cuvsIndexes = loadCuVSIndex(getIndexInputStream(), isMergeCase); + //} + } + + @SuppressWarnings({"unchecked"}) + private Map> loadCuVSIndex(ZipInputStream zis, boolean isMergeCase) throws Throwable { + Map> ret = new HashMap>(); + Map cagraIndexes = new HashMap(); + Map bruteforceIndexes = new HashMap(); + Map hnswIndexes = new HashMap(); + Map> mappings = new HashMap>(); + Map> vectors = new HashMap>(); + + Map maxDocs = null; // map of segment, maxDocs + ZipEntry ze; + while ((ze = zis.getNextEntry()) != null) { + String entry = ze.getName(); + + String segmentField = entry.split("\\.")[0]; + String extension = entry.split("\\.")[1]; + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + byte[] buffer = new byte[1024]; + int len = 0; + while ((len = zis.read(buffer)) != -1) { + baos.write(buffer, 0, len); + } + + switch (extension) { + case "meta": { + maxDocs = (Map) SerializationUtils.deserialize(baos.toByteArray()); // nocommit use IOUtils + break; + } + case "vec": { + vectors.put(segmentField, (List) SerializationUtils.deserialize(baos.toByteArray())); // nocommit use IOUtils + break; + } + case "map": { + List map = (List) SerializationUtils.deserialize(baos.toByteArray()); // nocommit use IOUtils + mappings.put(segmentField, map); + break; + } + case "cag": { + cagraIndexes.put(segmentField, new CagraIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .build()); + break; + } + case "bf": { + bruteforceIndexes.put(segmentField, new BruteForceIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .build()); + break; + } + case "hnsw": { + HnswIndexParams indexParams = new HnswIndexParams.Builder(resources) + .build(); + hnswIndexes.put(segmentField, new HnswIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .withIndexParams(indexParams) + .build()); + break; + } + } + } + + log.info("Loading cuvsIndexes from segment: " + segmentState.segmentInfo.name); + log.info("Diagnostics for this segment: " + segmentState.segmentInfo.getDiagnostics()); + log.info("Loading map of cagraIndexes: " + cagraIndexes); + log.info("Loading vectors: " + vectors); + log.info("Loading mapping: " + mappings); + + for (String segmentField: cagraIndexes.keySet()) { + log.info("Loading segmentField: " + segmentField); + String segment = segmentField.split("/")[0]; + String field = segmentField.split("/")[1]; + CuVSIndex cuvsIndex = new CuVSIndex(segment, field, cagraIndexes.get(segmentField), mappings.get(segmentField), vectors.get(segmentField), maxDocs.get(segment), bruteforceIndexes.get(segmentField)); + List listOfIndexes = ret.containsKey(field)? ret.get(field): new ArrayList(); + listOfIndexes.add(cuvsIndex); + ret.put(field, listOfIndexes); + } + return ret; + } + + public List getStackTrace(Stream stackFrameStream) { + return stackFrameStream.collect(Collectors.toList()); + } + + public ZipInputStream getIndexInputStream() throws IOException { + segmentInputStream.reset(); + return new ZipInputStream(segmentInputStream); + } + + @Override + public void close() throws IOException { + IOUtils.close(vectorDataReader); + } + + @Override + public void checkIntegrity() throws IOException { + // TODO: Pending implementation + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + throw new UnsupportedOperationException(); + /*return new FloatVectorValues() { + + int pos = -1; + + @Override + public int nextDoc() throws IOException { + pos++; + int size = cuvsIndexes.get(field).get(0).getMapping().size(); + if (pos >= size) return FloatVectorValues.NO_MORE_DOCS; + return cuvsIndexes.get(field).get(0).getMapping().get(pos); + } + + @Override + public int docID() { + return cuvsIndexes.get(field).get(0).getMapping().get(pos); + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public float[] vectorValue() throws IOException { + return cuvsIndexes.get(field).get(0).getVectors().get(pos); + + } + + @Override + public int size() { + return cuvsIndexes.get(field).get(0).getVectors().size(); + } + + @Override + public VectorScorer scorer(float[] query) throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public int dimension() { + // TODO Auto-generated method stub + return cuvsIndexes.get(field).get(0).getVectors().get(0).length; + } + };*/ + } + + @Override + public ByteVectorValues getByteVectorValues(String field) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + PerLeafCuVSKnnCollector cuvsCollector = knnCollector instanceof PerLeafCuVSKnnCollector? ((PerLeafCuVSKnnCollector)knnCollector): new PerLeafCuVSKnnCollector(knnCollector.k(), knnCollector.k(), 1); + TopKnnCollector defaultCollector = knnCollector instanceof TopKnnCollector? ((TopKnnCollector)knnCollector): null; + + int prevDocCount = 0; + + // log.debug("Will try to search all the indexes for segment "+segmentState.segmentInfo.name+", field "+field+": "+cuvsIndexes); + for (CuVSIndex cuvsIndex: cuvsIndexes.get(field)) { + try { + Map result = new HashMap(); + if (cuvsCollector.k() <= 1024) { + CagraSearchParams searchParams = new CagraSearchParams.Builder(resources) + .withItopkSize(cuvsCollector.iTopK) + .withSearchWidth(cuvsCollector.searchWidth) + .build(); + + CagraQuery query = new CagraQuery.Builder() + .withTopK(cuvsCollector.k()) + .withSearchParams(searchParams) + .withMapping(cuvsIndex.getMapping()) + .withQueryVectors(new float[][] {target}) + .build(); + + CagraIndex cagraIndex = cuvsIndex.getCagraIndex(); + assert (cagraIndex != null); + log.info("k is " + cuvsCollector.k()); + result = cagraIndex.search(query).getResults().get(0); // List expected to have only one entry because of single query "target". + log.info("INTERMEDIATE RESULT FROM CUVS: " + result + ", prevDocCount=" + prevDocCount); + } else { + BruteForceQuery bruteforceQuery = new BruteForceQuery.Builder() + .withQueryVectors(new float[][] { target }) + .withPrefilter(((FixedBitSet)acceptDocs).getBits()) + .withTopK(cuvsCollector.k()) + .build(); + + BruteForceIndex bruteforceIndex = cuvsIndex.getBruteforceIndex(); + result = bruteforceIndex.search(bruteforceQuery).getResults().get(0); + } + + for(Entry kv : result.entrySet()) { + if (defaultCollector != null) { + defaultCollector.collect(prevDocCount + kv.getKey(), kv.getValue()); + } + cuvsCollector.collect(prevDocCount + kv.getKey(), kv.getValue()); + } + + } catch (Throwable e) { + e.printStackTrace(); + } + prevDocCount += cuvsIndex.getMaxDocs(); + } + } + + @Override + public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + throw new UnsupportedOperationException(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java new file mode 100644 index 000000000000..1da7ca0f9e6c --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -0,0 +1,339 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.logging.Logger; + +import org.apache.commons.lang3.SerializationUtils; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Sorter.DocMap; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.IOUtils; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceIndexParams; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraIndexParams; +import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo; +import com.nvidia.cuvs.CuVSResources; + +public class CuVSVectorsWriter extends KnnVectorsWriter { + + protected Logger log = Logger.getLogger(getClass().getName()); + + private List fieldVectorWriters = new ArrayList<>(); + private IndexOutput cuVSIndex = null; + private SegmentWriteState segmentWriteState = null; + private String cuVSDataFilename = null; + + private CagraIndex cagraIndex; + private CagraIndex cagraIndexForHnsw; + + private int cuvsWriterThreads; + private int intGraphDegree; + private int graphDegree; + private MergeStrategy mergeStrategy; + private CuVSResources resources; + + public enum MergeStrategy { + TRIVIAL_MERGE, NON_TRIVIAL_MERGE + }; + + public CuVSVectorsWriter(SegmentWriteState state, int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy, CuVSResources resources) + throws IOException { + super(); + this.segmentWriteState = state; + this.mergeStrategy = mergeStrategy; + this.cuvsWriterThreads = cuvsWriterThreads; + this.intGraphDegree = intGraphDegree; + this.graphDegree = graphDegree; + this.resources = resources; + + cuVSDataFilename = IndexFileNames.segmentFileName(this.segmentWriteState.segmentInfo.name, this.segmentWriteState.segmentSuffix, CuVSVectorsFormat.VECTOR_DATA_EXTENSION); + } + + @Override + public long ramBytesUsed() { + return 0; + } + + @Override + public void close() throws IOException { + IOUtils.close(cuVSIndex); + cuVSIndex = null; + fieldVectorWriters.clear(); + fieldVectorWriters = null; + } + + @Override + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + CagraFieldVectorsWriter cagraFieldVectorWriter = new CagraFieldVectorsWriter(fieldInfo); + fieldVectorWriters.add(cagraFieldVectorWriter); + return cagraFieldVectorWriter; + } + + private byte[] createCagraIndex(float[][] vectors, List mapping) throws Throwable { + CagraIndexParams indexParams = new CagraIndexParams.Builder(resources) + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(intGraphDegree) + .withGraphDegree(graphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + + log.info("Indexing started: " + System.currentTimeMillis()); + cagraIndex = new CagraIndex.Builder(resources) + .withDataset(vectors) + .withIndexParams(indexParams) + .build(); + log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + vectors.length); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + File tmpFile = File.createTempFile("tmpindex", "cag"); // TODO: Should we make this a file with random names? + cagraIndex.serialize(baos, tmpFile); + return baos.toByteArray(); + } + + private byte[] createBruteForceIndex(float[][] vectors) throws Throwable { + BruteForceIndexParams indexParams = new BruteForceIndexParams.Builder() + .withNumWriterThreads(32) // TODO: Make this configurable later. + .build(); + + log.info("Indexing started: " + System.currentTimeMillis()); + BruteForceIndex index = new BruteForceIndex.Builder(resources) + .withIndexParams(indexParams) + .withDataset(vectors) + .build(); + + log.info("Indexing done: " + System.currentTimeMillis()); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + index.serialize(baos); + return baos.toByteArray(); + } + + private byte[] createHnswIndex(float[][] vectors) throws Throwable { + CagraIndexParams indexParams = new CagraIndexParams.Builder(resources) + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(intGraphDegree) + .withGraphDegree(graphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + + log.info("Indexing started: " + System.currentTimeMillis()); + cagraIndexForHnsw = new CagraIndex.Builder(resources) + .withDataset(vectors) + .withIndexParams(indexParams) + .build(); + log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + vectors.length); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + File tmpFile = File.createTempFile("tmpindex", "hnsw"); + cagraIndexForHnsw.serializeToHNSW(baos, tmpFile); + return baos.toByteArray(); + } + + @SuppressWarnings({"resource", "rawtypes", "unchecked"}) + @Override + public void flush(int maxDoc, DocMap sortMap) throws IOException { + cuVSIndex = this.segmentWriteState.directory.createOutput(cuVSDataFilename, this.segmentWriteState.context); + CodecUtil.writeIndexHeader(cuVSIndex, CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, CuVSVectorsFormat.VERSION_CURRENT, this.segmentWriteState.segmentInfo.getId(), this.segmentWriteState.segmentSuffix); + + + CuVSSegmentFile cuVSFile = new CuVSSegmentFile(new SegmentOutputStream(cuVSIndex, 100000)); + + LinkedHashMap metaMap = new LinkedHashMap(); + + for (CagraFieldVectorsWriter field : fieldVectorWriters) { + long start = System.currentTimeMillis(); + + byte[] cagraIndexBytes = null; + byte[] bruteForceIndexBytes = null; + byte[] hnswIndexBytes = null; + try { + log.info("Starting CAGRA indexing, space remaining: "+new File("/").getFreeSpace()); + log.info("Starting CAGRA indexing, docs: " + field.vectors.size()); + + float vectors[][] = new float[field.vectors.size()][field.vectors.get(0).length]; + for (int i = 0; i < vectors.length; i++) { + for (int j = 0; j < vectors[i].length; j++) { + vectors[i][j] = field.vectors.get(i)[j]; + } + } + + cagraIndexBytes = createCagraIndex(vectors, new ArrayList(field.vectors.keySet())); // nocommit + bruteForceIndexBytes = createBruteForceIndex(vectors); + hnswIndexBytes = createHnswIndex(vectors); + } catch (Throwable e) { + e.printStackTrace(); + } + + start = System.currentTimeMillis(); + cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".cag", cagraIndexBytes); + log.info("time for writing CAGRA index bytes to zip: " + (System.currentTimeMillis() - start)); + + start = System.currentTimeMillis(); + cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".bf", bruteForceIndexBytes); + log.info("time for writing BRUTEFORCE index bytes to zip: " + (System.currentTimeMillis() - start)); + + start = System.currentTimeMillis(); + cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".hnsw", hnswIndexBytes); + log.info("time for writing HNSW index bytes to zip: " + (System.currentTimeMillis() - start)); + + start = System.currentTimeMillis(); + cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".vec", SerializationUtils.serialize(new ArrayList(field.vectors.values()))); + cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".map", SerializationUtils.serialize(new ArrayList(field.vectors.keySet()))); + log.info("list serializing and writing: " + (System.currentTimeMillis() - start)); + field.vectors.clear(); + } + + metaMap.put(segmentWriteState.segmentInfo.name, maxDoc); + cuVSFile.addFile(segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); + cuVSFile.close(); + + CodecUtil.writeFooter(cuVSIndex); + } + + SegmentOutputStream mergeOutputStream = null; + CuVSSegmentFile mergedIndexFile = null; + + @SuppressWarnings("resource") + @Override + public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + List segInputStreams = new ArrayList(); + List readers = new ArrayList(); + + for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { + CuVSVectorsReader reader = (CuVSVectorsReader) mergeState.knnVectorsReaders[i]; + segInputStreams.add(reader.segmentInputStream); + readers.add(reader); + } + + log.info("Merging one field for segment: " + segmentWriteState.segmentInfo.name); + log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); + + if (!List.of(segmentWriteState.directory.listAll()).contains(cuVSDataFilename)) { + IndexOutput mergedVectorIndex = segmentWriteState.directory.createOutput(cuVSDataFilename, segmentWriteState.context); + CodecUtil.writeIndexHeader(mergedVectorIndex, CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, + CuVSVectorsFormat.VERSION_CURRENT, segmentWriteState.segmentInfo.getId(), segmentWriteState.segmentSuffix); + this.mergeOutputStream = new SegmentOutputStream(mergedVectorIndex, 100000); + mergedIndexFile = new CuVSSegmentFile(this.mergeOutputStream); + } + + log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); + + if (mergeStrategy.equals(MergeStrategy.TRIVIAL_MERGE)) { + Util.getMergedArchiveCOS(segInputStreams, segmentWriteState.segmentInfo.name, this.mergeOutputStream + ); + } else if (mergeStrategy.equals(MergeStrategy.NON_TRIVIAL_MERGE)) { + // nocommit: this doesn't merge all the fields + log.info("Readers: "+segInputStreams.size()+", deocMaps: "+mergeState.docMaps.length); + ArrayList docMapList = new ArrayList(); + + for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { + CuVSVectorsReader reader = (CuVSVectorsReader) mergeState.knnVectorsReaders[i]; + for (CuVSIndex index: reader.cuvsIndexes.get(fieldInfo.name)) { + log.info("Mapping for segment ("+reader.fileName+"): " + index.getMapping()); + log.info("Mapping for segment ("+reader.fileName+"): " + index.getMapping().size()); + for (int id=0; id mergedVectors = Util.getMergedVectors(segInputStreams, fieldInfo.name, segmentWriteState.segmentInfo.name); + log.info("Final mapping: " + docMapList); + log.info("Final mapping: " + docMapList.size()); + log.info("Merged vectors: " + mergedVectors.size()); + LinkedHashMap metaMap = new LinkedHashMap(); + byte[] cagraIndexBytes = null; + byte[] bruteForceIndexBytes = null; + byte[] hnswIndexBytes = null; + try { + float vectors[][] = new float[mergedVectors.size()][mergedVectors.get(0).length]; + for (int i = 0; i < vectors.length; i++) { + for (int j = 0; j < vectors[i].length; j++) { + vectors[i][j] = mergedVectors.get(i)[j]; + } + } + cagraIndexBytes = createCagraIndex(vectors, new ArrayList()); + bruteForceIndexBytes = createBruteForceIndex(vectors); + hnswIndexBytes = createHnswIndex(vectors); + } catch (Throwable e) { + e.printStackTrace(); + } + mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".cag", cagraIndexBytes); + mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".bf", bruteForceIndexBytes); + mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".hnsw", hnswIndexBytes); + mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".vec", SerializationUtils.serialize(mergedVectors)); + mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".map", SerializationUtils.serialize(docMapList)); + metaMap.put(segmentWriteState.segmentInfo.name, mergedVectors.size()); + if (mergedIndexFile.getFilesAdded().contains(segmentWriteState.segmentInfo.name + ".meta") == false) { + mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); + } + log.info("DocMaps: "+Arrays.toString(mergeState.docMaps)); + + metaMap.clear(); + } + } + + + @Override + public void finish() throws IOException { + if (this.mergeOutputStream!=null) { + mergedIndexFile.close(); + CodecUtil.writeFooter(mergeOutputStream.out); + IOUtils.close(mergeOutputStream.out); + this.mergeOutputStream = null; + this.mergedIndexFile = null; + } + } + + public class SegmentOutputStream extends OutputStream { + + IndexOutput out; + int bufferSize; + byte[] buffer; + int p; + + public SegmentOutputStream(IndexOutput out, int bufferSize) throws IOException { + super(); + this.out = out; + this.bufferSize = bufferSize; + this.buffer = new byte[this.bufferSize]; + } + + @Override + public void write(int b) throws IOException { + buffer[p] = (byte) b; + p += 1; + if (p == bufferSize) { + flush(); + } + } + + @Override + public void flush() throws IOException { + out.writeBytes(buffer, p); + p = 0; + } + + @Override + public void close() throws IOException { + this.flush(); + } + + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java new file mode 100644 index 000000000000..d4d19fad7041 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -0,0 +1,74 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TotalHits; + +public class PerLeafCuVSKnnCollector implements KnnCollector { + + public List scoreDocs; + public int topK = 0; + public int iTopK = topK; // TODO getter, no setter + public int searchWidth = 1; // TODO getter, no setter + public int results = 0; + + public PerLeafCuVSKnnCollector(int topK, int iTopK, int searchWidth) { + super(); + this.topK = topK; + this.iTopK = iTopK; + this.searchWidth = searchWidth; + scoreDocs = new ArrayList(); + } + + @Override + public boolean earlyTerminated() { + // TODO: may need implementation + return false; + } + + @Override + public void incVisitedCount(int count) { + // TODO: may need implementation + } + + @Override + public long visitedCount() { + // TODO: may need implementation + return 0; + } + + @Override + public long visitLimit() { + // TODO: may need implementation + return 0; + } + + @Override + public int k() { + return topK; + } + + @Override + @SuppressWarnings("cast") + public boolean collect(int docId, float similarity) { + scoreDocs.add(new ScoreDoc(docId, 1f/(float)(similarity))); + return true; + } + + @Override + public float minCompetitiveSimilarity() { + // TODO: may need implementation + return 0; + } + + @Override + public TopDocs topDocs() { + return new TopDocs(new TotalHits(scoreDocs.size(), TotalHits.Relation.EQUAL_TO), + scoreDocs.toArray(new ScoreDoc[scoreDocs.size()])); + } + +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java new file mode 100644 index 000000000000..a352269fbb1b --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java @@ -0,0 +1,90 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.lucene.store.IndexInput; + +public class SegmentInputStream extends InputStream { + + /** + * + */ + private final IndexInput indexInput; + public final long initialFilePointerPosition; + public final long limit; + public long pos = 0; + + // TODO: This input stream needs to be modified to enable buffering. + public SegmentInputStream(IndexInput indexInput, long limit, long initialFilePointerPosition) throws IOException { + super(); + this.indexInput = indexInput; + this.initialFilePointerPosition = initialFilePointerPosition; + this.limit = limit; + + this.indexInput.seek(initialFilePointerPosition); + } + + @Override + public int read() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int read(byte[] b, int off, int len) { + try { + long avail = limit - pos; + if (pos >= limit) { + return -1; + } + if (len > avail) { + len = (int) avail; + } + if (len <= 0) { + return 0; + } + indexInput.readBytes(b, off, len); + pos += len; + return len; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Override + public int read(byte[] b) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void reset() throws IOException { + indexInput.seek(initialFilePointerPosition); + pos = 0; + } + + @Override + public long skip(long n) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean markSupported() { + return true; + } + + @Override + public void mark(int readlimit) { + throw new UnsupportedOperationException(); + } + + @Override + public void close() { + // Do nothing for now. + } + + @Override + public int available() { + throw new UnsupportedOperationException(); + } + +} \ No newline at end of file diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java new file mode 100644 index 000000000000..a8200e7b897b --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java @@ -0,0 +1,142 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Logger; +import java.util.zip.Deflater; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.util.zip.ZipOutputStream; + +public class Util { + + public static ByteArrayOutputStream getZipEntryBAOS(String fileName, SegmentInputStream segInputStream) + throws IOException { + segInputStream.reset(); + ZipInputStream zipInputStream = new ZipInputStream(segInputStream); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + boolean fileFound = false; + ZipEntry zipEntry; + while (zipInputStream.available() == 1 && ((zipEntry = zipInputStream.getNextEntry()) != null)) { + if (zipEntry.getName().equals(fileName)) { + fileFound = true; + byte[] buffer = new byte[1024]; + int length; + while ((length = zipInputStream.read(buffer)) != -1) { + baos.write(buffer, 0, length); + } + } + } + if (!fileFound) throw new FileNotFoundException(); + return baos; + } + + private static final Logger log = Logger.getLogger(Util.class.getName()); + + public static ArrayList getMergedVectors(List segInputStreams, String fieldName, String mergedSegmentName) + throws IOException { + ZipEntry zs; + ArrayList mergedVectors = new ArrayList(); + log.info("Getting mergedVectors..."); + for (SegmentInputStream segInputStream : segInputStreams) { + segInputStream.reset(); + ZipInputStream zipStream = new ZipInputStream(segInputStream); + while ((zs = zipStream.getNextEntry()) != null) { + log.info("Getting mergedVectors... " + zs.getName()); + byte[] buffer = new byte[1024]; + int length; + if (zs.getName().endsWith(".vec")) { + String field = zs.getName().split("\\.")[0].split("/")[1]; + if (fieldName.equals(field)) { + ByteArrayOutputStream baosM = new ByteArrayOutputStream(); + while ((length = zipStream.read(buffer)) != -1) { + baosM.write(buffer, 0, length); + } + List m = deSerializeListInMemory(baosM.toByteArray()); + mergedVectors.addAll(m); + } + } + } + } + return mergedVectors; + } + + public static void getMergedArchiveCOS(List segInputStreams, String mergedSegmentName, + OutputStream os) throws IOException { + ZipOutputStream zos = new ZipOutputStream(os); + ZipEntry zs; + Map mergedMetaMap = new LinkedHashMap(); + for (SegmentInputStream segInputStream : segInputStreams) { + segInputStream.reset(); + ZipInputStream zipStream = new ZipInputStream(segInputStream); + while ((zs = zipStream.getNextEntry()) != null) { + byte[] buffer = new byte[1024]; + int length; + if (zs.getName().endsWith(".meta")) { + ByteArrayOutputStream baosM = new ByteArrayOutputStream(); + while ((length = zipStream.read(buffer)) != -1) { + baosM.write(buffer, 0, length); + } + Map m = deSerializeMapInMemory(baosM.toByteArray()); + mergedMetaMap.putAll(m); + } else { + ZipEntry zipEntry = new ZipEntry(zs.getName()); + zos.putNextEntry(zipEntry); + zos.setLevel(Deflater.NO_COMPRESSION); + while ((length = zipStream.read(buffer)) != -1) { + zos.write(buffer, 0, length); + } + zos.closeEntry(); + } + } + } + // Finally put the merged meta file + ZipEntry mergedMetaZipEntry = new ZipEntry(mergedSegmentName + ".meta"); + zos.putNextEntry(mergedMetaZipEntry); + zos.setLevel(Deflater.NO_COMPRESSION); + new ObjectOutputStream(zos).writeObject(mergedMetaMap); // Java serialization should be avoided + zos.closeEntry(); + zos.close(); + } + + @SuppressWarnings("unchecked") + public static Map deSerializeMapInMemory(byte[] bytes) { + Map map = null; + ObjectInputStream ois = null; + try { + ois = new ObjectInputStream(new ByteArrayInputStream(bytes)); + map = (Map) ois.readObject(); + ois.close(); + } catch (Exception e) { + e.printStackTrace(); + } + + return map; + } + + @SuppressWarnings("unchecked") + public static List deSerializeListInMemory(byte[] bytes) { + List map = null; + ObjectInputStream ois = null; + try { + ois = new ObjectInputStream(new ByteArrayInputStream(bytes)); + map = (List) ois.readObject(); + ois.close(); + } catch (Exception e) { + e.printStackTrace(); + } + + return map; + } + +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java new file mode 100644 index 000000000000..67199edca2f6 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java @@ -0,0 +1 @@ +package org.apache.lucene.sandbox.vectorsearch; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java new file mode 100644 index 000000000000..89ee9a3879ba --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java @@ -0,0 +1,201 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.TreeMap; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.analysis.MockTokenizer; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.English; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks; +import org.apache.lucene.tests.util.TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@SuppressSysoutChecks(bugUrl = "prints info from within cuvs") +public class IntegrationTest extends LuceneTestCase { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private static IndexSearcher searcher; + private static IndexReader reader; + private static Directory directory; + + public static int DATASET_SIZE_LIMIT = 1000; + public static int DIMENSIONS_LIMIT = 2048; + public static int NUM_QUERIES_LIMIT = 10; + public static int TOP_K_LIMIT = 64; // nocommit This fails beyond 64 + + public static float[][] dataset = null; + + @BeforeClass + public static void beforeClass() throws Exception { + directory = newDirectory(); + + Codec codec = new CuVSCodec(); + + RandomIndexWriter writer = + new RandomIndexWriter( + random(), + directory, + newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)) + .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) + .setCodec(codec) + .setMergePolicy(newTieredMergePolicy())); + + log.info("Merge Policy: " + writer.w.getConfig().getMergePolicy()); + + Random random = random(); + int datasetSize = random.nextInt(DATASET_SIZE_LIMIT) + 1; + int dimensions = random.nextInt(DIMENSIONS_LIMIT) + 1; + dataset = generateDataset(random, datasetSize, dimensions); + for (int i = 0; i < datasetSize; i++) { + Document doc = new Document(); + doc.add(new StringField("id", String.valueOf(i), Field.Store.YES)); + doc.add(newTextField("field", English.intToEnglish(i), Field.Store.YES)); + boolean skipVector = random.nextInt(10) < 0; // nocommit disable testing with holes for now, there's some bug. + if (!skipVector || datasetSize<100) { // about 10th of the documents shouldn't have a single vector + doc.add(new KnnFloatVectorField("vector", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new KnnFloatVectorField("vector2", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); + } + + writer.addDocument(doc); + } + + reader = writer.getReader(); + searcher = newSearcher(reader); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + // nocommit This fails until flat vectors are implemented + reader.close(); + directory.close(); + searcher = null; + reader = null; + directory = null; + log.info("Test finished"); + } + + @Test + public void testVectorSearch() throws IOException { + Random random = random(); + int numQueries = random.nextInt(NUM_QUERIES_LIMIT) + 1; + int topK = Math.min(random.nextInt(TOP_K_LIMIT) + 1, dataset.length); + + if(dataset.length < topK) topK = dataset.length; + + float[][] queries = generateQueries(random, dataset[0].length, numQueries); + List> expected = generateExpectedResults(topK, dataset, queries); + + debugPrintDatasetAndQueries(dataset, queries); + + log.info("Dataset size: {}x{}", dataset.length, dataset[0].length); + log.info("Query size: {}x{}", numQueries, queries[0].length); + log.info("TopK: {}", topK); + + Query query = new CuVSKnnFloatVectorQuery("vector", queries[0], topK, topK, 1); + int correct[] = new int[topK]; + for (int i=0; i> generateExpectedResults(int topK, float[][] dataset, float[][] queries) { + List> neighborsResult = new ArrayList<>(); + int dimensions = dataset[0].length; + + for (float[] query : queries) { + Map distances = new TreeMap<>(); + for (int j = 0; j < dataset.length; j++) { + double distance = 0; + for (int k = 0; k < dimensions; k++) { + distance += (query[k] - dataset[j][k]) * (query[k] - dataset[j][k]); + } + distances.put(j, (distance)); + } + + Map sorted = new TreeMap(distances); + log.info("EXPECTED: " + sorted); + + // Sort by distance and select the topK nearest neighbors + List neighbors = distances.entrySet().stream() + .sorted(Map.Entry.comparingByValue()) + .map(Map.Entry::getKey) + .toList(); + neighborsResult.add(neighbors.subList(0, Math.min(topK * 3, dataset.length))); // generate double the topK results in the expected array + } + + log.info("Expected results generated successfully."); + return neighborsResult; + } +} diff --git a/versions.toml b/versions.toml index 80dc51f39bf2..327848fd10d4 100644 --- a/versions.toml +++ b/versions.toml @@ -4,6 +4,8 @@ asm = "9.6" assertj = "3.21.0" commons-codec = "1.13" commons-compress = "1.19" +commons-lang3 = "3.17.0" +cuvs = "25.02" ecj = "3.36.0" errorprone = "2.18.0" flexmark = "0.61.24" @@ -33,6 +35,7 @@ s2-geometry = "1.0.0" spatial4j = "0.8" xerces = "2.12.0" zstd = "1.5.5-11" +jackson-core = "2.18.2" [libraries] antlr-core = { module = "org.antlr:antlr4", version.ref = "antlr" } @@ -42,6 +45,8 @@ asm-core = { module = "org.ow2.asm:asm", version.ref = "asm" } assertj = { module = "org.assertj:assertj-core", version.ref = "assertj" } commons-codec = { module = "commons-codec:commons-codec", version.ref = "commons-codec" } commons-compress = { module = "org.apache.commons:commons-compress", version.ref = "commons-compress" } +commons-lang3 = { module = "org.apache.commons:commons-lang3", version.ref = "commons-lang3" } +cuvs = { module = "com.nvidia.cuvs:cuvs-java", version.ref = "cuvs" } ecj = { module = "org.eclipse.jdt:ecj", version.ref = "ecj" } errorprone = { module = "com.google.errorprone:error_prone_core", version.ref = "errorprone" } flexmark-core = { module = "com.vladsch.flexmark:flexmark", version.ref = "flexmark" } @@ -52,6 +57,7 @@ flexmark-ext-tables = { module = "com.vladsch.flexmark:flexmark-ext-tables", ver groovy = { module = "org.apache.groovy:groovy-all", version.ref = "groovy" } hamcrest = { module = "org.hamcrest:hamcrest", version.ref = "hamcrest" } icu4j = { module = "com.ibm.icu:icu4j", version.ref = "icu4j" } +jackson-core = { module = "com.fasterxml.jackson.core:jackson-core", version.ref = "jackson-core" } javacc = { module = "net.java.dev.javacc:javacc", version.ref = "javacc" } jflex = { module = "de.jflex:jflex", version.ref = "jflex" } jgit = { module = "org.eclipse.jgit:org.eclipse.jgit", version.ref = "jgit" } From 0e9f6d4bc9a98eb33d594409ce8e4b3a6b4b1a06 Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Tue, 7 Jan 2025 21:28:17 +0530 Subject: [PATCH 02/34] Test fixes --- .../services/org.apache.lucene.codecs.Codec | 1 + .../org.apache.lucene.codecs.KnnVectorsFormat | 16 ++++++++++++++++ .../{IntegrationTest.java => TestCuVS.java} | 2 +- 3 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec create mode 100644 lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat rename lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/{IntegrationTest.java => TestCuVS.java} (99%) diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec new file mode 100644 index 000000000000..38b31884377d --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -0,0 +1 @@ +org.apache.lucene.sandbox.vectorsearch.CuVSCodec \ No newline at end of file diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat new file mode 100644 index 000000000000..666ee726f986 --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java similarity index 99% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java index 89ee9a3879ba..15a023d6fbd3 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java @@ -34,7 +34,7 @@ import org.slf4j.LoggerFactory; @SuppressSysoutChecks(bugUrl = "prints info from within cuvs") -public class IntegrationTest extends LuceneTestCase { +public class TestCuVS extends LuceneTestCase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); From a95f084e1d5a9d16128bd133e0631b193eed8709 Mon Sep 17 00:00:00 2001 From: Vivek Narang Date: Tue, 7 Jan 2025 12:32:57 -0500 Subject: [PATCH 03/34] fix for getFloatVectorValues --- .../vectorsearch/CuVSVectorsReader.java | 40 ++++--------------- 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index cac870afec6c..837a9229d061 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -196,52 +196,28 @@ public void checkIntegrity() throws IOException { @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { - throw new UnsupportedOperationException(); - /*return new FloatVectorValues() { - - int pos = -1; - - @Override - public int nextDoc() throws IOException { - pos++; - int size = cuvsIndexes.get(field).get(0).getMapping().size(); - if (pos >= size) return FloatVectorValues.NO_MORE_DOCS; - return cuvsIndexes.get(field).get(0).getMapping().get(pos); - } + return new FloatVectorValues() { @Override - public int docID() { - return cuvsIndexes.get(field).get(0).getMapping().get(pos); + public int size() { + return cuvsIndexes.get(field).get(0).getVectors().size(); } @Override - public int advance(int target) throws IOException { - throw new UnsupportedOperationException(); + public int dimension() { + return cuvsIndexes.get(field).get(0).getVectors().get(0).length; } @Override - public float[] vectorValue() throws IOException { + public float[] vectorValue(int pos) throws IOException { return cuvsIndexes.get(field).get(0).getVectors().get(pos); - } @Override - public int size() { - return cuvsIndexes.get(field).get(0).getVectors().size(); - } - - @Override - public VectorScorer scorer(float[] query) throws IOException { - // TODO Auto-generated method stub + public FloatVectorValues copy() throws IOException { return null; } - - @Override - public int dimension() { - // TODO Auto-generated method stub - return cuvsIndexes.get(field).get(0).getVectors().get(0).length; - } - };*/ + }; } @Override From 9f0d3dd5c05bc37f71c499ab9c79c43cfd2b0bf4 Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Fri, 10 Jan 2025 19:39:41 +0530 Subject: [PATCH 04/34] Fixing precommit, ECJ, Rat, spotless, forbiddenApis etc. --- lucene/licenses/commons-LICENSE-ASL.txt | 202 ++++++++++++ lucene/licenses/commons-NOTICE.txt | 197 +++++++++++ lucene/licenses/commons-lang3-3.17.0.jar.sha1 | 1 + lucene/licenses/cuvs-java-25.02.jar.sha1 | 1 + lucene/licenses/cuvs-java-LICENSE-ASL.txt | 202 ++++++++++++ lucene/licenses/cuvs-java-NOTICE.txt | 197 +++++++++++ lucene/sandbox/src/java/module-info.java | 7 +- .../vectorsearch/CagraFieldVectorsWriter.java | 26 +- .../sandbox/vectorsearch/CuVSCodec.java | 34 +- .../sandbox/vectorsearch/CuVSIndex.java | 36 +- .../vectorsearch/CuVSKnnFloatVectorQuery.java | 29 +- .../sandbox/vectorsearch/CuVSSegmentFile.java | 31 +- .../vectorsearch/CuVSVectorsFormat.java | 38 ++- .../vectorsearch/CuVSVectorsReader.java | 268 +++++++++------ .../vectorsearch/CuVSVectorsWriter.java | 307 +++++++++++------- .../vectorsearch/PerLeafCuVSKnnCollector.java | 23 +- .../vectorsearch/SegmentInputStream.java | 28 +- .../lucene/sandbox/vectorsearch/Util.java | 114 ++----- .../sandbox/vectorsearch/package-info.java | 16 + .../services/org.apache.lucene.codecs.Codec | 15 + .../lucene/sandbox/vectorsearch/TestCuVS.java | 95 +++--- versions.lock | 68 ++++ versions.toml | 2 - 23 files changed, 1525 insertions(+), 412 deletions(-) create mode 100644 lucene/licenses/commons-LICENSE-ASL.txt create mode 100644 lucene/licenses/commons-NOTICE.txt create mode 100644 lucene/licenses/commons-lang3-3.17.0.jar.sha1 create mode 100644 lucene/licenses/cuvs-java-25.02.jar.sha1 create mode 100644 lucene/licenses/cuvs-java-LICENSE-ASL.txt create mode 100644 lucene/licenses/cuvs-java-NOTICE.txt diff --git a/lucene/licenses/commons-LICENSE-ASL.txt b/lucene/licenses/commons-LICENSE-ASL.txt new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/lucene/licenses/commons-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/commons-NOTICE.txt b/lucene/licenses/commons-NOTICE.txt new file mode 100644 index 000000000000..554991d39bcf --- /dev/null +++ b/lucene/licenses/commons-NOTICE.txt @@ -0,0 +1,197 @@ +Apache Lucene +Copyright 2001-2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4 +compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed +under the 2-clause BSD license. +(https://opensource.org/licenses/bsd-license.php) + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + https://snowballstem.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See https://github.com/carrot2/carrot2. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski +(https://github.com/morfologik/morfologik-stemming) and uses +data from the BSD-licensed dictionary of Polish (SGJP, http://sgjp.pl/morfeusz/). + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.1.1-20180720 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz diff --git a/lucene/licenses/commons-lang3-3.17.0.jar.sha1 b/lucene/licenses/commons-lang3-3.17.0.jar.sha1 new file mode 100644 index 000000000000..f64174593b1c --- /dev/null +++ b/lucene/licenses/commons-lang3-3.17.0.jar.sha1 @@ -0,0 +1 @@ +b17d2136f0460dcc0d2016ceefca8723bdf4ee70 diff --git a/lucene/licenses/cuvs-java-25.02.jar.sha1 b/lucene/licenses/cuvs-java-25.02.jar.sha1 new file mode 100644 index 000000000000..e399aed842a5 --- /dev/null +++ b/lucene/licenses/cuvs-java-25.02.jar.sha1 @@ -0,0 +1 @@ +280c6f97d99a8d32500a0c0891db1ccdc49bc17b diff --git a/lucene/licenses/cuvs-java-LICENSE-ASL.txt b/lucene/licenses/cuvs-java-LICENSE-ASL.txt new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/lucene/licenses/cuvs-java-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/cuvs-java-NOTICE.txt b/lucene/licenses/cuvs-java-NOTICE.txt new file mode 100644 index 000000000000..554991d39bcf --- /dev/null +++ b/lucene/licenses/cuvs-java-NOTICE.txt @@ -0,0 +1,197 @@ +Apache Lucene +Copyright 2001-2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4 +compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed +under the 2-clause BSD license. +(https://opensource.org/licenses/bsd-license.php) + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + https://snowballstem.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See https://github.com/carrot2/carrot2. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski +(https://github.com/morfologik/morfologik-stemming) and uses +data from the BSD-licensed dictionary of Polish (SGJP, http://sgjp.pl/morfeusz/). + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.1.1-20180720 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index b2d45adf4d30..051c1df0a257 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -23,7 +23,7 @@ requires java.logging; requires com.nvidia.cuvs; requires org.apache.commons.lang3; - + exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; exports org.apache.lucene.sandbox.codecs.quantization; @@ -37,7 +37,12 @@ exports org.apache.lucene.sandbox.facet.iterators; exports org.apache.lucene.sandbox.facet.cutters; exports org.apache.lucene.sandbox.facet.labels; + exports org.apache.lucene.sandbox.vectorsearch; provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; + // provides org.apache.lucene.codecs.KnnVectorsFormat with + // org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat; + provides org.apache.lucene.codecs.Codec with + org.apache.lucene.sandbox.vectorsearch.CuVSCodec; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java index 21c088bd84f8..df8f83966dc3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -1,15 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import java.io.IOException; +import java.nio.charset.Charset; import java.util.concurrent.ConcurrentHashMap; - import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.index.FieldInfo; public class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { public final String fieldName; - public final ConcurrentHashMap vectors = new ConcurrentHashMap(); + public final ConcurrentHashMap vectors = + new ConcurrentHashMap(); public int fieldVectorDimension = -1; public CagraFieldVectorsWriter(FieldInfo fieldInfo) { @@ -19,7 +36,9 @@ public CagraFieldVectorsWriter(FieldInfo fieldInfo) { @Override public long ramBytesUsed() { - return fieldName.getBytes().length + Integer.BYTES + (vectors.size() * fieldVectorDimension * Float.BYTES); + return fieldName.getBytes(Charset.forName("UTF-8")).length + + Integer.BYTES + + (vectors.size() * fieldVectorDimension * Float.BYTES); } @Override @@ -31,5 +50,4 @@ public void addValue(int docID, float[] vectorValue) throws IOException { public float[] copyValue(float[] vectorValue) { throw new UnsupportedOperationException(); } - } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index 448803bb7fc4..315923d1eeb2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -1,12 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; +import com.nvidia.cuvs.LibraryNotFoundException; +import java.util.logging.Logger; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; - public class CuVSCodec extends FilterCodec { public CuVSCodec() { @@ -15,17 +32,24 @@ public CuVSCodec() { public CuVSCodec(String name, Codec delegate) { super(name, delegate); - setKnnFormat(new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE)); + KnnVectorsFormat format; + try { + format = new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE); + setKnnFormat(format); + } catch (LibraryNotFoundException ex) { + Logger log = Logger.getLogger(CuVSCodec.class.getName()); + log.severe("Couldn't load native library, possible classloader issue. " + ex.getMessage()); + } } - + KnnVectorsFormat knnFormat = null; @Override public KnnVectorsFormat knnVectorsFormat() { return knnFormat; } - + public void setKnnFormat(KnnVectorsFormat format) { this.knnFormat = format; } -} \ No newline at end of file +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index 1878b6c236bc..98a2eb9739ac 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -1,10 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; -import java.util.List; -import java.util.Objects; - import com.nvidia.cuvs.BruteForceIndex; import com.nvidia.cuvs.CagraIndex; +import java.util.List; +import java.util.Objects; public class CuVSIndex { private final CagraIndex cagraIndex; @@ -12,11 +27,18 @@ public class CuVSIndex { private final List mapping; private final List vectors; private final int maxDocs; - + private final String fieldName; private final String segmentName; - public CuVSIndex(String segmentName, String fieldName, CagraIndex cagraIndex, List mapping, List vectors, int maxDocs, BruteForceIndex bruteforceIndex) { + public CuVSIndex( + String segmentName, + String fieldName, + CagraIndex cagraIndex, + List mapping, + List vectors, + int maxDocs, + BruteForceIndex bruteforceIndex) { this.cagraIndex = Objects.requireNonNull(cagraIndex); this.bruteforceIndex = Objects.requireNonNull(bruteforceIndex); this.mapping = Objects.requireNonNull(mapping); @@ -25,7 +47,7 @@ public CuVSIndex(String segmentName, String fieldName, CagraIndex cagraIndex, Li this.segmentName = Objects.requireNonNull(segmentName); this.maxDocs = Objects.requireNonNull(maxDocs); } - + public CagraIndex getCagraIndex() { return cagraIndex; } @@ -53,4 +75,4 @@ public String getSegmentName() { public int getMaxDocs() { return maxDocs; } -} \ No newline at end of file +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java index 1bbae88c5630..e4df14208f97 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -1,7 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import java.io.IOException; - import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.KnnFloatVectorQuery; @@ -11,8 +26,8 @@ public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { - final private int iTopK; - final private int searchWidth; + private final int iTopK; + private final int searchWidth; public CuVSKnnFloatVectorQuery(String field, float[] target, int k, int iTopK, int searchWidth) { super(field, target, k); @@ -21,7 +36,12 @@ public CuVSKnnFloatVectorQuery(String field, float[] target, int k, int iTopK, i } @Override - protected TopDocs approximateSearch(LeafReaderContext context, Bits acceptDocs, int visitedLimit, KnnCollectorManager knnCollectorManager) throws IOException { + protected TopDocs approximateSearch( + LeafReaderContext context, + Bits acceptDocs, + int visitedLimit, + KnnCollectorManager knnCollectorManager) + throws IOException { PerLeafCuVSKnnCollector results = new PerLeafCuVSKnnCollector(k, iTopK, searchWidth); @@ -29,5 +49,4 @@ protected TopDocs approximateSearch(LeafReaderContext context, Bits acceptDocs, reader.searchNearestVectors(field, this.getTargetCopy(), results, null); return results.topDocs(); } - } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java index 9ca0d63ba087..7b850daa6662 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java @@ -1,6 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; -import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.util.Collections; @@ -11,8 +26,8 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; -public class CuVSSegmentFile implements AutoCloseable{ - final private ZipOutputStream zos; +public class CuVSSegmentFile implements AutoCloseable { + private final ZipOutputStream zos; private Set filesAdded = new HashSet(); @@ -20,18 +35,22 @@ public CuVSSegmentFile(OutputStream out) { zos = new ZipOutputStream(out); zos.setLevel(Deflater.NO_COMPRESSION); } - + protected Logger log = Logger.getLogger(getClass().getName()); public void addFile(String name, byte[] bytes) throws IOException { - log.info("Writing the file: " + name + ", size="+bytes.length + ", space remaining: "+new File("/").getFreeSpace()); + /*log.info( + "Writing the file: " + + name + + ", size=" + + bytes.length);*/ ZipEntry indexFileZipEntry = new ZipEntry(name); zos.putNextEntry(indexFileZipEntry); zos.write(bytes, 0, bytes.length); zos.closeEntry(); filesAdded.add(name); } - + public Set getFilesAdded() { return Collections.unmodifiableSet(filesAdded); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index c17b5258c9d5..e2b5bc2169f5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -1,14 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.LibraryNotFoundException; import java.io.IOException; - import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; -import com.nvidia.cuvs.CuVSResources; - public class CuVSVectorsFormat extends KnnVectorsFormat { public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; @@ -30,11 +45,13 @@ public CuVSVectorsFormat() { try { resources = new CuVSResources(); } catch (Throwable e) { - e.printStackTrace(); + throw new RuntimeException(e); } } - public CuVSVectorsFormat(int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy) { + public CuVSVectorsFormat( + int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy) + throws LibraryNotFoundException { super("CuVSVectorsFormat"); this.mergeStrategy = mergeStrategy; this.cuvsWriterThreads = cuvsWriterThreads; @@ -42,14 +59,17 @@ public CuVSVectorsFormat(int cuvsWriterThreads, int intGraphDegree, int graphDeg this.graphDegree = graphDegree; try { resources = new CuVSResources(); + } catch (LibraryNotFoundException ex) { + throw ex; } catch (Throwable e) { - e.printStackTrace(); + throw new RuntimeException(e); } } @Override public CuVSVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { - return new CuVSVectorsWriter(state, cuvsWriterThreads, intGraphDegree, graphDegree, mergeStrategy, resources); + return new CuVSVectorsWriter( + state, cuvsWriterThreads, intGraphDegree, graphDegree, mergeStrategy, resources); } @Override @@ -57,14 +77,12 @@ public CuVSVectorsReader fieldsReader(SegmentReadState state) throws IOException try { return new CuVSVectorsReader(state, resources); } catch (Throwable e) { - e.printStackTrace(); + throw new RuntimeException(e); } - return null; } @Override public int getMaxDimensions(String fieldName) { return maxDimensions; } - } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index 837a9229d061..d7e8a5f19b08 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -1,28 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceQuery; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraQuery; +import com.nvidia.cuvs.CagraSearchParams; +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.HnswIndex; +import com.nvidia.cuvs.HnswIndexParams; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.lang.StackWalker.StackFrame; -import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.logging.Logger; import java.util.stream.Collectors; import java.util.stream.Stream; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; - import org.apache.commons.lang3.SerializationUtils; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.TopKnnCollector; @@ -31,18 +51,9 @@ import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; -import com.nvidia.cuvs.BruteForceIndex; -import com.nvidia.cuvs.BruteForceQuery; -import com.nvidia.cuvs.CagraIndex; -import com.nvidia.cuvs.CagraQuery; -import com.nvidia.cuvs.CagraSearchParams; -import com.nvidia.cuvs.CuVSResources; -import com.nvidia.cuvs.HnswIndex; -import com.nvidia.cuvs.HnswIndexParams; - public class CuVSVectorsReader extends KnnVectorsReader { - protected Logger log = Logger.getLogger(getClass().getName()); + // protected Logger log = Logger.getLogger(getClass().getName()); IndexInput vectorDataReader = null; public String fileName = null; @@ -53,7 +64,7 @@ public class CuVSVectorsReader extends KnnVectorsReader { public int indexFilePayloadSize = 0; public long initialFilePointerLoc = 0; public SegmentInputStream segmentInputStream; - + // Field to List of Indexes public Map> cuvsIndexes; @@ -64,17 +75,21 @@ public CuVSVectorsReader(SegmentReadState state, CuVSResources resources) throws segmentState = state; this.resources = resources; - fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, - CuVSVectorsFormat.VECTOR_DATA_EXTENSION); + fileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, CuVSVectorsFormat.VECTOR_DATA_EXTENSION); vectorDataReader = segmentState.directory.openInput(fileName, segmentState.context); CodecUtil.readIndexHeader(vectorDataReader); initialFilePointerLoc = vectorDataReader.getFilePointer(); - indexFilePayloadSize = (int)vectorDataReader.length() - (int)initialFilePointerLoc; //vectorMetaReader.readInt(); - segmentInputStream = new SegmentInputStream(vectorDataReader, indexFilePayloadSize, initialFilePointerLoc); - log.info("payloadSize: " + indexFilePayloadSize); - log.info("initialFilePointerLoc: " + initialFilePointerLoc); + indexFilePayloadSize = + (int) vectorDataReader.length() + - (int) initialFilePointerLoc; // vectorMetaReader.readInt(); + segmentInputStream = + new SegmentInputStream(vectorDataReader, indexFilePayloadSize, initialFilePointerLoc); + // log.info("payloadSize: " + indexFilePayloadSize); + // log.info("initialFilePointerLoc: " + initialFilePointerLoc); List stackTrace = StackWalker.getInstance().walk(this::getStackTrace); @@ -82,36 +97,39 @@ public CuVSVectorsReader(SegmentReadState state, CuVSResources resources) throws for (StackFrame s : stackTrace) { if (s.toString().startsWith("org.apache.lucene.index.IndexWriter.merge")) { isMergeCase = true; - log.info("Reader opening on merge call"); + // log.info("Reader opening on merge call"); break; } } - - log.info("Source of this segment "+segmentState.segmentSuffix+" is " + segmentState.segmentInfo.getDiagnostics().get(IndexWriter.SOURCE)); + + /*log.info( + "Source of this segment " + + segmentState.segmentSuffix + + " is " + + segmentState.segmentInfo.getDiagnostics().get(IndexWriter.SOURCE)); log.info("Loading for " + segmentState.segmentInfo.name + ", mergeCase? " + isMergeCase); - //if (!isMergeCase) { nocommit: TODO: don't load the cagra index for merge case. - log.info("Not the merge case, hence loading for " + segmentState.segmentInfo.name); - this.cuvsIndexes = loadCuVSIndex(getIndexInputStream(), isMergeCase); - //} + log.info("Not the merge case, hence loading for " + segmentState.segmentInfo.name);*/ + this.cuvsIndexes = loadCuVSIndex(getIndexInputStream(), isMergeCase); } - + @SuppressWarnings({"unchecked"}) - private Map> loadCuVSIndex(ZipInputStream zis, boolean isMergeCase) throws Throwable { + private Map> loadCuVSIndex(ZipInputStream zis, boolean isMergeCase) + throws Throwable { Map> ret = new HashMap>(); Map cagraIndexes = new HashMap(); Map bruteforceIndexes = new HashMap(); Map hnswIndexes = new HashMap(); Map> mappings = new HashMap>(); Map> vectors = new HashMap>(); - + Map maxDocs = null; // map of segment, maxDocs ZipEntry ze; while ((ze = zis.getNextEntry()) != null) { String entry = ze.getName(); - + String segmentField = entry.split("\\.")[0]; String extension = entry.split("\\.")[1]; - + ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; int len = 0; @@ -120,55 +138,76 @@ private Map> loadCuVSIndex(ZipInputStream zis, boolean i } switch (extension) { - case "meta": { - maxDocs = (Map) SerializationUtils.deserialize(baos.toByteArray()); // nocommit use IOUtils - break; - } - case "vec": { - vectors.put(segmentField, (List) SerializationUtils.deserialize(baos.toByteArray())); // nocommit use IOUtils - break; - } - case "map": { - List map = (List) SerializationUtils.deserialize(baos.toByteArray()); // nocommit use IOUtils - mappings.put(segmentField, map); - break; - } - case "cag": { - cagraIndexes.put(segmentField, new CagraIndex.Builder(resources) - .from(new ByteArrayInputStream(baos.toByteArray())) - .build()); - break; - } - case "bf": { - bruteforceIndexes.put(segmentField, new BruteForceIndex.Builder(resources) - .from(new ByteArrayInputStream(baos.toByteArray())) - .build()); - break; - } - case "hnsw": { - HnswIndexParams indexParams = new HnswIndexParams.Builder(resources) - .build(); - hnswIndexes.put(segmentField, new HnswIndex.Builder(resources) - .from(new ByteArrayInputStream(baos.toByteArray())) - .withIndexParams(indexParams) - .build()); - break; - } + case "meta": + { + maxDocs = (Map) SerializationUtils.deserialize(baos.toByteArray()); + break; + } + case "vec": + { + vectors.put( + segmentField, (List) SerializationUtils.deserialize(baos.toByteArray())); + break; + } + case "map": + { + List map = (List) SerializationUtils.deserialize(baos.toByteArray()); + mappings.put(segmentField, map); + break; + } + case "cag": + { + cagraIndexes.put( + segmentField, + new CagraIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .build()); + break; + } + case "bf": + { + bruteforceIndexes.put( + segmentField, + new BruteForceIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .build()); + break; + } + case "hnsw": + { + HnswIndexParams indexParams = new HnswIndexParams.Builder(resources).build(); + hnswIndexes.put( + segmentField, + new HnswIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .withIndexParams(indexParams) + .build()); + break; + } } } - log.info("Loading cuvsIndexes from segment: " + segmentState.segmentInfo.name); + /*log.info("Loading cuvsIndexes from segment: " + segmentState.segmentInfo.name); log.info("Diagnostics for this segment: " + segmentState.segmentInfo.getDiagnostics()); log.info("Loading map of cagraIndexes: " + cagraIndexes); log.info("Loading vectors: " + vectors); - log.info("Loading mapping: " + mappings); + log.info("Loading mapping: " + mappings);*/ - for (String segmentField: cagraIndexes.keySet()) { - log.info("Loading segmentField: " + segmentField); + for (String segmentField : cagraIndexes.keySet()) { + // log.info("Loading segmentField: " + segmentField); String segment = segmentField.split("/")[0]; String field = segmentField.split("/")[1]; - CuVSIndex cuvsIndex = new CuVSIndex(segment, field, cagraIndexes.get(segmentField), mappings.get(segmentField), vectors.get(segmentField), maxDocs.get(segment), bruteforceIndexes.get(segmentField)); - List listOfIndexes = ret.containsKey(field)? ret.get(field): new ArrayList(); + CuVSIndex cuvsIndex = + new CuVSIndex( + segment, + field, + cagraIndexes.get(segmentField), + mappings.get(segmentField), + vectors.get(segmentField), + maxDocs.get(segment), + bruteforceIndexes.get(segmentField)); + List listOfIndexes = + ret.containsKey(field) ? ret.get(field) : new ArrayList(); listOfIndexes.add(cuvsIndex); ret.put(field, listOfIndexes); } @@ -197,22 +236,22 @@ public void checkIntegrity() throws IOException { @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { return new FloatVectorValues() { - + @Override public int size() { return cuvsIndexes.get(field).get(0).getVectors().size(); } - + @Override public int dimension() { return cuvsIndexes.get(field).get(0).getVectors().get(0).length; } - + @Override public float[] vectorValue(int pos) throws IOException { return cuvsIndexes.get(field).get(0).getVectors().get(pos); } - + @Override public FloatVectorValues copy() throws IOException { return null; @@ -226,46 +265,60 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { } @Override - public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - PerLeafCuVSKnnCollector cuvsCollector = knnCollector instanceof PerLeafCuVSKnnCollector? ((PerLeafCuVSKnnCollector)knnCollector): new PerLeafCuVSKnnCollector(knnCollector.k(), knnCollector.k(), 1); - TopKnnCollector defaultCollector = knnCollector instanceof TopKnnCollector? ((TopKnnCollector)knnCollector): null; + public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + PerLeafCuVSKnnCollector cuvsCollector = + knnCollector instanceof PerLeafCuVSKnnCollector + ? ((PerLeafCuVSKnnCollector) knnCollector) + : new PerLeafCuVSKnnCollector(knnCollector.k(), knnCollector.k(), 1); + TopKnnCollector defaultCollector = + knnCollector instanceof TopKnnCollector ? ((TopKnnCollector) knnCollector) : null; int prevDocCount = 0; - // log.debug("Will try to search all the indexes for segment "+segmentState.segmentInfo.name+", field "+field+": "+cuvsIndexes); - for (CuVSIndex cuvsIndex: cuvsIndexes.get(field)) { + // log.debug("Will try to search all the indexes for segment "+segmentState.segmentInfo.name+", + // field "+field+": "+cuvsIndexes); + for (CuVSIndex cuvsIndex : cuvsIndexes.get(field)) { try { Map result = new HashMap(); if (cuvsCollector.k() <= 1024) { - CagraSearchParams searchParams = new CagraSearchParams.Builder(resources) - .withItopkSize(cuvsCollector.iTopK) - .withSearchWidth(cuvsCollector.searchWidth) - .build(); - - CagraQuery query = new CagraQuery.Builder() - .withTopK(cuvsCollector.k()) - .withSearchParams(searchParams) - .withMapping(cuvsIndex.getMapping()) - .withQueryVectors(new float[][] {target}) - .build(); - + CagraSearchParams searchParams = + new CagraSearchParams.Builder(resources) + .withItopkSize(cuvsCollector.iTopK) + .withSearchWidth(cuvsCollector.searchWidth) + .build(); + + CagraQuery query = + new CagraQuery.Builder() + .withTopK(cuvsCollector.k()) + .withSearchParams(searchParams) + .withMapping(cuvsIndex.getMapping()) + .withQueryVectors(new float[][] {target}) + .build(); + CagraIndex cagraIndex = cuvsIndex.getCagraIndex(); assert (cagraIndex != null); - log.info("k is " + cuvsCollector.k()); - result = cagraIndex.search(query).getResults().get(0); // List expected to have only one entry because of single query "target". - log.info("INTERMEDIATE RESULT FROM CUVS: " + result + ", prevDocCount=" + prevDocCount); + // log.info("k is " + cuvsCollector.k()); + result = + cagraIndex + .search(query) + .getResults() + .get(0); // List expected to have only one entry because of single query "target". + // log.info("INTERMEDIATE RESULT FROM CUVS: " + result + ", prevDocCount=" + + // prevDocCount); } else { - BruteForceQuery bruteforceQuery = new BruteForceQuery.Builder() - .withQueryVectors(new float[][] { target }) - .withPrefilter(((FixedBitSet)acceptDocs).getBits()) - .withTopK(cuvsCollector.k()) - .build(); + BruteForceQuery bruteforceQuery = + new BruteForceQuery.Builder() + .withQueryVectors(new float[][] {target}) + .withPrefilter(((FixedBitSet) acceptDocs).getBits()) + .withTopK(cuvsCollector.k()) + .build(); BruteForceIndex bruteforceIndex = cuvsIndex.getBruteforceIndex(); result = bruteforceIndex.search(bruteforceQuery).getResults().get(0); } - - for(Entry kv : result.entrySet()) { + + for (Entry kv : result.entrySet()) { if (defaultCollector != null) { defaultCollector.collect(prevDocCount + kv.getKey(), kv.getValue()); } @@ -273,14 +326,15 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits } } catch (Throwable e) { - e.printStackTrace(); + throw new RuntimeException(e); } prevDocCount += cuvsIndex.getMaxDocs(); } } - + @Override - public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { throw new UnsupportedOperationException(); } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 1da7ca0f9e6c..d5c155ca7212 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -1,16 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceIndexParams; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraIndexParams; +import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo; +import com.nvidia.cuvs.CuVSResources; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.OutputStream; -import java.lang.invoke.MethodHandles; import java.util.ArrayList; -import java.util.Arrays; import java.util.LinkedHashMap; import java.util.List; -import java.util.logging.Logger; - import org.apache.commons.lang3.SerializationUtils; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnFieldVectorsWriter; @@ -22,17 +40,11 @@ import org.apache.lucene.index.Sorter.DocMap; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; - -import com.nvidia.cuvs.BruteForceIndex; -import com.nvidia.cuvs.BruteForceIndexParams; -import com.nvidia.cuvs.CagraIndex; -import com.nvidia.cuvs.CagraIndexParams; -import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo; -import com.nvidia.cuvs.CuVSResources; +import org.apache.lucene.util.SuppressForbidden; public class CuVSVectorsWriter extends KnnVectorsWriter { - protected Logger log = Logger.getLogger(getClass().getName()); + // protected Logger log = Logger.getLogger(getClass().getName()); private List fieldVectorWriters = new ArrayList<>(); private IndexOutput cuVSIndex = null; @@ -41,7 +53,7 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { private CagraIndex cagraIndex; private CagraIndex cagraIndexForHnsw; - + private int cuvsWriterThreads; private int intGraphDegree; private int graphDegree; @@ -49,10 +61,17 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { private CuVSResources resources; public enum MergeStrategy { - TRIVIAL_MERGE, NON_TRIVIAL_MERGE + TRIVIAL_MERGE, + NON_TRIVIAL_MERGE }; - public CuVSVectorsWriter(SegmentWriteState state, int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy, CuVSResources resources) + public CuVSVectorsWriter( + SegmentWriteState state, + int cuvsWriterThreads, + int intGraphDegree, + int graphDegree, + MergeStrategy mergeStrategy, + CuVSResources resources) throws IOException { super(); this.segmentWriteState = state; @@ -62,7 +81,11 @@ public CuVSVectorsWriter(SegmentWriteState state, int cuvsWriterThreads, int int this.graphDegree = graphDegree; this.resources = resources; - cuVSDataFilename = IndexFileNames.segmentFileName(this.segmentWriteState.segmentInfo.name, this.segmentWriteState.segmentSuffix, CuVSVectorsFormat.VECTOR_DATA_EXTENSION); + cuVSDataFilename = + IndexFileNames.segmentFileName( + this.segmentWriteState.segmentInfo.name, + this.segmentWriteState.segmentSuffix, + CuVSVectorsFormat.VECTOR_DATA_EXTENSION); } @Override @@ -85,58 +108,65 @@ public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException return cagraFieldVectorWriter; } + @SuppressForbidden(reason = "A temporary java.util.File is needed for Cagra's serialization") private byte[] createCagraIndex(float[][] vectors, List mapping) throws Throwable { - CagraIndexParams indexParams = new CagraIndexParams.Builder(resources) - .withNumWriterThreads(cuvsWriterThreads) - .withIntermediateGraphDegree(intGraphDegree) - .withGraphDegree(graphDegree) - .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) - .build(); - - log.info("Indexing started: " + System.currentTimeMillis()); - cagraIndex = new CagraIndex.Builder(resources) - .withDataset(vectors) - .withIndexParams(indexParams) - .build(); - log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + vectors.length); + CagraIndexParams indexParams = + new CagraIndexParams.Builder(resources) + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(intGraphDegree) + .withGraphDegree(graphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + + // log.info("Indexing started: " + System.currentTimeMillis()); + cagraIndex = + new CagraIndex.Builder(resources).withDataset(vectors).withIndexParams(indexParams).build(); + // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + + // vectors.length); ByteArrayOutputStream baos = new ByteArrayOutputStream(); - File tmpFile = File.createTempFile("tmpindex", "cag"); // TODO: Should we make this a file with random names? + File tmpFile = + File.createTempFile( + "tmpindex", "cag"); // TODO: Should we make this a file with random names? cagraIndex.serialize(baos, tmpFile); return baos.toByteArray(); } + @SuppressForbidden(reason = "A temporary java.util.File is needed for BruteForce's serialization") private byte[] createBruteForceIndex(float[][] vectors) throws Throwable { - BruteForceIndexParams indexParams = new BruteForceIndexParams.Builder() - .withNumWriterThreads(32) // TODO: Make this configurable later. - .build(); - - log.info("Indexing started: " + System.currentTimeMillis()); - BruteForceIndex index = new BruteForceIndex.Builder(resources) - .withIndexParams(indexParams) - .withDataset(vectors) - .build(); - - log.info("Indexing done: " + System.currentTimeMillis()); + BruteForceIndexParams indexParams = + new BruteForceIndexParams.Builder() + .withNumWriterThreads(32) // TODO: Make this configurable later. + .build(); + + // log.info("Indexing started: " + System.currentTimeMillis()); + BruteForceIndex index = + new BruteForceIndex.Builder(resources) + .withIndexParams(indexParams) + .withDataset(vectors) + .build(); + + // log.info("Indexing done: " + System.currentTimeMillis()); ByteArrayOutputStream baos = new ByteArrayOutputStream(); index.serialize(baos); return baos.toByteArray(); } - + + @SuppressForbidden(reason = "A temporary java.util.File is needed for HNSW's serialization") private byte[] createHnswIndex(float[][] vectors) throws Throwable { - CagraIndexParams indexParams = new CagraIndexParams.Builder(resources) - .withNumWriterThreads(cuvsWriterThreads) - .withIntermediateGraphDegree(intGraphDegree) - .withGraphDegree(graphDegree) - .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) - .build(); - - log.info("Indexing started: " + System.currentTimeMillis()); - cagraIndexForHnsw = new CagraIndex.Builder(resources) - .withDataset(vectors) - .withIndexParams(indexParams) - .build(); - log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + vectors.length); + CagraIndexParams indexParams = + new CagraIndexParams.Builder(resources) + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(intGraphDegree) + .withGraphDegree(graphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + + // log.info("Indexing started: " + System.currentTimeMillis()); + cagraIndexForHnsw = + new CagraIndex.Builder(resources).withDataset(vectors).withIndexParams(indexParams).build(); + // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + + // vectors.length); ByteArrayOutputStream baos = new ByteArrayOutputStream(); File tmpFile = File.createTempFile("tmpindex", "hnsw"); @@ -147,64 +177,82 @@ private byte[] createHnswIndex(float[][] vectors) throws Throwable { @SuppressWarnings({"resource", "rawtypes", "unchecked"}) @Override public void flush(int maxDoc, DocMap sortMap) throws IOException { - cuVSIndex = this.segmentWriteState.directory.createOutput(cuVSDataFilename, this.segmentWriteState.context); - CodecUtil.writeIndexHeader(cuVSIndex, CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, CuVSVectorsFormat.VERSION_CURRENT, this.segmentWriteState.segmentInfo.getId(), this.segmentWriteState.segmentSuffix); - + cuVSIndex = + this.segmentWriteState.directory.createOutput( + cuVSDataFilename, this.segmentWriteState.context); + CodecUtil.writeIndexHeader( + cuVSIndex, + CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, + CuVSVectorsFormat.VERSION_CURRENT, + this.segmentWriteState.segmentInfo.getId(), + this.segmentWriteState.segmentSuffix); CuVSSegmentFile cuVSFile = new CuVSSegmentFile(new SegmentOutputStream(cuVSIndex, 100000)); LinkedHashMap metaMap = new LinkedHashMap(); for (CagraFieldVectorsWriter field : fieldVectorWriters) { - long start = System.currentTimeMillis(); + // long start = System.currentTimeMillis(); byte[] cagraIndexBytes = null; byte[] bruteForceIndexBytes = null; byte[] hnswIndexBytes = null; try { - log.info("Starting CAGRA indexing, space remaining: "+new File("/").getFreeSpace()); - log.info("Starting CAGRA indexing, docs: " + field.vectors.size()); - + // log.info("Starting CAGRA indexing, space remaining: " + new File("/").getFreeSpace()); + // log.info("Starting CAGRA indexing, docs: " + field.vectors.size()); + float vectors[][] = new float[field.vectors.size()][field.vectors.get(0).length]; for (int i = 0; i < vectors.length; i++) { for (int j = 0; j < vectors[i].length; j++) { vectors[i][j] = field.vectors.get(i)[j]; } } - - cagraIndexBytes = createCagraIndex(vectors, new ArrayList(field.vectors.keySet())); // nocommit + + cagraIndexBytes = createCagraIndex(vectors, new ArrayList(field.vectors.keySet())); bruteForceIndexBytes = createBruteForceIndex(vectors); hnswIndexBytes = createHnswIndex(vectors); } catch (Throwable e) { - e.printStackTrace(); + throw new RuntimeException(e); } - - start = System.currentTimeMillis(); - cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".cag", cagraIndexBytes); - log.info("time for writing CAGRA index bytes to zip: " + (System.currentTimeMillis() - start)); - - start = System.currentTimeMillis(); - cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".bf", bruteForceIndexBytes); - log.info("time for writing BRUTEFORCE index bytes to zip: " + (System.currentTimeMillis() - start)); - - start = System.currentTimeMillis(); - cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".hnsw", hnswIndexBytes); - log.info("time for writing HNSW index bytes to zip: " + (System.currentTimeMillis() - start)); - - start = System.currentTimeMillis(); - cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".vec", SerializationUtils.serialize(new ArrayList(field.vectors.values()))); - cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".map", SerializationUtils.serialize(new ArrayList(field.vectors.keySet()))); - log.info("list serializing and writing: " + (System.currentTimeMillis() - start)); + + // start = System.currentTimeMillis(); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".cag", cagraIndexBytes); + // log.info( + // "time for writing CAGRA index bytes to zip: " + (System.currentTimeMillis() - start)); + + // start = System.currentTimeMillis(); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".bf", bruteForceIndexBytes); + /*log.info( + "time for writing BRUTEFORCE index bytes to zip: " + + (System.currentTimeMillis() - start));*/ + + // start = System.currentTimeMillis(); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".hnsw", hnswIndexBytes); + // log.info("time for writing HNSW index bytes to zip: " + (System.currentTimeMillis() - + // start)); + + // start = System.currentTimeMillis(); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".vec", + SerializationUtils.serialize(new ArrayList(field.vectors.values()))); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".map", + SerializationUtils.serialize(new ArrayList(field.vectors.keySet()))); + // log.info("list serializing and writing: " + (System.currentTimeMillis() - start)); field.vectors.clear(); } metaMap.put(segmentWriteState.segmentInfo.name, maxDoc); - cuVSFile.addFile(segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); cuVSFile.close(); - + CodecUtil.writeFooter(cuVSIndex); } - + SegmentOutputStream mergeOutputStream = null; CuVSSegmentFile mergedIndexFile = null; @@ -220,43 +268,50 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE readers.add(reader); } - log.info("Merging one field for segment: " + segmentWriteState.segmentInfo.name); - log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); + // log.info("Merging one field for segment: " + segmentWriteState.segmentInfo.name); + // log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); if (!List.of(segmentWriteState.directory.listAll()).contains(cuVSDataFilename)) { - IndexOutput mergedVectorIndex = segmentWriteState.directory.createOutput(cuVSDataFilename, segmentWriteState.context); - CodecUtil.writeIndexHeader(mergedVectorIndex, CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, - CuVSVectorsFormat.VERSION_CURRENT, segmentWriteState.segmentInfo.getId(), segmentWriteState.segmentSuffix); - this.mergeOutputStream = new SegmentOutputStream(mergedVectorIndex, 100000); + IndexOutput mergedVectorIndex = + segmentWriteState.directory.createOutput(cuVSDataFilename, segmentWriteState.context); + CodecUtil.writeIndexHeader( + mergedVectorIndex, + CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, + CuVSVectorsFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + this.mergeOutputStream = new SegmentOutputStream(mergedVectorIndex, 100000); mergedIndexFile = new CuVSSegmentFile(this.mergeOutputStream); } - - log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); + + // log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); if (mergeStrategy.equals(MergeStrategy.TRIVIAL_MERGE)) { - Util.getMergedArchiveCOS(segInputStreams, segmentWriteState.segmentInfo.name, this.mergeOutputStream - ); + throw new UnsupportedOperationException(); } else if (mergeStrategy.equals(MergeStrategy.NON_TRIVIAL_MERGE)) { - // nocommit: this doesn't merge all the fields - log.info("Readers: "+segInputStreams.size()+", deocMaps: "+mergeState.docMaps.length); + // log.info("Readers: " + segInputStreams.size() + ", deocMaps: " + + // mergeState.docMaps.length); ArrayList docMapList = new ArrayList(); for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { - CuVSVectorsReader reader = (CuVSVectorsReader) mergeState.knnVectorsReaders[i]; - for (CuVSIndex index: reader.cuvsIndexes.get(fieldInfo.name)) { - log.info("Mapping for segment ("+reader.fileName+"): " + index.getMapping()); - log.info("Mapping for segment ("+reader.fileName+"): " + index.getMapping().size()); - for (int id=0; id mergedVectors = Util.getMergedVectors(segInputStreams, fieldInfo.name, segmentWriteState.segmentInfo.name); - log.info("Final mapping: " + docMapList); - log.info("Final mapping: " + docMapList.size()); - log.info("Merged vectors: " + mergedVectors.size()); + + ArrayList mergedVectors = + Util.getMergedVectors( + segInputStreams, fieldInfo.name, segmentWriteState.segmentInfo.name); + // log.info("Final mapping: " + docMapList); + // log.info("Final mapping: " + docMapList.size()); + // log.info("Merged vectors: " + mergedVectors.size()); LinkedHashMap metaMap = new LinkedHashMap(); byte[] cagraIndexBytes = null; byte[] bruteForceIndexBytes = null; @@ -272,27 +327,36 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE bruteForceIndexBytes = createBruteForceIndex(vectors); hnswIndexBytes = createHnswIndex(vectors); } catch (Throwable e) { - e.printStackTrace(); + throw new RuntimeException(e); } - mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".cag", cagraIndexBytes); - mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".bf", bruteForceIndexBytes); - mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".hnsw", hnswIndexBytes); - mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".vec", SerializationUtils.serialize(mergedVectors)); - mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".map", SerializationUtils.serialize(docMapList)); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".cag", cagraIndexBytes); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".bf", + bruteForceIndexBytes); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".hnsw", hnswIndexBytes); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".vec", + SerializationUtils.serialize(mergedVectors)); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".map", + SerializationUtils.serialize(docMapList)); metaMap.put(segmentWriteState.segmentInfo.name, mergedVectors.size()); - if (mergedIndexFile.getFilesAdded().contains(segmentWriteState.segmentInfo.name + ".meta") == false) { - mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); + if (mergedIndexFile.getFilesAdded().contains(segmentWriteState.segmentInfo.name + ".meta") + == false) { + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); } - log.info("DocMaps: "+Arrays.toString(mergeState.docMaps)); + // log.info("DocMaps: " + Arrays.toString(mergeState.docMaps)); metaMap.clear(); } } - @Override public void finish() throws IOException { - if (this.mergeOutputStream!=null) { + if (this.mergeOutputStream != null) { mergedIndexFile.close(); CodecUtil.writeFooter(mergeOutputStream.out); IOUtils.close(mergeOutputStream.out); @@ -334,6 +398,5 @@ public void flush() throws IOException { public void close() throws IOException { this.flush(); } - } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java index d4d19fad7041..3c96aa37325b 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -1,8 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import java.util.ArrayList; import java.util.List; - import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; @@ -55,7 +70,7 @@ public int k() { @Override @SuppressWarnings("cast") public boolean collect(int docId, float similarity) { - scoreDocs.add(new ScoreDoc(docId, 1f/(float)(similarity))); + scoreDocs.add(new ScoreDoc(docId, 1f / (float) (similarity))); return true; } @@ -67,8 +82,8 @@ public float minCompetitiveSimilarity() { @Override public TopDocs topDocs() { - return new TopDocs(new TotalHits(scoreDocs.size(), TotalHits.Relation.EQUAL_TO), + return new TopDocs( + new TotalHits(scoreDocs.size(), TotalHits.Relation.EQUAL_TO), scoreDocs.toArray(new ScoreDoc[scoreDocs.size()])); } - } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java index a352269fbb1b..787d7c81cc61 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java @@ -1,22 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import java.io.IOException; import java.io.InputStream; - import org.apache.lucene.store.IndexInput; public class SegmentInputStream extends InputStream { - /** - * - */ + /** */ private final IndexInput indexInput; + public final long initialFilePointerPosition; public final long limit; public long pos = 0; // TODO: This input stream needs to be modified to enable buffering. - public SegmentInputStream(IndexInput indexInput, long limit, long initialFilePointerPosition) throws IOException { + public SegmentInputStream(IndexInput indexInput, long limit, long initialFilePointerPosition) + throws IOException { super(); this.indexInput = indexInput; this.initialFilePointerPosition = initialFilePointerPosition; @@ -86,5 +101,4 @@ public void close() { public int available() { throw new UnsupportedOperationException(); } - -} \ No newline at end of file +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java index a8200e7b897b..1ffb75037609 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java @@ -1,33 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.FileNotFoundException; import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.io.OutputStream; -import java.lang.invoke.MethodHandles; import java.util.ArrayList; -import java.util.LinkedHashMap; import java.util.List; -import java.util.Map; -import java.util.logging.Logger; -import java.util.zip.Deflater; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -import java.util.zip.ZipOutputStream; +import org.apache.commons.lang3.SerializationUtils; public class Util { - public static ByteArrayOutputStream getZipEntryBAOS(String fileName, SegmentInputStream segInputStream) - throws IOException { + public static ByteArrayOutputStream getZipEntryBAOS( + String fileName, SegmentInputStream segInputStream) throws IOException { segInputStream.reset(); ZipInputStream zipInputStream = new ZipInputStream(segInputStream); ByteArrayOutputStream baos = new ByteArrayOutputStream(); boolean fileFound = false; ZipEntry zipEntry; - while (zipInputStream.available() == 1 && ((zipEntry = zipInputStream.getNextEntry()) != null)) { + while (zipInputStream.available() == 1 + && ((zipEntry = zipInputStream.getNextEntry()) != null)) { if (zipEntry.getName().equals(fileName)) { fileFound = true; byte[] buffer = new byte[1024]; @@ -41,18 +49,19 @@ public static ByteArrayOutputStream getZipEntryBAOS(String fileName, SegmentInpu return baos; } - private static final Logger log = Logger.getLogger(Util.class.getName()); + // private static final Logger log = Logger.getLogger(Util.class.getName()); - public static ArrayList getMergedVectors(List segInputStreams, String fieldName, String mergedSegmentName) + public static ArrayList getMergedVectors( + List segInputStreams, String fieldName, String mergedSegmentName) throws IOException { ZipEntry zs; ArrayList mergedVectors = new ArrayList(); - log.info("Getting mergedVectors..."); + // log.info("Getting mergedVectors..."); for (SegmentInputStream segInputStream : segInputStreams) { segInputStream.reset(); ZipInputStream zipStream = new ZipInputStream(segInputStream); while ((zs = zipStream.getNextEntry()) != null) { - log.info("Getting mergedVectors... " + zs.getName()); + // log.info("Getting mergedVectors... " + zs.getName()); byte[] buffer = new byte[1024]; int length; if (zs.getName().endsWith(".vec")) { @@ -62,7 +71,7 @@ public static ArrayList getMergedVectors(List segIn while ((length = zipStream.read(buffer)) != -1) { baosM.write(buffer, 0, length); } - List m = deSerializeListInMemory(baosM.toByteArray()); + List m = SerializationUtils.deserialize(baosM.toByteArray()); mergedVectors.addAll(m); } } @@ -70,73 +79,4 @@ public static ArrayList getMergedVectors(List segIn } return mergedVectors; } - - public static void getMergedArchiveCOS(List segInputStreams, String mergedSegmentName, - OutputStream os) throws IOException { - ZipOutputStream zos = new ZipOutputStream(os); - ZipEntry zs; - Map mergedMetaMap = new LinkedHashMap(); - for (SegmentInputStream segInputStream : segInputStreams) { - segInputStream.reset(); - ZipInputStream zipStream = new ZipInputStream(segInputStream); - while ((zs = zipStream.getNextEntry()) != null) { - byte[] buffer = new byte[1024]; - int length; - if (zs.getName().endsWith(".meta")) { - ByteArrayOutputStream baosM = new ByteArrayOutputStream(); - while ((length = zipStream.read(buffer)) != -1) { - baosM.write(buffer, 0, length); - } - Map m = deSerializeMapInMemory(baosM.toByteArray()); - mergedMetaMap.putAll(m); - } else { - ZipEntry zipEntry = new ZipEntry(zs.getName()); - zos.putNextEntry(zipEntry); - zos.setLevel(Deflater.NO_COMPRESSION); - while ((length = zipStream.read(buffer)) != -1) { - zos.write(buffer, 0, length); - } - zos.closeEntry(); - } - } - } - // Finally put the merged meta file - ZipEntry mergedMetaZipEntry = new ZipEntry(mergedSegmentName + ".meta"); - zos.putNextEntry(mergedMetaZipEntry); - zos.setLevel(Deflater.NO_COMPRESSION); - new ObjectOutputStream(zos).writeObject(mergedMetaMap); // Java serialization should be avoided - zos.closeEntry(); - zos.close(); - } - - @SuppressWarnings("unchecked") - public static Map deSerializeMapInMemory(byte[] bytes) { - Map map = null; - ObjectInputStream ois = null; - try { - ois = new ObjectInputStream(new ByteArrayInputStream(bytes)); - map = (Map) ois.readObject(); - ois.close(); - } catch (Exception e) { - e.printStackTrace(); - } - - return map; - } - - @SuppressWarnings("unchecked") - public static List deSerializeListInMemory(byte[] bytes) { - List map = null; - ObjectInputStream ois = null; - try { - ois = new ObjectInputStream(new ByteArrayInputStream(bytes)); - map = (List) ois.readObject(); - ois.close(); - } catch (Exception e) { - e.printStackTrace(); - } - - return map; - } - } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java index 67199edca2f6..ce9cd8cc52d2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java @@ -1 +1,17 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 38b31884377d..6f0a89e365d1 100644 --- a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -1 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + org.apache.lucene.sandbox.vectorsearch.CuVSCodec \ No newline at end of file diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java index 15a023d6fbd3..70325a3aa294 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java @@ -1,14 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import java.io.IOException; -import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Random; import java.util.TreeMap; - +import java.util.logging.Logger; import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -30,13 +45,11 @@ import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; @SuppressSysoutChecks(bugUrl = "prints info from within cuvs") public class TestCuVS extends LuceneTestCase { - private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + protected static Logger log = Logger.getLogger(TestCuVS.class.getName()); private static IndexSearcher searcher; private static IndexReader reader; @@ -45,16 +58,16 @@ public class TestCuVS extends LuceneTestCase { public static int DATASET_SIZE_LIMIT = 1000; public static int DIMENSIONS_LIMIT = 2048; public static int NUM_QUERIES_LIMIT = 10; - public static int TOP_K_LIMIT = 64; // nocommit This fails beyond 64 + public static int TOP_K_LIMIT = 64; // TODO This fails beyond 64 public static float[][] dataset = null; @BeforeClass public static void beforeClass() throws Exception { directory = newDirectory(); - + Codec codec = new CuVSCodec(); - + RandomIndexWriter writer = new RandomIndexWriter( random(), @@ -63,7 +76,7 @@ public static void beforeClass() throws Exception { .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) .setCodec(codec) .setMergePolicy(newTieredMergePolicy())); - + log.info("Merge Policy: " + writer.w.getConfig().getMergePolicy()); Random random = random(); @@ -74,8 +87,10 @@ public static void beforeClass() throws Exception { Document doc = new Document(); doc.add(new StringField("id", String.valueOf(i), Field.Store.YES)); doc.add(newTextField("field", English.intToEnglish(i), Field.Store.YES)); - boolean skipVector = random.nextInt(10) < 0; // nocommit disable testing with holes for now, there's some bug. - if (!skipVector || datasetSize<100) { // about 10th of the documents shouldn't have a single vector + boolean skipVector = + random.nextInt(10) < 0; // disable testing with holes for now, there's some bug. + if (!skipVector + || datasetSize < 100) { // about 10th of the documents shouldn't have a single vector doc.add(new KnnFloatVectorField("vector", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); doc.add(new KnnFloatVectorField("vector2", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); } @@ -90,7 +105,6 @@ public static void beforeClass() throws Exception { @AfterClass public static void afterClass() throws Exception { - // nocommit This fails until flat vectors are implemented reader.close(); directory.close(); searcher = null; @@ -105,46 +119,30 @@ public void testVectorSearch() throws IOException { int numQueries = random.nextInt(NUM_QUERIES_LIMIT) + 1; int topK = Math.min(random.nextInt(TOP_K_LIMIT) + 1, dataset.length); - if(dataset.length < topK) topK = dataset.length; + if (dataset.length < topK) topK = dataset.length; float[][] queries = generateQueries(random, dataset[0].length, numQueries); List> expected = generateExpectedResults(topK, dataset, queries); - - debugPrintDatasetAndQueries(dataset, queries); - log.info("Dataset size: {}x{}", dataset.length, dataset[0].length); - log.info("Query size: {}x{}", numQueries, queries[0].length); - log.info("TopK: {}", topK); + log.info("Dataset size: " + dataset.length + "x" + dataset[0].length); + log.info("Query size: " + numQueries + "x" + queries[0].length); + log.info("TopK: " + topK); Query query = new CuVSKnnFloatVectorQuery("vector", queries[0], topK, topK, 1); int correct[] = new int[topK]; - for (int i=0; i> generateExpectedResults(int topK, float[][] dataset, float[][] queries) { + + private static List> generateExpectedResults( + int topK, float[][] dataset, float[][] queries) { List> neighborsResult = new ArrayList<>(); int dimensions = dataset[0].length; @@ -186,13 +185,19 @@ private static List> generateExpectedResults(int topK, float[][] d Map sorted = new TreeMap(distances); log.info("EXPECTED: " + sorted); - + // Sort by distance and select the topK nearest neighbors - List neighbors = distances.entrySet().stream() - .sorted(Map.Entry.comparingByValue()) - .map(Map.Entry::getKey) - .toList(); - neighborsResult.add(neighbors.subList(0, Math.min(topK * 3, dataset.length))); // generate double the topK results in the expected array + List neighbors = + distances.entrySet().stream() + .sorted(Map.Entry.comparingByValue()) + .map(Map.Entry::getKey) + .toList(); + neighborsResult.add( + neighbors.subList( + 0, + Math.min( + topK * 3, + dataset.length))); // generate double the topK results in the expected array } log.info("Expected results generated successfully."); diff --git a/versions.lock b/versions.lock index 26de44f99e2d..dfa465a1b3fe 100644 --- a/versions.lock +++ b/versions.lock @@ -4,6 +4,7 @@ "main_dependencies" : { "com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.8.1" : "fa9ef26b,refs=4", "com.ibm.icu:icu4j:74.2" : "47ea4550,refs=6", + "com.nvidia.cuvs:cuvs-java:25.02" : "0129b4f0,refs=6", "commons-codec:commons-codec:1.13" : "e9962aab,refs=4", "io.sgr:s2-geometry-library-java:1.0.0" : "cbc357ab,refs=4", "junit:junit:4.13.1" : "fa9ef26b,refs=4", @@ -11,6 +12,7 @@ "net.sourceforge.nekohtml:nekohtml:1.9.17" : "5ce8cdc6,refs=2", "org.antlr:antlr4-runtime:4.11.1" : "d9953130,refs=4", "org.apache.commons:commons-compress:1.19" : "5ce8cdc6,refs=2", + "org.apache.commons:commons-lang3:3.17.0" : "0129b4f0,refs=6", "org.apache.commons:commons-math3:3.6.1" : "85a1e4c6,refs=2", "org.apache.opennlp:opennlp-tools:2.3.2" : "2f760bab,refs=4", "org.carrot2:morfologik-fsa:2.1.9" : "79af844b,refs=4", @@ -46,6 +48,7 @@ "com.google.j2objc:j2objc-annotations:1.3" : "6897bc09,refs=38", "com.google.protobuf:protobuf-java:3.19.2" : "6897bc09,refs=38", "com.ibm.icu:icu4j:74.2" : "ffa00415,refs=8", + "com.nvidia.cuvs:cuvs-java:25.02" : "7ac6f8d9,refs=9", "commons-codec:commons-codec:1.13" : "733734f0,refs=6", "io.github.java-diff-utils:java-diff-utils:4.0" : "6897bc09,refs=38", "io.sgr:s2-geometry-library-java:1.0.0" : "1d5a4b2b,refs=4", @@ -55,6 +58,7 @@ "net.sourceforge.nekohtml:nekohtml:1.9.17" : "6f16ff86,refs=2", "org.antlr:antlr4-runtime:4.11.1" : "6fbc4021,refs=5", "org.apache.commons:commons-compress:1.19" : "6f16ff86,refs=2", + "org.apache.commons:commons-lang3:3.17.0" : "7ac6f8d9,refs=9", "org.apache.commons:commons-math3:3.6.1" : "152d9f78,refs=3", "org.apache.opennlp:opennlp-tools:2.3.2" : "b91715f0,refs=6", "org.assertj:assertj-core:3.21.0" : "b7ba1646,refs=2", @@ -79,6 +83,32 @@ } }, "because" : { + "0129b4f0" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "152d9f78" : [ { "configuration" : "annotationProcessor", @@ -405,6 +435,44 @@ "projectPath" : ":lucene:analysis:morfologik" } ], + "7ac6f8d9" : [ + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "85a1e4c6" : [ { "configuration" : "compileClasspath", diff --git a/versions.toml b/versions.toml index 327848fd10d4..d0db5fd20d9d 100644 --- a/versions.toml +++ b/versions.toml @@ -35,7 +35,6 @@ s2-geometry = "1.0.0" spatial4j = "0.8" xerces = "2.12.0" zstd = "1.5.5-11" -jackson-core = "2.18.2" [libraries] antlr-core = { module = "org.antlr:antlr4", version.ref = "antlr" } @@ -57,7 +56,6 @@ flexmark-ext-tables = { module = "com.vladsch.flexmark:flexmark-ext-tables", ver groovy = { module = "org.apache.groovy:groovy-all", version.ref = "groovy" } hamcrest = { module = "org.hamcrest:hamcrest", version.ref = "hamcrest" } icu4j = { module = "com.ibm.icu:icu4j", version.ref = "icu4j" } -jackson-core = { module = "com.fasterxml.jackson.core:jackson-core", version.ref = "jackson-core" } javacc = { module = "net.java.dev.javacc:javacc", version.ref = "javacc" } jflex = { module = "de.jflex:jflex", version.ref = "jflex" } jgit = { module = "org.eclipse.jgit:org.eclipse.jgit", version.ref = "jgit" } From 4bca45fb3b237790e32f9e8e7b3c2f0108c5bd84 Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Fri, 17 Jan 2025 21:03:06 +0530 Subject: [PATCH 05/34] Adding Javadocs to some public methods --- lucene/licenses/cuvs-java-25.02.jar.sha1 | 2 +- .../sandbox/vectorsearch/CagraFieldVectorsWriter.java | 3 +++ .../apache/lucene/sandbox/vectorsearch/CuVSCodec.java | 3 +++ .../apache/lucene/sandbox/vectorsearch/CuVSIndex.java | 3 +++ .../sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java | 3 +++ .../lucene/sandbox/vectorsearch/CuVSSegmentFile.java | 3 +++ .../lucene/sandbox/vectorsearch/CuVSVectorsFormat.java | 3 +++ .../lucene/sandbox/vectorsearch/CuVSVectorsReader.java | 3 +++ .../lucene/sandbox/vectorsearch/CuVSVectorsWriter.java | 9 +++++++++ .../sandbox/vectorsearch/PerLeafCuVSKnnCollector.java | 3 +++ .../lucene/sandbox/vectorsearch/SegmentInputStream.java | 3 +++ .../org/apache/lucene/sandbox/vectorsearch/Util.java | 3 +++ .../apache/lucene/sandbox/vectorsearch/package-info.java | 4 ++++ 13 files changed, 44 insertions(+), 1 deletion(-) diff --git a/lucene/licenses/cuvs-java-25.02.jar.sha1 b/lucene/licenses/cuvs-java-25.02.jar.sha1 index e399aed842a5..42b4dae43805 100644 --- a/lucene/licenses/cuvs-java-25.02.jar.sha1 +++ b/lucene/licenses/cuvs-java-25.02.jar.sha1 @@ -1 +1 @@ -280c6f97d99a8d32500a0c0891db1ccdc49bc17b +870f2aed1a4633489cc9c3d33128683e668a0f30 diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java index df8f83966dc3..6940b9bfeea6 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -22,6 +22,9 @@ import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.index.FieldInfo; +/** + * CuVS based fields writer + */ public class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { public final String fieldName; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index 315923d1eeb2..1e3c85d746ef 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -24,6 +24,9 @@ import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; +/** + * CuVS based codec for GPU based vector search + */ public class CuVSCodec extends FilterCodec { public CuVSCodec() { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index 98a2eb9739ac..6d2a4e281911 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -21,6 +21,9 @@ import java.util.List; import java.util.Objects; +/** + * This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) + */ public class CuVSIndex { private final CagraIndex cagraIndex; private final BruteForceIndex bruteforceIndex; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java index e4df14208f97..e4ce49fb84f7 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -24,6 +24,9 @@ import org.apache.lucene.search.knn.KnnCollectorManager; import org.apache.lucene.util.Bits; +/** + * Query for CuVS + */ public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { private final int iTopK; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java index 7b850daa6662..e6be4726f16e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java @@ -26,6 +26,9 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; +/** + * Methods to deal with a CuVS composite file inside a segment + */ public class CuVSSegmentFile implements AutoCloseable { private final ZipOutputStream zos; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index e2b5bc2169f5..e3928a31b050 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -24,6 +24,9 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; +/** + * CuVS based KnnVectorsFormat for GPU acceleration + */ public class CuVSVectorsFormat extends KnnVectorsFormat { public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index d7e8a5f19b08..b41e5c08f177 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -51,6 +51,9 @@ import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; +/** + * KnnVectorsReader instance associated with CuVS format + */ public class CuVSVectorsReader extends KnnVectorsReader { // protected Logger log = Logger.getLogger(getClass().getName()); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index d5c155ca7212..bb40b7119a0e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -42,6 +42,9 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.SuppressForbidden; +/** + * KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU + */ public class CuVSVectorsWriter extends KnnVectorsWriter { // protected Logger log = Logger.getLogger(getClass().getName()); @@ -60,6 +63,9 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { private MergeStrategy mergeStrategy; private CuVSResources resources; + /** + * Merge strategy used for CuVS + */ public enum MergeStrategy { TRIVIAL_MERGE, NON_TRIVIAL_MERGE @@ -365,6 +371,9 @@ public void finish() throws IOException { } } + /** + * OutputStream for writing into an IndexOutput + */ public class SegmentOutputStream extends OutputStream { IndexOutput out; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java index 3c96aa37325b..a1473c4acf20 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -23,6 +23,9 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; +/** + * KnnCollector for CuVS + */ public class PerLeafCuVSKnnCollector implements KnnCollector { public List scoreDocs; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java index 787d7c81cc61..47c6d3c3cedf 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java @@ -20,6 +20,9 @@ import java.io.InputStream; import org.apache.lucene.store.IndexInput; +/** + * InputStream semantics for reading from an IndexInput + */ public class SegmentInputStream extends InputStream { /** */ diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java index 1ffb75037609..dfe60b29ea27 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java @@ -25,6 +25,9 @@ import java.util.zip.ZipInputStream; import org.apache.commons.lang3.SerializationUtils; +/** + * Some Utils used in CuVS integration + */ public class Util { public static ByteArrayOutputStream getZipEntryBAOS( diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java index ce9cd8cc52d2..a11c94e7224b 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java @@ -14,4 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * CuVS based fast vector search + */ package org.apache.lucene.sandbox.vectorsearch; From 63374b1633f518a504b580343434e0bce8e3f5a7 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Mon, 27 Jan 2025 10:49:08 +0000 Subject: [PATCH 06/34] fix maxDocs checks in CuVSIndex --- .../org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index 6d2a4e281911..adb820bbb4c1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -48,7 +48,10 @@ public CuVSIndex( this.vectors = Objects.requireNonNull(vectors); this.fieldName = Objects.requireNonNull(fieldName); this.segmentName = Objects.requireNonNull(segmentName); - this.maxDocs = Objects.requireNonNull(maxDocs); + if (maxDocs < 0) { + throw new IllegalArgumentException("negative maxDocs:" +maxDocs); + } + this.maxDocs = maxDocs; } public CagraIndex getCagraIndex() { From f75c50e3316a1802e9c46f04c37126ca77a8c4ff Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Mon, 27 Jan 2025 10:50:22 +0000 Subject: [PATCH 07/34] tidy - just to remove noise --- .../vectorsearch/CagraFieldVectorsWriter.java | 4 +--- .../lucene/sandbox/vectorsearch/CuVSCodec.java | 4 +--- .../lucene/sandbox/vectorsearch/CuVSIndex.java | 6 ++---- .../vectorsearch/CuVSKnnFloatVectorQuery.java | 4 +--- .../lucene/sandbox/vectorsearch/CuVSSegmentFile.java | 4 +--- .../sandbox/vectorsearch/CuVSVectorsFormat.java | 4 +--- .../sandbox/vectorsearch/CuVSVectorsReader.java | 4 +--- .../sandbox/vectorsearch/CuVSVectorsWriter.java | 12 +++--------- .../vectorsearch/PerLeafCuVSKnnCollector.java | 4 +--- .../sandbox/vectorsearch/SegmentInputStream.java | 4 +--- .../org/apache/lucene/sandbox/vectorsearch/Util.java | 4 +--- .../lucene/sandbox/vectorsearch/package-info.java | 4 +--- 12 files changed, 15 insertions(+), 43 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java index 6940b9bfeea6..de2c7315f033 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -22,9 +22,7 @@ import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.index.FieldInfo; -/** - * CuVS based fields writer - */ +/** CuVS based fields writer */ public class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { public final String fieldName; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index 1e3c85d746ef..32ca1077887c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -24,9 +24,7 @@ import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; -/** - * CuVS based codec for GPU based vector search - */ +/** CuVS based codec for GPU based vector search */ public class CuVSCodec extends FilterCodec { public CuVSCodec() { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index adb820bbb4c1..9258a04fc5c2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -21,9 +21,7 @@ import java.util.List; import java.util.Objects; -/** - * This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) - */ +/** This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) */ public class CuVSIndex { private final CagraIndex cagraIndex; private final BruteForceIndex bruteforceIndex; @@ -49,7 +47,7 @@ public CuVSIndex( this.fieldName = Objects.requireNonNull(fieldName); this.segmentName = Objects.requireNonNull(segmentName); if (maxDocs < 0) { - throw new IllegalArgumentException("negative maxDocs:" +maxDocs); + throw new IllegalArgumentException("negative maxDocs:" + maxDocs); } this.maxDocs = maxDocs; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java index e4ce49fb84f7..2f6c636590ef 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -24,9 +24,7 @@ import org.apache.lucene.search.knn.KnnCollectorManager; import org.apache.lucene.util.Bits; -/** - * Query for CuVS - */ +/** Query for CuVS */ public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { private final int iTopK; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java index e6be4726f16e..ddbf8fc9d29e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java @@ -26,9 +26,7 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; -/** - * Methods to deal with a CuVS composite file inside a segment - */ +/** Methods to deal with a CuVS composite file inside a segment */ public class CuVSSegmentFile implements AutoCloseable { private final ZipOutputStream zos; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index e3928a31b050..d2f6c78417f5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -24,9 +24,7 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; -/** - * CuVS based KnnVectorsFormat for GPU acceleration - */ +/** CuVS based KnnVectorsFormat for GPU acceleration */ public class CuVSVectorsFormat extends KnnVectorsFormat { public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index b41e5c08f177..d65dfbd288cc 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -51,9 +51,7 @@ import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; -/** - * KnnVectorsReader instance associated with CuVS format - */ +/** KnnVectorsReader instance associated with CuVS format */ public class CuVSVectorsReader extends KnnVectorsReader { // protected Logger log = Logger.getLogger(getClass().getName()); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index bb40b7119a0e..2f595def8446 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -42,9 +42,7 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.SuppressForbidden; -/** - * KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU - */ +/** KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU */ public class CuVSVectorsWriter extends KnnVectorsWriter { // protected Logger log = Logger.getLogger(getClass().getName()); @@ -63,9 +61,7 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { private MergeStrategy mergeStrategy; private CuVSResources resources; - /** - * Merge strategy used for CuVS - */ + /** Merge strategy used for CuVS */ public enum MergeStrategy { TRIVIAL_MERGE, NON_TRIVIAL_MERGE @@ -371,9 +367,7 @@ public void finish() throws IOException { } } - /** - * OutputStream for writing into an IndexOutput - */ + /** OutputStream for writing into an IndexOutput */ public class SegmentOutputStream extends OutputStream { IndexOutput out; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java index a1473c4acf20..ffba5f0c0f1f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -23,9 +23,7 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; -/** - * KnnCollector for CuVS - */ +/** KnnCollector for CuVS */ public class PerLeafCuVSKnnCollector implements KnnCollector { public List scoreDocs; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java index 47c6d3c3cedf..73fba879f6ad 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java @@ -20,9 +20,7 @@ import java.io.InputStream; import org.apache.lucene.store.IndexInput; -/** - * InputStream semantics for reading from an IndexInput - */ +/** InputStream semantics for reading from an IndexInput */ public class SegmentInputStream extends InputStream { /** */ diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java index dfe60b29ea27..35eaf35bc920 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java @@ -25,9 +25,7 @@ import java.util.zip.ZipInputStream; import org.apache.commons.lang3.SerializationUtils; -/** - * Some Utils used in CuVS integration - */ +/** Some Utils used in CuVS integration */ public class Util { public static ByteArrayOutputStream getZipEntryBAOS( diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java index a11c94e7224b..86c56b909dd1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java @@ -15,7 +15,5 @@ * limitations under the License. */ -/** - * CuVS based fast vector search - */ +/** CuVS based fast vector search */ package org.apache.lucene.sandbox.vectorsearch; From 3bc4469b9d7a73d7bc03d75a839a4663aaa4e34f Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 29 Jan 2025 13:44:19 +0000 Subject: [PATCH 08/34] Add temp permissions, etc, to allow testing to succeed. with this I can minimally get the following to complete successfully: gradlew :lucene:sandbox:test --tests "org.apache.lucene.sandbox.vectorsearch.*" --- gradle/testing/defaults-tests.gradle | 2 +- gradle/testing/randomization/policies/tests.policy | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle index 14e64647d667..be0004b72378 100644 --- a/gradle/testing/defaults-tests.gradle +++ b/gradle/testing/defaults-tests.gradle @@ -143,7 +143,7 @@ allprojects { ':lucene:codecs', ":lucene:distribution.tests", ":lucene:test-framework" - ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core') + ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core,com.nvidia.cuvs') def loggingConfigFile = layout.projectDirectory.file("${resources}/logging.properties") def tempDir = layout.projectDirectory.dir(testsTmpDir.toString()) diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy index f8e09ba03661..41cb5d60e44e 100644 --- a/gradle/testing/randomization/policies/tests.policy +++ b/gradle/testing/randomization/policies/tests.policy @@ -80,6 +80,12 @@ grant { permission java.io.FilePermission "${hunspell.corpora}${/}-", "read"; permission java.io.FilePermission "${hunspell.dictionaries}", "read"; permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read"; + + // TODO: these are just temporary to allow tesing with cuvs-java + permission java.lang.RuntimePermission "getenv.CUVS_JAVA_SO_PATH"; + permission java.io.FilePermission "${/}-", "read"; + // For temporary files to communicate with cuvs + permission java.io.FilePermission "${/}tmp${/}-", "write,delete"; }; // Permissions for jacoco code coverage From 705283f6b0bc5650a68bf8928b93831afe98ac25 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 29 Jan 2025 13:51:18 +0000 Subject: [PATCH 09/34] package-private where possible --- .../lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java | 2 +- .../java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java | 2 +- .../lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java | 2 +- .../org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java | 2 +- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java | 2 +- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java | 2 +- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java | 2 +- .../lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java | 2 +- .../apache/lucene/sandbox/vectorsearch/SegmentInputStream.java | 2 +- .../src/java/org/apache/lucene/sandbox/vectorsearch/Util.java | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java index de2c7315f033..e712d69c1ef1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -23,7 +23,7 @@ import org.apache.lucene.index.FieldInfo; /** CuVS based fields writer */ -public class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { +/*package-private*/ class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { public final String fieldName; public final ConcurrentHashMap vectors = diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index 9258a04fc5c2..7b8c19996195 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -22,7 +22,7 @@ import java.util.Objects; /** This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) */ -public class CuVSIndex { +/*package-private*/ class CuVSIndex { private final CagraIndex cagraIndex; private final BruteForceIndex bruteforceIndex; private final List mapping; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java index 2f6c636590ef..efa4ce51e77a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -25,7 +25,7 @@ import org.apache.lucene.util.Bits; /** Query for CuVS */ -public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { +/*package-private*/ class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { private final int iTopK; private final int searchWidth; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java index ddbf8fc9d29e..9b12cdf61012 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java @@ -27,7 +27,7 @@ import java.util.zip.ZipOutputStream; /** Methods to deal with a CuVS composite file inside a segment */ -public class CuVSSegmentFile implements AutoCloseable { +/*package-private*/ class CuVSSegmentFile implements AutoCloseable { private final ZipOutputStream zos; private Set filesAdded = new HashSet(); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index d2f6c78417f5..96f1c889be5d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -25,7 +25,7 @@ import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; /** CuVS based KnnVectorsFormat for GPU acceleration */ -public class CuVSVectorsFormat extends KnnVectorsFormat { +/*package-private*/ class CuVSVectorsFormat extends KnnVectorsFormat { public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; public static final String VECTOR_DATA_EXTENSION = "cag"; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index d65dfbd288cc..f23255792b84 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -52,7 +52,7 @@ import org.apache.lucene.util.IOUtils; /** KnnVectorsReader instance associated with CuVS format */ -public class CuVSVectorsReader extends KnnVectorsReader { +/*package-private*/ class CuVSVectorsReader extends KnnVectorsReader { // protected Logger log = Logger.getLogger(getClass().getName()); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 2f595def8446..c652f5333a74 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -43,7 +43,7 @@ import org.apache.lucene.util.SuppressForbidden; /** KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU */ -public class CuVSVectorsWriter extends KnnVectorsWriter { +/*package-private*/ class CuVSVectorsWriter extends KnnVectorsWriter { // protected Logger log = Logger.getLogger(getClass().getName()); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java index ffba5f0c0f1f..23d524cef182 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -24,7 +24,7 @@ import org.apache.lucene.search.TotalHits; /** KnnCollector for CuVS */ -public class PerLeafCuVSKnnCollector implements KnnCollector { +/*package-private*/ class PerLeafCuVSKnnCollector implements KnnCollector { public List scoreDocs; public int topK = 0; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java index 73fba879f6ad..8f81c8bb7f15 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java @@ -21,7 +21,7 @@ import org.apache.lucene.store.IndexInput; /** InputStream semantics for reading from an IndexInput */ -public class SegmentInputStream extends InputStream { +/*package-private*/ class SegmentInputStream extends InputStream { /** */ private final IndexInput indexInput; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java index 35eaf35bc920..a19e7d4681a5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java @@ -26,7 +26,7 @@ import org.apache.commons.lang3.SerializationUtils; /** Some Utils used in CuVS integration */ -public class Util { +/*package-private*/ class Util { public static ByteArrayOutputStream getZipEntryBAOS( String fileName, SegmentInputStream segInputStream) throws IOException { From 834e5607289de9a09d42eb3cbc8df8ca97bf4928 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sun, 2 Feb 2025 15:19:55 +0000 Subject: [PATCH 10/34] minimal update for the new cuvs-java api modifications --- lucene/licenses/cuvs-java-25.02.0.jar.sha1 | 1 + lucene/licenses/cuvs-java-25.02.jar.sha1 | 1 - .../sandbox/vectorsearch/CuVSCodec.java | 4 +-- .../vectorsearch/CuVSVectorsFormat.java | 10 +++---- .../vectorsearch/CuVSVectorsReader.java | 8 ++--- .../vectorsearch/CuVSVectorsWriter.java | 29 ++++++++++--------- versions.lock | 4 +-- versions.toml | 2 +- 8 files changed, 30 insertions(+), 29 deletions(-) create mode 100644 lucene/licenses/cuvs-java-25.02.0.jar.sha1 delete mode 100644 lucene/licenses/cuvs-java-25.02.jar.sha1 diff --git a/lucene/licenses/cuvs-java-25.02.0.jar.sha1 b/lucene/licenses/cuvs-java-25.02.0.jar.sha1 new file mode 100644 index 000000000000..f4abed6a16c0 --- /dev/null +++ b/lucene/licenses/cuvs-java-25.02.0.jar.sha1 @@ -0,0 +1 @@ +bee6c3f5bfdc4a4d21a079f8fc2837c42eb37560 diff --git a/lucene/licenses/cuvs-java-25.02.jar.sha1 b/lucene/licenses/cuvs-java-25.02.jar.sha1 deleted file mode 100644 index 42b4dae43805..000000000000 --- a/lucene/licenses/cuvs-java-25.02.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -870f2aed1a4633489cc9c3d33128683e668a0f30 diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index 32ca1077887c..f455a863a9a1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -16,7 +16,7 @@ */ package org.apache.lucene.sandbox.vectorsearch; -import com.nvidia.cuvs.LibraryNotFoundException; +import com.nvidia.cuvs.LibraryException; import java.util.logging.Logger; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; @@ -37,7 +37,7 @@ public CuVSCodec(String name, Codec delegate) { try { format = new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE); setKnnFormat(format); - } catch (LibraryNotFoundException ex) { + } catch (LibraryException ex) { Logger log = Logger.getLogger(CuVSCodec.class.getName()); log.severe("Couldn't load native library, possible classloader issue. " + ex.getMessage()); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index 96f1c889be5d..dfc224bf6309 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -17,7 +17,7 @@ package org.apache.lucene.sandbox.vectorsearch; import com.nvidia.cuvs.CuVSResources; -import com.nvidia.cuvs.LibraryNotFoundException; +import com.nvidia.cuvs.LibraryException; import java.io.IOException; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.index.SegmentReadState; @@ -44,7 +44,7 @@ public CuVSVectorsFormat() { this.intGraphDegree = 128; this.graphDegree = 64; try { - resources = new CuVSResources(); + resources = CuVSResources.create(); } catch (Throwable e) { throw new RuntimeException(e); } @@ -52,15 +52,15 @@ public CuVSVectorsFormat() { public CuVSVectorsFormat( int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy) - throws LibraryNotFoundException { + throws LibraryException { super("CuVSVectorsFormat"); this.mergeStrategy = mergeStrategy; this.cuvsWriterThreads = cuvsWriterThreads; this.intGraphDegree = intGraphDegree; this.graphDegree = graphDegree; try { - resources = new CuVSResources(); - } catch (LibraryNotFoundException ex) { + resources = CuVSResources.create(); + } catch (LibraryException ex) { throw ex; } catch (Throwable e) { throw new RuntimeException(e); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index f23255792b84..b93d7113036d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -160,7 +160,7 @@ private Map> loadCuVSIndex(ZipInputStream zis, boolean i { cagraIndexes.put( segmentField, - new CagraIndex.Builder(resources) + CagraIndex.newBuilder(resources) .from(new ByteArrayInputStream(baos.toByteArray())) .build()); break; @@ -169,17 +169,17 @@ private Map> loadCuVSIndex(ZipInputStream zis, boolean i { bruteforceIndexes.put( segmentField, - new BruteForceIndex.Builder(resources) + BruteForceIndex.newBuilder(resources) .from(new ByteArrayInputStream(baos.toByteArray())) .build()); break; } case "hnsw": { - HnswIndexParams indexParams = new HnswIndexParams.Builder(resources).build(); + HnswIndexParams indexParams = new HnswIndexParams.Builder().build(); hnswIndexes.put( segmentField, - new HnswIndex.Builder(resources) + HnswIndex.newBuilder(resources) .from(new ByteArrayInputStream(baos.toByteArray())) .withIndexParams(indexParams) .build()); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index c652f5333a74..9de52248004f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -23,9 +23,10 @@ import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo; import com.nvidia.cuvs.CuVSResources; import java.io.ByteArrayOutputStream; -import java.io.File; import java.io.IOException; import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; @@ -55,11 +56,11 @@ private CagraIndex cagraIndex; private CagraIndex cagraIndexForHnsw; - private int cuvsWriterThreads; - private int intGraphDegree; - private int graphDegree; - private MergeStrategy mergeStrategy; - private CuVSResources resources; + private final int cuvsWriterThreads; + private final int intGraphDegree; + private final int graphDegree; + private final MergeStrategy mergeStrategy; + private final CuVSResources resources; /** Merge strategy used for CuVS */ public enum MergeStrategy { @@ -113,7 +114,7 @@ public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException @SuppressForbidden(reason = "A temporary java.util.File is needed for Cagra's serialization") private byte[] createCagraIndex(float[][] vectors, List mapping) throws Throwable { CagraIndexParams indexParams = - new CagraIndexParams.Builder(resources) + new CagraIndexParams.Builder() .withNumWriterThreads(cuvsWriterThreads) .withIntermediateGraphDegree(intGraphDegree) .withGraphDegree(graphDegree) @@ -122,13 +123,13 @@ private byte[] createCagraIndex(float[][] vectors, List mapping) throws // log.info("Indexing started: " + System.currentTimeMillis()); cagraIndex = - new CagraIndex.Builder(resources).withDataset(vectors).withIndexParams(indexParams).build(); + CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + // vectors.length); ByteArrayOutputStream baos = new ByteArrayOutputStream(); - File tmpFile = - File.createTempFile( + Path tmpFile = + Files.createTempFile( "tmpindex", "cag"); // TODO: Should we make this a file with random names? cagraIndex.serialize(baos, tmpFile); return baos.toByteArray(); @@ -143,7 +144,7 @@ private byte[] createBruteForceIndex(float[][] vectors) throws Throwable { // log.info("Indexing started: " + System.currentTimeMillis()); BruteForceIndex index = - new BruteForceIndex.Builder(resources) + BruteForceIndex.newBuilder(resources) .withIndexParams(indexParams) .withDataset(vectors) .build(); @@ -157,7 +158,7 @@ private byte[] createBruteForceIndex(float[][] vectors) throws Throwable { @SuppressForbidden(reason = "A temporary java.util.File is needed for HNSW's serialization") private byte[] createHnswIndex(float[][] vectors) throws Throwable { CagraIndexParams indexParams = - new CagraIndexParams.Builder(resources) + new CagraIndexParams.Builder() .withNumWriterThreads(cuvsWriterThreads) .withIntermediateGraphDegree(intGraphDegree) .withGraphDegree(graphDegree) @@ -166,12 +167,12 @@ private byte[] createHnswIndex(float[][] vectors) throws Throwable { // log.info("Indexing started: " + System.currentTimeMillis()); cagraIndexForHnsw = - new CagraIndex.Builder(resources).withDataset(vectors).withIndexParams(indexParams).build(); + CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + // vectors.length); ByteArrayOutputStream baos = new ByteArrayOutputStream(); - File tmpFile = File.createTempFile("tmpindex", "hnsw"); + Path tmpFile = Files.createTempFile("tmpindex", "hnsw"); cagraIndexForHnsw.serializeToHNSW(baos, tmpFile); return baos.toByteArray(); } diff --git a/versions.lock b/versions.lock index dfa465a1b3fe..b9d5fa0a17a1 100644 --- a/versions.lock +++ b/versions.lock @@ -4,7 +4,7 @@ "main_dependencies" : { "com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.8.1" : "fa9ef26b,refs=4", "com.ibm.icu:icu4j:74.2" : "47ea4550,refs=6", - "com.nvidia.cuvs:cuvs-java:25.02" : "0129b4f0,refs=6", + "com.nvidia.cuvs:cuvs-java:25.02.0" : "0129b4f0,refs=6", "commons-codec:commons-codec:1.13" : "e9962aab,refs=4", "io.sgr:s2-geometry-library-java:1.0.0" : "cbc357ab,refs=4", "junit:junit:4.13.1" : "fa9ef26b,refs=4", @@ -48,7 +48,7 @@ "com.google.j2objc:j2objc-annotations:1.3" : "6897bc09,refs=38", "com.google.protobuf:protobuf-java:3.19.2" : "6897bc09,refs=38", "com.ibm.icu:icu4j:74.2" : "ffa00415,refs=8", - "com.nvidia.cuvs:cuvs-java:25.02" : "7ac6f8d9,refs=9", + "com.nvidia.cuvs:cuvs-java:25.02.0" : "7ac6f8d9,refs=9", "commons-codec:commons-codec:1.13" : "733734f0,refs=6", "io.github.java-diff-utils:java-diff-utils:4.0" : "6897bc09,refs=38", "io.sgr:s2-geometry-library-java:1.0.0" : "1d5a4b2b,refs=4", diff --git a/versions.toml b/versions.toml index d0db5fd20d9d..06c2247422a4 100644 --- a/versions.toml +++ b/versions.toml @@ -5,7 +5,7 @@ assertj = "3.21.0" commons-codec = "1.13" commons-compress = "1.19" commons-lang3 = "3.17.0" -cuvs = "25.02" +cuvs = "25.02.0" ecj = "3.36.0" errorprone = "2.18.0" flexmark = "0.61.24" From 3772c4c46c7996fd00291b9afc997bdd244da1a0 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sun, 2 Feb 2025 15:34:56 +0000 Subject: [PATCH 11/34] add filter cuvs service provider --- lucene/licenses/cuvs-java-25.02.0.jar.sha1 | 2 +- lucene/sandbox/src/java/module-info.java | 2 + .../vectorsearch/FilterCuVSProvider.java | 43 +++++++++++++++++++ .../FilterCuVSServiceProvider.java | 11 +++++ .../com.nvidia.cuvs.spi.CuVSServiceProvider | 16 +++++++ 5 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java create mode 100644 lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider diff --git a/lucene/licenses/cuvs-java-25.02.0.jar.sha1 b/lucene/licenses/cuvs-java-25.02.0.jar.sha1 index f4abed6a16c0..ccb02e86aa8c 100644 --- a/lucene/licenses/cuvs-java-25.02.0.jar.sha1 +++ b/lucene/licenses/cuvs-java-25.02.0.jar.sha1 @@ -1 +1 @@ -bee6c3f5bfdc4a4d21a079f8fc2837c42eb37560 +0086126edbd145e5d0be65e6157e96e3e8a2ebca diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index 051c1df0a257..8b182a6e050c 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -45,4 +45,6 @@ // org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat; provides org.apache.lucene.codecs.Codec with org.apache.lucene.sandbox.vectorsearch.CuVSCodec; + provides com.nvidia.cuvs.spi.CuVSServiceProvider with + org.apache.lucene.sandbox.vectorsearch.FilterCuVSServiceProvider; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java new file mode 100644 index 000000000000..641ef40acb3b --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java @@ -0,0 +1,43 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.HnswIndex; +import com.nvidia.cuvs.spi.CuVSProvider; + +import java.nio.file.Path; + +public class FilterCuVSProvider implements CuVSProvider { + + private final CuVSProvider delegate; + + FilterCuVSProvider(CuVSProvider delegate) { + this.delegate = delegate; + } + + @Override + public Path nativeLibraryPath() { + return CuVSProvider.TMPDIR; + } + + @Override + public CuVSResources newCuVSResources(Path tempPath) throws Throwable { + return delegate.newCuVSResources(tempPath); + } + + @Override + public BruteForceIndex.Builder newBruteForceIndexBuilder(CuVSResources cuVSResources) throws UnsupportedOperationException { + return delegate.newBruteForceIndexBuilder(cuVSResources); + } + + @Override + public CagraIndex.Builder newCagraIndexBuilder(CuVSResources cuVSResources) throws UnsupportedOperationException { + return delegate.newCagraIndexBuilder(cuVSResources); + } + + @Override + public HnswIndex.Builder newHnswIndexBuilder(CuVSResources cuVSResources) throws UnsupportedOperationException { + return delegate.newHnswIndexBuilder(cuVSResources); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java new file mode 100644 index 000000000000..7840b07a86cc --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java @@ -0,0 +1,11 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.spi.CuVSProvider; +import com.nvidia.cuvs.spi.CuVSServiceProvider; + +public class FilterCuVSServiceProvider extends CuVSServiceProvider { + @Override + public CuVSProvider get(CuVSProvider builtinProvider) { + return new FilterCuVSProvider(builtinProvider); + } +} diff --git a/lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider b/lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider new file mode 100644 index 000000000000..5e7ceba19343 --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.sandbox.vectorsearch.FilterCuVSServiceProvider \ No newline at end of file From 8453bb1832ca09daf1e17c63194e2ce71ef2f5f4 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sun, 2 Feb 2025 19:05:42 +0000 Subject: [PATCH 12/34] cleanup --- .../vectorsearch/FilterCuVSProvider.java | 80 ++++++++++++------- .../FilterCuVSServiceProvider.java | 16 ++++ 2 files changed, 65 insertions(+), 31 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java index 641ef40acb3b..155d9301ab36 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import com.nvidia.cuvs.BruteForceIndex; @@ -5,39 +21,41 @@ import com.nvidia.cuvs.CuVSResources; import com.nvidia.cuvs.HnswIndex; import com.nvidia.cuvs.spi.CuVSProvider; - import java.nio.file.Path; public class FilterCuVSProvider implements CuVSProvider { - private final CuVSProvider delegate; - - FilterCuVSProvider(CuVSProvider delegate) { - this.delegate = delegate; - } - - @Override - public Path nativeLibraryPath() { - return CuVSProvider.TMPDIR; - } - - @Override - public CuVSResources newCuVSResources(Path tempPath) throws Throwable { - return delegate.newCuVSResources(tempPath); - } - - @Override - public BruteForceIndex.Builder newBruteForceIndexBuilder(CuVSResources cuVSResources) throws UnsupportedOperationException { - return delegate.newBruteForceIndexBuilder(cuVSResources); - } - - @Override - public CagraIndex.Builder newCagraIndexBuilder(CuVSResources cuVSResources) throws UnsupportedOperationException { - return delegate.newCagraIndexBuilder(cuVSResources); - } - - @Override - public HnswIndex.Builder newHnswIndexBuilder(CuVSResources cuVSResources) throws UnsupportedOperationException { - return delegate.newHnswIndexBuilder(cuVSResources); - } + private final CuVSProvider delegate; + + FilterCuVSProvider(CuVSProvider delegate) { + this.delegate = delegate; + } + + @Override + public Path nativeLibraryPath() { + return CuVSProvider.TMPDIR; + } + + @Override + public CuVSResources newCuVSResources(Path tempPath) throws Throwable { + return delegate.newCuVSResources(tempPath); + } + + @Override + public BruteForceIndex.Builder newBruteForceIndexBuilder(CuVSResources cuVSResources) + throws UnsupportedOperationException { + return delegate.newBruteForceIndexBuilder(cuVSResources); + } + + @Override + public CagraIndex.Builder newCagraIndexBuilder(CuVSResources cuVSResources) + throws UnsupportedOperationException { + return delegate.newCagraIndexBuilder(cuVSResources); + } + + @Override + public HnswIndex.Builder newHnswIndexBuilder(CuVSResources cuVSResources) + throws UnsupportedOperationException { + return delegate.newHnswIndexBuilder(cuVSResources); + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java index 7840b07a86cc..65dbf5d14737 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import com.nvidia.cuvs.spi.CuVSProvider; From 2bce954d07477c8871c62c27958ec7c360c9f07d Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Mon, 3 Feb 2025 17:03:09 +0000 Subject: [PATCH 13/34] itr : remove dep on commons lang3, fix visibility issues --- gradle/testing/defaults-tests.gradle | 2 +- lucene/sandbox/src/java/module-info.java | 9 ++- .../vectorsearch/CagraFieldVectorsWriter.java | 5 +- .../vectorsearch/CuVSVectorsFormat.java | 2 +- .../vectorsearch/CuVSVectorsReader.java | 7 +- .../vectorsearch/CuVSVectorsWriter.java | 1 - .../vectorsearch/FilterCuVSProvider.java | 2 +- .../FilterCuVSServiceProvider.java | 1 + .../vectorsearch/SerializationUtils.java | 64 +++++++++++++++++++ .../lucene/sandbox/vectorsearch/Util.java | 1 - .../services/org.apache.lucene.codecs.Codec | 2 +- 11 files changed, 79 insertions(+), 17 deletions(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle index be0004b72378..b636162ea96d 100644 --- a/gradle/testing/defaults-tests.gradle +++ b/gradle/testing/defaults-tests.gradle @@ -143,7 +143,7 @@ allprojects { ':lucene:codecs', ":lucene:distribution.tests", ":lucene:test-framework" - ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core,com.nvidia.cuvs') + ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core,com.nvidia.cuvs') // TODO: make this sandbox only def loggingConfigFile = layout.projectDirectory.file("${resources}/logging.properties") def tempDir = layout.projectDirectory.dir(testsTmpDir.toString()) diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index 8b182a6e050c..822c06e1e431 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -22,7 +22,6 @@ requires org.apache.lucene.facet; requires java.logging; requires com.nvidia.cuvs; - requires org.apache.commons.lang3; exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; @@ -41,10 +40,10 @@ provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; - // provides org.apache.lucene.codecs.KnnVectorsFormat with - // org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat; - provides org.apache.lucene.codecs.Codec with - org.apache.lucene.sandbox.vectorsearch.CuVSCodec; + provides org.apache.lucene.codecs.KnnVectorsFormat with + org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat; +// provides org.apache.lucene.codecs.Codec with +// org.apache.lucene.sandbox.vectorsearch.CuVSCodec; provides com.nvidia.cuvs.spi.CuVSServiceProvider with org.apache.lucene.sandbox.vectorsearch.FilterCuVSServiceProvider; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java index e712d69c1ef1..66718be698be 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.index.FieldInfo; @@ -37,9 +38,9 @@ public CagraFieldVectorsWriter(FieldInfo fieldInfo) { @Override public long ramBytesUsed() { - return fieldName.getBytes(Charset.forName("UTF-8")).length + return fieldName.getBytes(StandardCharsets.UTF_8).length + Integer.BYTES - + (vectors.size() * fieldVectorDimension * Float.BYTES); + + ((long) vectors.size() * fieldVectorDimension * Float.BYTES); } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index dfc224bf6309..525dfe0eeb00 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -25,7 +25,7 @@ import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; /** CuVS based KnnVectorsFormat for GPU acceleration */ -/*package-private*/ class CuVSVectorsFormat extends KnnVectorsFormat { +public class CuVSVectorsFormat extends KnnVectorsFormat { public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; public static final String VECTOR_DATA_EXTENSION = "cag"; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index b93d7113036d..4b41ef7f3bb4 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -37,7 +37,6 @@ import java.util.stream.Stream; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -import org.apache.commons.lang3.SerializationUtils; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.index.ByteVectorValues; @@ -141,18 +140,18 @@ private Map> loadCuVSIndex(ZipInputStream zis, boolean i switch (extension) { case "meta": { - maxDocs = (Map) SerializationUtils.deserialize(baos.toByteArray()); + maxDocs = SerializationUtils.deserialize(baos.toByteArray()); break; } case "vec": { vectors.put( - segmentField, (List) SerializationUtils.deserialize(baos.toByteArray())); + segmentField, SerializationUtils.deserialize(baos.toByteArray())); break; } case "map": { - List map = (List) SerializationUtils.deserialize(baos.toByteArray()); + List map = SerializationUtils.deserialize(baos.toByteArray()); mappings.put(segmentField, map); break; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 9de52248004f..3400d306cd49 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -30,7 +30,6 @@ import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; -import org.apache.commons.lang3.SerializationUtils; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.codecs.KnnVectorsWriter; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java index 155d9301ab36..842fdde65dd2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java @@ -23,7 +23,7 @@ import com.nvidia.cuvs.spi.CuVSProvider; import java.nio.file.Path; -public class FilterCuVSProvider implements CuVSProvider { +/*package-private*/ class FilterCuVSProvider implements CuVSProvider { private final CuVSProvider delegate; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java index 65dbf5d14737..eeb7b6895aa3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java @@ -19,6 +19,7 @@ import com.nvidia.cuvs.spi.CuVSProvider; import com.nvidia.cuvs.spi.CuVSServiceProvider; +/** A provider that creates instances of FilterCuVSProvider. */ public class FilterCuVSServiceProvider extends CuVSServiceProvider { @Override public CuVSProvider get(CuVSProvider builtinProvider) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java new file mode 100644 index 000000000000..5eaf12d83a24 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.io.Serializable; +import java.io.UncheckedIOException; +import java.util.Objects; + +/*package-private*/ class SerializationUtils { + + static byte[] serialize(final Serializable obj) { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(64 * 1024); + serialize(obj, baos); + return baos.toByteArray(); + } + + static void serialize(final Serializable obj, final OutputStream outputStream) { + Objects.requireNonNull(outputStream); + try (ObjectOutputStream out = new ObjectOutputStream(outputStream)) { + out.writeObject(obj); + } catch (final IOException ex) { + throw new UncheckedIOException(ex); + } + } + + static T deserialize(final byte[] objectData) { + Objects.requireNonNull(objectData); + return deserialize(new ByteArrayInputStream(objectData)); + } + + static T deserialize(final InputStream inputStream) { + Objects.requireNonNull(inputStream); + try (ObjectInputStream in = new ObjectInputStream(inputStream)) { + @SuppressWarnings("unchecked") + final T obj = (T) in.readObject(); + return obj; + } catch (IOException ex) { + throw new UncheckedIOException(ex); + } catch (ClassNotFoundException ex) { + throw new AssertionError(ex); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java index a19e7d4681a5..ba980777b2df 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java @@ -23,7 +23,6 @@ import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -import org.apache.commons.lang3.SerializationUtils; /** Some Utils used in CuVS integration */ /*package-private*/ class Util { diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 6f0a89e365d1..d039758f2603 100644 --- a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.sandbox.vectorsearch.CuVSCodec \ No newline at end of file +#org.apache.lucene.sandbox.vectorsearch.CuVSCodec \ No newline at end of file From e62112ec6235afec2a6f71aff5cd73a25be0b1c8 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 5 Feb 2025 10:00:12 +0000 Subject: [PATCH 14/34] tidy --- .../vectorsearch/SerializationUtils.java | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java index 5eaf12d83a24..a46db32afea9 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java @@ -29,36 +29,36 @@ /*package-private*/ class SerializationUtils { - static byte[] serialize(final Serializable obj) { - final ByteArrayOutputStream baos = new ByteArrayOutputStream(64 * 1024); - serialize(obj, baos); - return baos.toByteArray(); - } + static byte[] serialize(final Serializable obj) { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(64 * 1024); + serialize(obj, baos); + return baos.toByteArray(); + } - static void serialize(final Serializable obj, final OutputStream outputStream) { - Objects.requireNonNull(outputStream); - try (ObjectOutputStream out = new ObjectOutputStream(outputStream)) { - out.writeObject(obj); - } catch (final IOException ex) { - throw new UncheckedIOException(ex); - } + static void serialize(final Serializable obj, final OutputStream outputStream) { + Objects.requireNonNull(outputStream); + try (ObjectOutputStream out = new ObjectOutputStream(outputStream)) { + out.writeObject(obj); + } catch (final IOException ex) { + throw new UncheckedIOException(ex); } + } - static T deserialize(final byte[] objectData) { - Objects.requireNonNull(objectData); - return deserialize(new ByteArrayInputStream(objectData)); - } + static T deserialize(final byte[] objectData) { + Objects.requireNonNull(objectData); + return deserialize(new ByteArrayInputStream(objectData)); + } - static T deserialize(final InputStream inputStream) { - Objects.requireNonNull(inputStream); - try (ObjectInputStream in = new ObjectInputStream(inputStream)) { - @SuppressWarnings("unchecked") - final T obj = (T) in.readObject(); - return obj; - } catch (IOException ex) { - throw new UncheckedIOException(ex); - } catch (ClassNotFoundException ex) { - throw new AssertionError(ex); - } + static T deserialize(final InputStream inputStream) { + Objects.requireNonNull(inputStream); + try (ObjectInputStream in = new ObjectInputStream(inputStream)) { + @SuppressWarnings("unchecked") + final T obj = (T) in.readObject(); + return obj; + } catch (IOException ex) { + throw new UncheckedIOException(ex); + } catch (ClassNotFoundException ex) { + throw new AssertionError(ex); } + } } From 349c7aa30aa047c56809e57e95243f371a91a2e8 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 5 Feb 2025 10:02:13 +0000 Subject: [PATCH 15/34] expose knn format and update test --- lucene/sandbox/src/java/module-info.java | 4 +--- .../vectorsearch/CagraFieldVectorsWriter.java | 1 - .../vectorsearch/CuVSVectorsFormat.java | 18 +++++++++--------- .../vectorsearch/CuVSVectorsReader.java | 3 +-- .../services/org.apache.lucene.codecs.Codec | 16 ---------------- .../lucene/sandbox/vectorsearch/TestCuVS.java | 6 +++--- 6 files changed, 14 insertions(+), 34 deletions(-) delete mode 100644 lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index 822c06e1e431..59e89cfd0bf0 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -40,10 +40,8 @@ provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; - provides org.apache.lucene.codecs.KnnVectorsFormat with + provides org.apache.lucene.codecs.KnnVectorsFormat with org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat; -// provides org.apache.lucene.codecs.Codec with -// org.apache.lucene.sandbox.vectorsearch.CuVSCodec; provides com.nvidia.cuvs.spi.CuVSServiceProvider with org.apache.lucene.sandbox.vectorsearch.FilterCuVSServiceProvider; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java index 66718be698be..183b3c87d431 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -17,7 +17,6 @@ package org.apache.lucene.sandbox.vectorsearch; import java.io.IOException; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.codecs.KnnFieldVectorsWriter; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index 525dfe0eeb00..ef8e206fcd48 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -31,6 +31,10 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { public static final String VECTOR_DATA_EXTENSION = "cag"; public static final String META_EXTENSION = "cagmf"; public static final int VERSION_CURRENT = 0; + public static final int DEFAULT_WRITER_THREADS = 1; + public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; + public static final int DEFAULT_GRAPH_DEGREE = 64; + public final int maxDimensions = 4096; public final int cuvsWriterThreads; public final int intGraphDegree; @@ -39,15 +43,11 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { public static CuVSResources resources; public CuVSVectorsFormat() { - super("CuVSVectorsFormat"); - this.cuvsWriterThreads = 1; - this.intGraphDegree = 128; - this.graphDegree = 64; - try { - resources = CuVSResources.create(); - } catch (Throwable e) { - throw new RuntimeException(e); - } + this( + DEFAULT_WRITER_THREADS, + DEFAULT_INTERMEDIATE_GRAPH_DEGREE, + DEFAULT_GRAPH_DEGREE, + MergeStrategy.NON_TRIVIAL_MERGE); } public CuVSVectorsFormat( diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index 4b41ef7f3bb4..0afbe18b278e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -145,8 +145,7 @@ private Map> loadCuVSIndex(ZipInputStream zis, boolean i } case "vec": { - vectors.put( - segmentField, SerializationUtils.deserialize(baos.toByteArray())); + vectors.put(segmentField, SerializationUtils.deserialize(baos.toByteArray())); break; } case "map": diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec deleted file mode 100644 index d039758f2603..000000000000 --- a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#org.apache.lucene.sandbox.vectorsearch.CuVSCodec \ No newline at end of file diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java index 70325a3aa294..dd013eed547a 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java @@ -66,7 +66,7 @@ public class TestCuVS extends LuceneTestCase { public static void beforeClass() throws Exception { directory = newDirectory(); - Codec codec = new CuVSCodec(); + Codec codec = TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); RandomIndexWriter writer = new RandomIndexWriter( @@ -105,8 +105,8 @@ public static void beforeClass() throws Exception { @AfterClass public static void afterClass() throws Exception { - reader.close(); - directory.close(); + if (reader != null) reader.close(); + if (directory != null) directory.close(); searcher = null; reader = null; directory = null; From 8d8db0b87c1ed02bcfc2f9aed9579c8eb7ac90b8 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 5 Feb 2025 11:52:15 +0000 Subject: [PATCH 16/34] fix initialization of cuvSResources --- .../vectorsearch/CuVSVectorsFormat.java | 57 +++++++++++++++---- .../lucene/sandbox/vectorsearch/TestCuVS.java | 20 +++---- 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index ef8e206fcd48..1a20913f312b 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -19,6 +19,7 @@ import com.nvidia.cuvs.CuVSResources; import com.nvidia.cuvs.LibraryException; import java.io.IOException; +import java.util.logging.Logger; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -27,6 +28,8 @@ /** CuVS based KnnVectorsFormat for GPU acceleration */ public class CuVSVectorsFormat extends KnnVectorsFormat { + private static final Logger LOG = Logger.getLogger(CuVSVectorsFormat.class.getName()); + public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; public static final String VECTOR_DATA_EXTENSION = "cag"; public static final String META_EXTENSION = "cagmf"; @@ -35,12 +38,13 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; public static final int DEFAULT_GRAPH_DEGREE = 64; - public final int maxDimensions = 4096; - public final int cuvsWriterThreads; - public final int intGraphDegree; - public final int graphDegree; - public MergeStrategy mergeStrategy; - public static CuVSResources resources; + static CuVSResources resources = cuVSResourcesOrNull(); + + final int maxDimensions = 4096; + final int cuvsWriterThreads; + final int intGraphDegree; + final int graphDegree; + final MergeStrategy mergeStrategy; public CuVSVectorsFormat() { this( @@ -58,23 +62,44 @@ public CuVSVectorsFormat( this.cuvsWriterThreads = cuvsWriterThreads; this.intGraphDegree = intGraphDegree; this.graphDegree = graphDegree; + } + + private static CuVSResources cuVSResourcesOrNull() { try { resources = CuVSResources.create(); - } catch (LibraryException ex) { - throw ex; - } catch (Throwable e) { - throw new RuntimeException(e); + return resources; + } catch (UnsupportedOperationException uoe) { + LOG.warning("cuvs is not supported on this platform or java version"); + } catch (Throwable t) { + if (t instanceof ExceptionInInitializerError ex) { + t = ex.getCause(); + } + LOG.warning("Exception occurred during creation of cuvs resources. " + t); + } + return null; + } + + /** Tells whether the platform supports cuvs. */ + public static boolean supported() { + return resources != null; + } + + private static void checkSupported() { + if (!supported()) { + throw new UnsupportedOperationException(); } } @Override public CuVSVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + checkSupported(); return new CuVSVectorsWriter( state, cuvsWriterThreads, intGraphDegree, graphDegree, mergeStrategy, resources); } @Override public CuVSVectorsReader fieldsReader(SegmentReadState state) throws IOException { + checkSupported(); try { return new CuVSVectorsReader(state, resources); } catch (Throwable e) { @@ -86,4 +111,16 @@ public CuVSVectorsReader fieldsReader(SegmentReadState state) throws IOException public int getMaxDimensions(String fieldName) { return maxDimensions; } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("CuVSVectorsFormat("); + sb.append("cuvsWriterThreads=").append(cuvsWriterThreads); + sb.append("intGraphDegree=").append(intGraphDegree); + sb.append("graphDegree=").append(graphDegree); + sb.append("mergeStrategy=").append(mergeStrategy); + sb.append("resources=").append(resources); + sb.append(")"); + return sb.toString(); + } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java index dd013eed547a..57be29050441 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java @@ -51,23 +51,23 @@ public class TestCuVS extends LuceneTestCase { protected static Logger log = Logger.getLogger(TestCuVS.class.getName()); - private static IndexSearcher searcher; - private static IndexReader reader; - private static Directory directory; + static final Codec codec = TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); + static IndexSearcher searcher; + static IndexReader reader; + static Directory directory; - public static int DATASET_SIZE_LIMIT = 1000; - public static int DIMENSIONS_LIMIT = 2048; - public static int NUM_QUERIES_LIMIT = 10; - public static int TOP_K_LIMIT = 64; // TODO This fails beyond 64 + static int DATASET_SIZE_LIMIT = 1000; + static int DIMENSIONS_LIMIT = 2048; + static int NUM_QUERIES_LIMIT = 10; + static int TOP_K_LIMIT = 64; // TODO This fails beyond 64 - public static float[][] dataset = null; + public static float[][] dataset; @BeforeClass public static void beforeClass() throws Exception { + assumeTrue("cuvs not supported", CuVSVectorsFormat.supported()); directory = newDirectory(); - Codec codec = TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); - RandomIndexWriter writer = new RandomIndexWriter( random(), From c9d454d5b91f8862e4ab803d19e40a53603fc84e Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 5 Feb 2025 12:51:03 +0000 Subject: [PATCH 17/34] add CuVSVectorsFormat test --- .../vectorsearch/TestCuVSVectorsFormat.java | 42 +++++++++++++++++++ .../index/BaseKnnVectorsFormatTestCase.java | 17 ++++++-- 2 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java new file mode 100644 index 000000000000..ae5b2403a3e5 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.util.List; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.junit.BeforeClass; + +public class TestCuVSVectorsFormat extends BaseKnnVectorsFormatTestCase { + + @BeforeClass + public static void beforeClass() { + assumeTrue("cuvs is not supported", CuVSVectorsFormat.supported()); + } + + @Override + protected Codec getCodec() { + return TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); + } + + @Override + protected List supportedVectorEncodings() { + return List.of(VectorEncoding.FLOAT32); + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index 97b578e7c5cd..ed1a76133968 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -296,6 +296,7 @@ public KnnVectorsFormat knnVectorsFormat() { } public void testMergingWithDifferentByteKnnFields() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); try (var dir = newDirectory()) { IndexWriterConfig iwc = new IndexWriterConfig(); Codec codec = getCodec(); @@ -994,6 +995,7 @@ public void testFloatVectorScorerIteration() throws Exception { } public void testByteVectorScorerIteration() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); IndexWriterConfig iwc = newIndexWriterConfig(); if (random().nextBoolean()) { iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); @@ -1081,6 +1083,7 @@ public void testEmptyFloatVectorData() throws Exception { } public void testEmptyByteVectorData() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { var doc1 = new Document(); @@ -1112,11 +1115,16 @@ protected VectorSimilarityFunction randomSimilarity() { } /** - * This method is overrideable since old codec versions only support {@link - * VectorEncoding#FLOAT32}. + * The vector encodings supported by the format. Defaults to all VectorEncoding.values(). Override + * if the format only supports a subset of these encodings. */ + protected List supportedVectorEncodings() { + return Arrays.stream(VectorEncoding.values()).toList(); + } + protected VectorEncoding randomVectorEncoding() { - return VectorEncoding.values()[random().nextInt(VectorEncoding.values().length)]; + var encodings = supportedVectorEncodings().toArray(VectorEncoding[]::new); + return encodings[random().nextInt(encodings.length)]; } public void testIndexedValueNotAliased() throws Exception { @@ -1193,6 +1201,7 @@ public void testSortedIndex() throws Exception { } public void testSortedIndexBytes() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); IndexWriterConfig iwc = newIndexWriterConfig(); iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); String fieldName = "field"; @@ -1361,6 +1370,7 @@ public void testRandom() throws Exception { * back consistently. */ public void testRandomBytes() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); IndexWriterConfig iwc = newIndexWriterConfig(); if (random().nextBoolean()) { iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); @@ -1875,6 +1885,7 @@ public void testVectorValuesReportCorrectDocs() throws Exception { } public void testMismatchedFields() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); Directory dir1 = newDirectory(); IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig()); Document doc = new Document(); From ab6beaedb2a16c3718574193f2918515242dd32c Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 5 Feb 2025 12:55:16 +0000 Subject: [PATCH 18/34] fix testWriterRamEstimate --- .../vectorsearch/CuVSVectorsWriter.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 3400d306cd49..3f8301e68119 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.sandbox.vectorsearch; +import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; + import com.nvidia.cuvs.BruteForceIndex; import com.nvidia.cuvs.BruteForceIndexParams; import com.nvidia.cuvs.CagraIndex; @@ -45,6 +47,8 @@ /** KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU */ /*package-private*/ class CuVSVectorsWriter extends KnnVectorsWriter { + private static final long SHALLOW_RAM_BYTES_USED = shallowSizeOfInstance(CuVSVectorsWriter.class); + // protected Logger log = Logger.getLogger(getClass().getName()); private List fieldVectorWriters = new ArrayList<>(); @@ -90,11 +94,6 @@ public CuVSVectorsWriter( CuVSVectorsFormat.VECTOR_DATA_EXTENSION); } - @Override - public long ramBytesUsed() { - return 0; - } - @Override public void close() throws IOException { IOUtils.close(cuVSIndex); @@ -367,6 +366,15 @@ public void finish() throws IOException { } } + @Override + public long ramBytesUsed() { + long total = SHALLOW_RAM_BYTES_USED; + for (var field : fieldVectorWriters) { + total += field.ramBytesUsed(); + } + return total; + } + /** OutputStream for writing into an IndexOutput */ public class SegmentOutputStream extends OutputStream { From 98686ecb27fa79a21926c90648bfba59371eb2b8 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 5 Feb 2025 13:04:36 +0000 Subject: [PATCH 19/34] This chanage rewrites the cuVS format implementation. After the rewrite all the BaseKnnVectorsFormatTestCase tests pass. There are still some lurking intermittent failures, but the tests pass successfully the majority of the time. Summary of the most significant changes: 1. Use the flat vectors reader/writer to support the raw float32 vectors and ordinal to docId mapping. This is similar to how HNSW is supported in Lucene. And keeps the code aligned with how other formats are layered atop each other. 2. The cuVS indices (Cagra, brute force, and HNSW) are stored directly in the format, so can be mmap'ed directly. 3. Merges are physical, all raw vectors are retrieved and used to create new cuVS indices. 4. A standard KnnCollector is used, no need for a special one for cuVS, unless one wants to customise some very specific parameters. A number of workarounds have been put in place, which will eventually be lifted. 1. pre-filter and deleted docs over sample the topK, since the cuvs-java do not yet support a pre-filter. 2. Ignore Cagra failures indexing with small numbers of docs, fail over to just brute force. --- .../vectorsearch/CagraFieldVectorsWriter.java | 54 -- .../sandbox/vectorsearch/CuVSFieldWriter.java | 80 +++ .../sandbox/vectorsearch/CuVSIndex.java | 32 +- .../vectorsearch/CuVSVectorsFormat.java | 42 +- .../vectorsearch/CuVSVectorsReader.java | 621 +++++++++++------- .../vectorsearch/CuVSVectorsWriter.java | 579 ++++++++-------- .../vectorsearch/IndexInputInputStream.java | 60 ++ .../vectorsearch/IndexOutputOutputStream.java | 70 ++ .../vectorsearch/SegmentInputStream.java | 105 --- .../vectorsearch/SerializationUtils.java | 64 -- .../lucene/sandbox/vectorsearch/Util.java | 82 --- .../lucene/sandbox/vectorsearch/TestCuVS.java | 4 +- .../vectorsearch/TestCuVSVectorsFormat.java | 89 +++ .../TestIndexOutputOutputStream.java | 102 +++ 14 files changed, 1134 insertions(+), 850 deletions(-) delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java deleted file mode 100644 index 183b3c87d431..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.sandbox.vectorsearch; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.concurrent.ConcurrentHashMap; -import org.apache.lucene.codecs.KnnFieldVectorsWriter; -import org.apache.lucene.index.FieldInfo; - -/** CuVS based fields writer */ -/*package-private*/ class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { - - public final String fieldName; - public final ConcurrentHashMap vectors = - new ConcurrentHashMap(); - public int fieldVectorDimension = -1; - - public CagraFieldVectorsWriter(FieldInfo fieldInfo) { - this.fieldName = fieldInfo.getName(); - this.fieldVectorDimension = fieldInfo.getVectorDimension(); - } - - @Override - public long ramBytesUsed() { - return fieldName.getBytes(StandardCharsets.UTF_8).length - + Integer.BYTES - + ((long) vectors.size() * fieldVectorDimension * Float.BYTES); - } - - @Override - public void addValue(int docID, float[] vectorValue) throws IOException { - vectors.put(docID, vectorValue); - } - - @Override - public float[] copyValue(float[] vectorValue) { - throw new UnsupportedOperationException(); - } -} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java new file mode 100644 index 000000000000..61b8f0879202 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.util.List; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; +import org.apache.lucene.index.DocsWithFieldSet; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.util.RamUsageEstimator; + +/** CuVS based fields writer */ +/*package-private*/ class CuVSFieldWriter extends KnnFieldVectorsWriter { + + private static final long SHALLOW_SIZE = + RamUsageEstimator.shallowSizeOfInstance(CuVSFieldWriter.class); + + private final FieldInfo fieldInfo; + private final FlatFieldVectorsWriter flatFieldVectorsWriter; + private int lastDocID = -1; + + public CuVSFieldWriter( + FieldInfo fieldInfo, FlatFieldVectorsWriter flatFieldVectorsWriter) { + this.fieldInfo = fieldInfo; + this.flatFieldVectorsWriter = flatFieldVectorsWriter; + } + + @Override + public void addValue(int docID, float[] vectorValue) throws IOException { + if (docID == lastDocID) { + throw new IllegalArgumentException( + "VectorValuesField \"" + + fieldInfo.name + + "\" appears more than once in this document (only one value is allowed per field)"); + } + flatFieldVectorsWriter.addValue(docID, vectorValue); + } + + List getVectors() { + return flatFieldVectorsWriter.getVectors(); + } + + FieldInfo fieldInfo() { + return fieldInfo; + } + + DocsWithFieldSet getDocsWithFieldSet() { + return flatFieldVectorsWriter.getDocsWithFieldSet(); + } + + @Override + public float[] copyValue(float[] vectorValue) { + throw new UnsupportedOperationException(); + } + + @Override + public long ramBytesUsed() { + return SHALLOW_SIZE + flatFieldVectorsWriter.ramBytesUsed(); + } + + @Override + public String toString() { + return "CuVSFieldWriter[field name=" + fieldInfo.name + ", number=" + fieldInfo.number + "]"; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index 7b8c19996195..0356d53780d1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -18,38 +18,40 @@ import com.nvidia.cuvs.BruteForceIndex; import com.nvidia.cuvs.CagraIndex; -import java.util.List; +import com.nvidia.cuvs.HnswIndex; import java.util.Objects; /** This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) */ -/*package-private*/ class CuVSIndex { +public class CuVSIndex { private final CagraIndex cagraIndex; private final BruteForceIndex bruteforceIndex; - private final List mapping; - private final List vectors; - private final int maxDocs; + private final HnswIndex hnswIndex; - private final String fieldName; - private final String segmentName; + private int maxDocs; + private String fieldName; + private String segmentName; public CuVSIndex( String segmentName, String fieldName, CagraIndex cagraIndex, - List mapping, - List vectors, int maxDocs, BruteForceIndex bruteforceIndex) { this.cagraIndex = Objects.requireNonNull(cagraIndex); this.bruteforceIndex = Objects.requireNonNull(bruteforceIndex); - this.mapping = Objects.requireNonNull(mapping); - this.vectors = Objects.requireNonNull(vectors); this.fieldName = Objects.requireNonNull(fieldName); this.segmentName = Objects.requireNonNull(segmentName); if (maxDocs < 0) { throw new IllegalArgumentException("negative maxDocs:" + maxDocs); } this.maxDocs = maxDocs; + this.hnswIndex = null; // TODO: + } + + public CuVSIndex(CagraIndex cagraIndex, BruteForceIndex bruteforceIndex, HnswIndex hnswIndex) { + this.cagraIndex = cagraIndex; + this.bruteforceIndex = bruteforceIndex; + this.hnswIndex = hnswIndex; } public CagraIndex getCagraIndex() { @@ -60,18 +62,14 @@ public BruteForceIndex getBruteforceIndex() { return bruteforceIndex; } - public List getMapping() { - return mapping; + public HnswIndex getHNSWIndex() { + return hnswIndex; } public String getFieldName() { return fieldName; } - public List getVectors() { - return vectors; - } - public String getSegmentName() { return segmentName; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index 1a20913f312b..0e839bafe792 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -21,6 +21,9 @@ import java.io.IOException; import java.util.logging.Logger; import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; +import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; @@ -30,16 +33,29 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { private static final Logger LOG = Logger.getLogger(CuVSVectorsFormat.class.getName()); - public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; - public static final String VECTOR_DATA_EXTENSION = "cag"; - public static final String META_EXTENSION = "cagmf"; - public static final int VERSION_CURRENT = 0; + // TODO: fix Lucene version in name, to the final targeted release, if any + static final String CUVS_META_CODEC_NAME = "Lucene102CuVSVectorsFormatMeta"; + static final String CUVS_META_CODEC_EXT = "vemc"; // ""cagmf"; + static final String CUVS_INDEX_CODEC_NAME = "Lucene102CuVSVectorsFormatIndex"; + static final String CUVS_INDEX_EXT = "vcag"; + + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + public static final int DEFAULT_WRITER_THREADS = 1; public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; public static final int DEFAULT_GRAPH_DEGREE = 64; + // The minimum number of vectors in the dataset required before + // we attempt to build a Cagra index + static final int MIN_CAGRA_INDEX_SIZE = 2; + static CuVSResources resources = cuVSResourcesOrNull(); + /** The format for storing, reading, and merging raw vectors on disk. */ + private static final FlatVectorsFormat flatVectorsFormat = + new Lucene99FlatVectorsFormat(DefaultFlatVectorScorer.INSTANCE); + final int maxDimensions = 4096; final int cuvsWriterThreads; final int intGraphDegree; @@ -69,7 +85,7 @@ private static CuVSResources cuVSResourcesOrNull() { resources = CuVSResources.create(); return resources; } catch (UnsupportedOperationException uoe) { - LOG.warning("cuvs is not supported on this platform or java version"); + LOG.warning("cuvs is not supported on this platform or java version: " + uoe.getMessage()); } catch (Throwable t) { if (t instanceof ExceptionInInitializerError ex) { t = ex.getCause(); @@ -93,18 +109,22 @@ private static void checkSupported() { @Override public CuVSVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { checkSupported(); + var flatWriter = flatVectorsFormat.fieldsWriter(state); return new CuVSVectorsWriter( - state, cuvsWriterThreads, intGraphDegree, graphDegree, mergeStrategy, resources); + state, + cuvsWriterThreads, + intGraphDegree, + graphDegree, + mergeStrategy, + resources, + flatWriter); } @Override public CuVSVectorsReader fieldsReader(SegmentReadState state) throws IOException { checkSupported(); - try { - return new CuVSVectorsReader(state, resources); - } catch (Throwable e) { - throw new RuntimeException(e); - } + var flatReader = flatVectorsFormat.fieldsReader(state); + return new CuVSVectorsReader(state, resources, flatReader); } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index 0afbe18b278e..07b44854f7c2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -16,6 +16,13 @@ */ package org.apache.lucene.sandbox.vectorsearch; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_START; + import com.nvidia.cuvs.BruteForceIndex; import com.nvidia.cuvs.BruteForceQuery; import com.nvidia.cuvs.CagraIndex; @@ -24,207 +31,252 @@ import com.nvidia.cuvs.CuVSResources; import com.nvidia.cuvs.HnswIndex; import com.nvidia.cuvs.HnswIndexParams; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.lang.StackWalker.StackFrame; -import java.util.ArrayList; -import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Map.Entry; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; +import java.util.logging.Logger; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.hnsw.FlatVectorsReader; import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.search.TopKnnCollector; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.hnsw.IntToIntFunction; /** KnnVectorsReader instance associated with CuVS format */ -/*package-private*/ class CuVSVectorsReader extends KnnVectorsReader { - - // protected Logger log = Logger.getLogger(getClass().getName()); - - IndexInput vectorDataReader = null; - public String fileName = null; - public byte[] indexFileBytes; - public int[] docIds; - public float[] vectors; - public SegmentReadState segmentState = null; - public int indexFilePayloadSize = 0; - public long initialFilePointerLoc = 0; - public SegmentInputStream segmentInputStream; +public class CuVSVectorsReader extends KnnVectorsReader { - // Field to List of Indexes - public Map> cuvsIndexes; + @SuppressWarnings("unused") + private static final Logger log = Logger.getLogger(CuVSVectorsReader.class.getName()); - private CuVSResources resources; + private final CuVSResources resources; + private final FlatVectorsReader flatVectorsReader; // for reading the raw vectors + private final FieldInfos fieldInfos; + private final IntObjectHashMap fields; + private final IntObjectHashMap cuvsIndices; + private final IndexInput cuvsIndexInput; - public CuVSVectorsReader(SegmentReadState state, CuVSResources resources) throws Throwable { - - segmentState = state; + public CuVSVectorsReader( + SegmentReadState state, CuVSResources resources, FlatVectorsReader flatReader) + throws IOException { this.resources = resources; + this.flatVectorsReader = flatReader; + this.fieldInfos = state.fieldInfos; + this.fields = new IntObjectHashMap<>(); - fileName = + String metaFileName = IndexFileNames.segmentFileName( - state.segmentInfo.name, state.segmentSuffix, CuVSVectorsFormat.VECTOR_DATA_EXTENSION); - - vectorDataReader = segmentState.directory.openInput(fileName, segmentState.context); - CodecUtil.readIndexHeader(vectorDataReader); - - initialFilePointerLoc = vectorDataReader.getFilePointer(); - indexFilePayloadSize = - (int) vectorDataReader.length() - - (int) initialFilePointerLoc; // vectorMetaReader.readInt(); - segmentInputStream = - new SegmentInputStream(vectorDataReader, indexFilePayloadSize, initialFilePointerLoc); - // log.info("payloadSize: " + indexFilePayloadSize); - // log.info("initialFilePointerLoc: " + initialFilePointerLoc); - - List stackTrace = StackWalker.getInstance().walk(this::getStackTrace); - - boolean isMergeCase = false; - for (StackFrame s : stackTrace) { - if (s.toString().startsWith("org.apache.lucene.index.IndexWriter.merge")) { - isMergeCase = true; - // log.info("Reader opening on merge call"); - break; + state.segmentInfo.name, state.segmentSuffix, CUVS_META_CODEC_EXT); + boolean success = false; + int versionMeta = -1; + try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName)) { + Throwable priorException = null; + try { + versionMeta = + CodecUtil.checkIndexHeader( + meta, + CUVS_META_CODEC_NAME, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + readFields(meta); + } catch (Throwable exception) { + priorException = exception; + } finally { + CodecUtil.checkFooter(meta, priorException); + } + var ioContext = state.context.withReadAdvice(ReadAdvice.SEQUENTIAL); + cuvsIndexInput = openCuVSInput(state, versionMeta, ioContext); + cuvsIndices = loadCuVSIndices(); + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); } } - - /*log.info( - "Source of this segment " - + segmentState.segmentSuffix - + " is " - + segmentState.segmentInfo.getDiagnostics().get(IndexWriter.SOURCE)); - log.info("Loading for " + segmentState.segmentInfo.name + ", mergeCase? " + isMergeCase); - log.info("Not the merge case, hence loading for " + segmentState.segmentInfo.name);*/ - this.cuvsIndexes = loadCuVSIndex(getIndexInputStream(), isMergeCase); } - @SuppressWarnings({"unchecked"}) - private Map> loadCuVSIndex(ZipInputStream zis, boolean isMergeCase) - throws Throwable { - Map> ret = new HashMap>(); - Map cagraIndexes = new HashMap(); - Map bruteforceIndexes = new HashMap(); - Map hnswIndexes = new HashMap(); - Map> mappings = new HashMap>(); - Map> vectors = new HashMap>(); - - Map maxDocs = null; // map of segment, maxDocs - ZipEntry ze; - while ((ze = zis.getNextEntry()) != null) { - String entry = ze.getName(); - - String segmentField = entry.split("\\.")[0]; - String extension = entry.split("\\.")[1]; - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - byte[] buffer = new byte[1024]; - int len = 0; - while ((len = zis.read(buffer)) != -1) { - baos.write(buffer, 0, len); + private static IndexInput openCuVSInput( + SegmentReadState state, int versionMeta, IOContext context) throws IOException { + String fileName = + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, CUVS_INDEX_EXT); + IndexInput in = state.directory.openInput(fileName, context); + boolean success = false; + try { + int versionVectorData = + CodecUtil.checkIndexHeader( + in, + CUVS_INDEX_CODEC_NAME, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + checkVersion(versionMeta, versionVectorData, in); + CodecUtil.retrieveChecksum(in); + success = true; + return in; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(in); } + } + } + + private void validateFieldEntry(FieldInfo info, FieldEntry fieldEntry) { + int dimension = info.getVectorDimension(); + if (dimension != fieldEntry.dims()) { + throw new IllegalStateException( + "Inconsistent vector dimension for field=\"" + + info.name + + "\"; " + + dimension + + " != " + + fieldEntry.dims()); + } + } - switch (extension) { - case "meta": - { - maxDocs = SerializationUtils.deserialize(baos.toByteArray()); - break; - } - case "vec": - { - vectors.put(segmentField, SerializationUtils.deserialize(baos.toByteArray())); - break; - } - case "map": - { - List map = SerializationUtils.deserialize(baos.toByteArray()); - mappings.put(segmentField, map); - break; - } - case "cag": - { - cagraIndexes.put( - segmentField, - CagraIndex.newBuilder(resources) - .from(new ByteArrayInputStream(baos.toByteArray())) - .build()); - break; - } - case "bf": - { - bruteforceIndexes.put( - segmentField, - BruteForceIndex.newBuilder(resources) - .from(new ByteArrayInputStream(baos.toByteArray())) - .build()); - break; - } - case "hnsw": - { - HnswIndexParams indexParams = new HnswIndexParams.Builder().build(); - hnswIndexes.put( - segmentField, - HnswIndex.newBuilder(resources) - .from(new ByteArrayInputStream(baos.toByteArray())) - .withIndexParams(indexParams) - .build()); - break; - } + private void readFields(ChecksumIndexInput meta) throws IOException { + for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { + FieldInfo info = fieldInfos.fieldInfo(fieldNumber); + if (info == null) { + throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); } + FieldEntry fieldEntry = readField(meta, info); + validateFieldEntry(info, fieldEntry); + fields.put(info.number, fieldEntry); + } + } + + // List of vector similarity functions. This list is defined here, in order + // to avoid an undesirable dependency on the declaration and order of values + // in VectorSimilarityFunction. The list values and order must be identical + // to that of {@link o.a.l.c.l.Lucene94FieldInfosFormat#SIMILARITY_FUNCTIONS}. + static final List SIMILARITY_FUNCTIONS = + List.of( + VectorSimilarityFunction.EUCLIDEAN, + VectorSimilarityFunction.DOT_PRODUCT, + VectorSimilarityFunction.COSINE, + VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT); + + static VectorSimilarityFunction readSimilarityFunction(DataInput input) throws IOException { + int i = input.readInt(); + if (i < 0 || i >= SIMILARITY_FUNCTIONS.size()) { + throw new IllegalArgumentException("invalid distance function: " + i); + } + return SIMILARITY_FUNCTIONS.get(i); + } + + static VectorEncoding readVectorEncoding(DataInput input) throws IOException { + int encodingId = input.readInt(); + if (encodingId < 0 || encodingId >= VectorEncoding.values().length) { + throw new CorruptIndexException("Invalid vector encoding id: " + encodingId, input); } + return VectorEncoding.values()[encodingId]; + } - /*log.info("Loading cuvsIndexes from segment: " + segmentState.segmentInfo.name); - log.info("Diagnostics for this segment: " + segmentState.segmentInfo.getDiagnostics()); - log.info("Loading map of cagraIndexes: " + cagraIndexes); - log.info("Loading vectors: " + vectors); - log.info("Loading mapping: " + mappings);*/ - - for (String segmentField : cagraIndexes.keySet()) { - // log.info("Loading segmentField: " + segmentField); - String segment = segmentField.split("/")[0]; - String field = segmentField.split("/")[1]; - CuVSIndex cuvsIndex = - new CuVSIndex( - segment, - field, - cagraIndexes.get(segmentField), - mappings.get(segmentField), - vectors.get(segmentField), - maxDocs.get(segment), - bruteforceIndexes.get(segmentField)); - List listOfIndexes = - ret.containsKey(field) ? ret.get(field) : new ArrayList(); - listOfIndexes.add(cuvsIndex); - ret.put(field, listOfIndexes); + private FieldEntry readField(IndexInput input, FieldInfo info) throws IOException { + VectorEncoding vectorEncoding = readVectorEncoding(input); + VectorSimilarityFunction similarityFunction = readSimilarityFunction(input); + if (similarityFunction != info.getVectorSimilarityFunction()) { + throw new IllegalStateException( + "Inconsistent vector similarity function for field=\"" + + info.name + + "\"; " + + similarityFunction + + " != " + + info.getVectorSimilarityFunction()); } - return ret; + return FieldEntry.readEntry(input, vectorEncoding, info.getVectorSimilarityFunction()); } - public List getStackTrace(Stream stackFrameStream) { - return stackFrameStream.collect(Collectors.toList()); + private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry fieldEntry; + if (info == null || (fieldEntry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); + } + if (fieldEntry.vectorEncoding != expectedEncoding) { + throw new IllegalArgumentException( + "field=\"" + + field + + "\" is encoded as: " + + fieldEntry.vectorEncoding + + " expected: " + + expectedEncoding); + } + return fieldEntry; } - public ZipInputStream getIndexInputStream() throws IOException { - segmentInputStream.reset(); - return new ZipInputStream(segmentInputStream); + private IntObjectHashMap loadCuVSIndices() throws IOException { + var indices = new IntObjectHashMap(); + for (var e : fields) { + var fieldEntry = e.value; + int fieldNumber = e.key; + var cuvsIndex = loadCuVSIndex(fieldEntry); + indices.put(fieldNumber, cuvsIndex); + } + return indices; + } + + private CuVSIndex loadCuVSIndex(FieldEntry fieldEntry) throws IOException { + CagraIndex cagraIndex = null; + BruteForceIndex bruteForceIndex = null; + HnswIndex hnswIndex = null; + + try { + long len = fieldEntry.cagraIndexLength(); + if (len > 0) { + long off = fieldEntry.cagraIndexOffset(); + try (var slice = cuvsIndexInput.slice("cagra index", off, len); + var in = new IndexInputInputStream(slice)) { + cagraIndex = CagraIndex.newBuilder(resources).from(in).build(); + } + } + + len = fieldEntry.bruteForceIndexLength(); + if (len > 0) { + long off = fieldEntry.bruteForceIndexOffset(); + try (var slice = cuvsIndexInput.slice("bf index", off, len); + var in = new IndexInputInputStream(slice)) { + bruteForceIndex = BruteForceIndex.newBuilder(resources).from(in).build(); + } + } + + len = fieldEntry.hnswIndexLength(); + if (len > 0) { + long off = fieldEntry.hnswIndexOffset(); + try (var slice = cuvsIndexInput.slice("hnsw index", off, len); + var in = new IndexInputInputStream(slice)) { + var params = new HnswIndexParams.Builder().build(); + hnswIndex = HnswIndex.newBuilder(resources).withIndexParams(params).from(in).build(); + } + } + } catch (Throwable t) { + handleThrowable(t); + } + return new CuVSIndex(cagraIndex, bruteForceIndex, hnswIndex); } @Override public void close() throws IOException { - IOUtils.close(vectorDataReader); + IOUtils.close(flatVectorsReader, cuvsIndexInput); } @Override @@ -234,106 +286,189 @@ public void checkIntegrity() throws IOException { @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { - return new FloatVectorValues() { - - @Override - public int size() { - return cuvsIndexes.get(field).get(0).getVectors().size(); - } + return flatVectorsReader.getFloatVectorValues(field); + } - @Override - public int dimension() { - return cuvsIndexes.get(field).get(0).getVectors().get(0).length; - } + @Override + public ByteVectorValues getByteVectorValues(String field) { + throw new UnsupportedOperationException("byte vectors not supported"); + } - @Override - public float[] vectorValue(int pos) throws IOException { - return cuvsIndexes.get(field).get(0).getVectors().get(pos); - } + /** Native float to float function */ + public interface FloatToFloatFunction { + float apply(float v); + } - @Override - public FloatVectorValues copy() throws IOException { - return null; - } - }; + static long[] bitsToLongArray(Bits bits) { + if (bits instanceof FixedBitSet fixedBitSet) { + return fixedBitSet.getBits(); + } else { + return FixedBitSet.copyOf(bits).getBits(); + } } - @Override - public ByteVectorValues getByteVectorValues(String field) throws IOException { - throw new UnsupportedOperationException(); + static FloatToFloatFunction getScoreNormalizationFunc(VectorSimilarityFunction sim) { + // TODO: check for different similarities + return score -> (1f / (1f + score)); } + // This is a hack - replace with cuVS bugId/filter support + static final int FILTER_OVER_SAMPLE = 10; + @Override public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - PerLeafCuVSKnnCollector cuvsCollector = - knnCollector instanceof PerLeafCuVSKnnCollector - ? ((PerLeafCuVSKnnCollector) knnCollector) - : new PerLeafCuVSKnnCollector(knnCollector.k(), knnCollector.k(), 1); - TopKnnCollector defaultCollector = - knnCollector instanceof TopKnnCollector ? ((TopKnnCollector) knnCollector) : null; - - int prevDocCount = 0; - - // log.debug("Will try to search all the indexes for segment "+segmentState.segmentInfo.name+", - // field "+field+": "+cuvsIndexes); - for (CuVSIndex cuvsIndex : cuvsIndexes.get(field)) { - try { - Map result = new HashMap(); - if (cuvsCollector.k() <= 1024) { - CagraSearchParams searchParams = - new CagraSearchParams.Builder(resources) - .withItopkSize(cuvsCollector.iTopK) - .withSearchWidth(cuvsCollector.searchWidth) - .build(); - - CagraQuery query = - new CagraQuery.Builder() - .withTopK(cuvsCollector.k()) - .withSearchParams(searchParams) - .withMapping(cuvsIndex.getMapping()) - .withQueryVectors(new float[][] {target}) - .build(); - - CagraIndex cagraIndex = cuvsIndex.getCagraIndex(); - assert (cagraIndex != null); - // log.info("k is " + cuvsCollector.k()); - result = - cagraIndex - .search(query) - .getResults() - .get(0); // List expected to have only one entry because of single query "target". - // log.info("INTERMEDIATE RESULT FROM CUVS: " + result + ", prevDocCount=" + - // prevDocCount); - } else { - BruteForceQuery bruteforceQuery = - new BruteForceQuery.Builder() - .withQueryVectors(new float[][] {target}) - .withPrefilter(((FixedBitSet) acceptDocs).getBits()) - .withTopK(cuvsCollector.k()) - .build(); - - BruteForceIndex bruteforceIndex = cuvsIndex.getBruteforceIndex(); - result = bruteforceIndex.search(bruteforceQuery).getResults().get(0); - } + var fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32); + if (fieldEntry.count() == 0 || knnCollector.k() == 0) { + return; + } - for (Entry kv : result.entrySet()) { - if (defaultCollector != null) { - defaultCollector.collect(prevDocCount + kv.getKey(), kv.getValue()); - } - cuvsCollector.collect(prevDocCount + kv.getKey(), kv.getValue()); - } + var fieldNumber = fieldInfos.fieldInfo(field).number; + // log.info("fieldNumber=" + fieldNumber + ", fieldEntry.count()=" + fieldEntry.count()); + + CuVSIndex cuvsIndex = cuvsIndices.get(fieldNumber); + if (cuvsIndex == null) { + throw new IllegalStateException("not index found for field:" + field); + } - } catch (Throwable e) { - throw new RuntimeException(e); + int collectorTopK = knnCollector.k(); + if (acceptDocs != null) { + collectorTopK = knnCollector.k() * FILTER_OVER_SAMPLE; + } + final int topK = Math.min(collectorTopK, fieldEntry.count()); + + Map result; + if (knnCollector.k() <= 1024 && cuvsIndex.getCagraIndex() != null) { + // log.info("searching cagra index"); + CagraSearchParams searchParams = + new CagraSearchParams.Builder(resources) + .withItopkSize(topK) // TODO: params + .withSearchWidth(1) + .build(); + + var query = + new CagraQuery.Builder() + .withTopK(topK) + .withSearchParams(searchParams) + .withMapping(null) + .withQueryVectors(new float[][] {target}) + .build(); + + CagraIndex cagraIndex = cuvsIndex.getCagraIndex(); + List> searchResult = null; + try { + searchResult = cagraIndex.search(query).getResults(); + } catch (Throwable t) { + handleThrowable(t); + } + // List expected to have only one entry because of single query "target". + assert searchResult.size() == 1; + result = searchResult.getFirst(); + } else { + BruteForceIndex bruteforceIndex = cuvsIndex.getBruteforceIndex(); + assert bruteforceIndex != null; + // log.info("searching brute index, with actual topK=" + topK); + var queryBuilder = + new BruteForceQuery.Builder().withQueryVectors(new float[][] {target}).withTopK(topK); + BruteForceQuery query = queryBuilder.build(); + + List> searchResult = null; + try { + searchResult = bruteforceIndex.search(query).getResults(); + } catch (Throwable t) { + handleThrowable(t); + } + assert searchResult.size() == 1; + result = searchResult.getFirst(); + } + assert result != null; + + final var rawValues = flatVectorsReader.getFloatVectorValues(field); + final Bits acceptedOrds = rawValues.getAcceptOrds(acceptDocs); + final var ordToDocFunction = (IntToIntFunction) rawValues::ordToDoc; + final var scoreCorrectionFunction = getScoreNormalizationFunc(fieldEntry.similarityFunction); + + for (var entry : result.entrySet()) { + int ord = entry.getKey(); + float score = entry.getValue(); + if (acceptedOrds == null || acceptedOrds.get(ord)) { + if (knnCollector.earlyTerminated()) { + break; + } + assert ord >= 0 : "unexpected ord: " + ord; + int doc = ordToDocFunction.apply(ord); + float correctedScore = scoreCorrectionFunction.apply(score); + knnCollector.incVisitedCount(1); + knnCollector.collect(doc, correctedScore); } - prevDocCount += cuvsIndex.getMaxDocs(); } } @Override public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException("byte vectors not supported"); + } + + record FieldEntry( + VectorEncoding vectorEncoding, + VectorSimilarityFunction similarityFunction, + int dims, + int count, + long cagraIndexOffset, + long cagraIndexLength, + long bruteForceIndexOffset, + long bruteForceIndexLength, + long hnswIndexOffset, + long hnswIndexLength) { + + static FieldEntry readEntry( + IndexInput input, + VectorEncoding vectorEncoding, + VectorSimilarityFunction similarityFunction) + throws IOException { + var dims = input.readInt(); + var count = input.readInt(); + var cagraIndexOffset = input.readVLong(); + var cagraIndexLength = input.readVLong(); + var bruteForceIndexOffset = input.readVLong(); + var bruteForceIndexLength = input.readVLong(); + var hnswIndexOffset = input.readVLong(); + var hnswIndexLength = input.readVLong(); + return new FieldEntry( + vectorEncoding, + similarityFunction, + dims, + count, + cagraIndexOffset, + cagraIndexLength, + bruteForceIndexOffset, + bruteForceIndexLength, + hnswIndexOffset, + hnswIndexLength); + } + } + + static void checkVersion(int versionMeta, int versionVectorData, IndexInput in) + throws CorruptIndexException { + if (versionMeta != versionVectorData) { + throw new CorruptIndexException( + "Format versions mismatch: meta=" + + versionMeta + + ", " + + CUVS_META_CODEC_NAME + + "=" + + versionVectorData, + in); + } + } + + static void handleThrowable(Throwable t) throws IOException { + switch (t) { + case IOException ioe -> throw ioe; + case Error error -> throw error; + case RuntimeException re -> throw re; + case null, default -> throw new RuntimeException("UNEXPECTED: exception type", t); + } } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 3f8301e68119..013ee0f40433 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -16,6 +16,15 @@ */ package org.apache.lucene.sandbox.vectorsearch; +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.SIMILARITY_FUNCTIONS; +import static org.apache.lucene.index.VectorEncoding.FLOAT32; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.MIN_CAGRA_INDEX_SIZE; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; import com.nvidia.cuvs.BruteForceIndex; @@ -24,47 +33,55 @@ import com.nvidia.cuvs.CagraIndexParams; import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo; import com.nvidia.cuvs.CuVSResources; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.time.Duration; import java.util.ArrayList; -import java.util.LinkedHashMap; import java.util.List; +import java.util.Objects; +import java.util.logging.Logger; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatVectorsWriter; +import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Sorter; import org.apache.lucene.index.Sorter.DocMap; +import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.SuppressForbidden; /** KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU */ -/*package-private*/ class CuVSVectorsWriter extends KnnVectorsWriter { +public class CuVSVectorsWriter extends KnnVectorsWriter { private static final long SHALLOW_RAM_BYTES_USED = shallowSizeOfInstance(CuVSVectorsWriter.class); - // protected Logger log = Logger.getLogger(getClass().getName()); - - private List fieldVectorWriters = new ArrayList<>(); - private IndexOutput cuVSIndex = null; - private SegmentWriteState segmentWriteState = null; - private String cuVSDataFilename = null; - - private CagraIndex cagraIndex; - private CagraIndex cagraIndexForHnsw; + @SuppressWarnings("unused") + private static final Logger log = Logger.getLogger(CuVSVectorsWriter.class.getName()); private final int cuvsWriterThreads; private final int intGraphDegree; private final int graphDegree; + + @SuppressWarnings("unused") private final MergeStrategy mergeStrategy; + private final CuVSResources resources; + private final FlatVectorsWriter flatVectorsWriter; // for writing the raw vectors + private final List fields = new ArrayList<>(); + private final IndexOutput meta, cuvsIndex; + private boolean finished; + /** Merge strategy used for CuVS */ public enum MergeStrategy { TRIVIAL_MERGE, @@ -77,337 +94,353 @@ public CuVSVectorsWriter( int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy, - CuVSResources resources) + CuVSResources resources, + FlatVectorsWriter flatVectorsWriter) throws IOException { super(); - this.segmentWriteState = state; this.mergeStrategy = mergeStrategy; this.cuvsWriterThreads = cuvsWriterThreads; this.intGraphDegree = intGraphDegree; this.graphDegree = graphDegree; this.resources = resources; + this.flatVectorsWriter = flatVectorsWriter; - cuVSDataFilename = + String metaFileName = IndexFileNames.segmentFileName( - this.segmentWriteState.segmentInfo.name, - this.segmentWriteState.segmentSuffix, - CuVSVectorsFormat.VECTOR_DATA_EXTENSION); + state.segmentInfo.name, state.segmentSuffix, CUVS_META_CODEC_EXT); + String cagraFileName = + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, CUVS_INDEX_EXT); + + boolean success = false; + try { + meta = state.directory.createOutput(metaFileName, state.context); + cuvsIndex = state.directory.createOutput(cagraFileName, state.context); + CodecUtil.writeIndexHeader( + meta, + CUVS_META_CODEC_NAME, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + CodecUtil.writeIndexHeader( + cuvsIndex, + CUVS_INDEX_CODEC_NAME, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); + } + } } @Override - public void close() throws IOException { - IOUtils.close(cuVSIndex); - cuVSIndex = null; - fieldVectorWriters.clear(); - fieldVectorWriters = null; + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + var encoding = fieldInfo.getVectorEncoding(); + if (encoding != FLOAT32) { + throw new IllegalArgumentException("expected float32, got:" + encoding); + } + var writer = Objects.requireNonNull(flatVectorsWriter.addField(fieldInfo)); + @SuppressWarnings("unchecked") + var flatWriter = (FlatFieldVectorsWriter) writer; + var cuvsFieldWriter = new CuVSFieldWriter(fieldInfo, flatWriter); + fields.add(cuvsFieldWriter); + return writer; } - @Override - public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { - CagraFieldVectorsWriter cagraFieldVectorWriter = new CagraFieldVectorsWriter(fieldInfo); - fieldVectorWriters.add(cagraFieldVectorWriter); - return cagraFieldVectorWriter; + static String indexMsg(int size, int... args) { + StringBuilder sb = new StringBuilder("cagra index params"); + sb.append(": size=").append(size); + sb.append(", intGraphDegree=").append(args[0]); + sb.append(", actualIntGraphDegree=").append(args[1]); + sb.append(", graphDegree=").append(args[2]); + sb.append(", actualGraphDegree=").append(args[3]); + return sb.toString(); } - @SuppressForbidden(reason = "A temporary java.util.File is needed for Cagra's serialization") - private byte[] createCagraIndex(float[][] vectors, List mapping) throws Throwable { - CagraIndexParams indexParams = - new CagraIndexParams.Builder() - .withNumWriterThreads(cuvsWriterThreads) - .withIntermediateGraphDegree(intGraphDegree) - .withGraphDegree(graphDegree) - .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) - .build(); + private CagraIndexParams cagraIndexParams(int size) { + if (size < 2) { + // https://github.com/rapidsai/cuvs/issues/666 + throw new IllegalArgumentException("cagra index must be greater than 2"); + } + var minIntGraphDegree = Math.min(intGraphDegree, size); + var minGraphDegree = Math.min(graphDegree, minIntGraphDegree); + // log.info(indexMsg(size, intGraphDegree, minIntGraphDegree, graphDegree, minGraphDegree)); + + return new CagraIndexParams.Builder() + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(minIntGraphDegree) + .withGraphDegree(minGraphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + } + + static long nanosToMillis(long nanos) { + return Duration.ofNanos(nanos).toMillis(); + } - // log.info("Indexing started: " + System.currentTimeMillis()); - cagraIndex = + private void writeCagraIndex(OutputStream os, float[][] vectors) throws Throwable { + if (vectors.length < 2) { + throw new IllegalArgumentException(vectors.length + " vectors, less than min [2] required"); + } + CagraIndexParams indexParams = cagraIndexParams(vectors.length); + // long startTime = System.nanoTime(); + var index = CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); - // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + - // vectors.length); - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - Path tmpFile = - Files.createTempFile( - "tmpindex", "cag"); // TODO: Should we make this a file with random names? - cagraIndex.serialize(baos, tmpFile); - return baos.toByteArray(); + // long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + // log.info("Cagra index created: " + elapsedMillis + "ms, documents: " + vectors.length); + + Path tmpFile = Files.createTempFile(resources.tempDirectory(), "tmpindex", "cag"); + index.serialize(os, tmpFile); } - @SuppressForbidden(reason = "A temporary java.util.File is needed for BruteForce's serialization") - private byte[] createBruteForceIndex(float[][] vectors) throws Throwable { + private void writeBruteForceIndex(OutputStream os, float[][] vectors) throws Throwable { BruteForceIndexParams indexParams = new BruteForceIndexParams.Builder() .withNumWriterThreads(32) // TODO: Make this configurable later. .build(); - // log.info("Indexing started: " + System.currentTimeMillis()); + // long startTime = System.nanoTime(); BruteForceIndex index = BruteForceIndex.newBuilder(resources) .withIndexParams(indexParams) .withDataset(vectors) .build(); + // long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + // log.info("BruteForce index created: " + elapsedMillis + "ms, documents: " + vectors.length); - // log.info("Indexing done: " + System.currentTimeMillis()); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - index.serialize(baos); - return baos.toByteArray(); + index.serialize(os); } - @SuppressForbidden(reason = "A temporary java.util.File is needed for HNSW's serialization") - private byte[] createHnswIndex(float[][] vectors) throws Throwable { - CagraIndexParams indexParams = - new CagraIndexParams.Builder() - .withNumWriterThreads(cuvsWriterThreads) - .withIntermediateGraphDegree(intGraphDegree) - .withGraphDegree(graphDegree) - .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) - .build(); + private void writeHNSWIndex(OutputStream os, float[][] vectors) throws Throwable { + if (vectors.length < 2) { + throw new IllegalArgumentException(vectors.length + " vectors, less than min [2] required"); + } + CagraIndexParams indexParams = cagraIndexParams(vectors.length); - // log.info("Indexing started: " + System.currentTimeMillis()); - cagraIndexForHnsw = + // long startTime = System.nanoTime(); + var index = CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); - // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + - // vectors.length); + // long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + // log.info("HNSW index created: " + elapsedMillis + "ms, documents: " + vectors.length); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); Path tmpFile = Files.createTempFile("tmpindex", "hnsw"); - cagraIndexForHnsw.serializeToHNSW(baos, tmpFile); - return baos.toByteArray(); + index.serializeToHNSW(os, tmpFile); } - @SuppressWarnings({"resource", "rawtypes", "unchecked"}) @Override public void flush(int maxDoc, DocMap sortMap) throws IOException { - cuVSIndex = - this.segmentWriteState.directory.createOutput( - cuVSDataFilename, this.segmentWriteState.context); - CodecUtil.writeIndexHeader( - cuVSIndex, - CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, - CuVSVectorsFormat.VERSION_CURRENT, - this.segmentWriteState.segmentInfo.getId(), - this.segmentWriteState.segmentSuffix); - - CuVSSegmentFile cuVSFile = new CuVSSegmentFile(new SegmentOutputStream(cuVSIndex, 100000)); - - LinkedHashMap metaMap = new LinkedHashMap(); - - for (CagraFieldVectorsWriter field : fieldVectorWriters) { - // long start = System.currentTimeMillis(); - - byte[] cagraIndexBytes = null; - byte[] bruteForceIndexBytes = null; - byte[] hnswIndexBytes = null; - try { - // log.info("Starting CAGRA indexing, space remaining: " + new File("/").getFreeSpace()); - // log.info("Starting CAGRA indexing, docs: " + field.vectors.size()); - - float vectors[][] = new float[field.vectors.size()][field.vectors.get(0).length]; - for (int i = 0; i < vectors.length; i++) { - for (int j = 0; j < vectors[i].length; j++) { - vectors[i][j] = field.vectors.get(i)[j]; - } - } - - cagraIndexBytes = createCagraIndex(vectors, new ArrayList(field.vectors.keySet())); - bruteForceIndexBytes = createBruteForceIndex(vectors); - hnswIndexBytes = createHnswIndex(vectors); - } catch (Throwable e) { - throw new RuntimeException(e); + flatVectorsWriter.flush(maxDoc, sortMap); + for (var field : fields) { + if (sortMap == null) { + writeField(field); + } else { + writeSortingField(field, sortMap); } - - // start = System.currentTimeMillis(); - cuVSFile.addFile( - segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".cag", cagraIndexBytes); - // log.info( - // "time for writing CAGRA index bytes to zip: " + (System.currentTimeMillis() - start)); - - // start = System.currentTimeMillis(); - cuVSFile.addFile( - segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".bf", bruteForceIndexBytes); - /*log.info( - "time for writing BRUTEFORCE index bytes to zip: " - + (System.currentTimeMillis() - start));*/ - - // start = System.currentTimeMillis(); - cuVSFile.addFile( - segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".hnsw", hnswIndexBytes); - // log.info("time for writing HNSW index bytes to zip: " + (System.currentTimeMillis() - - // start)); - - // start = System.currentTimeMillis(); - cuVSFile.addFile( - segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".vec", - SerializationUtils.serialize(new ArrayList(field.vectors.values()))); - cuVSFile.addFile( - segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".map", - SerializationUtils.serialize(new ArrayList(field.vectors.keySet()))); - // log.info("list serializing and writing: " + (System.currentTimeMillis() - start)); - field.vectors.clear(); } - - metaMap.put(segmentWriteState.segmentInfo.name, maxDoc); - cuVSFile.addFile( - segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); - cuVSFile.close(); - - CodecUtil.writeFooter(cuVSIndex); } - SegmentOutputStream mergeOutputStream = null; - CuVSSegmentFile mergedIndexFile = null; - - @SuppressWarnings("resource") - @Override - public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { - List segInputStreams = new ArrayList(); - List readers = new ArrayList(); + private void writeField(CuVSFieldWriter fieldData) throws IOException { + // TODO: Argh! + float[][] vectors = fieldData.getVectors().toArray(float[][]::new); + writeFieldInternal(fieldData.fieldInfo(), vectors); + } - for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { - CuVSVectorsReader reader = (CuVSVectorsReader) mergeState.knnVectorsReaders[i]; - segInputStreams.add(reader.segmentInputStream); - readers.add(reader); - } + private void writeSortingField(CuVSFieldWriter fieldData, Sorter.DocMap sortMap) + throws IOException { + DocsWithFieldSet oldDocsWithFieldSet = fieldData.getDocsWithFieldSet(); + final int[] new2OldOrd = new int[oldDocsWithFieldSet.cardinality()]; // new ord to old ord - // log.info("Merging one field for segment: " + segmentWriteState.segmentInfo.name); - // log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); + mapOldOrdToNewOrd(oldDocsWithFieldSet, sortMap, null, new2OldOrd, null); - if (!List.of(segmentWriteState.directory.listAll()).contains(cuVSDataFilename)) { - IndexOutput mergedVectorIndex = - segmentWriteState.directory.createOutput(cuVSDataFilename, segmentWriteState.context); - CodecUtil.writeIndexHeader( - mergedVectorIndex, - CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, - CuVSVectorsFormat.VERSION_CURRENT, - segmentWriteState.segmentInfo.getId(), - segmentWriteState.segmentSuffix); - this.mergeOutputStream = new SegmentOutputStream(mergedVectorIndex, 100000); - mergedIndexFile = new CuVSSegmentFile(this.mergeOutputStream); + // TODO: Argh! we need to be able to avoid loading all vectors into contiguous heap memory + float[][] oldVectors = fieldData.getVectors().toArray(float[][]::new); + float[][] newVectors = new float[oldVectors.length][]; + for (int i = 0; i < oldVectors.length; i++) { + newVectors[i] = oldVectors[new2OldOrd[i]]; } + writeFieldInternal(fieldData.fieldInfo(), newVectors); + } - // log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); - - if (mergeStrategy.equals(MergeStrategy.TRIVIAL_MERGE)) { - throw new UnsupportedOperationException(); - } else if (mergeStrategy.equals(MergeStrategy.NON_TRIVIAL_MERGE)) { - // log.info("Readers: " + segInputStreams.size() + ", deocMaps: " + - // mergeState.docMaps.length); - ArrayList docMapList = new ArrayList(); - - for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { - // CuVSVectorsReader reader = (CuVSVectorsReader) mergeState.knnVectorsReaders[i]; - // for (CuVSIndex index : reader.cuvsIndexes.get(fieldInfo.name)) { - // log.info("Mapping for segment (" + reader.fileName + "): " + index.getMapping()); - // log.info("Mapping for segment (" + reader.fileName + "): " + - // index.getMapping().size()); - for (int id = 0; id < mergeState.maxDocs[i]; id++) { - docMapList.add(mergeState.docMaps[i].get(id)); + private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws IOException { + long cagraIndexOffset, cagraIndexLength; + long bruteForceIndexOffset, bruteForceIndexLength; + long hnswIndexOffset, hnswIndexLength; + assert vectors.length > 0; + try { + // write the cagra graph + var cagraIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + cagraIndexOffset = cuvsIndex.getFilePointer(); + if (vectors.length > MIN_CAGRA_INDEX_SIZE) { + try { + writeCagraIndex(cagraIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); } - // log.info("DocMaps for segment (" + reader.fileName + "): " + docMapList); - // } } - - ArrayList mergedVectors = - Util.getMergedVectors( - segInputStreams, fieldInfo.name, segmentWriteState.segmentInfo.name); - // log.info("Final mapping: " + docMapList); - // log.info("Final mapping: " + docMapList.size()); - // log.info("Merged vectors: " + mergedVectors.size()); - LinkedHashMap metaMap = new LinkedHashMap(); - byte[] cagraIndexBytes = null; - byte[] bruteForceIndexBytes = null; - byte[] hnswIndexBytes = null; - try { - float vectors[][] = new float[mergedVectors.size()][mergedVectors.get(0).length]; - for (int i = 0; i < vectors.length; i++) { - for (int j = 0; j < vectors[i].length; j++) { - vectors[i][j] = mergedVectors.get(i)[j]; - } + cagraIndexLength = cuvsIndex.getFilePointer() - cagraIndexOffset; + + // write the brute force index + var bruteForceIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + bruteForceIndexOffset = cuvsIndex.getFilePointer(); + writeBruteForceIndex(bruteForceIndexOutputStream, vectors); + bruteForceIndexLength = cuvsIndex.getFilePointer() - bruteForceIndexOffset; + + // write the hnsw index + var hnswIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + hnswIndexOffset = cuvsIndex.getFilePointer(); + if (vectors.length > MIN_CAGRA_INDEX_SIZE) { + try { + writeHNSWIndex(hnswIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); } - cagraIndexBytes = createCagraIndex(vectors, new ArrayList()); - bruteForceIndexBytes = createBruteForceIndex(vectors); - hnswIndexBytes = createHnswIndex(vectors); - } catch (Throwable e) { - throw new RuntimeException(e); - } - mergedIndexFile.addFile( - segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".cag", cagraIndexBytes); - mergedIndexFile.addFile( - segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".bf", - bruteForceIndexBytes); - mergedIndexFile.addFile( - segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".hnsw", hnswIndexBytes); - mergedIndexFile.addFile( - segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".vec", - SerializationUtils.serialize(mergedVectors)); - mergedIndexFile.addFile( - segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".map", - SerializationUtils.serialize(docMapList)); - metaMap.put(segmentWriteState.segmentInfo.name, mergedVectors.size()); - if (mergedIndexFile.getFilesAdded().contains(segmentWriteState.segmentInfo.name + ".meta") - == false) { - mergedIndexFile.addFile( - segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); } - // log.info("DocMaps: " + Arrays.toString(mergeState.docMaps)); - - metaMap.clear(); + hnswIndexLength = cuvsIndex.getFilePointer() - hnswIndexOffset; + + // StringBuilder sb = new StringBuilder("writeField "); + // sb.append(": fieldInfo.name=").append(fieldInfo.name); + // sb.append(", fieldInfo.number=").append(fieldInfo.number); + // sb.append(", size=").append(vectors.length); + // sb.append(", cagraIndexLength=").append(cagraIndexLength); + // sb.append(", bruteForceIndexLength=").append(bruteForceIndexLength); + // sb.append(", hnswIndexLength=").append(hnswIndexLength); + // log.info(sb.toString()); + + writeMeta( + fieldInfo, + vectors.length, + cagraIndexOffset, + cagraIndexLength, + bruteForceIndexOffset, + bruteForceIndexLength, + hnswIndexOffset, + hnswIndexLength); + } catch (Throwable t) { + handleThrowable(t); } } - @Override - public void finish() throws IOException { - if (this.mergeOutputStream != null) { - mergedIndexFile.close(); - CodecUtil.writeFooter(mergeOutputStream.out); - IOUtils.close(mergeOutputStream.out); - this.mergeOutputStream = null; - this.mergedIndexFile = null; + private void writeMeta( + FieldInfo field, + int count, + long cagraIndexOffset, + long cagraIndexLength, + long bruteForceIndexOffset, + long bruteForceIndexLength, + long hnswIndexOffset, + long hnswIndexLength) + throws IOException { + meta.writeInt(field.number); + meta.writeInt(field.getVectorEncoding().ordinal()); + meta.writeInt(distFuncToOrd(field.getVectorSimilarityFunction())); + meta.writeInt(field.getVectorDimension()); + meta.writeInt(count); + meta.writeVLong(cagraIndexOffset); + meta.writeVLong(cagraIndexLength); + meta.writeVLong(bruteForceIndexOffset); + meta.writeVLong(bruteForceIndexLength); + meta.writeVLong(hnswIndexOffset); + meta.writeVLong(hnswIndexLength); + } + + static int distFuncToOrd(VectorSimilarityFunction func) { + for (int i = 0; i < SIMILARITY_FUNCTIONS.size(); i++) { + if (SIMILARITY_FUNCTIONS.get(i).equals(func)) { + return (byte) i; + } } + throw new IllegalArgumentException("invalid distance function: " + func); } - @Override - public long ramBytesUsed() { - long total = SHALLOW_RAM_BYTES_USED; - for (var field : fieldVectorWriters) { - total += field.ramBytesUsed(); + // We currently ignore this, until cuVS supports tiered indices + private static final String CANNOT_GENERATE_CAGRA = + """ + Could not generate an intermediate CAGRA graph because the initial \ + kNN graph contains too many invalid or duplicated neighbor nodes. \ + This error can occur, for example, if too many overflows occur \ + during the norm computation between the dataset vectors\ + """; + + static void handleThrowableWithIgnore(Throwable t, String msg) throws IOException { + if (t.getMessage().contains(msg)) { + return; } - return total; + handleThrowable(t); } - /** OutputStream for writing into an IndexOutput */ - public class SegmentOutputStream extends OutputStream { + static void handleThrowable(Throwable t) throws IOException { + switch (t) { + case IOException ioe -> throw ioe; + case Error error -> throw error; + case RuntimeException re -> throw re; + case null, default -> throw new RuntimeException("UNEXPECTED: exception type", t); + } + } - IndexOutput out; - int bufferSize; - byte[] buffer; - int p; + private static DocsWithFieldSet getVectorData(FloatVectorValues floatVectorValues, float[][] dst) + throws IOException { + DocsWithFieldSet docsWithField = new DocsWithFieldSet(); + int count = 0; + KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { + assert iter.index() == count; + dst[iter.index()] = floatVectorValues.vectorValue(iter.index()); + docsWithField.add(docV); + count++; + } + return docsWithField; + } - public SegmentOutputStream(IndexOutput out, int bufferSize) throws IOException { - super(); - this.out = out; - this.bufferSize = bufferSize; - this.buffer = new byte[this.bufferSize]; + @Override + public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + flatVectorsWriter.mergeOneField(fieldInfo, mergeState); + try { + final FloatVectorValues mergedVectorValues = + switch (fieldInfo.getVectorEncoding()) { + case BYTE -> throw new AssertionError("bytes not supported"); + case FLOAT32 -> + KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); + }; + + float[][] vectors = new float[mergedVectorValues.size()][mergedVectorValues.dimension()]; + getVectorData(mergedVectorValues, vectors); + writeFieldInternal(fieldInfo, vectors); + } catch (Throwable t) { + handleThrowable(t); } + } - @Override - public void write(int b) throws IOException { - buffer[p] = (byte) b; - p += 1; - if (p == bufferSize) { - flush(); - } + @Override + public void finish() throws IOException { + if (finished) { + throw new IllegalStateException("already finished"); } + finished = true; + flatVectorsWriter.finish(); - @Override - public void flush() throws IOException { - out.writeBytes(buffer, p); - p = 0; + if (meta != null) { + // write end of fields marker + meta.writeInt(-1); + CodecUtil.writeFooter(meta); } + if (cuvsIndex != null) { + CodecUtil.writeFooter(cuvsIndex); + } + } - @Override - public void close() throws IOException { - this.flush(); + @Override + public void close() throws IOException { + IOUtils.close(meta, cuvsIndex, flatVectorsWriter); + } + + @Override + public long ramBytesUsed() { + long total = SHALLOW_RAM_BYTES_USED; + for (var field : fields) { + total += field.ramBytesUsed(); } + return total; } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java new file mode 100644 index 000000000000..4eb8ed558f70 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.InputStream; +import org.apache.lucene.store.IndexInput; + +/** InputStream for reading from an IndexInput. */ +final class IndexInputInputStream extends InputStream { + + final IndexInput in; + long pos = 0; + final long limit; + + IndexInputInputStream(IndexInput in) { + this.in = in; + this.limit = in.length(); + } + + @Override + public int read() throws IOException { + if (pos >= limit) { + return -1; + } + pos++; + return in.readByte(); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + if (len <= 0) { + return 0; + } + if (pos >= limit) { + return -1; + } + long avail = limit - pos; + if (len > avail) { + len = (int) avail; + } + in.readBytes(b, off, len); + pos += len; + return len; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java new file mode 100644 index 000000000000..ffb2b922e4b5 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.OutputStream; +import org.apache.lucene.store.IndexOutput; + +/** OutputStream for writing into an IndexOutput */ +final class IndexOutputOutputStream extends OutputStream { + + static final int DEFAULT_BUFFER_SIZE = 8192; + + final IndexOutput out; + final int bufferSize; + final byte[] buffer; + int idx; + + IndexOutputOutputStream(IndexOutput out) { + this(out, DEFAULT_BUFFER_SIZE); + } + + IndexOutputOutputStream(IndexOutput out, int bufferSize) { + this.out = out; + this.bufferSize = bufferSize; + this.buffer = new byte[bufferSize]; + } + + @Override + public void write(int b) throws IOException { + buffer[idx] = (byte) b; + idx++; + if (idx == bufferSize) { + flush(); + } + } + + @Override + public void write(byte[] b, int offset, int length) throws IOException { + if (idx != 0) { + flush(); + } + out.writeBytes(b, offset, length); + } + + @Override + public void flush() throws IOException { + out.writeBytes(buffer, 0, idx); + idx = 0; + } + + @Override + public void close() throws IOException { + this.flush(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java deleted file mode 100644 index 8f81c8bb7f15..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.sandbox.vectorsearch; - -import java.io.IOException; -import java.io.InputStream; -import org.apache.lucene.store.IndexInput; - -/** InputStream semantics for reading from an IndexInput */ -/*package-private*/ class SegmentInputStream extends InputStream { - - /** */ - private final IndexInput indexInput; - - public final long initialFilePointerPosition; - public final long limit; - public long pos = 0; - - // TODO: This input stream needs to be modified to enable buffering. - public SegmentInputStream(IndexInput indexInput, long limit, long initialFilePointerPosition) - throws IOException { - super(); - this.indexInput = indexInput; - this.initialFilePointerPosition = initialFilePointerPosition; - this.limit = limit; - - this.indexInput.seek(initialFilePointerPosition); - } - - @Override - public int read() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int read(byte[] b, int off, int len) { - try { - long avail = limit - pos; - if (pos >= limit) { - return -1; - } - if (len > avail) { - len = (int) avail; - } - if (len <= 0) { - return 0; - } - indexInput.readBytes(b, off, len); - pos += len; - return len; - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - @Override - public int read(byte[] b) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public void reset() throws IOException { - indexInput.seek(initialFilePointerPosition); - pos = 0; - } - - @Override - public long skip(long n) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public boolean markSupported() { - return true; - } - - @Override - public void mark(int readlimit) { - throw new UnsupportedOperationException(); - } - - @Override - public void close() { - // Do nothing for now. - } - - @Override - public int available() { - throw new UnsupportedOperationException(); - } -} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java deleted file mode 100644 index a46db32afea9..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.sandbox.vectorsearch; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.io.OutputStream; -import java.io.Serializable; -import java.io.UncheckedIOException; -import java.util.Objects; - -/*package-private*/ class SerializationUtils { - - static byte[] serialize(final Serializable obj) { - final ByteArrayOutputStream baos = new ByteArrayOutputStream(64 * 1024); - serialize(obj, baos); - return baos.toByteArray(); - } - - static void serialize(final Serializable obj, final OutputStream outputStream) { - Objects.requireNonNull(outputStream); - try (ObjectOutputStream out = new ObjectOutputStream(outputStream)) { - out.writeObject(obj); - } catch (final IOException ex) { - throw new UncheckedIOException(ex); - } - } - - static T deserialize(final byte[] objectData) { - Objects.requireNonNull(objectData); - return deserialize(new ByteArrayInputStream(objectData)); - } - - static T deserialize(final InputStream inputStream) { - Objects.requireNonNull(inputStream); - try (ObjectInputStream in = new ObjectInputStream(inputStream)) { - @SuppressWarnings("unchecked") - final T obj = (T) in.readObject(); - return obj; - } catch (IOException ex) { - throw new UncheckedIOException(ex); - } catch (ClassNotFoundException ex) { - throw new AssertionError(ex); - } - } -} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java deleted file mode 100644 index ba980777b2df..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.sandbox.vectorsearch; - -import java.io.ByteArrayOutputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; - -/** Some Utils used in CuVS integration */ -/*package-private*/ class Util { - - public static ByteArrayOutputStream getZipEntryBAOS( - String fileName, SegmentInputStream segInputStream) throws IOException { - segInputStream.reset(); - ZipInputStream zipInputStream = new ZipInputStream(segInputStream); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - boolean fileFound = false; - ZipEntry zipEntry; - while (zipInputStream.available() == 1 - && ((zipEntry = zipInputStream.getNextEntry()) != null)) { - if (zipEntry.getName().equals(fileName)) { - fileFound = true; - byte[] buffer = new byte[1024]; - int length; - while ((length = zipInputStream.read(buffer)) != -1) { - baos.write(buffer, 0, length); - } - } - } - if (!fileFound) throw new FileNotFoundException(); - return baos; - } - - // private static final Logger log = Logger.getLogger(Util.class.getName()); - - public static ArrayList getMergedVectors( - List segInputStreams, String fieldName, String mergedSegmentName) - throws IOException { - ZipEntry zs; - ArrayList mergedVectors = new ArrayList(); - // log.info("Getting mergedVectors..."); - for (SegmentInputStream segInputStream : segInputStreams) { - segInputStream.reset(); - ZipInputStream zipStream = new ZipInputStream(segInputStream); - while ((zs = zipStream.getNextEntry()) != null) { - // log.info("Getting mergedVectors... " + zs.getName()); - byte[] buffer = new byte[1024]; - int length; - if (zs.getName().endsWith(".vec")) { - String field = zs.getName().split("\\.")[0].split("/")[1]; - if (fieldName.equals(field)) { - ByteArrayOutputStream baosM = new ByteArrayOutputStream(); - while ((length = zipStream.read(buffer)) != -1) { - baosM.write(buffer, 0, length); - } - List m = SerializationUtils.deserialize(baosM.toByteArray()); - mergedVectors.addAll(m); - } - } - } - } - return mergedVectors; - } -} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java index 57be29050441..a20a49be6f53 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java @@ -32,6 +32,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.KnnFloatVectorQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; @@ -128,7 +129,8 @@ public void testVectorSearch() throws IOException { log.info("Query size: " + numQueries + "x" + queries[0].length); log.info("TopK: " + topK); - Query query = new CuVSKnnFloatVectorQuery("vector", queries[0], topK, topK, 1); + // Query query = new CuVSKnnFloatVectorQuery("vector", queries[0], topK, topK, 1); + Query query = new KnnFloatVectorQuery("vector", queries[0], topK); int correct[] = new int[topK]; for (int i = 0; i < topK; i++) correct[i] = expected.get(0).get(i); diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java index ae5b2403a3e5..96f755b1b98f 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java @@ -16,9 +16,21 @@ */ package org.apache.lucene.sandbox.vectorsearch; +import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; + import java.util.List; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; import org.junit.BeforeClass; @@ -33,10 +45,87 @@ public static void beforeClass() { @Override protected Codec getCodec() { return TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); + // For convenience, to sanitize the test code, one can comment out + // the supported check and use another format, e.g. + // return TestUtil.alwaysKnnVectorsFormat(new Lucene99HnswVectorsFormat()); } @Override protected List supportedVectorEncodings() { return List.of(VectorEncoding.FLOAT32); } + + public void testMergeTwoSegsWithASingleDocPerSeg() throws Exception { + float[][] f = new float[][] {randomVector(384), randomVector(384)}; + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + Document doc1 = new Document(); + doc1.add(new StringField("id", "0", Field.Store.NO)); + doc1.add(new KnnFloatVectorField("f", f[0], EUCLIDEAN)); + w.addDocument(doc1); + w.commit(); + Document doc2 = new Document(); + doc2.add(new StringField("id", "1", Field.Store.NO)); + doc2.add(new KnnFloatVectorField("f", f[1], EUCLIDEAN)); + w.addDocument(doc2); + w.flush(); + w.commit(); + + // sanity - verify one doc per leaf + try (DirectoryReader reader = DirectoryReader.open(w)) { + List subReaders = reader.leaves(); + assertEquals(2, subReaders.size()); + assertEquals(1, subReaders.get(0).reader().getFloatVectorValues("f").size()); + assertEquals(1, subReaders.get(1).reader().getFloatVectorValues("f").size()); + } + + // now merge to a single segment + w.forceMerge(1); + + // verify merged content + try (DirectoryReader reader = DirectoryReader.open(w)) { + LeafReader r = getOnlyLeafReader(reader); + FloatVectorValues values = r.getFloatVectorValues("f"); + assertNotNull(values); + assertEquals(2, values.size()); + assertArrayEquals(f[0], values.vectorValue(0), 0.0f); + assertArrayEquals(f[1], values.vectorValue(1), 0.0f); + } + } + } + + // Basic test for multiple vectors fields per document + public void testTwoVectorFieldsPerDoc() throws Exception { + float[][] f1 = new float[][] {randomVector(384), randomVector(384)}; + float[][] f2 = new float[][] {randomVector(384), randomVector(384)}; + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + Document doc1 = new Document(); + doc1.add(new StringField("id", "0", Field.Store.NO)); + doc1.add(new KnnFloatVectorField("f1", f1[0], EUCLIDEAN)); + doc1.add(new KnnFloatVectorField("f2", f2[0], EUCLIDEAN)); + w.addDocument(doc1); + Document doc2 = new Document(); + doc2.add(new StringField("id", "1", Field.Store.NO)); + doc2.add(new KnnFloatVectorField("f1", f1[1], EUCLIDEAN)); + doc2.add(new KnnFloatVectorField("f2", f2[1], EUCLIDEAN)); + w.addDocument(doc2); + w.forceMerge(1); + + try (DirectoryReader reader = DirectoryReader.open(w)) { + LeafReader r = getOnlyLeafReader(reader); + FloatVectorValues values = r.getFloatVectorValues("f1"); + assertNotNull(values); + assertEquals(2, values.size()); + assertArrayEquals(f1[0], values.vectorValue(0), 0.0f); + assertArrayEquals(f1[1], values.vectorValue(1), 0.0f); + + values = r.getFloatVectorValues("f2"); + assertNotNull(values); + assertEquals(2, values.size()); + assertArrayEquals(f2[0], values.vectorValue(0), 0.0f); + assertArrayEquals(f2[1], values.vectorValue(1), 0.0f); + } + } + } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java new file mode 100644 index 000000000000..e2e2b7600e9d --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import static org.apache.lucene.util.ArrayUtil.copyOfSubArray; + +import java.io.IOException; +import java.util.Random; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestIndexOutputOutputStream extends LuceneTestCase { + + public void testBasic() throws IOException { + try (var dir = newDirectory()) { + try (var indexOut = dir.createOutput("test", IOContext.DEFAULT)) { + var out = new IndexOutputOutputStream(indexOut); + out.write(0x56); + out.write(new byte[] {0x10, 0x11, 0x12, 0x13, 0x14}); + out.close(); + } + + try (var indexIn = dir.openInput("test", IOContext.DEFAULT)) { + var in = new IndexInputInputStream(indexIn); + // assertEquals(0x56, in.read()); + byte[] ba = new byte[6]; + assertEquals(6, in.read(ba)); + assertArrayEquals(new byte[] {0x56, 0x10, 0x11, 0x12, 0x13, 0x14}, ba); + } + } + } + + public void testGetFilePointer() throws IOException { + try (var dir = newDirectory()) { + try (var indexOut = dir.createOutput("test", IOContext.DEFAULT)) { + var out = new IndexOutputOutputStream(indexOut); + out.write(0x56); + out.write(new byte[] {0x10, 0x11, 0x12}); + assertEquals(4, indexOut.getFilePointer()); + out.close(); + } + } + } + + public void testWithRandom() throws IOException { + byte[] data = new byte[Math.min(atLeast(10_000), 20_000)]; + Random random = random(); + random.nextBytes(data); + + try (var dir = newDirectory()) { + try (var indexOut = dir.createOutput("test", IOContext.DEFAULT)) { + var out = new IndexOutputOutputStream(indexOut); + int i = 0; + while (i < data.length) { + if (random.nextBoolean()) { + out.write(data[i]); + i++; + } else { + int numBytes = random.nextInt(Math.min(data.length - i, 100)); + out.write(data, i, numBytes); + i += numBytes; + } + } + out.close(); + } + + try (var indexIn = dir.openInput("test", IOContext.DEFAULT)) { + var in = new IndexInputInputStream(indexIn); + int i = 0; + while (i < data.length) { + if (random.nextBoolean()) { + int b = in.read(); + assertEquals(data[i], b); + i++; + } else { + int numBytes = random.nextInt(Math.min(data.length - i, 100)); + byte[] ba = new byte[numBytes]; + in.read(ba, 0, numBytes); + assertArrayEquals(copyOfSubArray(data, i, i + numBytes), ba); + i += numBytes; + } + } + assertEquals(-1, in.read()); + assertEquals(-1, in.read(new byte[2])); + } + } + } +} From 30206d68d78ed603594071ff3314efc3e75f4290 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 14 Feb 2025 11:35:37 +0000 Subject: [PATCH 20/34] add bug URLs --- .../lucene/sandbox/vectorsearch/CuVSVectorsFormat.java | 4 ---- .../lucene/sandbox/vectorsearch/CuVSVectorsReader.java | 3 ++- .../lucene/sandbox/vectorsearch/CuVSVectorsWriter.java | 10 +++++++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index 0e839bafe792..cf5ade94d679 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -46,10 +46,6 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; public static final int DEFAULT_GRAPH_DEGREE = 64; - // The minimum number of vectors in the dataset required before - // we attempt to build a Cagra index - static final int MIN_CAGRA_INDEX_SIZE = 2; - static CuVSResources resources = cuVSResourcesOrNull(); /** The format for storing, reading, and merging raw vectors on disk. */ diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index 07b44854f7c2..e044ede4d8cb 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -312,7 +312,7 @@ static FloatToFloatFunction getScoreNormalizationFunc(VectorSimilarityFunction s return score -> (1f / (1f + score)); } - // This is a hack - replace with cuVS bugId/filter support + // This is a hack - https://github.com/rapidsai/cuvs/issues/696 static final int FILTER_OVER_SAMPLE = 10; @Override @@ -350,6 +350,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits new CagraQuery.Builder() .withTopK(topK) .withSearchParams(searchParams) + // we don't use ord to doc mapping, https://github.com/rapidsai/cuvs/issues/699 .withMapping(null) .withQueryVectors(new float[][] {target}) .build(); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 013ee0f40433..4e2df540a9c7 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -22,7 +22,6 @@ import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_EXT; import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_EXT; import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_NAME; -import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.MIN_CAGRA_INDEX_SIZE; import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_CURRENT; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; @@ -68,6 +67,10 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { @SuppressWarnings("unused") private static final Logger log = Logger.getLogger(CuVSVectorsWriter.class.getName()); + // The minimum number of vectors in the dataset required before + // we attempt to build a Cagra index + static final int MIN_CAGRA_INDEX_SIZE = 2; + private final int cuvsWriterThreads; private final int intGraphDegree; private final int graphDegree; @@ -242,7 +245,7 @@ public void flush(int maxDoc, DocMap sortMap) throws IOException { } private void writeField(CuVSFieldWriter fieldData) throws IOException { - // TODO: Argh! + // TODO: Argh! https://github.com/rapidsai/cuvs/issues/698 float[][] vectors = fieldData.getVectors().toArray(float[][]::new); writeFieldInternal(fieldData.fieldInfo(), vectors); } @@ -254,7 +257,8 @@ private void writeSortingField(CuVSFieldWriter fieldData, Sorter.DocMap sortMap) mapOldOrdToNewOrd(oldDocsWithFieldSet, sortMap, null, new2OldOrd, null); - // TODO: Argh! we need to be able to avoid loading all vectors into contiguous heap memory + // TODO: Argh! https://github.com/rapidsai/cuvs/issues/698 + // Also will be replaced with the cuVS merge api float[][] oldVectors = fieldData.getVectors().toArray(float[][]::new); float[][] newVectors = new float[oldVectors.length][]; for (int i = 0; i < oldVectors.length; i++) { From 8e9fe16e4776d28a275718bb846ec453716d448b Mon Sep 17 00:00:00 2001 From: Vivek Narang Date: Fri, 21 Feb 2025 16:20:28 -0500 Subject: [PATCH 21/34] Make CuVSKnnFloatVectorQuery public --- .../lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java index efa4ce51e77a..2f6c636590ef 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -25,7 +25,7 @@ import org.apache.lucene.util.Bits; /** Query for CuVS */ -/*package-private*/ class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { +public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { private final int iTopK; private final int searchWidth; From 34afa24001efe8bc272e82195cfeb009de57fcdc Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 21 Feb 2025 14:59:43 +0000 Subject: [PATCH 22/34] assertion and test --- .../lucene/sandbox/vectorsearch/CuVSVectorsReader.java | 1 + .../lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index e044ede4d8cb..97c12798e6fb 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -336,6 +336,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits collectorTopK = knnCollector.k() * FILTER_OVER_SAMPLE; } final int topK = Math.min(collectorTopK, fieldEntry.count()); + assert topK > 0 : "Expected topK > 0, got:" + topK; Map result; if (knnCollector.k() <= 1024 && cuvsIndex.getCagraIndex() != null) { diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java index 96f755b1b98f..dbbdecf82ec9 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java @@ -125,6 +125,11 @@ public void testTwoVectorFieldsPerDoc() throws Exception { assertEquals(2, values.size()); assertArrayEquals(f2[0], values.vectorValue(0), 0.0f); assertArrayEquals(f2[1], values.vectorValue(1), 0.0f); + + // opportunistically check boundary condition - search with a 0 topK + var topDocs = r.searchNearestVectors("f1", randomVector(384), 0, null, 10); + assertEquals(0, topDocs.scoreDocs.length); + assertEquals(0, topDocs.totalHits.value()); } } } From 8ae25155e4ba940b6f8435a1429e0cf54c8b9e8e Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sat, 22 Feb 2025 20:46:24 +0000 Subject: [PATCH 23/34] plumb infoStream, and add indexType --- .../sandbox/vectorsearch/CuVSCodec.java | 5 +- .../vectorsearch/CuVSVectorsFormat.java | 15 +- .../vectorsearch/CuVSVectorsWriter.java | 140 ++++++++++++------ 3 files changed, 109 insertions(+), 51 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index f455a863a9a1..3489221908f8 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -22,6 +22,7 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.IndexType; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; /** CuVS based codec for GPU based vector search */ @@ -35,7 +36,9 @@ public CuVSCodec(String name, Codec delegate) { super(name, delegate); KnnVectorsFormat format; try { - format = new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE); + format = + new CuVSVectorsFormat( + 1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE, IndexType.CAGRA_AND_BRUTE_FORCE); setKnnFormat(format); } catch (LibraryException ex) { Logger log = Logger.getLogger(CuVSCodec.class.getName()); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index cf5ade94d679..705929fd86fe 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -26,6 +26,7 @@ import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.IndexType; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; /** CuVS based KnnVectorsFormat for GPU acceleration */ @@ -45,6 +46,8 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { public static final int DEFAULT_WRITER_THREADS = 1; public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; public static final int DEFAULT_GRAPH_DEGREE = 64; + public static final MergeStrategy DEFAULT_MERGE_STRATEGY = MergeStrategy.NON_TRIVIAL_MERGE; + public static final IndexType DEFAULT_INDEX_TYPE = IndexType.CAGRA; static CuVSResources resources = cuVSResourcesOrNull(); @@ -57,23 +60,30 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { final int intGraphDegree; final int graphDegree; final MergeStrategy mergeStrategy; + final CuVSVectorsWriter.IndexType indexType; // the index type to build, when writing public CuVSVectorsFormat() { this( DEFAULT_WRITER_THREADS, DEFAULT_INTERMEDIATE_GRAPH_DEGREE, DEFAULT_GRAPH_DEGREE, - MergeStrategy.NON_TRIVIAL_MERGE); + DEFAULT_MERGE_STRATEGY, + DEFAULT_INDEX_TYPE); } public CuVSVectorsFormat( - int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy) + int cuvsWriterThreads, + int intGraphDegree, + int graphDegree, + MergeStrategy mergeStrategy, + IndexType indexType) throws LibraryException { super("CuVSVectorsFormat"); this.mergeStrategy = mergeStrategy; this.cuvsWriterThreads = cuvsWriterThreads; this.intGraphDegree = intGraphDegree; this.graphDegree = graphDegree; + this.indexType = indexType; } private static CuVSResources cuVSResourcesOrNull() { @@ -112,6 +122,7 @@ public CuVSVectorsWriter fieldsWriter(SegmentWriteState state) throws IOExceptio intGraphDegree, graphDegree, mergeStrategy, + indexType, resources, flatWriter); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 4e2df540a9c7..e7670484ed14 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -58,6 +58,7 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.InfoStream; /** KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU */ public class CuVSVectorsWriter extends KnnVectorsWriter { @@ -67,6 +68,9 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { @SuppressWarnings("unused") private static final Logger log = Logger.getLogger(CuVSVectorsWriter.class.getName()); + /** The name of the CUVS component for the info-stream * */ + public static final String CUVS_COMPONENT = "CUVS"; + // The minimum number of vectors in the dataset required before // we attempt to build a Cagra index static final int MIN_CAGRA_INDEX_SIZE = 2; @@ -75,21 +79,54 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { private final int intGraphDegree; private final int graphDegree; + private final CuVSResources resources; + private final IndexType indexType; + @SuppressWarnings("unused") private final MergeStrategy mergeStrategy; - private final CuVSResources resources; - private final FlatVectorsWriter flatVectorsWriter; // for writing the raw vectors private final List fields = new ArrayList<>(); private final IndexOutput meta, cuvsIndex; + private final InfoStream infoStream; private boolean finished; /** Merge strategy used for CuVS */ public enum MergeStrategy { TRIVIAL_MERGE, NON_TRIVIAL_MERGE - }; + } + + /** The CuVS index Type. */ + public enum IndexType { + /** Builds a Cagra index. */ + CAGRA(true, false, false), + /** Builds a Brute Force index. */ + BRUTE_FORCE(false, true, false), + /** Builds an HSNW index - suitable for searching on CPU. */ + HNSW(false, false, true), + /** Builds a Cagra and a Brute Force index. */ + CAGRA_AND_BRUTE_FORCE(true, true, false); + private final boolean cagra, bruteForce, hnsw; + + IndexType(boolean cagra, boolean bruteForce, boolean hnsw) { + this.cagra = cagra; + this.bruteForce = bruteForce; + this.hnsw = hnsw; + } + + public boolean cagra() { + return cagra; + } + + public boolean bruteForce() { + return bruteForce; + } + + public boolean hnsw() { + return hnsw; + } + } public CuVSVectorsWriter( SegmentWriteState state, @@ -97,16 +134,19 @@ public CuVSVectorsWriter( int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy, + IndexType indexType, CuVSResources resources, FlatVectorsWriter flatVectorsWriter) throws IOException { super(); this.mergeStrategy = mergeStrategy; + this.indexType = indexType; this.cuvsWriterThreads = cuvsWriterThreads; this.intGraphDegree = intGraphDegree; this.graphDegree = graphDegree; this.resources = resources; this.flatVectorsWriter = flatVectorsWriter; + this.infoStream = state.infoStream; String metaFileName = IndexFileNames.segmentFileName( @@ -183,36 +223,36 @@ static long nanosToMillis(long nanos) { return Duration.ofNanos(nanos).toMillis(); } + private void info(String msg) { + if (infoStream.isEnabled(CUVS_COMPONENT)) { + infoStream.message(CUVS_COMPONENT, msg); + } + } + private void writeCagraIndex(OutputStream os, float[][] vectors) throws Throwable { if (vectors.length < 2) { throw new IllegalArgumentException(vectors.length + " vectors, less than min [2] required"); } - CagraIndexParams indexParams = cagraIndexParams(vectors.length); - // long startTime = System.nanoTime(); + CagraIndexParams params = cagraIndexParams(vectors.length); + long startTime = System.nanoTime(); var index = - CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); - // long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); - // log.info("Cagra index created: " + elapsedMillis + "ms, documents: " + vectors.length); - + CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(params).build(); + long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + info("Cagra index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); Path tmpFile = Files.createTempFile(resources.tempDirectory(), "tmpindex", "cag"); index.serialize(os, tmpFile); } private void writeBruteForceIndex(OutputStream os, float[][] vectors) throws Throwable { - BruteForceIndexParams indexParams = + BruteForceIndexParams params = new BruteForceIndexParams.Builder() .withNumWriterThreads(32) // TODO: Make this configurable later. .build(); - - // long startTime = System.nanoTime(); - BruteForceIndex index = - BruteForceIndex.newBuilder(resources) - .withIndexParams(indexParams) - .withDataset(vectors) - .build(); - // long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); - // log.info("BruteForce index created: " + elapsedMillis + "ms, documents: " + vectors.length); - + long startTime = System.nanoTime(); + var index = + BruteForceIndex.newBuilder(resources).withIndexParams(params).withDataset(vectors).build(); + long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + info("bf index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); index.serialize(os); } @@ -221,13 +261,11 @@ private void writeHNSWIndex(OutputStream os, float[][] vectors) throws Throwable throw new IllegalArgumentException(vectors.length + " vectors, less than min [2] required"); } CagraIndexParams indexParams = cagraIndexParams(vectors.length); - - // long startTime = System.nanoTime(); + long startTime = System.nanoTime(); var index = CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); - // long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); - // log.info("HNSW index created: " + elapsedMillis + "ms, documents: " + vectors.length); - + long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + info("HNSW index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); Path tmpFile = Files.createTempFile("tmpindex", "hnsw"); index.serializeToHNSW(os, tmpFile); } @@ -268,40 +306,46 @@ private void writeSortingField(CuVSFieldWriter fieldData, Sorter.DocMap sortMap) } private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws IOException { - long cagraIndexOffset, cagraIndexLength; - long bruteForceIndexOffset, bruteForceIndexLength; - long hnswIndexOffset, hnswIndexLength; + long cagraIndexOffset, cagraIndexLength = 0L; + long bruteForceIndexOffset, bruteForceIndexLength = 0L; + long hnswIndexOffset, hnswIndexLength = 0L; assert vectors.length > 0; try { - // write the cagra graph - var cagraIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); cagraIndexOffset = cuvsIndex.getFilePointer(); - if (vectors.length > MIN_CAGRA_INDEX_SIZE) { - try { - writeCagraIndex(cagraIndexOutputStream, vectors); - } catch (Throwable t) { - handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + if (indexType.cagra()) { + if (vectors.length > MIN_CAGRA_INDEX_SIZE) { + try { + var cagraIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + writeCagraIndex(cagraIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + } + } else { + // well, no index will be written at all + assert indexType.bruteForce || indexType.hnsw(); } + cagraIndexLength = cuvsIndex.getFilePointer() - cagraIndexOffset; } - cagraIndexLength = cuvsIndex.getFilePointer() - cagraIndexOffset; - // write the brute force index - var bruteForceIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); bruteForceIndexOffset = cuvsIndex.getFilePointer(); - writeBruteForceIndex(bruteForceIndexOutputStream, vectors); - bruteForceIndexLength = cuvsIndex.getFilePointer() - bruteForceIndexOffset; + if (indexType.bruteForce()) { + var bruteForceIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + writeBruteForceIndex(bruteForceIndexOutputStream, vectors); + bruteForceIndexLength = cuvsIndex.getFilePointer() - bruteForceIndexOffset; + } - // write the hnsw index - var hnswIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); hnswIndexOffset = cuvsIndex.getFilePointer(); - if (vectors.length > MIN_CAGRA_INDEX_SIZE) { - try { - writeHNSWIndex(hnswIndexOutputStream, vectors); - } catch (Throwable t) { - handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + if (indexType.hnsw()) { + var hnswIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + if (vectors.length > MIN_CAGRA_INDEX_SIZE) { + try { + writeHNSWIndex(hnswIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + } } + hnswIndexLength = cuvsIndex.getFilePointer() - hnswIndexOffset; } - hnswIndexLength = cuvsIndex.getFilePointer() - hnswIndexOffset; // StringBuilder sb = new StringBuilder("writeField "); // sb.append(": fieldInfo.name=").append(fieldInfo.name); From e04c2e7f5aa3b0015f4055e2963563dbdbe0573c Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sat, 22 Feb 2025 20:59:05 +0000 Subject: [PATCH 24/34] fix default index TYPE --- .../java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index 3489221908f8..ac94fffaf504 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -38,7 +38,7 @@ public CuVSCodec(String name, Codec delegate) { try { format = new CuVSVectorsFormat( - 1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE, IndexType.CAGRA_AND_BRUTE_FORCE); + 1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE, IndexType.CAGRA); setKnnFormat(format); } catch (LibraryException ex) { Logger log = Logger.getLogger(CuVSCodec.class.getName()); From fbb04070f8e9c4bd9a89f292f58b041cb657b104 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 09:22:53 +0000 Subject: [PATCH 25/34] fix workaround for tiny Cagra index --- .../vectorsearch/CuVSVectorsWriter.java | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index e7670484ed14..a9d1bb4d8dda 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -310,19 +310,21 @@ private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws I long bruteForceIndexOffset, bruteForceIndexLength = 0L; long hnswIndexOffset, hnswIndexLength = 0L; assert vectors.length > 0; + + // workaround for the minimum number of vectors for Cagra + final IndexType indexType = + this.indexType.cagra() && vectors.length < MIN_CAGRA_INDEX_SIZE + ? IndexType.BRUTE_FORCE + : this.indexType; + try { cagraIndexOffset = cuvsIndex.getFilePointer(); if (indexType.cagra()) { - if (vectors.length > MIN_CAGRA_INDEX_SIZE) { - try { - var cagraIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); - writeCagraIndex(cagraIndexOutputStream, vectors); - } catch (Throwable t) { - handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); - } - } else { - // well, no index will be written at all - assert indexType.bruteForce || indexType.hnsw(); + try { + var cagraIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + writeCagraIndex(cagraIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); } cagraIndexLength = cuvsIndex.getFilePointer() - cagraIndexOffset; } From 7f39c0c612210fb747f1cb9502e1505861c9d09d Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 09:23:09 +0000 Subject: [PATCH 26/34] tidy --- .../org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index ac94fffaf504..c3ddc809c4d3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -36,9 +36,7 @@ public CuVSCodec(String name, Codec delegate) { super(name, delegate); KnnVectorsFormat format; try { - format = - new CuVSVectorsFormat( - 1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE, IndexType.CAGRA); + format = new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE, IndexType.CAGRA); setKnnFormat(format); } catch (LibraryException ex) { Logger log = Logger.getLogger(CuVSCodec.class.getName()); From 67ec96beffa10e04298b729d35676dd2f50ca185 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 11:42:13 +0000 Subject: [PATCH 27/34] fix bug where docs are deleted or empty --- .../vectorsearch/CuVSVectorsWriter.java | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index a9d1bb4d8dda..d374fa83ec2c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -57,6 +57,7 @@ import org.apache.lucene.index.Sorter.DocMap; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.InfoStream; @@ -306,13 +307,16 @@ private void writeSortingField(CuVSFieldWriter fieldData, Sorter.DocMap sortMap) } private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws IOException { + if (vectors.length == 0) { + writeEmpty(fieldInfo); + return; + } long cagraIndexOffset, cagraIndexLength = 0L; long bruteForceIndexOffset, bruteForceIndexLength = 0L; long hnswIndexOffset, hnswIndexLength = 0L; - assert vectors.length > 0; // workaround for the minimum number of vectors for Cagra - final IndexType indexType = + IndexType indexType = this.indexType.cagra() && vectors.length < MIN_CAGRA_INDEX_SIZE ? IndexType.BRUTE_FORCE : this.indexType; @@ -325,6 +329,8 @@ private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws I writeCagraIndex(cagraIndexOutputStream, vectors); } catch (Throwable t) { handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + // workaround for cuVS issue + indexType = IndexType.BRUTE_FORCE; } cagraIndexLength = cuvsIndex.getFilePointer() - cagraIndexOffset; } @@ -372,6 +378,10 @@ private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws I } } + private void writeEmpty(FieldInfo fieldInfo) throws IOException { + writeMeta(fieldInfo, 0, 0L, 0L, 0L, 0L, 0L, 0L); + } + private void writeMeta( FieldInfo field, int count, @@ -429,7 +439,8 @@ static void handleThrowable(Throwable t) throws IOException { } } - private static DocsWithFieldSet getVectorData(FloatVectorValues floatVectorValues, float[][] dst) + /** Copies the vector values into dst. Returns the actual number of vectors copied. */ + private static int getVectorData(FloatVectorValues floatVectorValues, float[][] dst) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); int count = 0; @@ -440,7 +451,7 @@ private static DocsWithFieldSet getVectorData(FloatVectorValues floatVectorValue docsWithField.add(docV); count++; } - return docsWithField; + return docsWithField.cardinality(); } @Override @@ -455,7 +466,10 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE }; float[][] vectors = new float[mergedVectorValues.size()][mergedVectorValues.dimension()]; - getVectorData(mergedVectorValues, vectors); + int ret = getVectorData(mergedVectorValues, vectors); + if (ret < vectors.length) { + vectors = ArrayUtil.copyOfSubArray(vectors, 0, ret); + } writeFieldInternal(fieldInfo, vectors); } catch (Throwable t) { handleThrowable(t); From 6e86c21a26c5697035897f43afed5f0ae531eb7c Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 11:56:55 +0000 Subject: [PATCH 28/34] clamp intermediate graph degree --- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index d374fa83ec2c..f32c8cbfe05f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -208,9 +208,9 @@ private CagraIndexParams cagraIndexParams(int size) { // https://github.com/rapidsai/cuvs/issues/666 throw new IllegalArgumentException("cagra index must be greater than 2"); } - var minIntGraphDegree = Math.min(intGraphDegree, size); + var minIntGraphDegree = Math.min(intGraphDegree, size - 1); var minGraphDegree = Math.min(graphDegree, minIntGraphDegree); - // log.info(indexMsg(size, intGraphDegree, minIntGraphDegree, graphDegree, minGraphDegree)); + log.info(indexMsg(size, intGraphDegree, minIntGraphDegree, graphDegree, minGraphDegree)); return new CagraIndexParams.Builder() .withNumWriterThreads(cuvsWriterThreads) From b1a84c23105b3fb2e70e83c4ae5bd39e5015e227 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 11:58:22 +0000 Subject: [PATCH 29/34] comment out log mesg --- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index f32c8cbfe05f..90b1ec0ff6a1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -210,7 +210,7 @@ private CagraIndexParams cagraIndexParams(int size) { } var minIntGraphDegree = Math.min(intGraphDegree, size - 1); var minGraphDegree = Math.min(graphDegree, minIntGraphDegree); - log.info(indexMsg(size, intGraphDegree, minIntGraphDegree, graphDegree, minGraphDegree)); + // log.info(indexMsg(size, intGraphDegree, minIntGraphDegree, graphDegree, minGraphDegree)); return new CagraIndexParams.Builder() .withNumWriterThreads(cuvsWriterThreads) From 4dd1f88174609f91877f95c99835f18b60aba1b6 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 13:55:32 +0000 Subject: [PATCH 30/34] make 32 the default GPU index threads --- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index 705929fd86fe..7b66ec9ad528 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -43,7 +43,7 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; - public static final int DEFAULT_WRITER_THREADS = 1; + public static final int DEFAULT_WRITER_THREADS = 32; public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; public static final int DEFAULT_GRAPH_DEGREE = 64; public static final MergeStrategy DEFAULT_MERGE_STRATEGY = MergeStrategy.NON_TRIVIAL_MERGE; From c4b5c293254184d23ecd30fd7f7c42310a704031 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 13:56:10 +0000 Subject: [PATCH 31/34] remove LibraryException from the API, so consumers don't need cuvs-java --- .../sandbox/vectorsearch/CuVSVectorsFormat.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index 7b66ec9ad528..e0d4678aa5fe 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -62,6 +62,11 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { final MergeStrategy mergeStrategy; final CuVSVectorsWriter.IndexType indexType; // the index type to build, when writing + /** + * Creates a CuVSVectorsFormat, with default values. + * + * @throws LibraryException if the native library fails to load + */ public CuVSVectorsFormat() { this( DEFAULT_WRITER_THREADS, @@ -71,13 +76,17 @@ public CuVSVectorsFormat() { DEFAULT_INDEX_TYPE); } + /** + * Creates a CuVSVectorsFormat, with the given threads, graph degree, etc. + * + * @throws LibraryException if the native library fails to load + */ public CuVSVectorsFormat( int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy, - IndexType indexType) - throws LibraryException { + IndexType indexType) { super("CuVSVectorsFormat"); this.mergeStrategy = mergeStrategy; this.cuvsWriterThreads = cuvsWriterThreads; From 8cf5087379e66bb5f327d4f38746b94c7d0071d0 Mon Sep 17 00:00:00 2001 From: Vivek Narang Date: Wed, 26 Feb 2025 15:27:37 -0500 Subject: [PATCH 32/34] De-allocate indexes once serialized. --- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 90b1ec0ff6a1..d72d0bb2430a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -242,6 +242,7 @@ private void writeCagraIndex(OutputStream os, float[][] vectors) throws Throwabl info("Cagra index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); Path tmpFile = Files.createTempFile(resources.tempDirectory(), "tmpindex", "cag"); index.serialize(os, tmpFile); + index.destroyIndex(); } private void writeBruteForceIndex(OutputStream os, float[][] vectors) throws Throwable { @@ -255,6 +256,7 @@ private void writeBruteForceIndex(OutputStream os, float[][] vectors) throws Thr long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); info("bf index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); index.serialize(os); + index.destroyIndex(); } private void writeHNSWIndex(OutputStream os, float[][] vectors) throws Throwable { @@ -269,6 +271,7 @@ private void writeHNSWIndex(OutputStream os, float[][] vectors) throws Throwable info("HNSW index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); Path tmpFile = Files.createTempFile("tmpindex", "hnsw"); index.serializeToHNSW(os, tmpFile); + index.destroyIndex(); } @Override From 3837a109bf268f9d06a70196fc4511f76e4e96d2 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Thu, 27 Feb 2025 12:18:32 +0000 Subject: [PATCH 33/34] de-allocate indices on the read size, when closed --- .../sandbox/vectorsearch/CuVSIndex.java | 41 ++++++++++++++++++- .../vectorsearch/CuVSVectorsReader.java | 13 +++++- .../vectorsearch/CuVSVectorsWriter.java | 10 +---- 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index 0356d53780d1..d0cfe86d708e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -16,13 +16,17 @@ */ package org.apache.lucene.sandbox.vectorsearch; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsReader.handleThrowable; + import com.nvidia.cuvs.BruteForceIndex; import com.nvidia.cuvs.CagraIndex; import com.nvidia.cuvs.HnswIndex; +import java.io.Closeable; +import java.io.IOException; import java.util.Objects; /** This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) */ -public class CuVSIndex { +public class CuVSIndex implements Closeable { private final CagraIndex cagraIndex; private final BruteForceIndex bruteforceIndex; private final HnswIndex hnswIndex; @@ -30,6 +34,7 @@ public class CuVSIndex { private int maxDocs; private String fieldName; private String segmentName; + private volatile boolean closed; public CuVSIndex( String segmentName, @@ -55,14 +60,17 @@ public CuVSIndex(CagraIndex cagraIndex, BruteForceIndex bruteforceIndex, HnswInd } public CagraIndex getCagraIndex() { + ensureOpen(); return cagraIndex; } public BruteForceIndex getBruteforceIndex() { + ensureOpen(); return bruteforceIndex; } public HnswIndex getHNSWIndex() { + ensureOpen(); return hnswIndex; } @@ -77,4 +85,35 @@ public String getSegmentName() { public int getMaxDocs() { return maxDocs; } + + private void ensureOpen() { + if (closed) { + throw new IllegalStateException("index is closed"); + } + } + + @Override + public void close() throws IOException { + if (closed) { + return; + } + closed = true; + destroyIndices(); + } + + private void destroyIndices() throws IOException { + try { + if (cagraIndex != null) { + cagraIndex.destroyIndex(); + } + if (bruteforceIndex != null) { + bruteforceIndex.destroyIndex(); + } + if (hnswIndex != null) { + hnswIndex.destroyIndex(); + } + } catch (Throwable t) { + handleThrowable(t); + } + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index 97c12798e6fb..cfb59121e36e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -32,9 +32,12 @@ import com.nvidia.cuvs.HnswIndex; import com.nvidia.cuvs.HnswIndexParams; import java.io.IOException; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.logging.Logger; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.hnsw.FlatVectorsReader; @@ -276,7 +279,15 @@ private CuVSIndex loadCuVSIndex(FieldEntry fieldEntry) throws IOException { @Override public void close() throws IOException { - IOUtils.close(flatVectorsReader, cuvsIndexInput); + var closeableStream = + Stream.concat( + Stream.of(flatVectorsReader, cuvsIndexInput), + stream(cuvsIndices.values().iterator()).map(cursor -> cursor.value)); + IOUtils.close(closeableStream::iterator); + } + + static Stream stream(Iterator iterator) { + return StreamSupport.stream(((Iterable) () -> iterator).spliterator(), false); } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index d72d0bb2430a..61f77ee26e7c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -23,6 +23,7 @@ import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_EXT; import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_NAME; import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsReader.handleThrowable; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; @@ -433,15 +434,6 @@ static void handleThrowableWithIgnore(Throwable t, String msg) throws IOExceptio handleThrowable(t); } - static void handleThrowable(Throwable t) throws IOException { - switch (t) { - case IOException ioe -> throw ioe; - case Error error -> throw error; - case RuntimeException re -> throw re; - case null, default -> throw new RuntimeException("UNEXPECTED: exception type", t); - } - } - /** Copies the vector values into dst. Returns the actual number of vectors copied. */ private static int getVectorData(FloatVectorValues floatVectorValues, float[][] dst) throws IOException { From e4e1b15cf4992047c78ed79e074d7dfb06f762b8 Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Mon, 10 Mar 2025 20:46:52 +0530 Subject: [PATCH 34/34] Fixing scoring normalization for search spanning multiple segments --- .../lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java index 23d524cef182..caf9566064e9 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -71,7 +71,7 @@ public int k() { @Override @SuppressWarnings("cast") public boolean collect(int docId, float similarity) { - scoreDocs.add(new ScoreDoc(docId, 1f / (float) (similarity))); + scoreDocs.add(new ScoreDoc(docId, similarity)); return true; }