apache · jpountz · Feb 21, 2025 · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -64,6 +64,10 @@ New Features
 
 * GITHUB#13974: Introducing DocValuesMultiRangeQuery.SortedSetStabbingBuilder into sandbox. (Mikhail Khludnev)
 
+* GITHUB#14204: Added HistogramCollectorManager to efficiently compute a
+  histogram of the distribution of the values of a field, for documents
+  matching a given query. (Adrien Grand)
+
 Improvements
 ---------------------
 

diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java
@@ -34,6 +34,7 @@
   exports org.apache.lucene.sandbox.facet.iterators;
   exports org.apache.lucene.sandbox.facet.cutters;
   exports org.apache.lucene.sandbox.facet.labels;
+  exports org.apache.lucene.sandbox.facet.plain.histograms;
 
   provides org.apache.lucene.codecs.PostingsFormat with
       org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat;

diff --git a/...sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/HistogramCollector.java b/...sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/HistogramCollector.java
@@ -0,0 +1,274 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.sandbox.facet.plain.histograms;
+
+import java.io.IOException;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.DocValuesSkipper;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.internal.hppc.LongIntHashMap;
+import org.apache.lucene.search.CollectionTerminatedException;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.LeafCollector;
+import org.apache.lucene.search.Scorable;
+import org.apache.lucene.search.ScoreMode;
+
+final class HistogramCollector implements Collector {
+
+  private final String field;
+  private final long bucketWidth;
+  private final int maxBuckets;
+  private final LongIntHashMap counts;
+
+  HistogramCollector(String field, long bucketWidth, int maxBuckets) {
+    this.field = field;
+    this.bucketWidth = bucketWidth;
+    this.maxBuckets = maxBuckets;
+    this.counts = new LongIntHashMap();
+  }
+
+  @Override
+  public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
+    FieldInfo fi = context.reader().getFieldInfos().fieldInfo(field);
+    if (fi == null) {
+      // The segment has no values, nothing to do.
+      throw new CollectionTerminatedException();
+    }
+    if (fi.getDocValuesType() != DocValuesType.NUMERIC
+        && fi.getDocValuesType() != DocValuesType.SORTED_NUMERIC) {
+      throw new IllegalStateException(
+          "Expected numeric field, but got doc-value type: " + fi.getDocValuesType());
+    }
+    SortedNumericDocValues values = DocValues.getSortedNumeric(context.reader(), field);
+    NumericDocValues singleton = DocValues.unwrapSingleton(values);
+    if (singleton == null) {
+      return new HistogramNaiveLeafCollector(values, bucketWidth, maxBuckets, counts);
+    } else {
+      DocValuesSkipper skipper = context.reader().getDocValuesSkipper(field);
+      if (skipper != null) {
+        long leafMinBucket = Math.floorDiv(skipper.minValue(), bucketWidth);
+        long leafMaxBucket = Math.floorDiv(skipper.maxValue(), bucketWidth);
+        if (leafMaxBucket - leafMinBucket <= 1024) {
+          // Only use the optimized implementation if there is a small number of unique buckets,
+          // so that we can count them using a dense array instead of a hash table. This helps save
+          // the overhead of hashing and collision resolution.
+          return new HistogramLeafCollector(singleton, skipper, bucketWidth, maxBuckets, counts);
+        }
+      }
+      return new HistogramNaiveSingleValuedLeafCollector(
+          singleton, bucketWidth, maxBuckets, counts);
+    }
+  }
+
+  @Override
+  public ScoreMode scoreMode() {
+    return ScoreMode.COMPLETE_NO_SCORES;
+  }
+
+  LongIntHashMap getCounts() {
+    return counts;
+  }
+
+  /**
+   * Naive implementation of a histogram {@link LeafCollector}, which iterates all maches and looks
+   * up the value to determine the corresponding bucket.
+   */
+  private static class HistogramNaiveLeafCollector implements LeafCollector {
+
+    private final SortedNumericDocValues values;
+    private final long bucketWidth;
+    private final int maxBuckets;
+    private final LongIntHashMap counts;
+
+    HistogramNaiveLeafCollector(
+        SortedNumericDocValues values, long bucketWidth, int maxBuckets, LongIntHashMap counts) {
+      this.values = values;
+      this.bucketWidth = bucketWidth;
+      this.maxBuckets = maxBuckets;
+      this.counts = counts;
+    }
+
+    @Override
+    public void setScorer(Scorable scorer) throws IOException {}
+
+    @Override
+    public void collect(int doc) throws IOException {
+      if (values.advanceExact(doc)) {
+        int valueCount = values.docValueCount();
+        long prevBucket = Long.MIN_VALUE;
+        for (int i = 0; i < valueCount; ++i) {
+          final long value = values.nextValue();
+          final long bucket = Math.floorDiv(value, bucketWidth);
+          // We must not double-count values that map to the same bucket since this returns doc
+          // counts as opposed to value counts.
+          if (bucket != prevBucket) {
+            counts.addTo(bucket, 1);
+            checkMaxBuckets(counts.size(), maxBuckets);
+            prevBucket = bucket;
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * Naive implementation of a histogram {@link LeafCollector}, which iterates all maches and looks
+   * up the value to determine the corresponding bucket.
+   */
+  private static class HistogramNaiveSingleValuedLeafCollector implements LeafCollector {
+
+    private final NumericDocValues values;
+    private final long bucketWidth;
+    private final int maxBuckets;
+    private final LongIntHashMap counts;
+
+    HistogramNaiveSingleValuedLeafCollector(
+        NumericDocValues values, long bucketWidth, int maxBuckets, LongIntHashMap counts) {
+      this.values = values;
+      this.bucketWidth = bucketWidth;
+      this.maxBuckets = maxBuckets;
+      this.counts = counts;
+    }
+
+    @Override
+    public void setScorer(Scorable scorer) throws IOException {}
+
+    @Override
+    public void collect(int doc) throws IOException {
+      if (values.advanceExact(doc)) {
+        final long value = values.longValue();
+        final long bucket = Math.floorDiv(value, bucketWidth);
+        counts.addTo(bucket, 1);
+        checkMaxBuckets(counts.size(), maxBuckets);
+      }
+    }
+  }
+
+  /**
+   * Optimized histogram {@link LeafCollector}, that takes advantage of the doc-values index to
+   * speed up collection.
+   */
+  private static class HistogramLeafCollector implements LeafCollector {
+
+    private final NumericDocValues values;
+    private final DocValuesSkipper skipper;
+    private final long bucketWidth;
+    private final int maxBuckets;
+    private final int[] counts;
+    private final long leafMinBucket;
+    private final LongIntHashMap collectorCounts;
+
+    /** Max doc ID (inclusive) up to which all docs values may map to the same bucket. */
+    private int upToInclusive = -1;
+
+    /** Whether all docs up to {@link #upToInclusive} values map to the same bucket. */
+    private boolean upToSameBucket;
+
+    /** Index in {@link #counts} for docs up to {@link #upToInclusive}. */
+    private int upToBucketIndex;
+
+    HistogramLeafCollector(
+        NumericDocValues values,
+        DocValuesSkipper skipper,
+        long bucketWidth,
+        int maxBuckets,
+        LongIntHashMap collectorCounts) {
+      this.values = values;
+      this.skipper = skipper;
+      this.bucketWidth = bucketWidth;
+      this.maxBuckets = maxBuckets;
+      this.collectorCounts = collectorCounts;
+
+      leafMinBucket = Math.floorDiv(skipper.minValue(), bucketWidth);
+      long leafMaxBucket = Math.floorDiv(skipper.maxValue(), bucketWidth);
+      counts = new int[Math.toIntExact(leafMaxBucket - leafMinBucket + 1)];
+    }
+
+    @Override
+    public void setScorer(Scorable scorer) throws IOException {}
+
+    private void advanceSkipper(int doc) throws IOException {
+      if (doc > skipper.maxDocID(0)) {
+        skipper.advance(doc);
+      }
+      upToSameBucket = false;
+
+      if (skipper.minDocID(0) > doc) {
+        // Corner case which happens if `doc` doesn't have a value and is between two intervals of
+        // the doc-value skip index.
+        upToInclusive = skipper.minDocID(0) - 1;
+        return;
+      }
+
+      upToInclusive = skipper.maxDocID(0);
+
+      // Now find the highest level where all docs map to the same bucket.
+      for (int level = 0; level < skipper.numLevels(); ++level) {
+        int totalDocsAtLevel = skipper.maxDocID(level) - skipper.minDocID(level) + 1;
+        long minBucket = Math.floorDiv(skipper.minValue(level), bucketWidth);
+        long maxBucket = Math.floorDiv(skipper.maxValue(level), bucketWidth);
+
+        if (skipper.docCount(level) == totalDocsAtLevel && minBucket == maxBucket) {
+          // All docs at this level have a value, and all values map to the same bucket.
+          upToInclusive = skipper.maxDocID(level);
+          upToSameBucket = true;
+          upToBucketIndex = (int) (minBucket - this.leafMinBucket);
+        } else {
+          break;
+        }
+      }
+    }
+
+    @Override
+    public void collect(int doc) throws IOException {
+      if (doc > upToInclusive) {
+        advanceSkipper(doc);
+      }
+
+      if (upToSameBucket) {
+        counts[upToBucketIndex]++;
+      } else if (values.advanceExact(doc)) {
+        final long value = values.longValue();
+        final long bucket = Math.floorDiv(value, bucketWidth);
+        counts[(int) (bucket - leafMinBucket)]++;
+      }
+    }
+
+    @Override
+    public void finish() throws IOException {
+      // Put counts that we computed in the int[] back into the hash map.
+      for (int i = 0; i < counts.length; ++i) {
+        collectorCounts.addTo(leafMinBucket + i, counts[i]);
+      }
+      checkMaxBuckets(collectorCounts.size(), maxBuckets);
+    }
+  }
+
+  private static void checkMaxBuckets(int size, int maxBuckets) {
+    if (size > maxBuckets) {
+      throw new IllegalStateException(
+          "Collected "
+              + size
+              + " buckets, which is more than the configured max number of buckets: "
+              + maxBuckets);
+    }
+  }
+}
diff --git a/.../src/java/org/apache/lucene/sandbox/facet/plain/histograms/HistogramCollectorManager.java b/.../src/java/org/apache/lucene/sandbox/facet/plain/histograms/HistogramCollectorManager.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.sandbox.facet.plain.histograms;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Objects;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.internal.hppc.LongIntHashMap;
+import org.apache.lucene.internal.hppc.LongIntHashMap.LongIntCursor;
+import org.apache.lucene.search.CollectorManager;
+
+/**
+ * {@link CollectorManager} that computes a histogram of the distribution of the values of a field.
+ *
+ * <p>It takes an {@code bucketWidth} as a parameter and counts the number of documents that fall
+ * into intervals [0, bucketWidth), [bucketWidth, 2*bucketWidth), etc. The keys of the returned
+ * {@link LongIntHashMap} identify these intervals as the quotient of the integer division by {@code
+ * bucketWidth}. Said otherwise, a key equal to {@code k} maps to values in the interval {@code [k *
+ * bucketWidth, (k+1) * bucketWidth)}.
+ *
+ * <p>This implementation is optimized for the case when {@code field} is part of the index sort and
+ * has a {@link FieldType#setDocValuesSkipIndexType skip index}.
+ *
+ * <p>Note: this collector is inspired from "YU, Muzhi, LIN, Zhaoxiang, SUN, Jinan, et al.
+ * TencentCLS: the cloud log service with high query performances. Proceedings of the VLDB
+ * Endowment, 2022, vol. 15, no 12, p. 3472-3482.", where the authors describe how they run
+ * "histogram queries" by sorting the index by timestamp and pre-computing ranges of doc IDs for
+ * every possible bucket.
+ */
+public final class HistogramCollectorManager
+    implements CollectorManager<HistogramCollector, LongIntHashMap> {
+
+  private static final int DEFAULT_MAX_BUCKETS = 1024;
+
+  private final String field;
+  private final long bucketWidth;
+  private final int maxBuckets;
+
+  /**
+   * Compute a histogram of the distribution of the values of the given {@code field} according to
+   * the given {@code bucketWidth}. This configures a maximum number of buckets equal to the default
+   * of 1024.
+   */
+  public HistogramCollectorManager(String field, long bucketWidth) {
+    this(field, bucketWidth, DEFAULT_MAX_BUCKETS);
+  }
+
+  /**
+   * Expert constructor.
+   *
+   * @param maxBuckets Max allowed number of buckets. Note that this is checked at runtime and on a
+   *     best-effort basis.
+   */
+  public HistogramCollectorManager(String field, long bucketWidth, int maxBuckets) {
+    this.field = Objects.requireNonNull(field);
+    if (bucketWidth < 2) {
+      throw new IllegalArgumentException("bucketWidth must be at least 2, got: " + bucketWidth);
+    }
+    this.bucketWidth = bucketWidth;
+    if (maxBuckets < 1) {
+      throw new IllegalArgumentException("maxBuckets must be at least 1, got: " + maxBuckets);
+    }
+    this.maxBuckets = maxBuckets;
+  }
+
+  @Override
+  public HistogramCollector newCollector() throws IOException {
+    return new HistogramCollector(field, bucketWidth, maxBuckets);
+  }
+
+  @Override
+  public LongIntHashMap reduce(Collection<HistogramCollector> collectors) throws IOException {
+    LongIntHashMap reduced = new LongIntHashMap();
+    for (HistogramCollector collector : collectors) {
+      for (LongIntCursor cursor : collector.getCounts()) {
+        reduced.addTo(cursor.key, cursor.value);
+      }
+    }
+    return reduced;
+  }
+}