Implement negative position search (#212)

* Parse integers surrounded by double quotes * Expand getField to include more query types * Add query type to perform negative position searches * Use new query type for negative queries * Add tests for negative position queries * Add support for range queries & fix cache support * Uncomment test query & add additional negative query * Update doc comments to be more accurate * Calculate field count once rather than twice * Add sign to NUMBER lexer rule This enables us to drop quotes around numbers in the `pos` operator. * Delete unused constructor * Handle matching start+end position case * Update edge matching behavior Also breaks all the testcases out into named tests * Remove outdated test * Fix issue with positionIncrementGap
adsabs · May 16, 2024 · 85c935d · 85c935d
1 parent e60e0ab
commit 85c935d
Show file tree

Hide file tree

Showing 7 changed files with 424 additions and 10 deletions.
diff --git a/montysolr/src/main/antlr/ADS.g b/montysolr/src/main/antlr/ADS.g
@@ -504,7 +504,7 @@ DATE_TOKEN
 
 NUMBER  
   : 
-  INT+ ('.' INT+)?
+  ('+'|'-')? INT+ ('.' INT+)?
   ;
 
 fragment M_NUMBER:  

diff --git a/...n/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java b/...n/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java
@@ -20,6 +20,7 @@
 import org.apache.lucene.search.SecondOrderCollector.FinalValueType;
 import org.apache.lucene.search.join.JoinUtil;
 import org.apache.lucene.search.join.ScoreMode;
+import org.apache.lucene.search.spans.SpanNegativeIndexRangeQuery;
 import org.apache.lucene.search.spans.SpanPositionRangeQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.util.FixedBitSet;
@@ -323,18 +324,17 @@ public Query parse(FunctionQParser fp) throws SyntaxError {
                     throw new NestedParseException("Wrong number of arguments");
                 }
 
-                assert start > 0;
-                assert start <= end;
+                assert end != 0;
 
                 SpanConverter converter = new SpanConverter();
                 converter.setWrapNonConvertible(true);
 
                 // a field can have a different positionIncrementGap
                 int positionIncrementGap = 1;
+                String queryField = getField(query);
                 if (fp.getReq() != null) {
                     IndexSchema schema = fp.getReq().getSchema();
-                    String f = getField(query);
-                    SchemaField field = schema.getFieldOrNull(f);
+                    SchemaField field = schema.getFieldOrNull(queryField);
                     if (field != null) {
                         FieldType fType = field.getType();
                         //if (!fType.isMultiValued()) {
@@ -367,7 +367,12 @@ public Query parse(FunctionQParser fp) throws SyntaxError {
                     throw ex;
                 }
 
-                query = new SpanPositionRangeQuery(spanQuery, (start - 1) * positionIncrementGap, end * positionIncrementGap); //lucene counts from zeroes
+                if (start < 0 || end < 0) {
+                    query = new SpanNegativeIndexRangeQuery(spanQuery, queryField, start, end, positionIncrementGap);
+                } else {
+                    query = new SpanPositionRangeQuery(spanQuery, (start - 1) * positionIncrementGap, end * positionIncrementGap); //lucene counts from zeroes
+                }
+
                 if (wrapConstant)
                     query = new ConstantScoreQuery(query);
                 if (boostFactor != 1.0f)
@@ -390,11 +395,32 @@ private String getField(Query query) throws SyntaxError {
                     }
 
                     if (s.size() > 1) {
-                        throw new SyntaxError("`pos` queries cannot handle boolean queries that span multiple fields, " + 
-                            "including virtual field queries. Try using a non-virtual field instead.");
+                        throw new SyntaxError("`pos` queries cannot handle boolean queries that span multiple fields, " +
+                                "including virtual field queries. Try using a non-virtual field instead.");
                     }
 
                     return (String) s.toArray()[0];
+                } else if (query instanceof BoostQuery) {
+                    return getField(((BoostQuery) query).getQuery());
+                } else if (query instanceof ConstantScoreQuery) {
+                    return getField(((ConstantScoreQuery) query).getQuery());
+                } else if (query instanceof MultiTermQuery) {
+                    return ((MultiTermQuery) query).getField();
+                } else if (query instanceof DisjunctionMaxQuery) {
+                    String field = null;
+
+                    for (Query q : ((DisjunctionMaxQuery) query).getDisjuncts()) {
+                        String f = getField(q);
+
+                        if (field == null) {
+                            field = f;
+                        } else if (!field.equals(f)) {
+                            throw new SyntaxError("`pos` queries cannot handle disjunction queries that span multiple fields, " +
+                                    "including virtual field queries. Try using a non-virtual field instead.");
+                        }
+                    }
+
+                    return field;
                 } else {
                     // last resort
                     return query.toString().split(":")[0];

diff --git a/montysolr/src/main/java/org/apache/lucene/search/spans/SpanNegativeIndexRangeQuery.java b/montysolr/src/main/java/org/apache/lucene/search/spans/SpanNegativeIndexRangeQuery.java
@@ -0,0 +1,127 @@
+package org.apache.lucene.search.spans;
+
+import org.apache.lucene.document.Document;
+
+/**
+ * Matches spans that are within a range of positions in the document. This query type supports negative indices.
+ */
+public class SpanNegativeIndexRangeQuery extends SpanPositionAndDocumentQuery {
+
+    // Field name is required to check the total number of entries the document has for that field
+    // This information is not included in the Spans object; it needs to come from the Document
+    protected String fieldName;
+    protected int startPosition;
+    protected int endPosition;
+    private final int positionIncrementGap;
+
+    /**
+     * <p>Constructs a query to perform range matches, where the range position may be negative.
+     * <p>In the case of negative positions, the negative position is added to the total field count
+     * for a given document.
+     * For example: if the field count is 5, and the position is -1, the effective position is 4.
+     * <p>If the effective position is negative, the entire document is skipped.
+     *
+     * @param match Potential span match iterator
+     * @param fieldName Field to search
+     * @param startPosition The start position of the range. Can be positive or negative, but never 0
+     * @param endPosition The end position of the range. Can be positive or negative, but never 0
+     */
+    public SpanNegativeIndexRangeQuery(SpanQuery match, String fieldName, int startPosition, int endPosition,
+                                       int positionIncrementGap) {
+        super(match);
+
+        this.fieldName = fieldName;
+        this.startPosition = startPosition;
+        this.endPosition = endPosition;
+        this.positionIncrementGap = positionIncrementGap;
+    }
+
+    @Override
+    protected FilterSpans.AcceptStatus acceptPosition(Spans spans, Document currentDocument) {
+        int count = currentDocument.getFields(fieldName).length;
+        int docStartPosition = startPosition;
+        if (startPosition < 0) {
+            docStartPosition = count + startPosition;
+
+            // If the start index is still negative, clip it to the beginning of the field
+            // This is similar to Python, where if a = [1,2,3], then a[-10:] == [1,2,3] but a[:-10] == []
+            if (docStartPosition < 0) {
+                docStartPosition = 0;
+            }
+        }
+        int docEndPosition = endPosition;
+        if (endPosition < 0) {
+            docEndPosition = count + endPosition + 1;
+
+            // If the end position is still negative, there can be no matches in the document
+            // Consider what the user is asking for: "Give me all matches that are _not_ in the last N positions"
+            // If the user is asking for everything except the last 10 positions, and the field only has 5 positions,
+            // then there can be no matches.
+            if (docEndPosition < 0) {
+                return FilterSpans.AcceptStatus.NO_MORE_IN_CURRENT_DOC;
+            }
+        }
+
+        int spanStart = spans.startPosition();
+        int spanEnd = spans.endPosition();
+        if (positionIncrementGap > 1) {
+            // positionIncrementGap produces sequences of positions like f(n) = n*(positionIncrementGap + 2)
+            // e.g. 0, 102, 204, 306
+            // The end of each position is its start position +1
+            // It is NOT the next start position
+            // e.g. (start, end): (0, 1), (102, 103), (204, 205), (306, 307)
+            spanStart = spans.startPosition() / (positionIncrementGap + 2);
+            spanEnd = (spans.endPosition() - 1) / (positionIncrementGap + 2);
+            if (spanStart == spanEnd && spans.startPosition() != spans.endPosition())
+                spanEnd += 1;
+        }
+
+        if (endPosition != startPosition) {
+            // Too late; beyond the end position
+            if (spanStart > docEndPosition)
+                return FilterSpans.AcceptStatus.NO_MORE_IN_CURRENT_DOC;
+
+            if (spanEnd > docEndPosition)
+                return FilterSpans.AcceptStatus.NO;
+
+            // Too early; before the start position
+            if (spanEnd < docStartPosition)
+                return FilterSpans.AcceptStatus.NO;
+
+            if (spanStart < docStartPosition)
+                return FilterSpans.AcceptStatus.NO;
+        } else {
+            // We are doing an exact position search
+            if (spanStart != docStartPosition)
+                return FilterSpans.AcceptStatus.NO_MORE_IN_CURRENT_DOC;
+        }
+
+        return FilterSpans.AcceptStatus.YES;
+    }
+
+    // Override required otherwise the queries may be cached incorrectly
+    @Override
+    public boolean equals(Object other) {
+        if (!sameClassAs(other))
+            return false;
+
+        SpanNegativeIndexRangeQuery otherQuery = (SpanNegativeIndexRangeQuery) other;
+        return super.equals(other)
+                && startPosition == otherQuery.startPosition
+                && endPosition == otherQuery.endPosition
+                && fieldName.equals(otherQuery.fieldName);
+    }
+
+    @Override
+    public String toString(String field) {
+        StringBuilder buffer = new StringBuilder();
+        buffer.append("spanNegativeIndexRange(");
+        buffer.append(match.toString(field));
+        buffer.append(", ");
+        buffer.append(startPosition);
+        buffer.append(", ");
+        buffer.append(endPosition);
+        buffer.append(")");
+        return buffer.toString();
+    }
+}
diff --git a/montysolr/src/main/java/org/apache/lucene/search/spans/SpanPositionAndDocumentQuery.java b/montysolr/src/main/java/org/apache/lucene/search/spans/SpanPositionAndDocumentQuery.java
@@ -0,0 +1,126 @@
+package org.apache.lucene.search.spans;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.spans.FilterSpans.AcceptStatus;
+
+
+/**
+ * Base class for filtering a SpanQuery based on the position of a match.
+ **/
+public abstract class SpanPositionAndDocumentQuery extends SpanQuery implements Cloneable {
+    protected SpanQuery match;
+
+    public SpanPositionAndDocumentQuery(SpanQuery match) {
+        this.match = Objects.requireNonNull(match);
+    }
+
+    /**
+     * @return the SpanQuery whose matches are filtered.
+     *
+     * */
+    public SpanQuery getMatch() { return match; }
+
+    @Override
+    public String getField() { return match.getField(); }
+
+    /**
+     * Implementing classes are required to return whether the current position is a match for the passed in
+     * "match" {@link SpanQuery}.
+     *
+     * This is only called if the underlying last {@link Spans#nextStartPosition()} for the
+     * match indicated a valid start position.
+     *
+     * @param spans The {@link Spans} instance, positioned at the spot to check
+     *
+     * @return whether the match is accepted, rejected, or rejected and should move to the next doc.
+     *
+     * @see Spans#nextDoc()
+     *
+     */
+    protected abstract AcceptStatus acceptPosition(Spans spans, Document currentDocument) throws IOException;
+
+    @Override
+    public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
+        SpanWeight matchWeight = match.createWeight(searcher, false, boost);
+        return new SpanPositionCheckWeight(matchWeight, searcher, needsScores ? getTermContexts(matchWeight) : null, boost);
+    }
+
+    public class SpanPositionCheckWeight extends SpanWeight {
+
+        final SpanWeight matchWeight;
+
+        public SpanPositionCheckWeight(SpanWeight matchWeight, IndexSearcher searcher, Map<Term, TermContext> terms, float boost) throws IOException {
+            super(SpanPositionAndDocumentQuery.this, searcher, terms, boost);
+            this.matchWeight = matchWeight;
+        }
+
+        @Override
+        public void extractTerms(Set<Term> terms) {
+            matchWeight.extractTerms(terms);
+        }
+
+        @Override
+        public boolean isCacheable(LeafReaderContext ctx) {
+            return matchWeight.isCacheable(ctx);
+        }
+
+        @Override
+        public void extractTermContexts(Map<Term, TermContext> contexts) {
+            matchWeight.extractTermContexts(contexts);
+        }
+
+        @Override
+        public Spans getSpans(final LeafReaderContext context, Postings requiredPostings) throws IOException {
+            Spans matchSpans = matchWeight.getSpans(context, requiredPostings);
+            return (matchSpans == null) ? null : new FilterSpans(matchSpans) {
+                @Override
+                protected AcceptStatus accept(Spans candidate) throws IOException {
+                    Document currentDocument = context.reader().document(candidate.docID());
+
+                    return acceptPosition(candidate, currentDocument);
+                }
+            };
+        }
+
+    }
+
+    @Override
+    public Query rewrite(IndexReader reader) throws IOException {
+        SpanQuery rewritten = (SpanQuery) match.rewrite(reader);
+        if (rewritten != match) {
+            try {
+                SpanPositionAndDocumentQuery clone = (SpanPositionAndDocumentQuery) this.clone();
+                clone.match = rewritten;
+                return clone;
+            } catch (CloneNotSupportedException e) {
+                throw new AssertionError(e);
+            }
+        }
+
+        return super.rewrite(reader);
+    }
+
+    /** Returns true iff <code>other</code> is equal to this. */
+    @Override
+    public boolean equals(Object other) {
+        return sameClassAs(other) &&
+                match.equals(((SpanPositionAndDocumentQuery) other).match);
+    }
+
+    @Override
+    public int hashCode() {
+        return classHash() ^ match.hashCode();
+    }
+}
+
diff --git a/montysolr/src/main/java/org/apache/solr/search/AqpFunctionQParser.java b/montysolr/src/main/java/org/apache/solr/search/AqpFunctionQParser.java
@@ -144,7 +144,11 @@ public String parseId() throws SyntaxError {
 
 
     public int parseInt() {
-        return Integer.valueOf(consumeAsString());
+        String val = consumeAsString();
+        if (val.charAt(0) == '"' && val.charAt(val.length() - 1) == '"') {
+            val = val.substring(1, val.length() - 1);
+        }
+        return Integer.valueOf(val);
     }
 
     public Float parseFloat() throws SyntaxError {
-Original file line number
+Diff line change
@@ Expand Up / @@ -504,7 +504,7 @@ DATE_TOKEN @@
     NUMBER
       :
-      INT+ ('.' INT+)?
+      ('+'|'-')? INT+ ('.' INT+)?
       ;
     fragment M_NUMBER:
@@ Expand Down @@