Skip to content

Commit

Permalink
Implement negative position search (#212)
Browse files Browse the repository at this point in the history
* Parse integers surrounded by double quotes

* Expand getField to include more query types

* Add query type to perform negative position searches

* Use new query type for negative queries

* Add tests for negative position queries

* Add support for range queries & fix cache support

* Uncomment test query & add additional negative query

* Update doc comments to be more accurate

* Calculate field count once rather than twice

* Add sign to NUMBER lexer rule

This enables us to drop quotes around numbers in the `pos` operator.

* Delete unused constructor

* Handle matching start+end position case

* Update edge matching behavior

Also breaks all the testcases out into named tests

* Remove outdated test

* Fix issue with positionIncrementGap
  • Loading branch information
JCRPaquin authored May 16, 2024
1 parent e60e0ab commit 85c935d
Show file tree
Hide file tree
Showing 7 changed files with 424 additions and 10 deletions.
2 changes: 1 addition & 1 deletion montysolr/src/main/antlr/ADS.g
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ DATE_TOKEN

NUMBER
:
INT+ ('.' INT+)?
('+'|'-')? INT+ ('.' INT+)?
;

fragment M_NUMBER:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import org.apache.lucene.search.SecondOrderCollector.FinalValueType;
import org.apache.lucene.search.join.JoinUtil;
import org.apache.lucene.search.join.ScoreMode;
import org.apache.lucene.search.spans.SpanNegativeIndexRangeQuery;
import org.apache.lucene.search.spans.SpanPositionRangeQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.FixedBitSet;
Expand Down Expand Up @@ -323,18 +324,17 @@ public Query parse(FunctionQParser fp) throws SyntaxError {
throw new NestedParseException("Wrong number of arguments");
}

assert start > 0;
assert start <= end;
assert end != 0;

SpanConverter converter = new SpanConverter();
converter.setWrapNonConvertible(true);

// a field can have a different positionIncrementGap
int positionIncrementGap = 1;
String queryField = getField(query);
if (fp.getReq() != null) {
IndexSchema schema = fp.getReq().getSchema();
String f = getField(query);
SchemaField field = schema.getFieldOrNull(f);
SchemaField field = schema.getFieldOrNull(queryField);
if (field != null) {
FieldType fType = field.getType();
//if (!fType.isMultiValued()) {
Expand Down Expand Up @@ -367,7 +367,12 @@ public Query parse(FunctionQParser fp) throws SyntaxError {
throw ex;
}

query = new SpanPositionRangeQuery(spanQuery, (start - 1) * positionIncrementGap, end * positionIncrementGap); //lucene counts from zeroes
if (start < 0 || end < 0) {
query = new SpanNegativeIndexRangeQuery(spanQuery, queryField, start, end, positionIncrementGap);
} else {
query = new SpanPositionRangeQuery(spanQuery, (start - 1) * positionIncrementGap, end * positionIncrementGap); //lucene counts from zeroes
}

if (wrapConstant)
query = new ConstantScoreQuery(query);
if (boostFactor != 1.0f)
Expand All @@ -390,11 +395,32 @@ private String getField(Query query) throws SyntaxError {
}

if (s.size() > 1) {
throw new SyntaxError("`pos` queries cannot handle boolean queries that span multiple fields, " +
"including virtual field queries. Try using a non-virtual field instead.");
throw new SyntaxError("`pos` queries cannot handle boolean queries that span multiple fields, " +
"including virtual field queries. Try using a non-virtual field instead.");
}

return (String) s.toArray()[0];
} else if (query instanceof BoostQuery) {
return getField(((BoostQuery) query).getQuery());
} else if (query instanceof ConstantScoreQuery) {
return getField(((ConstantScoreQuery) query).getQuery());
} else if (query instanceof MultiTermQuery) {
return ((MultiTermQuery) query).getField();
} else if (query instanceof DisjunctionMaxQuery) {
String field = null;

for (Query q : ((DisjunctionMaxQuery) query).getDisjuncts()) {
String f = getField(q);

if (field == null) {
field = f;
} else if (!field.equals(f)) {
throw new SyntaxError("`pos` queries cannot handle disjunction queries that span multiple fields, " +
"including virtual field queries. Try using a non-virtual field instead.");
}
}

return field;
} else {
// last resort
return query.toString().split(":")[0];
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package org.apache.lucene.search.spans;

import org.apache.lucene.document.Document;

/**
* Matches spans that are within a range of positions in the document. This query type supports negative indices.
*/
public class SpanNegativeIndexRangeQuery extends SpanPositionAndDocumentQuery {

// Field name is required to check the total number of entries the document has for that field
// This information is not included in the Spans object; it needs to come from the Document
protected String fieldName;
protected int startPosition;
protected int endPosition;
private final int positionIncrementGap;

/**
* <p>Constructs a query to perform range matches, where the range position may be negative.
* <p>In the case of negative positions, the negative position is added to the total field count
* for a given document.
* For example: if the field count is 5, and the position is -1, the effective position is 4.
* <p>If the effective position is negative, the entire document is skipped.
*
* @param match Potential span match iterator
* @param fieldName Field to search
* @param startPosition The start position of the range. Can be positive or negative, but never 0
* @param endPosition The end position of the range. Can be positive or negative, but never 0
*/
public SpanNegativeIndexRangeQuery(SpanQuery match, String fieldName, int startPosition, int endPosition,
int positionIncrementGap) {
super(match);

this.fieldName = fieldName;
this.startPosition = startPosition;
this.endPosition = endPosition;
this.positionIncrementGap = positionIncrementGap;
}

@Override
protected FilterSpans.AcceptStatus acceptPosition(Spans spans, Document currentDocument) {
int count = currentDocument.getFields(fieldName).length;
int docStartPosition = startPosition;
if (startPosition < 0) {
docStartPosition = count + startPosition;

// If the start index is still negative, clip it to the beginning of the field
// This is similar to Python, where if a = [1,2,3], then a[-10:] == [1,2,3] but a[:-10] == []
if (docStartPosition < 0) {
docStartPosition = 0;
}
}
int docEndPosition = endPosition;
if (endPosition < 0) {
docEndPosition = count + endPosition + 1;

// If the end position is still negative, there can be no matches in the document
// Consider what the user is asking for: "Give me all matches that are _not_ in the last N positions"
// If the user is asking for everything except the last 10 positions, and the field only has 5 positions,
// then there can be no matches.
if (docEndPosition < 0) {
return FilterSpans.AcceptStatus.NO_MORE_IN_CURRENT_DOC;
}
}

int spanStart = spans.startPosition();
int spanEnd = spans.endPosition();
if (positionIncrementGap > 1) {
// positionIncrementGap produces sequences of positions like f(n) = n*(positionIncrementGap + 2)
// e.g. 0, 102, 204, 306
// The end of each position is its start position +1
// It is NOT the next start position
// e.g. (start, end): (0, 1), (102, 103), (204, 205), (306, 307)
spanStart = spans.startPosition() / (positionIncrementGap + 2);
spanEnd = (spans.endPosition() - 1) / (positionIncrementGap + 2);
if (spanStart == spanEnd && spans.startPosition() != spans.endPosition())
spanEnd += 1;
}

if (endPosition != startPosition) {
// Too late; beyond the end position
if (spanStart > docEndPosition)
return FilterSpans.AcceptStatus.NO_MORE_IN_CURRENT_DOC;

if (spanEnd > docEndPosition)
return FilterSpans.AcceptStatus.NO;

// Too early; before the start position
if (spanEnd < docStartPosition)
return FilterSpans.AcceptStatus.NO;

if (spanStart < docStartPosition)
return FilterSpans.AcceptStatus.NO;
} else {
// We are doing an exact position search
if (spanStart != docStartPosition)
return FilterSpans.AcceptStatus.NO_MORE_IN_CURRENT_DOC;
}

return FilterSpans.AcceptStatus.YES;
}

// Override required otherwise the queries may be cached incorrectly
@Override
public boolean equals(Object other) {
if (!sameClassAs(other))
return false;

SpanNegativeIndexRangeQuery otherQuery = (SpanNegativeIndexRangeQuery) other;
return super.equals(other)
&& startPosition == otherQuery.startPosition
&& endPosition == otherQuery.endPosition
&& fieldName.equals(otherQuery.fieldName);
}

@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
buffer.append("spanNegativeIndexRange(");
buffer.append(match.toString(field));
buffer.append(", ");
buffer.append(startPosition);
buffer.append(", ");
buffer.append(endPosition);
buffer.append(")");
return buffer.toString();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package org.apache.lucene.search.spans;

import java.io.IOException;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.FilterSpans.AcceptStatus;


/**
* Base class for filtering a SpanQuery based on the position of a match.
**/
public abstract class SpanPositionAndDocumentQuery extends SpanQuery implements Cloneable {
protected SpanQuery match;

public SpanPositionAndDocumentQuery(SpanQuery match) {
this.match = Objects.requireNonNull(match);
}

/**
* @return the SpanQuery whose matches are filtered.
*
* */
public SpanQuery getMatch() { return match; }

@Override
public String getField() { return match.getField(); }

/**
* Implementing classes are required to return whether the current position is a match for the passed in
* "match" {@link SpanQuery}.
*
* This is only called if the underlying last {@link Spans#nextStartPosition()} for the
* match indicated a valid start position.
*
* @param spans The {@link Spans} instance, positioned at the spot to check
*
* @return whether the match is accepted, rejected, or rejected and should move to the next doc.
*
* @see Spans#nextDoc()
*
*/
protected abstract AcceptStatus acceptPosition(Spans spans, Document currentDocument) throws IOException;

@Override
public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
SpanWeight matchWeight = match.createWeight(searcher, false, boost);
return new SpanPositionCheckWeight(matchWeight, searcher, needsScores ? getTermContexts(matchWeight) : null, boost);
}

public class SpanPositionCheckWeight extends SpanWeight {

final SpanWeight matchWeight;

public SpanPositionCheckWeight(SpanWeight matchWeight, IndexSearcher searcher, Map<Term, TermContext> terms, float boost) throws IOException {
super(SpanPositionAndDocumentQuery.this, searcher, terms, boost);
this.matchWeight = matchWeight;
}

@Override
public void extractTerms(Set<Term> terms) {
matchWeight.extractTerms(terms);
}

@Override
public boolean isCacheable(LeafReaderContext ctx) {
return matchWeight.isCacheable(ctx);
}

@Override
public void extractTermContexts(Map<Term, TermContext> contexts) {
matchWeight.extractTermContexts(contexts);
}

@Override
public Spans getSpans(final LeafReaderContext context, Postings requiredPostings) throws IOException {
Spans matchSpans = matchWeight.getSpans(context, requiredPostings);
return (matchSpans == null) ? null : new FilterSpans(matchSpans) {
@Override
protected AcceptStatus accept(Spans candidate) throws IOException {
Document currentDocument = context.reader().document(candidate.docID());

return acceptPosition(candidate, currentDocument);
}
};
}

}

@Override
public Query rewrite(IndexReader reader) throws IOException {
SpanQuery rewritten = (SpanQuery) match.rewrite(reader);
if (rewritten != match) {
try {
SpanPositionAndDocumentQuery clone = (SpanPositionAndDocumentQuery) this.clone();
clone.match = rewritten;
return clone;
} catch (CloneNotSupportedException e) {
throw new AssertionError(e);
}
}

return super.rewrite(reader);
}

/** Returns true iff <code>other</code> is equal to this. */
@Override
public boolean equals(Object other) {
return sameClassAs(other) &&
match.equals(((SpanPositionAndDocumentQuery) other).match);
}

@Override
public int hashCode() {
return classHash() ^ match.hashCode();
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,11 @@ public String parseId() throws SyntaxError {


public int parseInt() {
return Integer.valueOf(consumeAsString());
String val = consumeAsString();
if (val.charAt(0) == '"' && val.charAt(val.length() - 1) == '"') {
val = val.substring(1, val.length() - 1);
}
return Integer.valueOf(val);
}

public Float parseFloat() throws SyntaxError {
Expand Down
Loading

0 comments on commit 85c935d

Please sign in to comment.