-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement negative position search (#212)
* Parse integers surrounded by double quotes * Expand getField to include more query types * Add query type to perform negative position searches * Use new query type for negative queries * Add tests for negative position queries * Add support for range queries & fix cache support * Uncomment test query & add additional negative query * Update doc comments to be more accurate * Calculate field count once rather than twice * Add sign to NUMBER lexer rule This enables us to drop quotes around numbers in the `pos` operator. * Delete unused constructor * Handle matching start+end position case * Update edge matching behavior Also breaks all the testcases out into named tests * Remove outdated test * Fix issue with positionIncrementGap
- Loading branch information
Showing
7 changed files
with
424 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -504,7 +504,7 @@ DATE_TOKEN | |
|
||
NUMBER | ||
: | ||
INT+ ('.' INT+)? | ||
('+'|'-')? INT+ ('.' INT+)? | ||
; | ||
|
||
fragment M_NUMBER: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
127 changes: 127 additions & 0 deletions
127
montysolr/src/main/java/org/apache/lucene/search/spans/SpanNegativeIndexRangeQuery.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
package org.apache.lucene.search.spans; | ||
|
||
import org.apache.lucene.document.Document; | ||
|
||
/** | ||
* Matches spans that are within a range of positions in the document. This query type supports negative indices. | ||
*/ | ||
public class SpanNegativeIndexRangeQuery extends SpanPositionAndDocumentQuery { | ||
|
||
// Field name is required to check the total number of entries the document has for that field | ||
// This information is not included in the Spans object; it needs to come from the Document | ||
protected String fieldName; | ||
protected int startPosition; | ||
protected int endPosition; | ||
private final int positionIncrementGap; | ||
|
||
/** | ||
* <p>Constructs a query to perform range matches, where the range position may be negative. | ||
* <p>In the case of negative positions, the negative position is added to the total field count | ||
* for a given document. | ||
* For example: if the field count is 5, and the position is -1, the effective position is 4. | ||
* <p>If the effective position is negative, the entire document is skipped. | ||
* | ||
* @param match Potential span match iterator | ||
* @param fieldName Field to search | ||
* @param startPosition The start position of the range. Can be positive or negative, but never 0 | ||
* @param endPosition The end position of the range. Can be positive or negative, but never 0 | ||
*/ | ||
public SpanNegativeIndexRangeQuery(SpanQuery match, String fieldName, int startPosition, int endPosition, | ||
int positionIncrementGap) { | ||
super(match); | ||
|
||
this.fieldName = fieldName; | ||
this.startPosition = startPosition; | ||
this.endPosition = endPosition; | ||
this.positionIncrementGap = positionIncrementGap; | ||
} | ||
|
||
@Override | ||
protected FilterSpans.AcceptStatus acceptPosition(Spans spans, Document currentDocument) { | ||
int count = currentDocument.getFields(fieldName).length; | ||
int docStartPosition = startPosition; | ||
if (startPosition < 0) { | ||
docStartPosition = count + startPosition; | ||
|
||
// If the start index is still negative, clip it to the beginning of the field | ||
// This is similar to Python, where if a = [1,2,3], then a[-10:] == [1,2,3] but a[:-10] == [] | ||
if (docStartPosition < 0) { | ||
docStartPosition = 0; | ||
} | ||
} | ||
int docEndPosition = endPosition; | ||
if (endPosition < 0) { | ||
docEndPosition = count + endPosition + 1; | ||
|
||
// If the end position is still negative, there can be no matches in the document | ||
// Consider what the user is asking for: "Give me all matches that are _not_ in the last N positions" | ||
// If the user is asking for everything except the last 10 positions, and the field only has 5 positions, | ||
// then there can be no matches. | ||
if (docEndPosition < 0) { | ||
return FilterSpans.AcceptStatus.NO_MORE_IN_CURRENT_DOC; | ||
} | ||
} | ||
|
||
int spanStart = spans.startPosition(); | ||
int spanEnd = spans.endPosition(); | ||
if (positionIncrementGap > 1) { | ||
// positionIncrementGap produces sequences of positions like f(n) = n*(positionIncrementGap + 2) | ||
// e.g. 0, 102, 204, 306 | ||
// The end of each position is its start position +1 | ||
// It is NOT the next start position | ||
// e.g. (start, end): (0, 1), (102, 103), (204, 205), (306, 307) | ||
spanStart = spans.startPosition() / (positionIncrementGap + 2); | ||
spanEnd = (spans.endPosition() - 1) / (positionIncrementGap + 2); | ||
if (spanStart == spanEnd && spans.startPosition() != spans.endPosition()) | ||
spanEnd += 1; | ||
} | ||
|
||
if (endPosition != startPosition) { | ||
// Too late; beyond the end position | ||
if (spanStart > docEndPosition) | ||
return FilterSpans.AcceptStatus.NO_MORE_IN_CURRENT_DOC; | ||
|
||
if (spanEnd > docEndPosition) | ||
return FilterSpans.AcceptStatus.NO; | ||
|
||
// Too early; before the start position | ||
if (spanEnd < docStartPosition) | ||
return FilterSpans.AcceptStatus.NO; | ||
|
||
if (spanStart < docStartPosition) | ||
return FilterSpans.AcceptStatus.NO; | ||
} else { | ||
// We are doing an exact position search | ||
if (spanStart != docStartPosition) | ||
return FilterSpans.AcceptStatus.NO_MORE_IN_CURRENT_DOC; | ||
} | ||
|
||
return FilterSpans.AcceptStatus.YES; | ||
} | ||
|
||
// Override required otherwise the queries may be cached incorrectly | ||
@Override | ||
public boolean equals(Object other) { | ||
if (!sameClassAs(other)) | ||
return false; | ||
|
||
SpanNegativeIndexRangeQuery otherQuery = (SpanNegativeIndexRangeQuery) other; | ||
return super.equals(other) | ||
&& startPosition == otherQuery.startPosition | ||
&& endPosition == otherQuery.endPosition | ||
&& fieldName.equals(otherQuery.fieldName); | ||
} | ||
|
||
@Override | ||
public String toString(String field) { | ||
StringBuilder buffer = new StringBuilder(); | ||
buffer.append("spanNegativeIndexRange("); | ||
buffer.append(match.toString(field)); | ||
buffer.append(", "); | ||
buffer.append(startPosition); | ||
buffer.append(", "); | ||
buffer.append(endPosition); | ||
buffer.append(")"); | ||
return buffer.toString(); | ||
} | ||
} |
126 changes: 126 additions & 0 deletions
126
montysolr/src/main/java/org/apache/lucene/search/spans/SpanPositionAndDocumentQuery.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
package org.apache.lucene.search.spans; | ||
|
||
import java.io.IOException; | ||
import java.util.Map; | ||
import java.util.Objects; | ||
import java.util.Set; | ||
|
||
import org.apache.lucene.document.Document; | ||
import org.apache.lucene.index.IndexReader; | ||
import org.apache.lucene.index.LeafReaderContext; | ||
import org.apache.lucene.index.Term; | ||
import org.apache.lucene.index.TermContext; | ||
import org.apache.lucene.search.IndexSearcher; | ||
import org.apache.lucene.search.Query; | ||
import org.apache.lucene.search.spans.FilterSpans.AcceptStatus; | ||
|
||
|
||
/** | ||
* Base class for filtering a SpanQuery based on the position of a match. | ||
**/ | ||
public abstract class SpanPositionAndDocumentQuery extends SpanQuery implements Cloneable { | ||
protected SpanQuery match; | ||
|
||
public SpanPositionAndDocumentQuery(SpanQuery match) { | ||
this.match = Objects.requireNonNull(match); | ||
} | ||
|
||
/** | ||
* @return the SpanQuery whose matches are filtered. | ||
* | ||
* */ | ||
public SpanQuery getMatch() { return match; } | ||
|
||
@Override | ||
public String getField() { return match.getField(); } | ||
|
||
/** | ||
* Implementing classes are required to return whether the current position is a match for the passed in | ||
* "match" {@link SpanQuery}. | ||
* | ||
* This is only called if the underlying last {@link Spans#nextStartPosition()} for the | ||
* match indicated a valid start position. | ||
* | ||
* @param spans The {@link Spans} instance, positioned at the spot to check | ||
* | ||
* @return whether the match is accepted, rejected, or rejected and should move to the next doc. | ||
* | ||
* @see Spans#nextDoc() | ||
* | ||
*/ | ||
protected abstract AcceptStatus acceptPosition(Spans spans, Document currentDocument) throws IOException; | ||
|
||
@Override | ||
public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { | ||
SpanWeight matchWeight = match.createWeight(searcher, false, boost); | ||
return new SpanPositionCheckWeight(matchWeight, searcher, needsScores ? getTermContexts(matchWeight) : null, boost); | ||
} | ||
|
||
public class SpanPositionCheckWeight extends SpanWeight { | ||
|
||
final SpanWeight matchWeight; | ||
|
||
public SpanPositionCheckWeight(SpanWeight matchWeight, IndexSearcher searcher, Map<Term, TermContext> terms, float boost) throws IOException { | ||
super(SpanPositionAndDocumentQuery.this, searcher, terms, boost); | ||
this.matchWeight = matchWeight; | ||
} | ||
|
||
@Override | ||
public void extractTerms(Set<Term> terms) { | ||
matchWeight.extractTerms(terms); | ||
} | ||
|
||
@Override | ||
public boolean isCacheable(LeafReaderContext ctx) { | ||
return matchWeight.isCacheable(ctx); | ||
} | ||
|
||
@Override | ||
public void extractTermContexts(Map<Term, TermContext> contexts) { | ||
matchWeight.extractTermContexts(contexts); | ||
} | ||
|
||
@Override | ||
public Spans getSpans(final LeafReaderContext context, Postings requiredPostings) throws IOException { | ||
Spans matchSpans = matchWeight.getSpans(context, requiredPostings); | ||
return (matchSpans == null) ? null : new FilterSpans(matchSpans) { | ||
@Override | ||
protected AcceptStatus accept(Spans candidate) throws IOException { | ||
Document currentDocument = context.reader().document(candidate.docID()); | ||
|
||
return acceptPosition(candidate, currentDocument); | ||
} | ||
}; | ||
} | ||
|
||
} | ||
|
||
@Override | ||
public Query rewrite(IndexReader reader) throws IOException { | ||
SpanQuery rewritten = (SpanQuery) match.rewrite(reader); | ||
if (rewritten != match) { | ||
try { | ||
SpanPositionAndDocumentQuery clone = (SpanPositionAndDocumentQuery) this.clone(); | ||
clone.match = rewritten; | ||
return clone; | ||
} catch (CloneNotSupportedException e) { | ||
throw new AssertionError(e); | ||
} | ||
} | ||
|
||
return super.rewrite(reader); | ||
} | ||
|
||
/** Returns true iff <code>other</code> is equal to this. */ | ||
@Override | ||
public boolean equals(Object other) { | ||
return sameClassAs(other) && | ||
match.equals(((SpanPositionAndDocumentQuery) other).match); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return classHash() ^ match.hashCode(); | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.