Skip to content

Commit

Permalink
Modified the pick synonym algo; updated similar
Browse files Browse the repository at this point in the history
  • Loading branch information
romanchyla committed Aug 27, 2019
1 parent a4531fc commit d8e3e05
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 65 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1210,17 +1210,22 @@ public Query parse(FunctionQParser fp) throws SyntaxError {
final StringBuilder text = new StringBuilder();
SolrQueryRequest req = fp.getReq();
FixedBitSet toIgnore = null;
String[] fieldsToLoad;
String[] fieldsToLoad = toLoad.split(" ");

if (toLoad.equals("input")) {
if (toLoad.indexOf("input") > -1) {
text.append(input);
fieldsToLoad = new String[] {"abstract"};
if (toLoad.length() > 5) {
fieldsToLoad = toLoad.substring(toLoad.indexOf("input")+6).split(" ");
}
else {
fieldsToLoad = new String[] {"abstract"};
}
}
else {

fieldsToLoad = toLoad.split(" ");
QParser aqp = fp.subQuery(input, "aqp");
Query innerQuery = aqp.parse();
fieldsToLoad = toLoad.split(" ");

HashSet<String> docFields = new HashSet<String>();
for (String f: fieldsToLoad) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,17 +232,21 @@ private void pickSynonyms(List<QueryNode> termList, List<QueryNode> newList, Set
// positions, to guess what situation we are in

int equalLength = 0;
int fromShortToLongForm = 0;
int fromLongToShort = 0;
int userInputLen = 0;
int numTokens = 0;
int tokenLongerThanInput = 0;
int tokenShorterThanInput = 0;
int begin = 0;
int end = 0;
int len = 0;
String text;
FieldQueryNode maxTerm = null;
FieldQueryNode minTerm = null;
FieldQueryNode maxFreqTerm = null;
FieldQueryNode minFreqTerm = null;
FieldQueryNode closestLenTerm = null;
int termFreq;
int minFreqTerm = Integer.MAX_VALUE;
int maxFreqTerm = Integer.MIN_VALUE;
int minFreq = Integer.MAX_VALUE;
int maxFreq = Integer.MIN_VALUE;
Integer closestLen = null;

// first decide one scenarios 1. xor 2.
for (QueryNode n: termList) {
Expand All @@ -256,54 +260,70 @@ private void pickSynonyms(List<QueryNode> termList, List<QueryNode> newList, Set
begin = termNode.getBegin();
end = termNode.getEnd();
text = termNode.getTextAsString();
len = text.length() - 5;
len = text.length() - (text.indexOf("::") + 2);
userInputLen += len;
numTokens++;

if (len > (end - begin)) {
fromShortToLongForm++;
// how many times the current token fits into the user input
// anything below 1.0 means the current token is longer than
// what user typed

float ratio = (float)(end-begin) / (float)len;

if (ratio == 1.0f) {
equalLength++;
}
else if (len < (end - begin)) {
fromLongToShort++;
else if (ratio < 1.2f) { // we give it bit of slack
tokenLongerThanInput++;
}
else {
equalLength++;
tokenShorterThanInput++;
}

if (closestLen == null || Math.abs((end-begin)-len) < closestLen) {
closestLen = Math.abs((end-begin)-len);
closestLenTerm = termNode;
}


// careful, 0 means the term does not exist
termFreq = searcher.docFreq(new Term(termNode.getFieldAsString(), text));

// we'll ignore unknown terms
if (termFreq > 0) {
if (termFreq < minFreqTerm) {
minTerm = termNode;
minFreqTerm = termFreq;
if (termFreq < minFreq) {
minFreqTerm = termNode;
minFreq = termFreq;
}
else if (termFreq == minFreqTerm && text.length() > minTerm.getValue().length()) {
minTerm = termNode;
else if (termFreq == minFreq && text.length() > minFreqTerm.getValue().length()) {
minFreqTerm = termNode; // if same docfreq, pick longer ones
}

if (termFreq > maxFreqTerm) {
maxTerm = termNode;
maxFreqTerm = termFreq;
if (termFreq > maxFreq) {
maxFreqTerm = termNode;
maxFreq = termFreq;
}
else if (termFreq == maxFreqTerm && text.length() < minTerm.getValue().length()) {
maxTerm = termNode;
else if (termFreq == maxFreq && text.length() < minFreqTerm.getValue().length()) {
maxFreqTerm = termNode; // if same frequency, pick shorter one
}
}

}

String strategy = null;
if (fromShortToLongForm > fromLongToShort) {
strategy = "mostFrequent"; // pick the shortest
if (tokenLongerThanInput > tokenShorterThanInput) {
strategy = "mostFrequent"; // most tokens are longer than input (i.e. user typed acronym)
// pick the shortest - i.e. more frequent term
}
else if (fromLongToShort > fromShortToLongForm) {
strategy = "leastFrequent"; // pick the longest
else if (tokenShorterThanInput > tokenLongerThanInput) {
strategy = "leastFrequent"; // most tokens were equal or shorter than the user's input
// pick the longest - i.e. more specific term
}
else { // they were equal lengths
strategy = "cantDecide";
if (minTerm != null && maxTerm != null) {
int diffMax = Math.abs(len - (maxTerm.getEnd() - maxTerm.getBegin()));
int diffMin = Math.abs(len - (minTerm.getEnd() - minTerm.getBegin()));
if (minFreqTerm != null && maxFreqTerm != null) {
float diffMax = Math.abs((float)userInputLen/numTokens - ((FieldQueryNode) maxFreqTerm).getTextAsString().length());
float diffMin = Math.abs((float)userInputLen/numTokens - ((FieldQueryNode) minFreqTerm).getTextAsString().length());

if (diffMax < diffMin) { // longer term is closer to input
strategy = "leastFrequent";
Expand All @@ -316,38 +336,19 @@ else if (diffMin < diffMax) { // shorter term is closer to the user input length



int added = 0;
for (QueryNode n: termList) {
String t = (String) n.getTag(AqpAnalyzerQueryNodeProcessor.TYPE_ATTRIBUTE);
if (t != null && typesToKeep.contains(t)) {
if (strategy.equals("mostFrequent") && n.equals(maxTerm)) {
newList.add(n);
added += 1;
break;
}
else if (strategy.equals("leastFrequent") && n.equals(minTerm)) {
newList.add(n);
added += 1;
break;
}
else if (strategy.equals("cantDecide")) {
newList.add(n);
added += 1;
break;
}
}
if (strategy.equals("mostFrequent") && maxFreqTerm != null) {
newList.add(maxFreqTerm);
}
else if (strategy.equals("leastFrequent") && minFreqTerm != null) {
newList.add(minFreqTerm);
}
else if (strategy.equals("cantDecide")) {
newList.add(closestLenTerm);
}

if (added == 0) { // we didn't find any type that would satisfy the condition
if (strategy.equals("mostFrequent") && maxTerm != null) {
newList.add(maxTerm);
}
else if (strategy.equals("leastFrequent") && minTerm != null) {
newList.add(minTerm);
}
else if (strategy.equals("cantDecide")) {
newList.add(termList.get(0));
}

if (newList.size() == 0) { // we didn't find any type that would satisfy the condition
newList.add(termList.get(0));
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,28 @@ public void setUp() throws Exception {

public void testMultiTokens() throws Exception {

// make sure the correct synonym is picked in absence of docfreq info
assertQueryEquals(req("q", "title:(\"antidesitter spacetime\" application)",
"aqp.multiphrase.keep_one", "SYNONYM",
"aqp.multiphrase.fields", "title"),
"+(title:\"antidesitter spacetime\" | Synonym(title:syn::antidesitter spacetime)) +title:application",
BooleanQuery.class);

// now add some docfreq
assertU(adoc("id", "1000", "bibcode", "xxxxxxxxxx1000",
"title", "antidesitter spacetime application"));
assertU(adoc("id", "1001", "bibcode", "xxxxxxxxxx1001",
"title", "anti de sitter space application"));
assertU(adoc("id", "1002", "bibcode", "xxxxxxxxxx1002",
"title", "NASA ADS"));
assertU(commit());

assertQueryEquals(req("q", "title:(\"antidesitter spacetime\" application)",
"aqp.multiphrase.keep_one", "SYNONYM",
"aqp.multiphrase.fields", "title"),
"+(title:\"antidesitter spacetime\" | Synonym(title:syn::antidesitter spacetime)) +title:application",
BooleanQuery.class);


// for relevancy scoring we want to avoid double-counting
// so all of these below will use new aqp.multiphrase.keep parameter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -381,10 +381,10 @@ public void testSpecialCases() throws Exception {
"like:foo bar baz",
MoreLikeThisQuery.class);
// default docfreq=2, termfreq=2
assertQ(req("q", "similar(foo bar baz, input, 100, 100, 2, 2)"),
assertQ(req("q", "similar(foo bar baz, input abstract, 100, 100, 2, 2)"),
"//*[@numFound='0']");
// change defaults
assertQ(req("q", "similar(foo bar baz, input, 100, 100, 1, 1)"),
assertQ(req("q", "similar(foo bar baz, input abstract, 100, 100, 1, 1)"),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='2']");

Expand Down

0 comments on commit d8e3e05

Please sign in to comment.