Modified the pick synonym algo; updated similar

adsabs · Aug 27, 2019 · d8e3e05 · d8e3e05
1 parent a4531fc
commit d8e3e05
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 65 deletions.
diff --git a/...c/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java b/...c/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java
@@ -1210,17 +1210,22 @@ public Query parse(FunctionQParser fp) throws SyntaxError {
         final StringBuilder text = new StringBuilder();
         SolrQueryRequest req = fp.getReq();
         FixedBitSet toIgnore = null;
-        String[] fieldsToLoad;
+        String[] fieldsToLoad = toLoad.split(" ");
 
-        if (toLoad.equals("input")) {
+        if (toLoad.indexOf("input") > -1) {
           text.append(input);
-          fieldsToLoad = new String[] {"abstract"};
+          if (toLoad.length() > 5) {
+            fieldsToLoad = toLoad.substring(toLoad.indexOf("input")+6).split(" ");
+          }
+          else {
+            fieldsToLoad = new String[] {"abstract"};
+          }
         }
         else {
 
+          fieldsToLoad = toLoad.split(" ");
           QParser aqp = fp.subQuery(input, "aqp");
           Query innerQuery = aqp.parse();
-          fieldsToLoad = toLoad.split(" ");
 
           HashSet<String> docFields = new HashSet<String>();
           for (String f: fieldsToLoad) {

diff --git a/...rg/apache/lucene/queryparser/flexible/aqp/processors/AqpChangeRewriteMethodProcessor.java b/...rg/apache/lucene/queryparser/flexible/aqp/processors/AqpChangeRewriteMethodProcessor.java
@@ -232,17 +232,21 @@ private void pickSynonyms(List<QueryNode> termList, List<QueryNode> newList, Set
       // positions, to guess what situation we are in
 
       int equalLength = 0;
-      int fromShortToLongForm = 0;
-      int fromLongToShort = 0;
+      int userInputLen = 0;
+      int numTokens = 0;
+      int tokenLongerThanInput = 0;
+      int tokenShorterThanInput = 0;
       int begin = 0;
       int end = 0;
       int len = 0;
       String text;
-      FieldQueryNode maxTerm = null;
-      FieldQueryNode minTerm = null;
+      FieldQueryNode maxFreqTerm = null;
+      FieldQueryNode minFreqTerm = null;
+      FieldQueryNode closestLenTerm = null;
       int termFreq;
-      int minFreqTerm = Integer.MAX_VALUE;
-      int maxFreqTerm = Integer.MIN_VALUE;
+      int minFreq = Integer.MAX_VALUE;
+      int maxFreq = Integer.MIN_VALUE;
+      Integer closestLen = null;
 
       // first decide one scenarios 1. xor 2.
       for (QueryNode n: termList) {
@@ -256,54 +260,70 @@ private void pickSynonyms(List<QueryNode> termList, List<QueryNode> newList, Set
         begin = termNode.getBegin();
         end = termNode.getEnd();
         text = termNode.getTextAsString();
-        len = text.length() - 5;
+        len = text.length() - (text.indexOf("::") + 2);
+        userInputLen += len;
+        numTokens++;
 
-        if (len > (end - begin)) {
-          fromShortToLongForm++;
+        // how many times the current token fits into the user input
+        // anything below 1.0 means the current token is longer than 
+        // what user typed
+
+        float ratio = (float)(end-begin) / (float)len;
+
+        if (ratio == 1.0f) {
+          equalLength++;
         }
-        else if (len < (end - begin)) {
-          fromLongToShort++;
+        else if (ratio < 1.2f) { // we give it bit of slack
+          tokenLongerThanInput++;
         }
         else {
-          equalLength++;
+          tokenShorterThanInput++;
+        }
+
+        if (closestLen == null || Math.abs((end-begin)-len) < closestLen) {
+          closestLen = Math.abs((end-begin)-len);
+          closestLenTerm = termNode;
         }
 
+
         // careful, 0 means the term does not exist
         termFreq = searcher.docFreq(new Term(termNode.getFieldAsString(), text));
 
         // we'll ignore unknown terms
         if (termFreq > 0) {
-          if (termFreq < minFreqTerm) {
-            minTerm = termNode;
-            minFreqTerm = termFreq;
+          if (termFreq < minFreq) {
+            minFreqTerm = termNode;
+            minFreq = termFreq;
           }
-          else if (termFreq == minFreqTerm && text.length() > minTerm.getValue().length()) {
-            minTerm = termNode;
+          else if (termFreq == minFreq && text.length() > minFreqTerm.getValue().length()) {
+            minFreqTerm = termNode; // if same docfreq, pick longer ones
           }
 
-          if (termFreq > maxFreqTerm) {
-            maxTerm = termNode;
-            maxFreqTerm = termFreq;
+          if (termFreq > maxFreq) {
+            maxFreqTerm = termNode;
+            maxFreq = termFreq;
           }
-          else if (termFreq == maxFreqTerm && text.length() < minTerm.getValue().length()) {
-            maxTerm = termNode;
+          else if (termFreq == maxFreq && text.length() < minFreqTerm.getValue().length()) {
+            maxFreqTerm = termNode; // if same frequency, pick shorter one
           }
         }
 
       }
 
       String strategy = null;
-      if (fromShortToLongForm > fromLongToShort) {
-        strategy = "mostFrequent"; // pick the shortest
+      if (tokenLongerThanInput > tokenShorterThanInput) {
+        strategy = "mostFrequent"; // most tokens are longer than input (i.e. user typed acronym)
+                                   // pick the shortest - i.e. more frequent term
       }
-      else if (fromLongToShort > fromShortToLongForm) {
-        strategy = "leastFrequent"; // pick the longest
+      else if (tokenShorterThanInput > tokenLongerThanInput) {
+        strategy = "leastFrequent"; // most tokens were equal or shorter than the user's input
+                                    // pick the longest - i.e. more specific term
       }
       else { // they were equal lengths
         strategy = "cantDecide"; 
-        if (minTerm != null && maxTerm != null) {
-          int diffMax = Math.abs(len - (maxTerm.getEnd() - maxTerm.getBegin()));
-          int diffMin = Math.abs(len - (minTerm.getEnd() - minTerm.getBegin()));
+        if (minFreqTerm != null && maxFreqTerm != null) {
+          float diffMax = Math.abs((float)userInputLen/numTokens - ((FieldQueryNode) maxFreqTerm).getTextAsString().length());
+          float diffMin = Math.abs((float)userInputLen/numTokens - ((FieldQueryNode) minFreqTerm).getTextAsString().length());
 
           if (diffMax < diffMin) { // longer term is closer to input
             strategy = "leastFrequent";
@@ -316,38 +336,19 @@ else if (diffMin < diffMax) { // shorter term is closer to the user input length
 
 
 
-      int added = 0;
-      for (QueryNode n: termList) {
-        String t = (String) n.getTag(AqpAnalyzerQueryNodeProcessor.TYPE_ATTRIBUTE);
-        if (t != null && typesToKeep.contains(t)) {
-          if (strategy.equals("mostFrequent") && n.equals(maxTerm)) {
-            newList.add(n);
-            added += 1;
-            break;
-          }
-          else if (strategy.equals("leastFrequent") && n.equals(minTerm)) {
-            newList.add(n);
-            added += 1;
-            break;
-          }
-          else if (strategy.equals("cantDecide")) {
-            newList.add(n);
-            added += 1;
-            break;                
-          }
-        }
+      if (strategy.equals("mostFrequent") && maxFreqTerm != null) {
+        newList.add(maxFreqTerm);
+      }
+      else if (strategy.equals("leastFrequent") && minFreqTerm != null) {
+        newList.add(minFreqTerm);
+      }
+      else if (strategy.equals("cantDecide")) {
+        newList.add(closestLenTerm);
       }
 
-      if (added == 0) { // we didn't find any type that would satisfy the condition
-        if (strategy.equals("mostFrequent") && maxTerm != null) {
-          newList.add(maxTerm);
-        }
-        else if (strategy.equals("leastFrequent") && minTerm != null) {
-          newList.add(minTerm);
-        }
-        else if (strategy.equals("cantDecide")) {
-          newList.add(termList.get(0));                
-        }
+
+      if (newList.size() == 0) { // we didn't find any type that would satisfy the condition
+        newList.add(termList.get(0));                
       }
 
     }

diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java
@@ -274,6 +274,28 @@ public void setUp() throws Exception {
 
   public void testMultiTokens() throws Exception {
 
+    // make sure the correct synonym is picked in absence of docfreq info
+    assertQueryEquals(req("q", "title:(\"antidesitter spacetime\" application)",
+        "aqp.multiphrase.keep_one", "SYNONYM",
+        "aqp.multiphrase.fields", "title"),
+        "+(title:\"antidesitter spacetime\" | Synonym(title:syn::antidesitter spacetime)) +title:application",
+        BooleanQuery.class);
+
+    // now add some docfreq
+    assertU(adoc("id", "1000", "bibcode", "xxxxxxxxxx1000",
+        "title", "antidesitter spacetime application"));
+    assertU(adoc("id", "1001", "bibcode", "xxxxxxxxxx1001",
+        "title", "anti de sitter space application"));
+    assertU(adoc("id", "1002", "bibcode", "xxxxxxxxxx1002",
+        "title", "NASA ADS"));
+    assertU(commit());
+
+    assertQueryEquals(req("q", "title:(\"antidesitter spacetime\" application)",
+        "aqp.multiphrase.keep_one", "SYNONYM",
+        "aqp.multiphrase.fields", "title"),
+        "+(title:\"antidesitter spacetime\" | Synonym(title:syn::antidesitter spacetime)) +title:application",
+        BooleanQuery.class);
+
 
     // for relevancy scoring we want to avoid double-counting
     // so all of these below will use new aqp.multiphrase.keep parameter

diff --git a/contrib/adsabs/src/test/org/apache/solr/search/TestAqpAdsabsSolrSearch.java b/contrib/adsabs/src/test/org/apache/solr/search/TestAqpAdsabsSolrSearch.java
@@ -381,10 +381,10 @@ public void testSpecialCases() throws Exception {
           "like:foo bar baz",
           MoreLikeThisQuery.class);
       // default docfreq=2, termfreq=2
-      assertQ(req("q", "similar(foo bar baz, input, 100, 100, 2, 2)"),
+      assertQ(req("q", "similar(foo bar baz, input abstract, 100, 100, 2, 2)"),
           "//*[@numFound='0']");
       // change defaults
-      assertQ(req("q", "similar(foo bar baz, input, 100, 100, 1, 1)"),
+      assertQ(req("q", "similar(foo bar baz, input abstract, 100, 100, 1, 1)"),
           "//*[@numFound='1']",
           "//doc/str[@name='id'][.='2']");