diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java
index c6e79b1cae..363459feee 100644
--- a/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java
+++ b/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java
@@ -1348,18 +1348,12 @@ public void handle(HttpExchange httpExchange) throws IOException {
return Pair.makePair("".getBytes(), null);
}
- CoreNLPProtos.SemgrexResponse.Builder responseBuilder = CoreNLPProtos.SemgrexResponse.newBuilder();
- int sentenceIdx = 0;
- for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
- SemanticGraph graph = sentence.get(dependenciesType.annotation());
- CoreNLPProtos.SemgrexResponse.GraphResult.Builder graphResultBuilder = CoreNLPProtos.SemgrexResponse.GraphResult.newBuilder();
- graphResultBuilder.addResult(ProcessSemgrexRequest.matchSentence(regex, graph, 0, sentenceIdx));
- responseBuilder.addResult(graphResultBuilder.build());
- ++sentenceIdx;
- }
+ List sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
+ List patterns = Collections.singletonList(regex);
+ CoreNLPProtos.SemgrexResponse semgrexResponse = ProcessSemgrexRequest.processRequest(sentences, patterns);
ByteArrayOutputStream os = new ByteArrayOutputStream();
- responseBuilder.build().writeTo(os);
+ semgrexResponse.writeTo(os);
os.close();
return Pair.makePair(os.toByteArray(), doc);
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ProcessSemgrexRequest.java b/src/edu/stanford/nlp/semgraph/semgrex/ProcessSemgrexRequest.java
index 9e0cc9e801..c072e2b430 100644
--- a/src/edu/stanford/nlp/semgraph/semgrex/ProcessSemgrexRequest.java
+++ b/src/edu/stanford/nlp/semgraph/semgrex/ProcessSemgrexRequest.java
@@ -10,26 +10,31 @@
import java.io.InputStream;
import java.io.IOException;
import java.io.OutputStream;
+import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
+import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer;
import edu.stanford.nlp.pipeline.CoreNLPProtos;
import edu.stanford.nlp.semgraph.SemanticGraph;
+import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
+import edu.stanford.nlp.util.ArrayCoreMap;
+import edu.stanford.nlp.util.CoreMap;
+import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.ProcessProtobufRequest;
public class ProcessSemgrexRequest extends ProcessProtobufRequest {
/**
* Builds a single inner SemgrexResult structure from the pair of a SemgrexPattern and a SemanticGraph
*/
- public static CoreNLPProtos.SemgrexResponse.SemgrexResult matchSentence(SemgrexPattern pattern, SemanticGraph graph, int patternIdx, int graphIdx) {
+ public static CoreNLPProtos.SemgrexResponse.SemgrexResult matchSentence(SemgrexPattern pattern, SemanticGraph graph, List matches, int patternIdx, int graphIdx) {
CoreNLPProtos.SemgrexResponse.SemgrexResult.Builder semgrexResultBuilder = CoreNLPProtos.SemgrexResponse.SemgrexResult.newBuilder();
- SemgrexMatcher matcher = pattern.matcher(graph);
- while (matcher.find()) {
+ for (SemgrexMatch matcher : matches) {
CoreNLPProtos.SemgrexResponse.Match.Builder matchBuilder = CoreNLPProtos.SemgrexResponse.Match.newBuilder();
matchBuilder.setMatchIndex(matcher.getMatch().index());
matchBuilder.setSemgrexIndex(patternIdx);
@@ -74,6 +79,38 @@ public static CoreNLPProtos.SemgrexResponse.SemgrexResult matchSentence(SemgrexP
return semgrexResultBuilder.build();
}
+ public static CoreNLPProtos.SemgrexResponse processRequest(List sentences, List patterns) {
+ CoreNLPProtos.SemgrexResponse.Builder responseBuilder = CoreNLPProtos.SemgrexResponse.newBuilder();
+ List>>>> allMatches = new ArrayList<>();
+ for (CoreMap sentence : sentences) {
+ allMatches.add(new Pair<>(sentence, new ArrayList<>()));
+ }
+ for (SemgrexPattern pattern : patterns) {
+ List>> patternMatches = pattern.matchSentences(sentences, true);
+ for (int i = 0; i < sentences.size(); ++i) {
+ Pair> sentenceMatches = patternMatches.get(i);
+ allMatches.get(i).second().add(new Pair<>(pattern, sentenceMatches.second()));
+ }
+ }
+
+ int graphIdx = 0;
+ for (Pair>>> sentenceMatches : allMatches) {
+ CoreNLPProtos.SemgrexResponse.GraphResult.Builder graphResultBuilder = CoreNLPProtos.SemgrexResponse.GraphResult.newBuilder();
+
+ int patternIdx = 0;
+ SemanticGraph graph = sentenceMatches.first().get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
+ for (Pair> patternMatches : sentenceMatches.second()) {
+ SemgrexPattern pattern = patternMatches.first();
+ graphResultBuilder.addResult(matchSentence(pattern, graph, patternMatches.second(), patternIdx, graphIdx));
+ ++patternIdx;
+ }
+
+ responseBuilder.addResult(graphResultBuilder.build());
+ ++graphIdx;
+ }
+ return responseBuilder.build();
+ }
+
/**
* For a single request, iterate through the SemanticGraphs it
* includes, and add the results of each Semgrex operation included
@@ -81,13 +118,9 @@ public static CoreNLPProtos.SemgrexResponse.SemgrexResult matchSentence(SemgrexP
*/
public static CoreNLPProtos.SemgrexResponse processRequest(CoreNLPProtos.SemgrexRequest request) {
ProtobufAnnotationSerializer serializer = new ProtobufAnnotationSerializer();
- CoreNLPProtos.SemgrexResponse.Builder responseBuilder = CoreNLPProtos.SemgrexResponse.newBuilder();
- List patterns = request.getSemgrexList().stream().map(SemgrexPattern::compile).collect(Collectors.toList());
- int graphIdx = 0;
+ List sentences = new ArrayList<>();
for (CoreNLPProtos.SemgrexRequest.Dependencies sentence : request.getQueryList()) {
- CoreNLPProtos.SemgrexResponse.GraphResult.Builder graphResultBuilder = CoreNLPProtos.SemgrexResponse.GraphResult.newBuilder();
-
final List tokens;
if (sentence.getGraph().getTokenList().size() > 0) {
tokens = sentence.getGraph().getTokenList().stream().map(serializer::fromProto).collect(Collectors.toList());
@@ -95,16 +128,14 @@ public static CoreNLPProtos.SemgrexResponse processRequest(CoreNLPProtos.Semgrex
tokens = sentence.getTokenList().stream().map(serializer::fromProto).collect(Collectors.toList());
}
SemanticGraph graph = ProtobufAnnotationSerializer.fromProto(sentence.getGraph(), tokens, "semgrex");
- int patternIdx = 0;
- for (SemgrexPattern pattern : patterns) {
- graphResultBuilder.addResult(matchSentence(pattern, graph, patternIdx, graphIdx));
- ++patternIdx;
- }
-
- responseBuilder.addResult(graphResultBuilder.build());
- ++graphIdx;
+ CoreMap coremap = new ArrayCoreMap();
+ coremap.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, graph);
+ coremap.set(CoreAnnotations.TokensAnnotation.class, tokens);
+ sentences.add(coremap);
}
- return responseBuilder.build();
+
+ List patterns = request.getSemgrexList().stream().map(SemgrexPattern::compile).collect(Collectors.toList());
+ return processRequest(sentences, patterns);
}
/**
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexMatch.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexMatch.java
index f3d3236c2c..9de2c72d97 100644
--- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexMatch.java
+++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexMatch.java
@@ -65,6 +65,22 @@ public Set getNodeNames() {
return namesToNodes.keySet();
}
+ public Set getRelationNames() {
+ return namesToRelations.keySet();
+ }
+
+ public String getRelnString(String name) {
+ return namesToRelations.get(name);
+ }
+
+ public Set getEdgeNames() {
+ return namesToEdges.keySet();
+ }
+
+ public SemanticGraphEdge getEdge(String name) {
+ return namesToEdges.get(name);
+ }
+
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append(matchedPattern);
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java
index a7e346f661..32ae449640 100644
--- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java
+++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java
@@ -51,6 +51,9 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
Token reverse = null;
List children = new ArrayList();
Token startToken = null;
+
+ List uniqKeys = null;
+ Token nextIdentifier = null;
// a local variable
// start from 1 since we haven't parsed anything yet
@@ -59,13 +62,12 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
case ALIGNRELN:{
reverse = jj_consume_token(ALIGNRELN);
node = SubNode(GraphRelation.ALIGNED_ROOT);
- jj_consume_token(9);
break;
}
- case 11:
- case 15:
+ case 13:
case 17:
- case 26:{
+ case 19:
+ case 28:{
node = SubNode(GraphRelation.ROOT);
children.add(node);
label_1:
@@ -83,7 +85,6 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
node = SubNode(GraphRelation.ITERATOR);
children.add(node);
}
- jj_consume_token(9);
break;
}
default:
@@ -92,36 +93,73 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
throw new ParseException();
}
if (children.size() > 1)
- node = new CoordinationPattern(true, children, true, true);
- if (deprecatedAmp) {
- {if (true) throw new SemgrexParseException("Use of & in semgrex patterns is now illegal. It is equivalent to the same expression without the &. Offending expression: " + startToken);}
- }
- if (deprecatedNodeConj) {
- {if (true) throw new SemgrexParseException("Use of node conjugation (expressions such as '< [foo bar]' or '< [foo & bar]') is now illegal. The issue is that expressions such as '[foo bar] < zzz' may intuitively mean that foo < zzz, bar < zzz, zzz the same for both cases, but that is not the way the parser interpreted this expression. Changing the functionality might break existing expressions, and anyway this can be rewritten in various ways such as 'zzz > foo > bar' or 'foo < zzz=a : bar < zzz=a'. Offending expression: " + startToken);}
+ node = new CoordinationPattern(true, children, true, true);
+ if (deprecatedAmp) {
+ {if (true) throw new SemgrexParseException("Use of & in semgrex patterns is now illegal. It is equivalent to the same expression without the &. Offending expression: " + startToken);}
+ }
+ if (deprecatedNodeConj) {
+ {if (true) throw new SemgrexParseException("Use of node conjugation (expressions such as '< [foo bar]' or '< [foo & bar]') is now illegal. The issue is that expressions such as '[foo bar] < zzz' may intuitively mean that foo < zzz, bar < zzz, zzz the same for both cases, but that is not the way the parser interpreted this expression. Changing the functionality might break existing expressions, and anyway this can be rewritten in various ways such as 'zzz > foo > bar' or 'foo < zzz=a : bar < zzz=a'. Offending expression: " + startToken);}
+ }
+ switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case 11:{
+ jj_consume_token(11);
+ jj_consume_token(UNIQ);
+uniqKeys = new ArrayList<>();
+ label_2:
+ while (true) {
+ switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
+ case IDENTIFIER:{
+ ;
+ break;
+ }
+ default:
+ jj_la1[2] = jj_gen;
+ break label_2;
+ }
+ nextIdentifier = identifier();
+uniqKeys.add(nextIdentifier.image);
+ }
+for (String key : uniqKeys) {
+ if (!knownVariables.contains(key)) {
+ {if (true) throw new SemgrexParseException("Semgrex pattern asked for uniq of node " + key + " which does not exist in the pattern");}
+ }
+ }
+ // TODO: can error check that the keys are unique between node and edge names
+ // that might require keeping edge names in a known set
+ // TODO: edge names might need some upgrades anyway - shouldn't name them under negation, for example
+ node = new UniqPattern(node, uniqKeys);
+ break;
+ }
+ default:
+ jj_la1[3] = jj_gen;
+ ;
}
- {if ("" != null) return node;}
+ jj_consume_token(12);
+{if ("" != null) return node;}
throw new Error("Missing return statement in function");
}
final public SemgrexPattern SubNode(GraphRelation r) throws ParseException {SemgrexPattern result = null;
SemgrexPattern child = null;
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 11:{
- jj_consume_token(11);
+ case 13:{
+ jj_consume_token(13);
result = SubNode(r);
- jj_consume_token(12);
+ jj_consume_token(14);
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case RELATION:
case ALIGNRELN:
case IDENTIFIER:
- case 15:
- case 16:
- case 17:{
+ case 17:
+ case 18:
+ case 19:{
child = RelationDisj();
break;
}
default:
- jj_la1[2] = jj_gen;
+ jj_la1[4] = jj_gen;
;
}
if (child != null) {
@@ -133,22 +171,23 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
{if ("" != null) return result;}
break;
}
- case 15:
case 17:
- case 26:{
+ case 19:
+ case 28:{
result = ModNode(r);
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case RELATION:
case ALIGNRELN:
case IDENTIFIER:
- case 15:
- case 16:
- case 17:{
+ case 17:
+ case 18:
+ case 19:{
child = RelationDisj();
break;
}
default:
- jj_la1[3] = jj_gen;
+ jj_la1[5] = jj_gen;
;
}
if (child != null) result.setChild(child);
@@ -156,7 +195,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
break;
}
default:
- jj_la1[4] = jj_gen;
+ jj_la1[6] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -167,18 +206,18 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
List children = new ArrayList();
child = RelationConj();
children.add(child);
- label_2:
+ label_3:
while (true) {
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 13:{
+ case 15:{
;
break;
}
default:
- jj_la1[5] = jj_gen;
- break label_2;
+ jj_la1[7] = jj_gen;
+ break label_3;
}
- jj_consume_token(13);
+ jj_consume_token(15);
child = RelationConj();
children.add(child);
}
@@ -194,31 +233,32 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
List children = new ArrayList();
child = ModRelation();
children.add(child);
- label_3:
+ label_4:
while (true) {
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case RELATION:
case ALIGNRELN:
case IDENTIFIER:
- case 14:
- case 15:
case 16:
- case 17:{
+ case 17:
+ case 18:
+ case 19:{
;
break;
}
default:
- jj_la1[6] = jj_gen;
- break label_3;
+ jj_la1[8] = jj_gen;
+ break label_4;
}
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 14:{
- jj_consume_token(14);
+ case 16:{
+ jj_consume_token(16);
deprecatedAmp = true;
break;
}
default:
- jj_la1[7] = jj_gen;
+ jj_la1[9] = jj_gen;
;
}
child = ModRelation();
@@ -235,15 +275,16 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
final public SemgrexPattern ModRelation() throws ParseException {SemgrexPattern child;
boolean startUnderNeg;
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case RELATION:
case ALIGNRELN:
case IDENTIFIER:
- case 17:{
+ case 19:{
child = RelChild();
break;
}
- case 15:{
- jj_consume_token(15);
+ case 17:{
+ jj_consume_token(17);
startUnderNeg = underNegation;
underNegation = true;
child = RelChild();
@@ -251,14 +292,14 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
child.negate();
break;
}
- case 16:{
- jj_consume_token(16);
+ case 18:{
+ jj_consume_token(18);
child = RelChild();
child.makeOptional();
break;
}
default:
- jj_la1[8] = jj_gen;
+ jj_la1[10] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -268,12 +309,13 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
final public SemgrexPattern RelChild() throws ParseException {SemgrexPattern child;
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 17:{
- jj_consume_token(17);
+ case 19:{
+ jj_consume_token(19);
child = RelationDisj();
- jj_consume_token(18);
+ jj_consume_token(20);
break;
}
+ case UNIQ:
case RELATION:
case ALIGNRELN:
case IDENTIFIER:{
@@ -281,7 +323,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
break;
}
default:
- jj_la1[9] = jj_gen;
+ jj_la1[11] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -299,34 +341,38 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
SemgrexPattern node;
boolean pC = false;
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case RELATION:
case IDENTIFIER:{
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case IDENTIFIER:{
- numArg = jj_consume_token(IDENTIFIER);
+ numArg = identifier();
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 19:{
- jj_consume_token(19);
- numArg2 = jj_consume_token(IDENTIFIER);
+ case 21:{
+ jj_consume_token(21);
+ numArg2 = identifier();
break;
}
default:
- jj_la1[10] = jj_gen;
+ jj_la1[12] = jj_gen;
;
}
break;
}
default:
- jj_la1[11] = jj_gen;
+ jj_la1[13] = jj_gen;
;
}
rel = jj_consume_token(RELATION);
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case IDENTIFIER:
case REGEX:{
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case IDENTIFIER:{
- relnType = jj_consume_token(IDENTIFIER);
+ relnType = identifier();
break;
}
case REGEX:{
@@ -334,34 +380,34 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
break;
}
default:
- jj_la1[12] = jj_gen;
+ jj_la1[14] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
break;
}
default:
- jj_la1[13] = jj_gen;
+ jj_la1[15] = jj_gen;
;
}
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 20:{
- jj_consume_token(20);
- name = jj_consume_token(IDENTIFIER);
+ case 22:{
+ jj_consume_token(22);
+ name = identifier();
break;
}
default:
- jj_la1[14] = jj_gen;
+ jj_la1[16] = jj_gen;
;
}
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 21:{
- jj_consume_token(21);
- edgeName = jj_consume_token(IDENTIFIER);
+ case 23:{
+ jj_consume_token(23);
+ edgeName = identifier();
break;
}
default:
- jj_la1[15] = jj_gen;
+ jj_la1[17] = jj_gen;
;
}
break;
@@ -371,7 +417,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
break;
}
default:
- jj_la1[16] = jj_gen;
+ jj_la1[18] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -395,20 +441,20 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
edgeName != null ? edgeName.image : null);
}
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 15:
case 17:
- case 26:{
+ case 19:
+ case 28:{
node = ModNode(reln);
break;
}
- case 11:{
- jj_consume_token(11);
+ case 13:{
+ jj_consume_token(13);
node = SubNode(reln);
- jj_consume_token(12);
+ jj_consume_token(14);
break;
}
default:
- jj_la1[17] = jj_gen;
+ jj_la1[19] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -418,25 +464,25 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
final public SemgrexPattern NodeDisj(GraphRelation r) throws ParseException {SemgrexPattern child;
List children = new ArrayList();
- jj_consume_token(17);
+ jj_consume_token(19);
child = NodeConj(r);
children.add(child);
- label_4:
+ label_5:
while (true) {
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 13:{
+ case 15:{
;
break;
}
default:
- jj_la1[18] = jj_gen;
- break label_4;
+ jj_la1[20] = jj_gen;
+ break label_5;
}
- jj_consume_token(13);
+ jj_consume_token(15);
child = NodeConj(r);
children.add(child);
}
- jj_consume_token(18);
+ jj_consume_token(20);
if (children.size() == 1)
{if ("" != null) return child;}
else
@@ -448,27 +494,27 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
List children = new ArrayList();
child = ModNode(r);
children.add(child);
- label_5:
+ label_6:
while (true) {
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 14:
- case 15:
+ case 16:
case 17:
- case 26:{
+ case 19:
+ case 28:{
;
break;
}
default:
- jj_la1[19] = jj_gen;
- break label_5;
+ jj_la1[21] = jj_gen;
+ break label_6;
}
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 14:{
- jj_consume_token(14);
+ case 16:{
+ jj_consume_token(16);
break;
}
default:
- jj_la1[20] = jj_gen;
+ jj_la1[22] = jj_gen;
;
}
child = ModNode(r);
@@ -484,21 +530,21 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
final public SemgrexPattern ModNode(GraphRelation r) throws ParseException {SemgrexPattern child;
boolean startUnderNeg;
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 17:
- case 26:{
+ case 19:
+ case 28:{
child = Child(r);
break;
}
- case 15:{
- jj_consume_token(15);
-startUnderNeg = underNodeNegation;
+ case 17:{
+ jj_consume_token(17);
+startUnderNeg = underNodeNegation; // TODO: can negations be nested? If so, should they cancel?
underNodeNegation = true;
child = Child(r);
underNodeNegation = startUnderNeg;
break;
}
default:
- jj_la1[21] = jj_gen;
+ jj_la1[23] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -508,16 +554,16 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
final public SemgrexPattern Child(GraphRelation r) throws ParseException {SemgrexPattern child;
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 17:{
+ case 19:{
child = NodeDisj(r);
break;
}
- case 26:{
+ case 28:{
child = Description(r);
break;
}
default:
- jj_la1[22] = jj_gen;
+ jj_la1[24] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -531,28 +577,30 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
Token attrType = null;
boolean negated = false;
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case IDENTIFIER:{
- attr = jj_consume_token(IDENTIFIER);
+ attr = identifier();
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
case 10:
- case 22:{
+ case 24:{
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
case 10:{
attrType = jj_consume_token(10);
break;
}
- case 22:{
- attrType = jj_consume_token(22);
+ case 24:{
+ attrType = jj_consume_token(24);
break;
}
default:
- jj_la1[23] = jj_gen;
+ jj_la1[25] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case IDENTIFIER:{
- value = jj_consume_token(IDENTIFIER);
+ value = identifier();
break;
}
case REGEX:{
@@ -560,7 +608,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
break;
}
default:
- jj_la1[24] = jj_gen;
+ jj_la1[26] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -570,11 +618,12 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
}
break;
}
- case 23:{
- jj_consume_token(23);
+ case 25:{
+ jj_consume_token(25);
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case IDENTIFIER:{
- key = jj_consume_token(IDENTIFIER);
+ key = identifier();
break;
}
case REGEX:{
@@ -582,7 +631,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
break;
}
default:
- jj_la1[25] = jj_gen;
+ jj_la1[27] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -591,18 +640,19 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
attrType = jj_consume_token(10);
break;
}
- case 22:{
- attrType = jj_consume_token(22);
+ case 24:{
+ attrType = jj_consume_token(24);
break;
}
default:
- jj_la1[26] = jj_gen;
+ jj_la1[28] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case IDENTIFIER:{
- value = jj_consume_token(IDENTIFIER);
+ value = identifier();
break;
}
case REGEX:{
@@ -610,7 +660,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
break;
}
default:
- jj_la1[27] = jj_gen;
+ jj_la1[29] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -620,21 +670,22 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
}
negated = attrType.image.equals("!:");
attributes.addContains(attr.image, key.image, value.image, negated);
- label_6:
+ label_7:
while (true) {
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 24:{
+ case 26:{
;
break;
}
default:
- jj_la1[28] = jj_gen;
- break label_6;
+ jj_la1[30] = jj_gen;
+ break label_7;
}
- jj_consume_token(24);
+ jj_consume_token(26);
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case IDENTIFIER:{
- key = jj_consume_token(IDENTIFIER);
+ key = identifier();
break;
}
case REGEX:{
@@ -642,7 +693,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
break;
}
default:
- jj_la1[29] = jj_gen;
+ jj_la1[31] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -651,18 +702,19 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
attrType = jj_consume_token(10);
break;
}
- case 22:{
- attrType = jj_consume_token(22);
+ case 24:{
+ attrType = jj_consume_token(24);
break;
}
default:
- jj_la1[30] = jj_gen;
+ jj_la1[32] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case IDENTIFIER:{
- value = jj_consume_token(IDENTIFIER);
+ value = identifier();
break;
}
case REGEX:{
@@ -670,7 +722,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
break;
}
default:
- jj_la1[31] = jj_gen;
+ jj_la1[33] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -681,11 +733,11 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
negated = attrType.image.equals("!:");
attributes.addContains(attr.image, key.image, value.image, negated);
}
- jj_consume_token(25);
+ jj_consume_token(27);
break;
}
default:
- jj_la1[32] = jj_gen;
+ jj_la1[34] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -702,7 +754,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
break;
}
default:
- jj_la1[33] = jj_gen;
+ jj_la1[35] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
@@ -712,38 +764,39 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
boolean link = false;
NodeAttributes attributes = new NodeAttributes();
NodePattern pat;
- jj_consume_token(26);
+ jj_consume_token(28);
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:
case IDENTIFIER:
case EMPTY:
case ROOT:{
AddAttribute(attributes);
- label_7:
+ label_8:
while (true) {
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 24:{
+ case 26:{
;
break;
}
default:
- jj_la1[34] = jj_gen;
- break label_7;
+ jj_la1[36] = jj_gen;
+ break label_8;
}
- jj_consume_token(24);
+ jj_consume_token(26);
AddAttribute(attributes);
}
break;
}
default:
- jj_la1[35] = jj_gen;
+ jj_la1[37] = jj_gen;
;
}
- jj_consume_token(25);
+ jj_consume_token(27);
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
- case 21:{
- jj_consume_token(21);
+ case 23:{
+ jj_consume_token(23);
link = true;
- name = jj_consume_token(IDENTIFIER);
+ name = identifier();
String nodeName = name.image;
if (underNegation) {
if (!knownVariables.contains(nodeName)) {
@@ -755,7 +808,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
break;
}
default:
- jj_la1[36] = jj_gen;
+ jj_la1[38] = jj_gen;
;
}
pat = new NodePattern(r, underNodeNegation, attributes, link, name != null ? name.image : null);
@@ -763,6 +816,25 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
throw new Error("Missing return statement in function");
}
+ final public Token identifier() throws ParseException {Token t ;
+ switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
+ case UNIQ:{
+ t = jj_consume_token(UNIQ);
+ break;
+ }
+ case IDENTIFIER:{
+ t = jj_consume_token(IDENTIFIER);
+ break;
+ }
+ default:
+ jj_la1[39] = jj_gen;
+ jj_consume_token(-1);
+ throw new ParseException();
+ }
+{if ("" != null) return t;}
+ throw new Error("Missing return statement in function");
+}
+
/** Generated Token Manager. */
public SemgrexParserTokenManager token_source;
SimpleCharStream jj_input_stream;
@@ -772,13 +844,13 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
public Token jj_nt;
private int jj_ntk;
private int jj_gen;
- final private int[] jj_la1 = new int[37];
+ final private int[] jj_la1 = new int[40];
static private int[] jj_la1_0;
static {
jj_la1_init_0();
}
private static void jj_la1_init_0() {
- jj_la1_0 = new int[] {0x400,0x4028808,0x3801c,0x3801c,0x4028800,0x2000,0x3c01c,0x4000,0x3801c,0x2001c,0x80000,0x10,0x110,0x110,0x100000,0x200000,0x1c,0x4028800,0x2000,0x402c000,0x4000,0x4028000,0x4020000,0x400400,0x110,0x110,0x400400,0x110,0x1000000,0x110,0x400400,0x110,0xc00400,0xd0,0x1000000,0xd0,0x200000,};
+ jj_la1_0 = new int[] {0x400,0x100a2010,0x24,0x800,0xe003c,0xe003c,0x100a2000,0x8000,0xf003c,0x10000,0xe003c,0x8003c,0x200000,0x24,0x224,0x224,0x400000,0x800000,0x3c,0x100a2000,0x8000,0x100b0000,0x10000,0x100a0000,0x10080000,0x1000400,0x224,0x224,0x1000400,0x224,0x4000000,0x224,0x1000400,0x224,0x3000400,0x1a4,0x4000000,0x1a4,0x800000,0x24,};
}
/** Constructor with InputStream. */
@@ -792,7 +864,7 @@ public SemgrexParser(java.io.InputStream stream, String encoding) {
token = new Token();
jj_ntk = -1;
jj_gen = 0;
- for (int i = 0; i < 37; i++) jj_la1[i] = -1;
+ for (int i = 0; i < 40; i++) jj_la1[i] = -1;
}
/** Reinitialise. */
@@ -806,7 +878,7 @@ public void ReInit(java.io.InputStream stream, String encoding) {
token = new Token();
jj_ntk = -1;
jj_gen = 0;
- for (int i = 0; i < 37; i++) jj_la1[i] = -1;
+ for (int i = 0; i < 40; i++) jj_la1[i] = -1;
}
/** Constructor. */
@@ -816,7 +888,7 @@ public SemgrexParser(java.io.Reader stream) {
token = new Token();
jj_ntk = -1;
jj_gen = 0;
- for (int i = 0; i < 37; i++) jj_la1[i] = -1;
+ for (int i = 0; i < 40; i++) jj_la1[i] = -1;
}
/** Reinitialise. */
@@ -834,7 +906,7 @@ public void ReInit(java.io.Reader stream) {
token = new Token();
jj_ntk = -1;
jj_gen = 0;
- for (int i = 0; i < 37; i++) jj_la1[i] = -1;
+ for (int i = 0; i < 40; i++) jj_la1[i] = -1;
}
/** Constructor with generated Token Manager. */
@@ -843,7 +915,7 @@ public SemgrexParser(SemgrexParserTokenManager tm) {
token = new Token();
jj_ntk = -1;
jj_gen = 0;
- for (int i = 0; i < 37; i++) jj_la1[i] = -1;
+ for (int i = 0; i < 40; i++) jj_la1[i] = -1;
}
/** Reinitialise. */
@@ -852,7 +924,7 @@ public void ReInit(SemgrexParserTokenManager tm) {
token = new Token();
jj_ntk = -1;
jj_gen = 0;
- for (int i = 0; i < 37; i++) jj_la1[i] = -1;
+ for (int i = 0; i < 40; i++) jj_la1[i] = -1;
}
private Token jj_consume_token(int kind) throws ParseException {
@@ -903,12 +975,12 @@ private int jj_ntk_f() {
/** Generate ParseException. */
public ParseException generateParseException() {
jj_expentries.clear();
- boolean[] la1tokens = new boolean[27];
+ boolean[] la1tokens = new boolean[29];
if (jj_kind >= 0) {
la1tokens[jj_kind] = true;
jj_kind = -1;
}
- for (int i = 0; i < 37; i++) {
+ for (int i = 0; i < 40; i++) {
if (jj_la1[i] == jj_gen) {
for (int j = 0; j < 32; j++) {
if ((jj_la1_0[i] & (1<
}
+TOKEN:
+{
+ < UNIQ: "uniq" >
+}
+
TOKEN:
{
< RELATION: "<" | ">" | ">>" | "<<" | "<>" | "==" | "$+" | "$-" | "$++" | "$--" | "." | ".." | "-" | "--" | ">++" | ">--" | "<++" | "<--" >
@@ -77,6 +82,9 @@ SemgrexPattern Root() : {
Token reverse = null;
List children = new ArrayList();
Token startToken = null;
+
+ List uniqKeys = null;
+ Token nextIdentifier = null;
// a local variable
} {
{
@@ -84,22 +92,44 @@ SemgrexPattern Root() : {
startToken = getToken(1);
}
(
- (reverse = node = SubNode(GraphRelation.ALIGNED_ROOT) "\n")
- |
- ( node = SubNode(GraphRelation.ROOT) { children.add(node); }
- ( ":" node = SubNode(GraphRelation.ITERATOR) { children.add(node); } )*
- "\n"
+ (
+ (reverse = node = SubNode(GraphRelation.ALIGNED_ROOT))
+ |
+ ( node = SubNode(GraphRelation.ROOT) { children.add(node); }
+ ( ":" node = SubNode(GraphRelation.ITERATOR) { children.add(node); } )*
+ )
)
+ {
+ if (children.size() > 1)
+ node = new CoordinationPattern(true, children, true, true);
+ if (deprecatedAmp) {
+ throw new SemgrexParseException("Use of & in semgrex patterns is now illegal. It is equivalent to the same expression without the &. Offending expression: " + startToken);
+ }
+ if (deprecatedNodeConj) {
+ throw new SemgrexParseException("Use of node conjugation (expressions such as '< [foo bar]' or '< [foo & bar]') is now illegal. The issue is that expressions such as '[foo bar] < zzz' may intuitively mean that foo < zzz, bar < zzz, zzz the same for both cases, but that is not the way the parser interpreted this expression. Changing the functionality might break existing expressions, and anyway this can be rewritten in various ways such as 'zzz > foo > bar' or 'foo < zzz=a : bar < zzz=a'. Offending expression: " + startToken);
+ }
+ }
+ )
+ (
+ (
+ "::" { uniqKeys = new ArrayList<>(); } (nextIdentifier = identifier() { uniqKeys.add(nextIdentifier.image); })*
+ {
+ for (String key : uniqKeys) {
+ if (!knownVariables.contains(key)) {
+ throw new SemgrexParseException("Semgrex pattern asked for uniq of node " + key + " which does not exist in the pattern");
+ }
+ }
+ // TODO: can error check that the keys are unique between node and edge names
+ // that might require keeping edge names in a known set
+ // TODO: edge names might need some upgrades anyway - shouldn't name them under negation, for example
+ node = new UniqPattern(node, uniqKeys);
+ }
+ )?
+ )
+ (
+ "\n"
)
{
- if (children.size() > 1)
- node = new CoordinationPattern(true, children, true, true);
- if (deprecatedAmp) {
- throw new SemgrexParseException("Use of & in semgrex patterns is now illegal. It is equivalent to the same expression without the &. Offending expression: " + startToken);
- }
- if (deprecatedNodeConj) {
- throw new SemgrexParseException("Use of node conjugation (expressions such as '< [foo bar]' or '< [foo & bar]') is now illegal. The issue is that expressions such as '[foo bar] < zzz' may intuitively mean that foo < zzz, bar < zzz, zzz the same for both cases, but that is not the way the parser interpreted this expression. Changing the functionality might break existing expressions, and anyway this can be rewritten in various ways such as 'zzz > foo > bar' or 'foo < zzz=a : bar < zzz=a'. Offending expression: " + startToken);
- }
return node;
}
@@ -185,10 +215,10 @@ SemgrexPattern Relation() : {
boolean pC = false;
} {
(
- ( ( ( (numArg = ("," numArg2 = )?)? rel =
- (relnType = | relnType = )?)
- (( ("~") name = ) )?
- (( ("=") edgeName = ) )? )
+ ( ( ( (numArg = identifier() ("," numArg2 = identifier())?)? rel =
+ (relnType = identifier() | relnType = )?)
+ (( ("~") name = identifier() ) )?
+ (( ("=") edgeName = identifier() ) )? )
| (rel = ))
{
@@ -252,7 +282,7 @@ SemgrexPattern ModNode(GraphRelation r) : {
} {
( child = Child(r)
| ( "!"
- { startUnderNeg = underNodeNegation;
+ { startUnderNeg = underNodeNegation; // TODO: can negations be nested? If so, should they cancel?
underNodeNegation = true; } child = Child(r) { underNodeNegation = startUnderNeg; } )
)
{
@@ -276,8 +306,8 @@ void AddAttribute(NodeAttributes attributes) : {
Token attrType = null;
boolean negated = false;
} {
- (attr =
- (( (attrType = ":" | attrType = "!:") (value = | value = ) {
+ (attr = identifier()
+ (( (attrType = ":" | attrType = "!:") (value = identifier() | value = ) {
if (attr != null && value != null) {
negated = attrType.image.equals("!:");
attributes.setAttribute(attr.image, value.image, negated);
@@ -285,7 +315,7 @@ void AddAttribute(NodeAttributes attributes) : {
})
|
( ":{"
- ((key = | key = ) (attrType = ":" | attrType = "!:") (value = | value = )
+ ((key = identifier() | key = ) (attrType = ":" | attrType = "!:") (value = identifier() | value = )
{
if (attr == null || key == null || value == null) {
throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr +
@@ -294,7 +324,7 @@ void AddAttribute(NodeAttributes attributes) : {
negated = attrType.image.equals("!:");
attributes.addContains(attr.image, key.image, value.image, negated);
})
- ( ";" (key = | key = ) (attrType = ":" | attrType = "!:") (value = | value = )
+ ( ";" (key = identifier() | key = ) (attrType = ":" | attrType = "!:") (value = identifier() | value = )
{
if (attr == null || key == null || value == null) {
throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr +
@@ -321,7 +351,7 @@ NodePattern Description(GraphRelation r) : {
( "{" ( AddAttribute(attributes)
(";" AddAttribute(attributes))* )? "}"
- (( ("=" { link = true; }) name = )
+ (( ("=" { link = true; }) name = identifier() )
{
String nodeName = name.image;
if (underNegation) {
@@ -338,3 +368,10 @@ NodePattern Description(GraphRelation r) : {
return pat;
}
}
+
+Token identifier() : {
+ Token t ;
+}
+{
+ ( t = < UNIQ > | t = < IDENTIFIER > ) { return t; }
+}
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java
index 891073b9ff..a50ff802c7 100644
--- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java
+++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java
@@ -13,19 +13,21 @@ interface SemgrexParserConstants {
/** RegularExpression Id. */
int WHITESPACE = 1;
/** RegularExpression Id. */
- int RELATION = 2;
+ int UNIQ = 2;
/** RegularExpression Id. */
- int ALIGNRELN = 3;
+ int RELATION = 3;
/** RegularExpression Id. */
- int IDENTIFIER = 4;
+ int ALIGNRELN = 4;
/** RegularExpression Id. */
- int NUMBER = 5;
+ int IDENTIFIER = 5;
/** RegularExpression Id. */
- int EMPTY = 6;
+ int NUMBER = 6;
/** RegularExpression Id. */
- int ROOT = 7;
+ int EMPTY = 7;
/** RegularExpression Id. */
- int REGEX = 8;
+ int ROOT = 8;
+ /** RegularExpression Id. */
+ int REGEX = 9;
/** Lexical state. */
int DEFAULT = 0;
@@ -34,6 +36,7 @@ interface SemgrexParserConstants {
String[] tokenImage = {
"",
"",
+ "\"uniq\"",
"",
"\"@\"",
"",
@@ -41,8 +44,9 @@ interface SemgrexParserConstants {
"\"#\"",
"\"$\"",
"",
- "\"\\n\"",
"\":\"",
+ "\"::\"",
+ "\"\\n\"",
"\"(\"",
"\")\"",
"\"|\"",
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java
index fae081ad7a..2013aa3cc6 100644
--- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java
+++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java
@@ -22,11 +22,32 @@ private final int jjStopStringLiteralDfa_0(int pos, long active0){
switch (pos)
{
case 0:
- if ((active0 & 0x200000L) != 0L)
+ if ((active0 & 0x800000L) != 0L)
return 2;
- if ((active0 & 0x80L) != 0L)
+ if ((active0 & 0x4L) != 0L)
+ {
+ jjmatchedKind = 5;
+ return 8;
+ }
+ if ((active0 & 0x100L) != 0L)
return 25;
return -1;
+ case 1:
+ if ((active0 & 0x4L) != 0L)
+ {
+ jjmatchedKind = 5;
+ jjmatchedPos = 1;
+ return 8;
+ }
+ return -1;
+ case 2:
+ if ((active0 & 0x4L) != 0L)
+ {
+ jjmatchedKind = 5;
+ jjmatchedPos = 2;
+ return 8;
+ }
+ return -1;
default :
return -1;
}
@@ -44,45 +65,47 @@ private int jjMoveStringLiteralDfa0_0(){
switch(curChar)
{
case 10:
- return jjStopAtPos(0, 9);
+ return jjStopAtPos(0, 12);
case 33:
- jjmatchedKind = 15;
- return jjMoveStringLiteralDfa1_0(0x400000L);
+ jjmatchedKind = 17;
+ return jjMoveStringLiteralDfa1_0(0x1000000L);
case 35:
- return jjStopAtPos(0, 6);
+ return jjStopAtPos(0, 7);
case 36:
- return jjStartNfaWithStates_0(0, 7, 25);
+ return jjStartNfaWithStates_0(0, 8, 25);
case 38:
- return jjStopAtPos(0, 14);
+ return jjStopAtPos(0, 16);
case 40:
- return jjStopAtPos(0, 11);
+ return jjStopAtPos(0, 13);
case 41:
- return jjStopAtPos(0, 12);
+ return jjStopAtPos(0, 14);
case 44:
- return jjStopAtPos(0, 19);
+ return jjStopAtPos(0, 21);
case 58:
jjmatchedKind = 10;
- return jjMoveStringLiteralDfa1_0(0x800000L);
+ return jjMoveStringLiteralDfa1_0(0x2000800L);
case 59:
- return jjStopAtPos(0, 24);
+ return jjStopAtPos(0, 26);
case 61:
- return jjStartNfaWithStates_0(0, 21, 2);
+ return jjStartNfaWithStates_0(0, 23, 2);
case 63:
- return jjStopAtPos(0, 16);
+ return jjStopAtPos(0, 18);
case 64:
- return jjStopAtPos(0, 3);
+ return jjStopAtPos(0, 4);
case 91:
- return jjStopAtPos(0, 17);
+ return jjStopAtPos(0, 19);
case 93:
- return jjStopAtPos(0, 18);
+ return jjStopAtPos(0, 20);
+ case 117:
+ return jjMoveStringLiteralDfa1_0(0x4L);
case 123:
- return jjStopAtPos(0, 26);
+ return jjStopAtPos(0, 28);
case 124:
- return jjStopAtPos(0, 13);
+ return jjStopAtPos(0, 15);
case 125:
- return jjStopAtPos(0, 25);
+ return jjStopAtPos(0, 27);
case 126:
- return jjStopAtPos(0, 20);
+ return jjStopAtPos(0, 22);
default :
return jjMoveNfa_0(1, 0);
}
@@ -96,18 +119,58 @@ private int jjMoveStringLiteralDfa1_0(long active0){
switch(curChar)
{
case 58:
- if ((active0 & 0x400000L) != 0L)
- return jjStopAtPos(1, 22);
+ if ((active0 & 0x800L) != 0L)
+ return jjStopAtPos(1, 11);
+ else if ((active0 & 0x1000000L) != 0L)
+ return jjStopAtPos(1, 24);
break;
+ case 110:
+ return jjMoveStringLiteralDfa2_0(active0, 0x4L);
case 123:
- if ((active0 & 0x800000L) != 0L)
- return jjStopAtPos(1, 23);
+ if ((active0 & 0x2000000L) != 0L)
+ return jjStopAtPos(1, 25);
break;
default :
break;
}
return jjStartNfa_0(0, active0);
}
+private int jjMoveStringLiteralDfa2_0(long old0, long active0){
+ if (((active0 &= old0)) == 0L)
+ return jjStartNfa_0(0, old0);
+ try { curChar = input_stream.readChar(); }
+ catch(java.io.IOException e) {
+ jjStopStringLiteralDfa_0(1, active0);
+ return 2;
+ }
+ switch(curChar)
+ {
+ case 105:
+ return jjMoveStringLiteralDfa3_0(active0, 0x4L);
+ default :
+ break;
+ }
+ return jjStartNfa_0(1, active0);
+}
+private int jjMoveStringLiteralDfa3_0(long old0, long active0){
+ if (((active0 &= old0)) == 0L)
+ return jjStartNfa_0(1, old0);
+ try { curChar = input_stream.readChar(); }
+ catch(java.io.IOException e) {
+ jjStopStringLiteralDfa_0(2, active0);
+ return 3;
+ }
+ switch(curChar)
+ {
+ case 113:
+ if ((active0 & 0x4L) != 0L)
+ return jjStartNfaWithStates_0(3, 2, 8);
+ break;
+ default :
+ break;
+ }
+ return jjStartNfa_0(2, active0);
+}
private int jjStartNfaWithStates_0(int pos, int kind, int state)
{
jjmatchedKind = kind;
@@ -143,14 +206,14 @@ private int jjMoveNfa_0(int startState, int curPos)
case 1:
if ((0x3ff0484ffffdbffL & l) != 0L)
{
- if (kind > 4)
- kind = 4;
+ if (kind > 5)
+ kind = 5;
{ jjCheckNAdd(8); }
}
else if ((0x5000600000000000L & l) != 0L)
{
- if (kind > 2)
- kind = 2;
+ if (kind > 3)
+ kind = 3;
}
else if (curChar == 36)
{ jjCheckNAddStates(0, 3); }
@@ -160,8 +223,8 @@ else if (curChar == 61)
jjstateSet[jjnewStateCnt++] = 2;
if ((0x3ff000000000000L & l) != 0L)
{
- if (kind > 5)
- kind = 5;
+ if (kind > 6)
+ kind = 6;
{ jjCheckNAdd(9); }
}
else if ((0x100002200L & l) != 0L)
@@ -186,13 +249,13 @@ else if (curChar == 43)
{ jjCheckNAdd(17); }
if (curChar == 45)
{
- if (kind > 2)
- kind = 2;
+ if (kind > 3)
+ kind = 3;
}
else if (curChar == 43)
{
- if (kind > 2)
- kind = 2;
+ if (kind > 3)
+ kind = 3;
}
break;
case 0:
@@ -203,24 +266,24 @@ else if (curChar == 43)
{ jjCheckNAdd(0); }
break;
case 2:
- if (curChar == 61 && kind > 2)
- kind = 2;
+ if (curChar == 61 && kind > 3)
+ kind = 3;
break;
case 3:
if (curChar == 61)
jjstateSet[jjnewStateCnt++] = 2;
break;
case 4:
- if (curChar == 46 && kind > 2)
- kind = 2;
+ if (curChar == 46 && kind > 3)
+ kind = 3;
break;
case 5:
if (curChar == 46)
jjstateSet[jjnewStateCnt++] = 4;
break;
case 6:
- if (curChar == 45 && kind > 2)
- kind = 2;
+ if (curChar == 45 && kind > 3)
+ kind = 3;
break;
case 7:
case 19:
@@ -231,15 +294,15 @@ else if (curChar == 43)
case 8:
if ((0x3ff0484ffffdbffL & l) == 0L)
break;
- if (kind > 4)
- kind = 4;
+ if (kind > 5)
+ kind = 5;
{ jjCheckNAdd(8); }
break;
case 9:
if ((0x3ff000000000000L & l) == 0L)
break;
- if (kind > 5)
- kind = 5;
+ if (kind > 6)
+ kind = 6;
{ jjCheckNAdd(9); }
break;
case 10:
@@ -252,20 +315,20 @@ else if (curChar == 43)
{ jjCheckNAddStates(4, 6); }
break;
case 14:
- if (curChar == 47 && kind > 8)
- kind = 8;
+ if (curChar == 47 && kind > 9)
+ kind = 9;
break;
case 15:
if (curChar == 62)
{ jjCheckNAddStates(11, 13); }
break;
case 16:
- if (curChar == 62 && kind > 2)
- kind = 2;
+ if (curChar == 62 && kind > 3)
+ kind = 3;
break;
case 17:
- if (curChar == 43 && kind > 2)
- kind = 2;
+ if (curChar == 43 && kind > 3)
+ kind = 3;
break;
case 18:
case 22:
@@ -277,8 +340,8 @@ else if (curChar == 43)
{ jjCheckNAddStates(7, 10); }
break;
case 21:
- if (curChar == 60 && kind > 2)
- kind = 2;
+ if (curChar == 60 && kind > 3)
+ kind = 3;
break;
case 24:
if (curChar == 36)
@@ -303,8 +366,8 @@ else if (curChar < 128)
case 8:
if ((0x87ffffffd7fffffeL & l) == 0L)
break;
- if (kind > 4)
- kind = 4;
+ if (kind > 5)
+ kind = 5;
{ jjCheckNAdd(8); }
break;
case 12:
@@ -333,8 +396,8 @@ else if (curChar < 128)
case 8:
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
- if (kind > 4)
- kind = 4;
+ if (kind > 5)
+ kind = 5;
{ jjCheckNAdd(8); }
break;
case 13:
@@ -361,9 +424,9 @@ else if (curChar < 128)
/** Token literal values. */
public static final String[] jjstrLiteralImages = {
-"", null, null, "\100", null, null, "\43", "\44", null, "\12", "\72", "\50",
-"\51", "\174", "\46", "\41", "\77", "\133", "\135", "\54", "\176", "\75", "\41\72",
-"\72\173", "\73", "\175", "\173", };
+"", null, "\165\156\151\161", null, "\100", null, null, "\43", "\44", null,
+"\72", "\72\72", "\12", "\50", "\51", "\174", "\46", "\41", "\77", "\133", "\135",
+"\54", "\176", "\75", "\41\72", "\72\173", "\73", "\175", "\173", };
protected Token jjFillToken()
{
final Token t;
@@ -600,10 +663,10 @@ public void SwitchTo(int lexState)
/** Lex State array. */
public static final int[] jjnewLexState = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1,
+ -1, -1, -1, -1,
};
static final long[] jjtoToken = {
- 0x7fffffdL,
+ 0x1ffffffdL,
};
static final long[] jjtoSkip = {
0x2L,
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.java
index 88447264a7..5fd99a1fb1 100644
--- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.java
+++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.java
@@ -193,6 +193,15 @@
* {@code Y} and there are two paths to {@code Y}, one of
* which goes through a {@code dobj} and one of which goes
* through a {@code mod}.
+ *
+ * There is also a new operation, {@code uniq}, which allows for a query to reduce to only one match:
+ *
+ * {@code {} >dobj ({} > {}=foo) >mod ({} > {}=foo) :: uniq}
+ *
+ * This operation also takes a list of nodes, which if supplied, will use the values of those nodes
+ * as keys for the uniq. In the above example, this variation will match once per observed value of {@code foo}:
+ *
+ * {@code {} >dobj ({} > {}=foo) >mod ({} > {}=foo) :: uniq foo}
*
*
Naming relations
*
@@ -337,19 +346,30 @@ public SemgrexMatcher matcher(SemanticGraph hypGraph, Alignment alignment, Seman
// batch processing
// -------------------------------------------------------------
+ /**
+ * Postprocess a set of results from the batch processing method
+ *
+ * TODO: make abstract
+ */
+ public List>> postprocessMatches(List>> matches, boolean keepEmptyMatches) {
+ return matches;
+ }
/**
* Returns a list of matching sentences and each of the matches from those sentences.
*
* Non-matching sentences are currently not returned (may change in the future to return an empty list).
*/
- public List>> matchSentences(List sentences) {
+ public List>> matchSentences(List sentences, boolean keepEmptyMatches) {
List>> matches = new ArrayList<>();
for (CoreMap sentence : sentences) {
SemanticGraph graph = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
SemanticGraph enhanced = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
SemgrexMatcher matcher = matcher(graph);
- if ( ! matcher.find()) {
+ if (!matcher.find()) {
+ if (keepEmptyMatches) {
+ matches.add(new Pair<>(sentence, new ArrayList<>()));
+ }
continue;
}
matches.add(new Pair<>(sentence, new ArrayList<>()));
@@ -359,6 +379,12 @@ public List>> matchSentences(List sent
found = matcher.find();
}
}
+
+ for (SemgrexPattern child : getChildren()) {
+ matches = child.postprocessMatches(matches, keepEmptyMatches);
+ }
+ matches = postprocessMatches(matches, keepEmptyMatches);
+
return matches;
}
@@ -588,7 +614,7 @@ public static void main(String[] args) throws IOException {
}
}
- List>> matches = semgrex.matchSentences(sentences);
+ List>> matches = semgrex.matchSentences(sentences, false);
for (Pair> sentenceMatches : matches) {
CoreMap sentence = sentenceMatches.first();
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/UniqPattern.java b/src/edu/stanford/nlp/semgraph/semgrex/UniqPattern.java
new file mode 100644
index 0000000000..dec9717346
--- /dev/null
+++ b/src/edu/stanford/nlp/semgraph/semgrex/UniqPattern.java
@@ -0,0 +1,133 @@
+package edu.stanford.nlp.semgraph.semgrex;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import edu.stanford.nlp.ling.IndexedWord;
+import edu.stanford.nlp.semgraph.SemanticGraph;
+import edu.stanford.nlp.semgraph.SemanticGraphEdge;
+import edu.stanford.nlp.util.CoreMap;
+import edu.stanford.nlp.util.Pair;
+import edu.stanford.nlp.util.VariableStrings;
+
+/**
+ * At semgrex creation time, this takes a list of nodes or attributes.
+ *
+ * At batch processing time, this pares a list of matches down to
+ * one match for each matching attributes.
+ */
+public class UniqPattern extends SemgrexPattern {
+ private static final long serialVersionUID = -38315768154569L;
+
+ private final SemgrexPattern child;
+ private final List keys;
+
+ public UniqPattern(SemgrexPattern child, List keys) {
+ this.child = child;
+ this.keys = new ArrayList<>(keys);
+ }
+
+ private String getKey(SemgrexMatch match, String key) {
+ // TODO: could also do edge names or variable groups (once those exist)
+ IndexedWord node = match.getNode(key);
+ if (node == null) {
+ return null;
+ }
+ return node.value();
+ }
+
+ public List>> postprocessMatches(List>> matches, boolean keepEmptyMatches) {
+ // hashing lists should be okay here since the lists will not change
+ // while the postprocessing is happening
+ Set> seenKeys = new HashSet<>();
+
+ List>> newMatches = new ArrayList<>();
+ for (Pair> sentence : matches) {
+ List newSentenceMatches = new ArrayList<>();
+ for (SemgrexMatch match : sentence.second()) {
+ List matchKey = new ArrayList<>();
+ for (String key : keys) {
+ matchKey.add(getKey(match, key));
+ }
+ if (seenKeys.contains(matchKey)) {
+ continue;
+ }
+ seenKeys.add(matchKey);
+ newSentenceMatches.add(match);
+ }
+ if (newSentenceMatches.size() > 0 || keepEmptyMatches) {
+ newMatches.add(new Pair<>(sentence.first(), newSentenceMatches));
+ }
+ }
+
+ return newMatches;
+ }
+
+ @Override
+ public String localString() {
+ return toString(true, false);
+ }
+
+ @Override
+ public String toString() {
+ return toString(true, true);
+ }
+
+ @Override
+ public String toString(boolean hasPrecedence) {
+ return toString(hasPrecedence, true);
+ }
+
+ @Override
+ public void setChild(SemgrexPattern n) {
+ throw new UnsupportedOperationException("Child should only be set on a UniqPattern at creation time");
+ }
+
+ @Override
+ public List getChildren() {
+ if (child == null) {
+ return Collections.emptyList();
+ } else {
+ return Collections.singletonList(child);
+ }
+ }
+
+ public String toString(boolean hasPrecedence, boolean addChild) {
+ StringBuilder sb = new StringBuilder();
+ if (addChild) {
+ sb.append(child.toString(true));
+ }
+ sb.append(" :: uniq");
+ for (String key : keys) {
+ sb.append(" ");
+ sb.append(key);
+ }
+ return sb.toString();
+ }
+
+ @Override
+ public SemgrexMatcher matcher(SemanticGraph sg, IndexedWord node,
+ Map namesToNodes,
+ Map namesToRelations,
+ Map namesToEdges,
+ VariableStrings variableStrings,
+ boolean ignoreCase) {
+ return child.matcher(sg, node, namesToNodes, namesToRelations, namesToEdges, variableStrings, ignoreCase);
+ }
+
+ @Override
+ public SemgrexMatcher matcher(SemanticGraph sg,
+ Alignment alignment, SemanticGraph sg_align,
+ boolean hyp, IndexedWord node,
+ Map namesToNodes,
+ Map namesToRelations,
+ Map namesToEdges,
+ VariableStrings variableStrings,
+ boolean ignoreCase) {
+ return child.matcher(sg, alignment, sg_align, hyp, node, namesToNodes, namesToRelations, namesToEdges, variableStrings, ignoreCase);
+ }
+}
diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/ProcessSemgrexRequestTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/ProcessSemgrexRequestTest.java
index 2336c7d03e..6b8ea59c21 100644
--- a/test/src/edu/stanford/nlp/semgraph/semgrex/ProcessSemgrexRequestTest.java
+++ b/test/src/edu/stanford/nlp/semgraph/semgrex/ProcessSemgrexRequestTest.java
@@ -16,9 +16,13 @@ public class ProcessSemgrexRequestTest {
* Build a fake request. The same query will be repeated N times
*/
public static CoreNLPProtos.SemgrexRequest buildFakeRequest(int numQueries, int numSemgrex) {
+ return buildFakeRequest(numQueries, numSemgrex, "{}=source >dobj=foo {}=target");
+ }
+
+ public static CoreNLPProtos.SemgrexRequest buildFakeRequest(int numQueries, int numSemgrex, String semgrexPattern) {
CoreNLPProtos.SemgrexRequest.Builder request = CoreNLPProtos.SemgrexRequest.newBuilder();
for (int i = 0; i < numSemgrex; ++i) {
- request.addSemgrex("{}=source >dobj=foo {}=target");
+ request.addSemgrex(semgrexPattern);
}
for (int i = 0; i < numQueries; ++i) {
@@ -87,7 +91,7 @@ public void testSimpleRequest() {
CoreNLPProtos.SemgrexResponse response = ProcessSemgrexRequest.processRequest(request);
Assert.assertEquals("Expected exactly 1 reply", 1, response.getResultList().size());
- checkResult(response, 1, 0);
+ checkResult(response, 1, 0, true);
}
@Test
@@ -96,39 +100,43 @@ public void testTwoSemgrex() {
CoreNLPProtos.SemgrexResponse response = ProcessSemgrexRequest.processRequest(request);
Assert.assertEquals("Expected exactly 1 reply", 1, response.getResultList().size());
- checkResult(response, 2, 0);
+ checkResult(response, 2, 0, true);
}
- public static void checkResult(CoreNLPProtos.SemgrexResponse response, int numSemgrex, int graphIdx) {
+ public static void checkResult(CoreNLPProtos.SemgrexResponse response, int numSemgrex, int graphIdx, boolean shouldMatch) {
CoreNLPProtos.SemgrexResponse.GraphResult result = response.getResultList().get(graphIdx);
Assert.assertEquals("Expected exactly " + numSemgrex + " semgrex result(s)", numSemgrex, result.getResultList().size());
int semgrexIdx = 0;
for (CoreNLPProtos.SemgrexResponse.SemgrexResult semgrexResult : result.getResultList()) {
- Assert.assertEquals("Expected exactly 1 match", 1, semgrexResult.getMatchList().size());
- CoreNLPProtos.SemgrexResponse.Match match = semgrexResult.getMatchList().get(0);
-
- Assert.assertEquals("Match is supposed to be at the root", 1, match.getMatchIndex());
- Assert.assertEquals("Expected exactly 2 named nodes", 2, match.getNodeList().size());
- Assert.assertEquals("Expected exactly 1 named reln", 1, match.getRelnList().size());
- Assert.assertEquals("Expected exactly 1 named edge", 1, match.getEdgeList().size());
-
- Assert.assertEquals("Node 1 should be source", 1, match.getNodeList().get(0).getMatchIndex());
- Assert.assertEquals("Node 1 should be source", "source", match.getNodeList().get(0).getName());
- Assert.assertEquals("Node 2 should be target", 2, match.getNodeList().get(1).getMatchIndex());
- Assert.assertEquals("Node 2 should be target", "target", match.getNodeList().get(1).getName());
-
- Assert.assertEquals("Reln dobj should be named foo", "foo", match.getRelnList().get(0).getName());
- Assert.assertEquals("Reln dobj should be have reln dobj", "dobj", match.getRelnList().get(0).getReln());
-
- Assert.assertEquals("Edge dobj should be named foo", "foo", match.getEdgeList().get(0).getName());
- Assert.assertEquals("Edge dobj should have reln dobj", "dobj", match.getEdgeList().get(0).getReln());
- Assert.assertEquals("Edge dobj source should be 1", 1, match.getEdgeList().get(0).getSource());
- Assert.assertEquals("Edge dobj source should be 2", 2, match.getEdgeList().get(0).getTarget());
-
- Assert.assertEquals("Graph count was off", graphIdx, match.getGraphIndex());
- Assert.assertEquals("Semgrex pattern count was off", semgrexIdx, match.getSemgrexIndex());
+ if (shouldMatch) {
+ Assert.assertEquals("Expected exactly 1 match", 1, semgrexResult.getMatchList().size());
+ CoreNLPProtos.SemgrexResponse.Match match = semgrexResult.getMatchList().get(0);
+
+ Assert.assertEquals("Match is supposed to be at the root", 1, match.getMatchIndex());
+ Assert.assertEquals("Expected exactly 2 named nodes", 2, match.getNodeList().size());
+ Assert.assertEquals("Expected exactly 1 named reln", 1, match.getRelnList().size());
+ Assert.assertEquals("Expected exactly 1 named edge", 1, match.getEdgeList().size());
+
+ Assert.assertEquals("Node 1 should be source", 1, match.getNodeList().get(0).getMatchIndex());
+ Assert.assertEquals("Node 1 should be source", "source", match.getNodeList().get(0).getName());
+ Assert.assertEquals("Node 2 should be target", 2, match.getNodeList().get(1).getMatchIndex());
+ Assert.assertEquals("Node 2 should be target", "target", match.getNodeList().get(1).getName());
+
+ Assert.assertEquals("Reln dobj should be named foo", "foo", match.getRelnList().get(0).getName());
+ Assert.assertEquals("Reln dobj should be have reln dobj", "dobj", match.getRelnList().get(0).getReln());
+
+ Assert.assertEquals("Edge dobj should be named foo", "foo", match.getEdgeList().get(0).getName());
+ Assert.assertEquals("Edge dobj should have reln dobj", "dobj", match.getEdgeList().get(0).getReln());
+ Assert.assertEquals("Edge dobj source should be 1", 1, match.getEdgeList().get(0).getSource());
+ Assert.assertEquals("Edge dobj source should be 2", 2, match.getEdgeList().get(0).getTarget());
+
+ Assert.assertEquals("Graph count was off", graphIdx, match.getGraphIndex());
+ Assert.assertEquals("Semgrex pattern count was off", semgrexIdx, match.getSemgrexIndex());
+ } else {
+ Assert.assertEquals("Expected exactly 0 match", 0, semgrexResult.getMatchList().size());
+ }
++semgrexIdx;
}
}
@@ -147,8 +155,24 @@ public void testTwoGraphs() {
CoreNLPProtos.SemgrexResponse response = ProcessSemgrexRequest.processRequest(request);
Assert.assertEquals("Expected exactly 2 replies", 2, response.getResultList().size());
- checkResult(response, 1, 0);
- checkResult(response, 1, 1);
+ checkResult(response, 1, 0, true);
+ checkResult(response, 1, 1, true);
+ }
+
+ /**
+ * For this test, only the first graph should have any results for the given pattern
+ *
+ * The uniq operator in the SemgrexPattern will remove the match from the second graph,
+ * since the second graph is identical
+ */
+ @Test
+ public void testTwoGraphsUniq() {
+ CoreNLPProtos.SemgrexRequest request = buildFakeRequest(2, 1, "{}=source >dobj=foo {}=target :: uniq source");
+ CoreNLPProtos.SemgrexResponse response = ProcessSemgrexRequest.processRequest(request);
+
+ Assert.assertEquals("Expected exactly 2 replies", 2, response.getResultList().size());
+ checkResult(response, 1, 0, true);
+ checkResult(response, 1, 1, false);
}
public byte[] buildRepeatedRequest(int count, boolean closingLength) throws IOException {
@@ -179,7 +203,7 @@ public void checkRepeatedResults(byte[] arr, int count) throws IOException {
byte[] responseBytes = new byte[len];
din.read(responseBytes, 0, len);
CoreNLPProtos.SemgrexResponse response = CoreNLPProtos.SemgrexResponse.parseFrom(responseBytes);
- checkResult(response, 1, 0);
+ checkResult(response, 1, 0, true);
}
int len = din.readInt();
Assert.assertEquals("Repeated results should be over", 0, len);
diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java
index 8e7da2f867..e02714a69f 100644
--- a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java
+++ b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java
@@ -589,6 +589,22 @@ public void testMultipleDepths() {
runTest("{} 6,6<< {word:A}", graph, "I");
}
+ /** After making UNIQ a separate token in the parser, we should verify that "uniq" can be treated as an identifier as well */
+ public void testUniqNamedNode() {
+ SemanticGraph graph = makeComplicatedGraph();
+
+ runTest("{} >obj ({} >expl {})", graph, "A");
+
+ SemgrexPattern pattern =
+ SemgrexPattern.compile("{} >obj ({} >expl {}=uniq)");
+ SemgrexMatcher matcher = pattern.matcher(graph);
+ assertTrue(matcher.find());
+ assertEquals(1, matcher.getNodeNames().size());
+ assertEquals("E", matcher.getNode("uniq").toString());
+ assertEquals("A", matcher.getMatch().toString());
+ assertFalse(matcher.find());
+ }
+
public void testNamedNode() {
SemanticGraph graph = makeComplicatedGraph();
@@ -1448,31 +1464,39 @@ public void testBrackets() {
"[ate/VBD subj>Billz/NNP obj>[muffins compound>strawberry]]");
}
+ String[] BATCH_PARSES = {
+ "[foo-1 nmod> bar-2]",
+ "[foo-1 obj> bar-2]",
+ "[bar-1 compound> baz-2]",
+ "[foo-1 nmod> baz-2 obj> bar-3]",
+ };
+
/**
- * A simple test of the batch search - should return 3 of the 4 sentences
+ * Build a list of sentences with BasicDependenciesAnnotation
*/
- public void testBatchSearch() {
- String[] parses = {
- "[foo-1 nmod> bar-2]",
- "[foo-1 obj> bar-2]",
- "[bar-1 compound> baz-2]",
- "[foo-1 nmod> baz-2 obj> bar-3]",
- };
+ public List buildSmallBatch() {
List sentences = new ArrayList<>();
- for (String parse : parses) {
+ for (String parse : BATCH_PARSES) {
SemanticGraph graph = SemanticGraph.valueOf(parse);
CoreMap sentence = new ArrayCoreMap();
sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, graph);
sentence.set(CoreAnnotations.TextAnnotation.class, parse);
sentences.add(sentence);
}
+ return sentences;
+ }
+ /**
+ * A simple test of the batch search - should return 3 of the 4 sentences
+ */
+ public void testBatchSearch() {
+ List sentences = buildSmallBatch();
SemgrexPattern semgrex = SemgrexPattern.compile("{word:foo}=x > {}=y");
- List>> matches = semgrex.matchSentences(sentences);
+ List>> matches = semgrex.matchSentences(sentences, false);
String[] expectedMatches = {
- parses[0],
- parses[1],
- parses[3],
+ BATCH_PARSES[0],
+ BATCH_PARSES[1],
+ BATCH_PARSES[3],
};
int[] expectedCount = {1, 1, 2};
assertEquals(expectedMatches.length, matches.size());
@@ -1482,6 +1506,80 @@ public void testBatchSearch() {
}
}
+ /**
+ * Test that an illegal uniq expression throws an exception
+ *
+ * Specifically, the expectation is for a SemgrexParseException
+ */
+ public void testBrokenUniq() {
+ try {
+ String pattern = "{word:foo}=foo :: uniq bar";
+ SemgrexPattern semgrex = SemgrexPattern.compile(pattern);
+ throw new RuntimeException("This expression should fail because the node name is unknown");
+ } catch (SemgrexParseException e) {
+ // yay
+ }
+ }
+
+ /**
+ * Test that a simple uniq expression is correctly parsed
+ */
+ public void testParsesUniq() {
+ String pattern = "{word:foo}=foo :: uniq foo";
+ SemgrexPattern semgrex = SemgrexPattern.compile(pattern);
+ }
+
+ /**
+ * Test the uniq functionality on a few simple parses
+ */
+ public void testBatchUniq() {
+ List sentences = buildSmallBatch();
+ SemgrexPattern semgrex = SemgrexPattern.compile("{word:foo}=x > {}=y :: uniq x");
+ List>> matches = semgrex.matchSentences(sentences, false);
+ // only the first foo sentence should match when using "uniq x"
+ assertEquals(1, matches.size());
+ assertEquals(BATCH_PARSES[0], matches.get(0).first().get(CoreAnnotations.TextAnnotation.class));
+ assertEquals(1, matches.get(0).second().size());
+
+ semgrex = SemgrexPattern.compile("{word:foo}=x > {}=y :: uniq");
+ matches = semgrex.matchSentences(sentences, false);
+ // same thing happens when using "uniq" and no nodes - only one match will occur
+ assertEquals(1, matches.size());
+ assertEquals(BATCH_PARSES[0], matches.get(0).first().get(CoreAnnotations.TextAnnotation.class));
+ assertEquals(1, matches.get(0).second().size());
+
+ semgrex = SemgrexPattern.compile("{word:foo}=x > {}=y :: uniq y");
+ matches = semgrex.matchSentences(sentences, false);
+ // now it should match both foo>bar and foo>baz
+ assertEquals(2, matches.size());
+ assertEquals(BATCH_PARSES[0], matches.get(0).first().get(CoreAnnotations.TextAnnotation.class));
+ assertEquals(1, matches.get(0).second().size());
+ assertEquals(BATCH_PARSES[3], matches.get(1).first().get(CoreAnnotations.TextAnnotation.class));
+ assertEquals(1, matches.get(1).second().size());
+
+ semgrex = SemgrexPattern.compile("{}=x > {}=y :: uniq x y");
+ matches = semgrex.matchSentences(sentences, false);
+ // now it should batch each of foo>bar, bar>baz, foo>baz
+ assertEquals(3, matches.size());
+ assertEquals(BATCH_PARSES[0], matches.get(0).first().get(CoreAnnotations.TextAnnotation.class));
+ assertEquals(1, matches.get(0).second().size());
+ assertEquals(BATCH_PARSES[2], matches.get(1).first().get(CoreAnnotations.TextAnnotation.class));
+ assertEquals(1, matches.get(1).second().size());
+ assertEquals(BATCH_PARSES[3], matches.get(2).first().get(CoreAnnotations.TextAnnotation.class));
+ assertEquals(1, matches.get(2).second().size());
+ }
+
+ public static void outputBatchResults(SemgrexPattern pattern, List sentences) {
+ List>> matches = pattern.matchSentences(sentences, false);
+ for (Pair> sentenceMatch : matches) {
+ System.out.println("Pattern matched at:");
+ System.out.println(sentenceMatch.first());
+ for (SemgrexMatch match : sentenceMatch.second()) {
+ System.out.println(match);
+ }
+ }
+ }
+
public static void outputResults(String pattern, String graph,
String ... ignored) {
outputResults(SemgrexPattern.compile(pattern),