-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstanzaTools.py
173 lines (144 loc) · 5.83 KB
/
stanzaTools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import numpy as np
import tensorflow as tf
import networkx as nx
import matplotlib.pyplot as plt
def createAdjacencyMatrix(edgeList, numNodes):
"""
Creates an numNodes x numNodes adjacency matrix from the edgeList
Args:
edgeList - (list) The list of edges from getNodeEdgeLists
numNodes - (int) The number of nodes in the graph
Returns:
newAdjMatrix - (np.ndarray) The adjacency matrix
"""
newAdjMatrix = np.zeros((numNodes, numNodes))
for edge in edgeList:
newAdjMatrix[edge['edgePair'][0]-1, edge['edgePair'][1]-1] = 1
return newAdjMatrix
def convertToEmbedding(words, preprocessor, encoder):
"""
Takes a list of words and converts it to a list of embeddings.
Args:
words - (list) The list of words to convert
preprocessor - (tensorflow_hub.keras_layer.KerasLayer) The preprocessor needed to process a string to tokens
encoder - (tensorflow_hub.keras_layer.KerasLayer) The encoder needed to convert the tokens to embeddings
Returns:
embeddings - (list) A list of embeddings
"""
convertedWords = np.array(
encoder(preprocessor(tf.constant(words)))['pooled_output'])
return convertedWords
def convertToCustomEmbedding(words, preprocessor, encoder):
"""
Don't Use This Function, use the function in the DependencyParsing.ipynb
Takes a list of words and converts it to a list of custom embeddings.
Args:
words - (list) The list of words to convert
preprocessor - (tensorflow_hub.keras_layer.KerasLayer) The preprocessor needed to process a string to tokens
encoder - (tensorflow_hub.keras_layer.KerasLayer) The encoder needed to convert the tokens to embeddings
Returns:
embeddings - (list) A list of embeddings
"""
convertedWords = np.array(
encoder(preprocessor(words)['input_word_ids'])[:, 1:2, :])
return convertedWords.reshape((convertedWords.shape[0], convertedWords.shape[2]))
def tokenRelationHead(sent_dict):
"""
Prints the token - relation - head chart
Args:
sent_dict - (list) The dictionary from sentence.to_dict()
Returns:
"""
print ("{:<15} | {:<10} | {:<15} ".format('Token', 'Relation', 'Head'))
print ("-" * 50)
# iterate to print the token, relation and head
for word in sent_dict:
print ("{:<15} | {:<10} | {:<15} ".format(
str(word['text']),
str(word['deprel']),
str(sent_dict[word['head']-1]['text'] if word['head'] > 0 else 'ROOT')))
def drawDepGraph(nodeList, edgeList):
"""
Draws the dependency graph for a sentence. The words are nodes and the edges are the relations
"""
G = nx.DiGraph()
G.add_nodes_from(range(1, len(nodeList) + 1))
nodeLabels = dict((node['id'], str(node['id']) + " : " + node['text']) for node in nodeList)
edgeLabels = []
for edge in edgeList:
G.add_edge(*edge['edgePair'])
edgeLabels.append((edge['edgePair'], edge['edgeLabel']))
edgeLabels = dict(edgeLabels)
plt.figure(3,figsize=(12,12))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, labels=nodeLabels, node_size=2000, node_color='#B5EAD7', font_size=10)
nx.draw_networkx_edge_labels(G, pos, edge_labels=edgeLabels, font_size=8)
plt.show()
def getNodeEdgeLists(doc, sentiment=True, limit=True):
"""
Parses all the edges in sent_dict and extracts the edges, node labels, and edge labels.
Args:
doc - (stanza.models.common.doc.Document) The doc object
Returns:
nodeList - (list) A list of dictionaries, the keys are the same as the items inside a sentence object.
edgeList - (list) A list of dictionaries, the keys are "edgePair", "edgeLabel"
"""
edgeList = []
nodeList = []
modifier = 0
if limit:
wordLimit = 50
maxSentences = 3
else:
wordLimit = 1000
maxSentences = 100
sentences = []
# to use the sentence sentiment information
if sentiment:
for sentence in doc.sentences:
if sentence.sentiment != 1:
sentences.append(sentence)
if len(sentences) > 0:
sentences = sentences[0:maxSentences]
else:
sentences = doc.sentences[0:maxSentences]
for sentence in sentences:
for node in sentence.to_dict()[0:wordLimit]:
node['id'] += modifier
node['head'] += modifier
nodeList.append(node)
# if modifier and node['id'] == modifier + 1:
# edgePair = (node['id'] - 1, node['id'])
# edgeLabel = 'nextSentence'
# edgeList.append(
# {
# "edgePair" : edgePair,
# "edgeLabel" : edgeLabel
# }
# )
if (node['head'] != modifier and node['head'] <= modifier + wordLimit):
# the first is the head and the second is dependent
edgePair = (node['head'], node['id'])
edgeLabel = node['deprel']
edgeList.append(
{
"edgePair" : edgePair,
"edgeLabel" : edgeLabel,
}
)
modifier += len(sentence.to_dict()[0:wordLimit])
return nodeList, edgeList
def printSpareTensor(st):
"""
Returns a string showing the contents of the sparsetensor.
Args:
st - (tf.sparse.SparseTensor) the sparse tensor you want to display
Returns:
s - (string) The string containing the contents of the sparse tensor
"""
i = 0
s = "<SparseTensor shape=%s \n values={" % (st.dense_shape.numpy().tolist(),)
for (index, value) in zip(st.indices, st.values):
s += f"\n %s: %s" % (index.numpy().tolist(), value.numpy().tolist())
i += 1
return (s + "}>", i)