-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.py
483 lines (470 loc) · 22.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
"""
description: 运行主程序,不含训练代码,运行时间约一分钟
"""
import os
import json
import tensorflow as tf
from tkinter import *
import qaData
from qaLSTMNet import QaLSTMNet
from tkinter.scrolledtext import ScrolledText
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
if __name__ == '__main__':
# 定义参数
trainingFile = "./data/train_data_sample.json"
testingFile = "./data/my_data_sample.json"
resultFile = "predictRst.score"
saveFile = "newModel/savedModel"
trainedModel = "./newModel/savedModel"
embeddingFile = "./zhwiki/zhwiki_2017_03.sg_50d.word2vec"
embeddingSize = 50 # 词向量的维度
dropout = 1.0 # 保留全部结果,训练时用
learningRate = 0.4 # 学习速度
lrDownRate = 0.5 # 学习速度下降速度
lrDownCount = 4 # 学习速度下降次数
epochs = 20 # 每次学习速度指数下降之前执行的完整epoch次数
batchSize = 8 # 每一批次处理的问题个数
rnnSize = 100 # 隐含层神经元的个数
margin = 0.1 # 余弦边界值 来对计算出的正负样本的语义相似度进行评判。
unrollSteps = 100 # 句子中的最大词汇数目
max_grad_norm = 5 # 用于控制梯度膨胀,如果梯度向量的L2模超过max_grad_norm,则等比例缩小
allow_soft_placement = True # 那么当运行设备不满足要求时,允许自动分配GPU或者CPU。
cpuDevice = "/cpu:0" # CPU不区分设备号,统一使用 /cpu:0
# 读取所有词向量和对应索引
print("正在加载词向量")
embedding, word2idx = qaData.loadEmbedding(embeddingFile)
print("词向量加载完成")
with tf.Graph().as_default(), tf.device(cpuDevice):
session_conf = tf.ConfigProto(allow_soft_placement=allow_soft_placement)
with tf.Session(config=session_conf).as_default() as sess:
globalStep = tf.Variable(0, name="globle_step", trainable=False)
print("实例化网络")
lstm = QaLSTMNet(batchSize, unrollSteps, embedding, embeddingSize, rnnSize, margin)
print("实例化结束")
saver = tf.train.Saver()
# 创建一个tf.train.Saver对象
saver.restore(sess, trainedModel)
# 使用已保存的模型
print("正在加载知识库")
aTest, aIdTest = qaData.loadtestjsonData(testingFile, word2idx, unrollSteps)
print("知识库加载完成")
root = Tk()
root.title("校园问答系统")
root.geometry('800x800')
l1 = Label(root, text="输入你的问题:")
l1.pack()
xls_text = StringVar()
xls = Entry(root, textvariable=xls_text, width=50)
xls_text.set("")
xls.pack()
def on_click():
q = xls_text.get()
print(q)
qTest = qaData.loadquestion(q, testingFile, word2idx, unrollSteps)
# 返回的是答案个问句对应的词向量索引
with open(resultFile, 'w') as file:
# 返回的元祖数组,然后依次遍历里面的每一对值
for question, answer in qaData.testingBatchIter(qTest, aTest, batchSize):
# 来赋值的,格式为字典型
feed_dict = {
lstm.inputTestQuestions: question,
lstm.inputTestAnswers: answer,
lstm.keep_prob: dropout
}
_, scores = sess.run([globalStep, lstm.result], feed_dict)
best = b1 = 0.0
i = n = n1 = 0
for score in scores:
file.write("%.9f" % score + '\n')
i += 1
if score > best:
b1 = best
best = score
n1 = n
n = i
print(best)
print(n)
print(b1)
print(n1)
with open(testingFile, mode="r", encoding="utf-8") as rf:
json_d = json.load(rf)
if best <= 0.6:
EditText.insert('end', "您的问题:" + q)
EditText.insert(INSERT, '\n')
EditText.insert('end', "暂无您所询问的相关信息,请换个问题吧。")
EditText.insert(INSERT, '\n')
EditText.insert(INSERT, '\n')
elif b1 > 0.6:
for block in json_d:
for ans in block['passages']:
if n == int(ans['passage_id']):
EditText.insert('end', "您的问题:" + q)
EditText.insert(INSERT, '\n')
EditText.insert('end', "最佳答案:" + ans['content'])
EditText.insert(INSERT, '\n')
for ans in block['passages']:
if n1 == int(ans['passage_id']):
EditText.insert('end', "相关信息:" + ans['content'])
EditText.insert(INSERT, '\n')
EditText.insert(INSERT, '\n')
else:
for block in json_d:
for ans in block['passages']:
if n == int(ans['passage_id']):
EditText.insert('end', "您的问题:" + q)
EditText.insert(INSERT, '\n')
EditText.insert('end', "最佳答案:" + ans['content'])
EditText.insert(INSERT, '\n')
EditText.insert(INSERT, '\n')
Button(root, text="查询", command=on_click).pack()
EditText = ScrolledText(root, width=80, height=50)
EditText.pack()
root.mainloop()
print("程序结束")
# """
# description: 运行主程序,含训练代码(CPU版),运行时间约三分钟,答案不含相关信息
# """
# import os
# import time
# import json
# import tensorflow as tf
# from tkinter import *
# import qaData
# from qaLSTMNet import QaLSTMNet
# from tkinter.scrolledtext import ScrolledText
#
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
#
#
# def restore():
# try:
# print("正在加载模型")
# saver.restore(sess, trainedModel)
# except Exception as e:
# print(e)
# print("加载模型失败,重新开始训练")
# train()
#
#
# def train():
# print("重新训练,请保证计算机拥有至少8G空闲内存与2G空闲显存")
# # 准备训练数据
# print("正在准备训练数据,大约需要五分钟...")
# qTrain, aTrain, lTrain, qIdTrain, aIdTrain = qaData.loadjsonData(trainingFile, word2idx, unrollSteps, True)
#
# print("训练数据准备完毕")
# tqs, tta, tfa = [], [], []
# for question, trueAnswer, falseAnswer in qaData.trainingBatchIter(qTrain , aTrain ,lTrain , qIdTrain ,batchSize):
# tqs.append(question), tta.append(trueAnswer), tfa.append(falseAnswer)
#
# print("训练数据加载完成!")
# # 开始训练
# print("开始训练,全部训练过程大约需要12小时")
# sess.run(tf.global_variables_initializer())
# lr = learningRate # 引入局部变量,防止shadow name
# for i in range(lrDownCount):
# optimizer = tf.train.GradientDescentOptimizer(lr)
# optimizer.apply_gradients(zip(grads, tvars))
# trainOp = optimizer.apply_gradients(zip(grads, tvars), global_step=globalStep)
# for epoch in range(epochs):
# print("epoch",epoch)
# for question, trueAnswer, falseAnswer in zip(tqs, tta, tfa):
# # print("question.shape = ", question.shape)
# # print("trueAnswer.shape = ", trueAnswer.shape)
# # print("falseAnswer.shape = ", falseAnswer.shape)
# startTime = time.time()
# feed_dict = {
# lstm.inputQuestions: question,
# lstm.inputTrueAnswers: trueAnswer,
# lstm.inputFalseAnswers: falseAnswer,
# lstm.keep_prob: dropout
# }
# # summary_val = sess.run(lstm.dev_summary_op, feed_dict)
# sess.run(trainOp, feed_dict)
# step = sess.run(globalStep, feed_dict)
# sess.run(lstm.trueCosSim, feed_dict)
# sess.run(lstm.falseCosSim, feed_dict)
# loss = sess.run(lstm.loss, feed_dict)
# timeUsed = time.time() - startTime
# print("step:", step, "loss:", loss, "time:", timeUsed)
# saver.save(sess, saveFile)
# lr *= lrDownRate
#
#
# if __name__ == '__main__':
# # 定义参数
# trainingFile = "./data/train_data_sample.json"
# testingFile = "./data/my_data_sample.json"
# resultFile = "predictRst.score"
# saveFile = "newModel/savedModel"
# trainedModel = "./newModel/savedModel"
# embeddingFile = "./zhwiki/zhwiki_2017_03.sg_50d.word2vec"
# embeddingSize = 50 # 词向量的维度
#
# dropout = 1.0
# learningRate = 0.4 # 学习速度
# lrDownRate = 0.5 # 学习速度下降速度
# lrDownCount = 4 # 学习速度下降次数
# epochs = 20 # 每次学习速度指数下降之前执行的完整epoch次数
# batchSize = 8 # 每一批次处理的问题个数
#
# rnnSize = 100 # LSTM cell中隐藏层神经元的个数
# margin = 0.1 # 余弦边界值 来对计算出的正负样本的语义相似度进行评判。
#
# unrollSteps = 100 # 句子中的最大词汇数目
# max_grad_norm = 5 # 用于控制梯度膨胀,如果梯度向量的L2模超过max_grad_norm,则等比例缩小
#
# allow_soft_placement = True # Allow device soft device placement 有时候不同的设备的cpu和gpu是不同的,那么当运行设备不满足要求时,会自动分配GPU或者CPU。
# # gpuMemUsage = 0.75 # 显存最大使用率
# # gpuDevice = "/gpu:0" # GPU设备名
# cpuDevice = "/cpu:0" # CPU不区分设备号,统一使用 /cpu:0
#
# # 读取所有词向量和对应索引
# print("正在加载词向量")
# embedding, word2idx = qaData.loadEmbedding(embeddingFile)
#
# # 配置TensorFlow
# with tf.Graph().as_default(), tf.device(cpuDevice):
# # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpuMemUsage)
# # session_conf = tf.ConfigProto(allow_soft_placement=allow_soft_placement, gpu_options=gpu_options)
# session_conf = tf.ConfigProto(allow_soft_placement=allow_soft_placement)
# with tf.Session(config=session_conf).as_default() as sess:
# # 加载LSTM网络
# print("正在加载BiLSTM网络,大约需要三分钟...")
# globalStep = tf.Variable(0, name="globle_step", trainable=False)
# # tf.Variable为tensorflow变量声明函数 trainable默认为True,可以后期被算法优化的。如果不想该变量被优化,改为False。
# lstm = QaLSTMNet(batchSize, unrollSteps, embedding, embeddingSize, rnnSize, margin)
# # 实例化一个网络结构对象
# tvars = tf.trainable_variables()
# # trainable_variables()函数可以也仅可以查看可训练的变量
# grads, _ = tf.clip_by_global_norm(tf.gradients(lstm.loss, tvars), max_grad_norm)
# # 通过权重梯度的总和的比率来截取多个张量的值 第一个参数为梯度张量,第二个梯度为截取的比率 返回截取过的梯度张量和一个所有张量的全局范数。
# saver = tf.train.Saver()
# # 保存模型
# print("加载完成!")
#
# # 加载模型或训练模型
# if os.path.exists(trainedModel + '.index'):
# while True:
# choice = input("找到已经训练好的模型,是否载入(y/n)")
# if choice.strip().lower() == 'y':
# restore()
# break
# elif choice.strip().lower() == 'n':
# train()
# break
# else:
# print("无效的输入!\n")
# else:
# train()
#
# print("正在加载知识库")
# aTest, aIdTest = qaData.loadtestjsonData(testingFile, word2idx, unrollSteps)
#
# root = Tk()
# root.title("校园问答系统")
# root.geometry('800x800')
#
# l1 = Label(root, text="输入你的问题:")
# l1.pack()
# xls_text = StringVar()
# xls = Entry(root, textvariable=xls_text, width=50)
# xls_text.set("")
# xls.pack()
#
# def on_click():
# q = xls_text.get()
# print(q)
# qTest = qaData.loadquestion(q, testingFile, word2idx, unrollSteps)
# # 返回的是答案个问句对应的词向量索引
# with open(resultFile, 'w') as file:
# # 返回的元祖数组,然后依次遍历里面的每一对值
# for question, answer in qaData.testingBatchIter(qTest, aTest, batchSize):
# # 来赋值的,格式为字典型
# feed_dict = {
# lstm.inputTestQuestions: question,
# lstm.inputTestAnswers: answer,
# lstm.keep_prob: dropout
# }
# _, scores = sess.run([globalStep, lstm.result], feed_dict)
# best = 0.0
# i = 0
# n = 0
# for score in scores:
# file.write("%.9f" % score + '\n')
# i += 1
# if score > best:
# best = score
# n = i
#
# with open(testingFile, mode="r", encoding="utf-8") as rf:
# json_d = json.load(rf)
# for block in json_d:
# for ans in block['passages']:
# if n == int(ans['passage_id']):
# print("答案:" + ans['content'])
# EditText.insert('end', "您的问题:" + q)
# EditText.insert(INSERT, '\n')
# EditText.insert('end', "可能答案:"+ans['content'])
# EditText.insert(INSERT, '\n')
# EditText.insert(INSERT, '\n')
# # print(best)
#
# Button(root, text="查询", command=on_click).pack()
# EditText = ScrolledText(root, width=80, height=50)
# EditText.pack()
# root.mainloop()
# print("程序结束")
#
#
#
# """
# description: 原版主程序,需配合原版qaData.py使用,只计算问答句匹配度,结果存在predictRst中,无显示页面,优先使用GPU进行训练,如果是CPU版环境则用CPU训练
# """
# import os
# import time
# import tensorflow as tf
# import qaData
# from qaLSTMNet import QaLSTMNet
#
#
# def restore():
# try:
# print("正在加载模型,大约需要一分钟...")
# saver.restore(sess, trainedModel)
# except Exception as e:
# print(e)
# print("加载模型失败,重新开始训练")
# train()
#
#
# def train():
# print("重新训练,请保证计算机拥有至少8G空闲内存与2G空闲显存")
# # 准备训练数据
# print("正在准备训练数据,大约需要五分钟...")
# qTrain, aTrain, lTrain, qIdTrain, aIdTrain = qaData.loadjsonData(trainingFile, word2idx, unrollSteps, True)
#
# print("训练数据准备完毕")
# tqs, tta, tfa = [], [], []
# for question, trueAnswer, falseAnswer in qaData.trainingBatchIter(qTrain , aTrain ,lTrain , qIdTrain ,batchSize):
# tqs.append(question), tta.append(trueAnswer), tfa.append(falseAnswer)
#
# print("训练数据加载完成!")
# # 开始训练
# print("开始训练,全部训练过程大约需要12小时")
# sess.run(tf.global_variables_initializer())
# lr = learningRate # 引入局部变量,防止shadow name
# for i in range(lrDownCount):
# optimizer = tf.train.GradientDescentOptimizer(lr)
# optimizer.apply_gradients(zip(grads, tvars))
# trainOp = optimizer.apply_gradients(zip(grads, tvars), global_step=globalStep)
# for epoch in range(epochs):
# print("epoch",epoch)
# for question, trueAnswer, falseAnswer in zip(tqs, tta, tfa):
# # print("question.shape = ", question.shape)
# # print("trueAnswer.shape = ", trueAnswer.shape)
# # print("falseAnswer.shape = ", falseAnswer.shape)
# startTime = time.time()
# feed_dict = {
# lstm.inputQuestions: question,
# lstm.inputTrueAnswers: trueAnswer,
# lstm.inputFalseAnswers: falseAnswer,
# lstm.keep_prob: dropout
# }
# summary_val = sess.run(lstm.dev_summary_op,feed_dict)
# sess.run(trainOp,feed_dict)
# step = sess.run(globalStep,feed_dict)
# sess.run(lstm.trueCosSim,feed_dict)
# sess.run(lstm.falseCosSim,feed_dict)
# loss = sess.run(lstm.loss,feed_dict)
# timeUsed = time.time() - startTime
# print("step:", step, "loss:", loss, "time:", timeUsed)
# saver.save(sess, saveFile)
# lr *= lrDownRate
#
#
# if __name__ == '__main__':
# # 定义参数
# trainingFile = "./data/train_data_sample.json"
# testingFile = "./data/test_data_sample.json"
# resultFile = "predictRst.score"
# saveFile = "newModel/savedModel"
# trainedModel = "./newModel/savedModel"
#
# embeddingFile = "./zhwiki/zhwiki_2017_03.sg_50d.word2vec"
# embeddingSize = 50 # 词向量的维度
#
# dropout = 1.0
# learningRate = 0.4 # 学习速度
# lrDownRate = 0.5 # 学习速度下降速度
# lrDownCount = 4 # 学习速度下降次数
# epochs = 20 # 每次学习速度指数下降之前执行的完整epoch次数
# batchSize = 8 # 每一批次处理的问题个数
#
# rnnSize = 100 # LSTM cell中隐藏层神经元的个数
# margin = 0.1 # M is constant margin 余弦边界值 来对计算出的正负样本的语义相似度进行评判。
#
# unrollSteps = 100 # 句子中的最大词汇数目
# max_grad_norm = 5 # 用于控制梯度膨胀,如果梯度向量的L2模超过max_grad_norm,则等比例缩小
#
# allow_soft_placement = True # Allow device soft device placement 有时候不同的设备的cpu和gpu是不同的,那么当运行设备不满足要求时,会自动分配GPU或者CPU。
# gpuMemUsage = 0.50 # 显存最大使用率
# gpuDevice = "/gpu:0" # GPU设备名
#
# # 读取测试数据
# print("正在载入测试数据,大约需要一分钟...")
# embedding, word2idx = qaData.loadEmbedding(embeddingFile)
# qTest, aTest, _, qIdTest,aIdTest = qaData.loadjsonData(testingFile, word2idx, unrollSteps)
# print("测试数据加载完成")
# # 配置TensorFlow
# with tf.Graph().as_default(), tf.device(gpuDevice):
# gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpuMemUsage)
# session_conf = tf.ConfigProto(allow_soft_placement=allow_soft_placement, gpu_options=gpu_options)
# with tf.Session(config=session_conf).as_default() as sess:
# # 加载LSTM网络
# print("正在加载LSTM网络,大约需要三分钟...")
# globalStep = tf.Variable(0, name="globle_step", trainable=False)
# # tf.Variable为tensorflow变量声明函数 trainable默认为True,可以后期被算法优化的。如果不想该变量被优化,改为False。
# print("1")
# lstm = QaLSTMNet(batchSize, unrollSteps, embedding, embeddingSize, rnnSize, margin)
# # 实例化一个网络结构对象
# print("2")
# tvars = tf.trainable_variables()
# # trainable_variables()函数可以也仅可以查看可训练的变量
# print("3")
# grads, _ = tf.clip_by_global_norm(tf.gradients(lstm.loss, tvars), max_grad_norm)
# # 通过权重梯度的总和的比率来截取多个张量的值 第一个参数为梯度张量,第二个梯度为截取的比率 返回截取过的梯度张量和一个所有张量的全局范数。
# print("4")
# saver = tf.train.Saver()
# # 保存模型
# print("加载完成!")
# # input()
#
# # 加载模型或训练模型
# if os.path.exists(trainedModel + '.index'):
# while True:
# choice = input("找到已经训练好的模型,是否载入(y/n)")
# if choice.strip().lower() == 'y':
# restore()
# break
# elif choice.strip().lower() == 'n':
# train()
# break
# else:
# print("无效的输入!\n")
# else:
# train()
#
# # 进行测试,输出结果
# print("正在进行测试,大约需要三分钟...")
# with open(resultFile, 'w') as file:
# for question, answer in qaData.testingBatchIter(qTest, aTest, batchSize):
# feed_dict = {
# lstm.inputTestQuestions: question,
# lstm.inputTestAnswers: answer,
# lstm.keep_prob: dropout
# }
# _, scores = sess.run([globalStep, lstm.result], feed_dict)
# for score in scores:
# file.write("%.9f" % score + '\n')
# print("所有步骤完成!程序结束")