magic282
diff --git a/‎.gitattributes
+63 b/‎.gitattributes
+63
diff --git a/‎.gitignore
+4 b/‎.gitignore
+4
diff --git a/‎LICENSE
+674 b/‎LICENSE
+674
diff --git a/‎README.md
+98 b/‎README.md
+98
diff --git a/‎seq2seq_pt/CollectVocab.py
+61 b/‎seq2seq_pt/CollectVocab.py
+61
diff --git a/‎seq2seq_pt/PyBLEU/__init__.py
+4 b/‎seq2seq_pt/PyBLEU/__init__.py
+4
@@ -0,0 +1,63 @@
+###############################################################################
+# Set default behavior to automatically normalize line endings.
+###############################################################################
+* text=auto
+
+###############################################################################
+# Set default behavior for command prompt diff.
+#
+# This is need for earlier builds of msysgit that does not have it on by
+# default for csharp files.
+# Note: This is only used by command line
+###############################################################################
+#*.cs     diff=csharp
+
+###############################################################################
+# Set the merge driver for project and solution files
+#
+# Merging from the command prompt will add diff markers to the files if there
+# are conflicts (Merging from VS is not affected by the settings below, in VS
+# the diff markers are never inserted). Diff markers may cause the following 
+# file extensions to fail to load in VS. An alternative would be to treat
+# these files as binary and thus will always conflict and require user
+# intervention with every merge. To do so, just uncomment the entries below
+###############################################################################
+#*.sln       merge=binary
+#*.csproj    merge=binary
+#*.vbproj    merge=binary
+#*.vcxproj   merge=binary
+#*.vcproj    merge=binary
+#*.dbproj    merge=binary
+#*.fsproj    merge=binary
+#*.lsproj    merge=binary
+#*.wixproj   merge=binary
+#*.modelproj merge=binary
+#*.sqlproj   merge=binary
+#*.wwaproj   merge=binary
+
+###############################################################################
+# behavior for image files
+#
+# image files are treated as binary by default.
+###############################################################################
+#*.jpg   binary
+#*.png   binary
+#*.gif   binary
+
+###############################################################################
+# diff behavior for common document formats
+# 
+# Convert binary document formats to text before diffing them. This feature
+# is only available from the command line. Turn it on by uncommenting the 
+# entries below.
+###############################################################################
+#*.doc   diff=astextplain
+#*.DOC   diff=astextplain
+#*.docx  diff=astextplain
+#*.DOCX  diff=astextplain
+#*.dot   diff=astextplain
+#*.DOT   diff=astextplain
+#*.pdf   diff=astextplain
+#*.PDF   diff=astextplain
+#*.rtf   diff=astextplain
+#*.RTF   diff=astextplain
@@ -0,0 +1,4 @@
+pred.txt
+multi-bleu.perl
+*.pt
+*.pyc
@@ -0,0 +1,98 @@
+# NQG
+This repository contains code for the  paper "[Neural Question Generation from Text: A Preliminary Study](https://arxiv.org/abs/1704.01792)"
+
+## About this code
+
+The experiments in the paper were done with an in-house deep learning tool. Therefore, we re-implement this with PyTorch as a reference.
+
+This code only implements the setting `NQG+` in the paper.
+Within 1 hour's training on Tesla P100, the `NQG+` model achieves 12.35 BLEU-4 score on the dev set as reported in our paper.
+
+If you find this code useful in your research, please consider citing:
+
+    @article{zhou2017neural,
+      title={Neural Question Generation from Text: A Preliminary Study},
+      author={Zhou, Qingyu and Yang, Nan and Wei, Furu and Tan, Chuanqi and Bao, Hangbo and Zhou, Ming},
+      journal={arXiv preprint arXiv:1704.01792},
+      year={2017}
+    }
+
+
+
+## How to run
+
+### Prepare the dataset and code
+
+Make a experiment home folder for NQG data and code:
+```bash
+NQG_HOME=~/workspace/nqg
+mkdir -p $NQG_HOME/code
+mkdir -p $NQG_HOME/data
+cd $NQG_HOME/code
+git clone https://github.com/magic282/NQG.git
+cd $NQG_HOME/data
+wget https://res.qyzhou.me/redistribute.zip
+unzip redistribute.zip
+```
+Put the data in the folder `$NQG_HOME/code/data/giga` and organize them as:
+```
+nqg
+├── code
+│   └── NQG
+│       └── seq2seq_pt
+└── data
+    └── redistribute
+        ├── QG
+        │   ├── dev
+        │   ├── test
+        │   ├── test_sample
+        │   └── train
+        └── raw
+```
+Then collect vocabularies:
+```bash
+python $NQG_HOME/code/NQG/seq2seq_pt/CollectVocab.py \
+       $NQG_HOME/data/redistribute/QG/train/train.txt.source.txt \
+       $NQG_HOME/data/redistribute/QG/train/train.txt.target.txt \
+       $NQG_HOME/data/redistribute/QG/train/vocab.txt
+python $NQG_HOME/code/NQG/seq2seq_pt/CollectVocab.py \
+       $NQG_HOME/data/redistribute/QG/train/train.txt.bio \
+       $NQG_HOME/data/redistribute/QG/train/bio.vocab.txt
+python $NQG_HOME/code/NQG/seq2seq_pt/CollectVocab.py \
+       $NQG_HOME/data/redistribute/QG/train/train.txt.pos \
+       $NQG_HOME/data/redistribute/QG/train/train.txt.ner \
+       $NQG_HOME/data/redistribute/QG/train/train.txt.case \
+       $NQG_HOME/data/redistribute/QG/train/feat.vocab.txt
+head -n 20000 $NQG_HOME/data/redistribute/QG/train/vocab.txt > $NQG_HOME/data/redistribute/QG/train/vocab.txt.20k
+```
+
+### Setup the environment
+#### Package Requirements:
+```
+nltk scipy numpy pytorch
+```
+**PyTorch version**: This code requires PyTorch v0.4.0.
+
+**Python version**: This code requires Python3.
+
+**Warning**: Older versions of NLTK have a bug in the PorterStemmer. Therefore, a fresh installation or update of NLTK is recommended.
+
+A Docker image is also provided.
+#### Docker image
+```bash
+docker pull magic282/pytorch:0.4.0
+```
+### Run training
+The file `run.sh` is an example. Modify it according to your configuration.
+#### Without Docker
+```bash
+bash $NQG_HOME/code/NQG/seq2seq_pt/run_squad_qg.sh $NQG_HOME/data/redistribute/QG $NQG_HOME/code/NQG/seq2seq_pt
+```
+#### With Docker
+```bash
+nvidia-docker run --rm -ti -v $NQG_HOME:/workspace magic282/pytorch:0.4.0
+```
+Then inside the docker:
+```bash
+bash code/NQG/seq2seq_pt/run_squad_qg.sh /workspace/data/redistribute/QG /workspace/code/NQG/seq2seq_pt
+```
@@ -0,0 +1,61 @@
+from __future__ import division
+import sys
+import operator
+
+DefaultSpecialWords = ["<blank>", "<unk>", "<s>", "</s>"]
+
+
+def Collect(inputFiles, vocabPath, toLower=False, userDefineSpecial=None):
+    global DefaultSpecialWords
+    specialWords = []
+    if userDefineSpecial:
+        for item in userDefineSpecial:
+            if item not in specialWords:
+                specialWords.append(item)
+    else:
+        specialWords = DefaultSpecialWords
+
+    dict = CollectVocab(inputFiles, toLower)
+    total = sum(dict.values())
+    sorted_dict = sorted(dict.items(), key=operator.itemgetter(1), reverse=True)
+    acc = 0
+    with open(vocabPath, 'w', encoding='utf-8') as sw:
+        count = 0
+        for item in specialWords:
+            sw.write("{0} {1}\n".format(item, count))
+            count += 1
+        for k, v in sorted_dict:
+            if k in specialWords:
+                continue
+            acc += v
+            sw.write("{0} {1} {2} {3}\n".format(k, count, v, 1.0 * acc / total))
+            count += 1
+
+
+def CollectVocab(files, toLower):
+    dict = {}
+    for f in files:
+
+        with open(f, encoding='utf-8') as sr:
+            for line in sr:
+                line = line.strip()
+                if toLower:
+                    line = line.lower()
+                sp = line.split()
+                sp = filter(None, sp)
+                for token in sp:
+                    if token not in dict:
+                        dict[token] = 0
+                    dict[token] += 1
+    return dict
+
+
+if __name__ == "__main__":
+    if len(sys.argv) >= 3:
+        files = sys.argv[1:-1]
+        vocab_file = sys.argv[-1]
+        Collect(files, vocab_file, False, ["<blank>", "<unk>", "<s>", "</s>"])
+    else:
+        print('CollectVocab.py: Collect vocabulary from multiple files.')
+        print('Usage:')
+        print('python CollectVocab.py file_1 file_2 ... file_n out.vocab.txt')
@@ -0,0 +1,4 @@
+from __future__ import absolute_import
+import nltk_bleu_score
+
+__version__ = "0.0.1"