Add rouge scores calculations and python150k reproducing.

stasbel · stasbel · commit 70d9a558b131 · 2020-01-13T00:07:17.000-08:00
diff --git a/Python150kExtractor/README.md b/Python150kExtractor/README.md
@@ -0,0 +1,70 @@
+# Python150k dataset
+
+## Steps to reproduce
+
+1. Download parsed python dataset from [here](https://www.sri.inf.ethz.ch/py150
+), unarchive and place under `PYTHON150K_DIR`:
+
+```bash
+# Replace with desired path.
+>>> PYTHON150K_DIR=/path/to/data/dir
+>>> mkdir -p $PYTHON150K_DIR
+>>> cd $PYTHON150K_DIR
+>>> wget http://files.srl.inf.ethz.ch/data/py150.tar.gz
+...
+>>> tar -xzvf py150.tar.gz
+...
+```
+
+2. Extract samples to `DATA_DIR`:
+
+```bash
+# Replace with desired path.
+>>> DATA_DIR=$(pwd)/data/default
+>>> SEED=239
+>>> python extract.py \
+    --data_dir=$PYTHON150K_DIR \
+    --output_dir=$DATA_DIR \
+    --seed=$SEED
+...
+```
+
+3. Preprocess for training:
+
+```bash
+>>> ./preprocess.sh $DATA_DIR
+...
+```
+
+4. Train:
+
+```bash
+>>> cd ..
+>>> DESC=default
+>>> CUDA=0
+>>> ./train_python150k.sh $DATA_DIR $DESC $CUDA $SEED
+...
+```
+
+## Test results (seed=239)
+
+### Best scores
+
+**setup#2**: `batch_size=64`  
+**setup#3**: `embedding_size=256,use_momentum=False`  
+**setup#4**: `batch_size=32,embedding_size=256,embeddings_dropout_keep_prob=0.5,use_momentum=False`
+
+| params | Precision | Recall | F1 | ROUGE-2 | ROUGE-L | 
+|---|---|---|---|---|---|
+| default | 0.37 | 0.27 | 0.31 | 0.06 | 0.38 |
+| setup#2 | 0.40 | 0.31 | 0.34 | 0.08 | 0.41 |
+| setup#3 | 0.36 | 0.31 | 0.33 | 0.09 | 0.38 |
+| setup#4 | 0.33 | 0.25 | 0.28 | 0.05 | 0.34 |
+
+### Ablation studies
+
+| params | Precision | Recall | F1 | ROUGE-2 | ROUGE-L | 
+|---|---|---|---|---|---|
+| default | 0.37 | 0.27 | 0.31 | 0.06 | 0.38 |
+| no ast nodes (5th epoch) | 0.27 | 0.16 | 0.20 | 0.02 | 0.28 |
+| no token split (4th epoch) | 0.60 | 0.09 | 0.15 | 0.00 | 0.60 |
diff --git a/Python150kExtractor/extract.py b/Python150kExtractor/extract.py
@@ -0,0 +1,193 @@
+import argparse
+import re
+import json
+import multiprocessing
+import itertools
+import tqdm
+import joblib
+import numpy as np
+
+from pathlib import Path
+from sklearn import model_selection as sklearn_model_selection
+
+METHOD_NAME, NUM = 'METHODNAME', 'NUM'
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--data_dir', required=True, type=str)
+parser.add_argument('--valid_p', type=float, default=0.2)
+parser.add_argument('--max_path_length', type=int, default=8)
+parser.add_argument('--max_path_width', type=int, default=2)
+parser.add_argument('--use_method_name', type=bool, default=True)
+parser.add_argument('--use_nums', type=bool, default=True)
+parser.add_argument('--output_dir', required=True, type=str)
+parser.add_argument('--n_jobs', type=int, default=multiprocessing.cpu_count())
+parser.add_argument('--seed', type=int, default=239)
+
+
+def __collect_asts(json_file):
+    asts = []
+    with open(json_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            ast = json.loads(line.strip())
+            asts.append(ast)
+
+    return asts
+
+
+def __terminals(ast, node_index, args):
+    stack, paths = [], []
+
+    def dfs(v):
+        stack.append(v)
+
+        v_node = ast[v]
+
+        if 'value' in v_node:
+            if v == node_index:  # Top-level func def node.
+                if args.use_method_name:
+                    paths.append((stack.copy(), METHOD_NAME))
+            else:
+                v_type = v_node['type']
+
+                if v_type.startswith('Name'):
+                    paths.append((stack.copy(), v_node['value']))
+                elif args.use_nums and v_type == 'Num':
+                    paths.append((stack.copy(), NUM))
+                else:
+                    pass
+
+        if 'children' in v_node:
+            for child in v_node['children']:
+                dfs(child)
+
+        stack.pop()
+
+    dfs(node_index)
+
+    return paths
+
+
+def __merge_terminals2_paths(v_path, u_path):
+    s, n, m = 0, len(v_path), len(u_path)
+    while s < min(n, m) and v_path[s] == u_path[s]:
+        s += 1
+
+    prefix = list(reversed(v_path[s:]))
+    lca = v_path[s - 1]
+    suffix = u_path[s:]
+
+    return prefix, lca, suffix
+
+
+def __raw_tree_paths(ast, node_index, args):
+    tnodes = __terminals(ast, node_index, args)
+
+    tree_paths = []
+    for (v_path, v_value), (u_path, u_value) in itertools.combinations(
+            iterable=tnodes,
+            r=2,
+    ):
+        prefix, lca, suffix = __merge_terminals2_paths(v_path, u_path)
+        if (len(prefix) + 1 + len(suffix) <= args.max_path_length) \
+                and (abs(len(prefix) - len(suffix)) <= args.max_path_width):
+            path = prefix + [lca] + suffix
+            tree_path = v_value, path, u_value
+            tree_paths.append(tree_path)
+
+    return tree_paths
+
+
+def __delim_name(name):
+    if name in {METHOD_NAME, NUM}:
+        return name
+
+    def camel_case_split(identifier):
+        matches = re.finditer(
+            '.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)',
+            identifier,
+        )
+        return [m.group(0) for m in matches]
+
+    blocks = []
+    for underscore_block in name.split('_'):
+        blocks.extend(camel_case_split(underscore_block))
+
+    return '|'.join(block.lower() for block in blocks)
+
+
+def __collect_sample(ast, fd_index, args):
+    root = ast[fd_index]
+    if root['type'] != 'FunctionDef':
+        raise ValueError('Wrong node type.')
+
+    target = root['value']
+
+    tree_paths = __raw_tree_paths(ast, fd_index, args)
+    contexts = []
+    for tree_path in tree_paths:
+        start, connector, finish = tree_path
+
+        start, finish = __delim_name(start), __delim_name(finish)
+        connector = '|'.join(ast[v]['type'] for v in connector)
+
+        context = f'{start},{connector},{finish}'
+        contexts.append(context)
+
+    if len(contexts) == 0:
+        return None
+
+    target = __delim_name(target)
+    context = ' '.join(contexts)
+
+    return f'{target} {context}'
+
+
+def __collect_samples(ast, args):
+    samples = []
+    for node_index, node in enumerate(ast):
+        if node['type'] == 'FunctionDef':
+            sample = __collect_sample(ast, node_index, args)
+            if sample is not None:
+                samples.append(sample)
+
+    return samples
+
+
+def __collect_all_and_save(asts, args, output_file):
+    parallel = joblib.Parallel(n_jobs=args.n_jobs)
+    func = joblib.delayed(__collect_samples)
+
+    samples = parallel(func(ast, args) for ast in tqdm.tqdm(asts))
+    samples = list(itertools.chain.from_iterable(samples))
+
+    with open(output_file, 'w') as f:
+        for line_index, line in enumerate(samples):
+            f.write(line + ('' if line_index == len(samples) - 1 else '\n'))
+
+
+def main():
+    args = parser.parse_args()
+    np.random.seed(args.seed)
+
+    data_dir = Path(args.data_dir)
+    trains = __collect_asts(data_dir / 'python100k_train.json')
+    evals = __collect_asts(data_dir / 'python50k_eval.json')
+
+    train, valid = sklearn_model_selection.train_test_split(
+        trains,
+        test_size=args.valid_p,
+    )
+    test = evals
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(exist_ok=True)
+    for split_name, split in zip(
+            ('train', 'valid', 'test'),
+            (train, valid, test),
+    ):
+        output_file = output_dir / f'{split_name}_output_file.txt'
+        __collect_all_and_save(split, args, output_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/Python150kExtractor/preprocess.sh b/Python150kExtractor/preprocess.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+MAX_CONTEXTS=200
+MAX_DATA_CONTEXTS=1000
+SUBTOKEN_VOCAB_SIZE=186277
+TARGET_VOCAB_SIZE=26347
+
+data_dir=${1:-data}
+mkdir -p "${data_dir}"
+train_data_file=$data_dir/train_output_file.txt
+valid_data_file=$data_dir/valid_output_file.txt
+test_data_file=$data_dir/test_output_file.txt
+
+echo "Creating histograms from the training data..."
+target_histogram_file=$data_dir/histo.tgt.c2s
+source_subtoken_histogram=$data_dir/histo.ori.c2s
+node_histogram_file=$data_dir/histo.node.c2s
+cut <"${train_data_file}" -d' ' -f1 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' >"${target_histogram_file}"
+cut <"${train_data_file}" -d' ' -f2- | tr ' ' '\n' | cut -d',' -f1,3 | tr ',|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' >"${source_subtoken_histogram}"
+cut <"${train_data_file}" -d' ' -f2- | tr ' ' '\n' | cut -d',' -f2 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' >"${node_histogram_file}"
+
+echo "Preprocessing..."
+python ../preprocess.py \
+  --train_data "${train_data_file}" \
+  --val_data "${valid_data_file}" \
+  --test_data "${test_data_file}" \
+  --max_contexts ${MAX_CONTEXTS} \
+  --max_data_contexts ${MAX_DATA_CONTEXTS} \
+  --subtoken_vocab_size ${SUBTOKEN_VOCAB_SIZE} \
+  --target_vocab_size ${TARGET_VOCAB_SIZE} \
+  --target_histogram "${target_histogram_file}" \
+  --subtoken_histogram "${source_subtoken_histogram}" \
+  --node_histogram "${node_histogram_file}" \
+  --output_name "${data_dir}"/"$(basename "${data_dir}")"
+rm \
+  "${target_histogram_file}" \
+  "${source_subtoken_histogram}" \
+  "${node_histogram_file}"
diff --git a/README.md b/README.md
@@ -38,6 +38,7 @@ Table of Contents
 > python3 -c 'import tensorflow as tf; print(tf.\_\_version\_\_)'
   * For [creating a new Java dataset](#creating-and-preprocessing-a-new-java-dataset) or [manually examining a trained model](#step-4-manual-examination-of-a-trained-model) (any operation that requires parsing of a new code example): [JDK](https://openjdk.java.net/install/)
   * For creating a C# dataset: [dotnet-core](https://dotnet.microsoft.com/download) version 2.2 or newer.
+  * `pip install rouge` for computing rouge scores.
 
 ## Quickstart
 ### Step 0: Cloning this repository
diff --git a/code2seq.py b/code2seq.py
@@ -1,4 +1,6 @@
 from argparse import ArgumentParser
+import numpy as np
+import tensorflow as tf
 
 from config import Config
 from interactive_predict import InteractivePredictor
@@ -20,8 +22,12 @@
                              'size.')
     parser.add_argument('--predict', action='store_true')
     parser.add_argument('--debug', action='store_true')
+    parser.add_argument('--seed', type=int, default=239)
     args = parser.parse_args()
 
+    np.random.seed(args.seed)
+    tf.set_random_seed(args.seed)
+
     if args.debug:
         config = Config.get_debug_config(args)
     else:
@@ -32,9 +38,10 @@
     if config.TRAIN_PATH:
         model.train()
     if config.TEST_PATH and not args.data_path:
-        results, precision, recall, f1 = model.evaluate()
+        results, precision, recall, f1, rouge = model.evaluate()
         print('Accuracy: ' + str(results))
         print('Precision: ' + str(precision) + ', recall: ' + str(recall) + ', F1: ' + str(f1))
+        print('Rouge: ', rouge)
     if args.predict:
         predictor = InteractivePredictor(config, model)
         predictor.predict()
diff --git a/model.py b/model.py
diff --git a/train_python150k.sh b/train_python150k.sh