a bunch of random stuff. Add hacky way of showing some other top resu…

…lts for the a fullret model. Add some examples. Add a risk field to some fields. Adjust the stackexchange download scripts to include multiple networks
AInixProject · Mar 30, 2019 · 1860efe · 1860efe
1 parent e1c0c27
commit 1860efe
Show file tree

Hide file tree

Showing 16 changed files with 179 additions and 67 deletions.
diff --git a/ainix_kernel/models/Fullretrieval/fullretmodel.py b/ainix_kernel/models/Fullretrieval/fullretmodel.py
@@ -99,17 +99,33 @@ def predict(
         example_ref = self.example_refs[top_inds[max_score_ind]]
         ref_ast = example_ref.reference_ast
 
-
         valid_for_copy_mask = get_valid_for_copy_mask(tokens)
         ast_with_new_copies = self._apply_copy_changes(
             ref_ast, example_ref.copy_refs, memory, valid_for_copy_mask, tokens[0])
+
+        other_options = []
+        for ind, sim in zip(top_inds, top_sims):
+            try:
+                new_example_ref = self.example_refs[ind]
+                new_ref_ast = new_example_ref.reference_ast
+
+                new_valid_for_copy_mask = get_valid_for_copy_mask(tokens)
+                new_ast_with_new_copies = self._apply_copy_changes(
+                    new_ref_ast, new_example_ref.copy_refs, memory, new_valid_for_copy_mask,
+                    tokens[0])
+                other_options.append((math.log(float(sim)), new_ast_with_new_copies))
+            except Exception:
+                pass
+
+
         metad = TypeTranslatePredictMetadata(
             (math.log(float(top_sims[0])), ),
             (ExampleRetrieveExplanation(
                 tuple([self.example_refs[int(ind)].x_val_id for ind in top_inds]),
                 tuple([math.log(float(sim)) for sim in top_sims]),
                 None),
-            )
+            ),
+            other_options=other_options
         )
         return ast_with_new_copies, metad
 

diff --git a/ainix_kernel/models/model_types.py b/ainix_kernel/models/model_types.py
@@ -207,6 +207,7 @@ class TypeTranslatePredictMetadata:
     """
     log_confidences: Tuple[float, ...]
     example_retrieve_explanations: Tuple['ExampleRetrieveExplanation', ...]
+    other_options: List[Tuple[float, ObjectChoiceNode]] = None
 
     @property
     def total_confidence(self) -> float:

diff --git a/ainix_kernel/specialtypes/generic_strings.py b/ainix_kernel/specialtypes/generic_strings.py
@@ -19,6 +19,7 @@
 WORD_PART_MODIFIER_ARG_NAME = "modifier"
 WORD_PART_NEXT_ARG_NAME = "next_part"
 
+
 def create_generic_strings(type_context: TypeContext):
     """Main public interface for creating the appropriate types inside context"""
     _create_root_types(type_context)

diff --git a/ainix_kernel/training/fullret_try.py b/ainix_kernel/training/fullret_try.py
@@ -1,6 +1,7 @@
 import math
 
 from ainix_common.parsing.stringparser import AstUnparser
+from ainix_kernel.indexing.examplestore import DataSplits
 from ainix_kernel.models.Fullretrieval.fullretmodel import full_ret_from_example_store
 from ainix_kernel.training.evaluate import EvaluateLogger, print_ast_eval_log
 from ainix_kernel.training.trainer import TypeTranslateCFTrainer, get_examples
@@ -13,6 +14,7 @@ def train_the_thing():
     pretrained_checkpoint_path = f"{dir_path}/../../checkpoints/" \
                                  "lmchkp_30epoch2rnn_merge_toks_total_2.922_ns0.424_lm2.4973.pt"
     type_context, index, replacers, loader = get_examples()
+    print(f"count {len(list(index.get_all_examples((DataSplits.VALIDATION,))))}")
     model = full_ret_from_example_store(index, replacers, pretrained_checkpoint_path)
     return model, index, replacers, type_context, loader
 

diff --git a/ainix_kernel/training/opennmt/expir3.sh b/ainix_kernel/training/opennmt/expir3.sh
@@ -8,56 +8,56 @@ REPLACE_SAMPLES=10
 WORD_VEC_SIZE=300
 TRAIN_STEPS=7500
 
-#echo "Exporting latest data"
-#cd ../../..
-#python3 -m ainix_kernel.training.export_data \
-#    --replace_samples ${REPLACE_SAMPLES} \
-#    || exit 1
-#mv data_train* ./ainix_kernel/training/opennmt
-#mv data_val* ./ainix_kernel/training/opennmt
-#cd ./ainix_kernel/training/opennmt
-#
-#echo "Preproc data"
-#rm expirs/exp1*
-#python3 ./OpenNMT-py/preprocess.py \
-#  -train_src data_train_x.txt \
-#  -train_tgt data_train_y.txt \
-#  -valid_src data_val_x.txt \
-#  -valid_tgt data_val_y.txt \
-#  --save_data expirs/exp1 \
-#  --src_words_min_frequency 3 \
-#  --tgt_words_min_frequency 3 \
-#  || exit 1
+echo "Exporting latest data"
+cd ../../..
+python3 -m ainix_kernel.training.export_data \
+    --replace_samples ${REPLACE_SAMPLES} \
+    || exit 1
+mv data_train* ./ainix_kernel/training/opennmt
+mv data_val* ./ainix_kernel/training/opennmt
+cd ./ainix_kernel/training/opennmt
+
+echo "Preproc data"
+rm expirs/exp1*
+python3 ./OpenNMT-py/preprocess.py \
+  -train_src data_train_x.txt \
+  -train_tgt data_train_y.txt \
+  -valid_src data_val_x.txt \
+  -valid_tgt data_val_y.txt \
+  --save_data expirs/exp1 \
+  --src_words_min_frequency 3 \
+  --tgt_words_min_frequency 3 \
+  || exit 1
 
-#echo "prepare glove"
-#cd ./OpenNMT-py/
-#python3 -m tools.embeddings_to_torch \
-#    -emb_file_both "../glove_dir/glove.840B.${WORD_VEC_SIZE}d.txt" \
-#    -dict_file "../expirs/exp1.vocab.pt" \
-#    -output_file "../data/embeddings" \
-#    || exit 1
-#cd ..
-#
-#echo "Train"
-#data_size=$(wc -l < data_train_x.txt)
-##steps_to_do=$[(TRAIN_EPOCHS*BATCH_SIZE)/REPLACE_SAMPLES/BATCH_SIZE]
-#echo ${steps_to_do}
-#CUDA_VISIBLE_DEVICES=0 python3 ./OpenNMT-py/train.py \
-#    -data expirs/exp1 \
-#    -save_model data/demo-model \
-#    --src_word_vec_size 64 \
-#    --tgt_word_vec_size 64 \
-#    --rnn_size 128 \
-#    --batch_size ${BATCH_SIZE} \
-#    --train_steps ${TRAIN_STEPS} \
-#    --report_every 50 \
-#    --start_decay_steps 4000 \
-#    --decay_steps 2000 \
-#    --gpu_rank 0 \
-#    --word_vec_size ${WORD_VEC_SIZE} \
-#    --pre_word_vecs_enc "data/embeddings.enc.pt" \
-#    --pre_word_vecs_dec "data/embeddings.dec.pt" \
-#    || exit 1
+echo "prepare glove"
+cd ./OpenNMT-py/
+python3 -m tools.embeddings_to_torch \
+    -emb_file_both "../glove_dir/glove.840B.${WORD_VEC_SIZE}d.txt" \
+    -dict_file "../expirs/exp1.vocab.pt" \
+    -output_file "../data/embeddings" \
+    || exit 1
+cd ..
+
+echo "Train"
+data_size=$(wc -l < data_train_x.txt)
+#steps_to_do=$[(TRAIN_EPOCHS*BATCH_SIZE)/REPLACE_SAMPLES/BATCH_SIZE]
+echo ${steps_to_do}
+CUDA_VISIBLE_DEVICES=0 python3 ./OpenNMT-py/train.py \
+    -data expirs/exp1 \
+    -save_model data/demo-model \
+    --src_word_vec_size 64 \
+    --tgt_word_vec_size 64 \
+    --rnn_size 128 \
+    --batch_size ${BATCH_SIZE} \
+    --train_steps ${TRAIN_STEPS} \
+    --report_every 50 \
+    --start_decay_steps 4000 \
+    --decay_steps 2000 \
+    --gpu_rank 0 \
+    --word_vec_size ${WORD_VEC_SIZE} \
+    --pre_word_vecs_enc "data/embeddings.enc.pt" \
+    --pre_word_vecs_dec "data/embeddings.dec.pt" \
+    || exit 1
 
 echo "Predict"
 python3 ./OpenNMT-py/translate.py \

diff --git a/ainix_kernel/training/trainer.py b/ainix_kernel/training/trainer.py
@@ -16,7 +16,7 @@
 from ainix_kernel.specialtypes import allspecials
 from ainix_kernel.training.model_specific_training import update_latent_store_from_examples
 from ainix_kernel.training.train_contexts import ALL_EXAMPLE_NAMES, load_all_examples, \
-    load_tellia_examples
+    load_tellia_examples, load_all_and_tellina
 from ainix_kernel.util.sampling import WeightedRandomChooser
 from ainix_kernel.util.serialization import serialize
 from tqdm import tqdm

diff --git a/aish/shell.py b/aish/shell.py
@@ -19,6 +19,7 @@
 from aish.parser import BashParser
 from terminaltables import SingleTable
 import colorama
+import math
 
 builtin_dict = {}
 
@@ -86,9 +87,39 @@ def do_predict(self, in_x: str) -> Tuple[Optional[str], float]:
                   f"{colorama.Fore.MAGENTA}{pred_result.unparse.total_string.strip()}"
                   f"{colorama.Fore.RESET} "
                   f"(confidence score {pred_result.metad.total_confidence:.2f} {conf_emoji} )")
+            # actual method
             #self.do_example_retrieve_explanation(
             #    pred_result.metad.example_retrieve_explanations, pred_result.ast,
             #    pred_result.unparse)
+
+            # hacky for fullrett
+
+            # Hackily just grab things out of the interface. This should be improved
+            #index = self.kernel_interface.example_store
+            #retr_explan = pred_result.metad.example_retrieve_explanations[0]
+            #for sim, example_id in zip(
+            #    retr_explan.reference_confidence,
+            #    retr_explan.reference_example_ids
+            #):
+            #    example = index.get_example_by_id(example_id)
+            #    yvs = index.get_y_values_for_y_set(example.y_set_id)
+            #    print(math.exp(sim), yvs[0].y_text)
+            print("Other possible options:")
+            printed = set([pred_result.unparse.total_string.strip()])
+            for sim, other_ast in pred_result.metad.other_options[1:]:
+                try:
+                    s = self.kernel_interface.unparser.to_string(other_ast).total_string.strip()
+                    if s in printed:
+                        continue
+                    print(math.exp(sim), s)
+                    printed.add(s)
+                except Exception:
+                    # panic! time
+                    pass
+                if len(printed) >= 3:
+                    break
+            print("-")
+
             if total_confidence > PROMPT_CONF_THRESHOLD:
                 pass
             return pred_result.unparse.total_string.strip(), total_confidence

diff --git a/builtin_types/find_examples.ainix.yaml b/builtin_types/find_examples.ainix.yaml
@@ -8,99 +8,118 @@ defines:
     - recursively search for files and dirs named [-[ENGWORD]-].txt
     y:
     - 'find . -name "[-[ENGWORD]-].txt"'
+    risk: 1
   - x:
     - recursively print all directories here
     - what are all the directories anywhere on this path?
     y:
     - 'find . -type d'
+    risk: 1
   - x:
     - recursively find all files that start with "[-[ENGWORD]-]"
     - Starting here, find all files which start with "[-[ENGWORD]-]"
     y:
     - 'find . -name "[-[ENGWORD]-]*" -type f'
+    risk: 1
   - x:
     - recursively find all files that end with "[-[ENGWORD]-]"
     - Starting here, find all files which end with "[-[ENGWORD]-]"
     - find files that end with "[-[ENGWORD]-]" starting here
     y:
     - 'find . -name "*[-[ENGWORD]-]" -type f'
+    risk: 1
   - x:
     - find all files that start with the letter "[-[LETTER]-]"
     - recursively find files that start with "[-[LETTER]-]"
     y:
     # TODO: once user queries are supported, ask if care about case?
     - 'find . -name "[-[LETTER]-]*" -type f'
+    risk: 1
   - x:
       - recursively find files that contain "[-[ENGWORD]-]" in their name
     y:
       - 'find . -name "*[-[ENGWORD]-]*" -type f'
+    risk: 1
   - x:
       - recursively find files with "[-[LETTER]-][-[ENGWORD]-]" in name
     y:
       - 'find . -name "*[-[LETTER]-][-[ENGWORD]-]*" -type f'
+    risk: 1
   - x:
       - recursively find all directories that end with "[-[ENGWORD]-]" starting here
     y:
       - 'find . -name "*[-[ENGWORD]-]" -type d'
+    risk: 1
   - x:
       - recursively find all python files
     y:
       - 'find . -name "*.py" -type f'
+    risk: 1
   - x:
       - recursively find all javascript files
       - find all javascript files starting here
     y:
       - 'find . -name "*.js" -type f'
+    risk: 1
   - x:
       - recursively find all [-[EXTENSION]-] files starting here
       - recursively find all [-[EXTENSION]-] files
       - find all [-[EXTENSION]-] files
       - find all "[-[EXTENSION]-]" files
     y:
       - 'find . -name "*.[-[EXTENSION]-]" -type f'
+    risk: 1
   - x:
       - recursively find all .[-[EXTENSION]-] files starting here
       - recursively find all ".[-[EXTENSION]-]" files starting here
       - find all .[-[EXTENSION]-] files
     y:
       - 'find . -name "*.[-[EXTENSION]-]" -type f'
+    risk: 1
   - x:
       - find all hidden files starting here
       - recursively list all hidden files here
       - recursively find all hidden files
     y:
       - 'find . -name ".*" -type f'
+    risk: 1
   - x:
       - find all hidden files starting in [-[DIRNAME]-]
     y:
       - 'find [-[DIRNAME]-] -name ".*" -type f'
+    risk: 1
   - x:
       - find all hidden files starting in my tmp directory
     y:
       - 'find \tmp -name ".*" -type f'
+    risk: 1
   - x:
       - find all files starting here which belong to [-[USERNAME]-]
       - recursively find files that belong to [-[USERNAME]-]
       - what files here belong to [-[USERNAME]-]?
     y:
       - 'find . -type f -user [-[USERNAME]-]'
+    risk: 1
   - x:
       - search for all files in the whole system that belong to [-[USERNAME]-]
       - starting from root find all files that belong to [-[USERNAME]-]
     y:
       - 'find / -type f -user [-[USERNAME]-]'
+    risk: 1
   - x:
     - find all files here which belong to group [-[GROUPNAME]-]
     - look for files with group name [-[GROUPNAME]-]
     y:
     - 'find . -type f -user [-[GROUPNAME]-]'
+    risk: 1
   - x:
     - how many [-[EXTENSION]-] files are here
     - count the number of [-[EXTENSION]-] files are here
     - how many .[-[EXTENSION]-] files are here
     - how many *.[-[EXTENSION]-] files are in this dir
     y:
     - 'find . -name "*.[-[EXTENSION]-]" -type f -maxdepth 1 | wc -l'
+    risk: 0
     # We don't actually do anything with this warn parameter, but it seems useful
     warn: 'If any files contain new lines in their name, this might give incorrect results.'
   - x:
@@ -110,12 +129,24 @@ defines:
     - how many files in this directory
     y:
     - 'find . -type f -maxdepth 1 | wc -l'
+    risk: 0
     warn: 'If any files contain new lines in their name, this might give incorrect results.'
+  - x:
+    - how many files are in this directory and all subdirectories
+    - how many files are here recursive
+    - recursively count number files here
+    - how many files in this folder or all subfolders
+    y:
+    - 'find . -type f | wc -l'
+    warn: 'If any files contain new lines in their name, this might give incorrect results.'
+    risk: 1
   - x:
     - how many empty files are here nonrecursive
     y:
     - 'find . -type f -empty -maxdepth 1 | wc -l'
+    risk: 0
   - x:
     - how many empty files are here recursive
     y:
     - 'find . -type f -empty | wc -l'
+    risk: 1
diff --git a/builtin_types/otherdata/stackexchange/README.md b/builtin_types/otherdata/stackexchange/README.md
@@ -18,7 +18,7 @@ Depending on what you are doing you might not need all these.
 $ ./download_data.sh
 # Split it into a txt file which has a sentence per line 
 # and an empty line between documents (this is like what bert wants)
-$ python3 split_stacke_data.py -s unix-stackexchange/Posts.xml -o unix-stackexchange/sentences.txt
+$ ./split_all_stacke.sh
 
 # Train a word tokenizer
 
@@ -44,4 +44,4 @@ $ cat sentences_no_blank_lines.txt | spm_encode --model=testmod_upper_2000.model
 $ $PATH_TO_FASTTEXT skipgram -input sentences_tokenized_with_upper_2000.txt -output fasttext/m3 -thread 8 -maxn 0
 
 
-```
+```