Skip to content

Commit

Permalink
a bunch of random stuff. Add hacky way of showing some other top resu…
Browse files Browse the repository at this point in the history
…lts for the a fullret model. Add some examples. Add a risk field to some fields. Adjust the stackexchange download scripts to include multiple networks
  • Loading branch information
DNGros committed Mar 30, 2019
1 parent e1c0c27 commit 1860efe
Show file tree
Hide file tree
Showing 16 changed files with 179 additions and 67 deletions.
20 changes: 18 additions & 2 deletions ainix_kernel/models/Fullretrieval/fullretmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,17 +99,33 @@ def predict(
example_ref = self.example_refs[top_inds[max_score_ind]]
ref_ast = example_ref.reference_ast


valid_for_copy_mask = get_valid_for_copy_mask(tokens)
ast_with_new_copies = self._apply_copy_changes(
ref_ast, example_ref.copy_refs, memory, valid_for_copy_mask, tokens[0])

other_options = []
for ind, sim in zip(top_inds, top_sims):
try:
new_example_ref = self.example_refs[ind]
new_ref_ast = new_example_ref.reference_ast

new_valid_for_copy_mask = get_valid_for_copy_mask(tokens)
new_ast_with_new_copies = self._apply_copy_changes(
new_ref_ast, new_example_ref.copy_refs, memory, new_valid_for_copy_mask,
tokens[0])
other_options.append((math.log(float(sim)), new_ast_with_new_copies))
except Exception:
pass


metad = TypeTranslatePredictMetadata(
(math.log(float(top_sims[0])), ),
(ExampleRetrieveExplanation(
tuple([self.example_refs[int(ind)].x_val_id for ind in top_inds]),
tuple([math.log(float(sim)) for sim in top_sims]),
None),
)
),
other_options=other_options
)
return ast_with_new_copies, metad

Expand Down
1 change: 1 addition & 0 deletions ainix_kernel/models/model_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ class TypeTranslatePredictMetadata:
"""
log_confidences: Tuple[float, ...]
example_retrieve_explanations: Tuple['ExampleRetrieveExplanation', ...]
other_options: List[Tuple[float, ObjectChoiceNode]] = None

@property
def total_confidence(self) -> float:
Expand Down
1 change: 1 addition & 0 deletions ainix_kernel/specialtypes/generic_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
WORD_PART_MODIFIER_ARG_NAME = "modifier"
WORD_PART_NEXT_ARG_NAME = "next_part"


def create_generic_strings(type_context: TypeContext):
"""Main public interface for creating the appropriate types inside context"""
_create_root_types(type_context)
Expand Down
2 changes: 2 additions & 0 deletions ainix_kernel/training/fullret_try.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import math

from ainix_common.parsing.stringparser import AstUnparser
from ainix_kernel.indexing.examplestore import DataSplits
from ainix_kernel.models.Fullretrieval.fullretmodel import full_ret_from_example_store
from ainix_kernel.training.evaluate import EvaluateLogger, print_ast_eval_log
from ainix_kernel.training.trainer import TypeTranslateCFTrainer, get_examples
Expand All @@ -13,6 +14,7 @@ def train_the_thing():
pretrained_checkpoint_path = f"{dir_path}/../../checkpoints/" \
"lmchkp_30epoch2rnn_merge_toks_total_2.922_ns0.424_lm2.4973.pt"
type_context, index, replacers, loader = get_examples()
print(f"count {len(list(index.get_all_examples((DataSplits.VALIDATION,))))}")
model = full_ret_from_example_store(index, replacers, pretrained_checkpoint_path)
return model, index, replacers, type_context, loader

Expand Down
98 changes: 49 additions & 49 deletions ainix_kernel/training/opennmt/expir3.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,56 +8,56 @@ REPLACE_SAMPLES=10
WORD_VEC_SIZE=300
TRAIN_STEPS=7500

#echo "Exporting latest data"
#cd ../../..
#python3 -m ainix_kernel.training.export_data \
# --replace_samples ${REPLACE_SAMPLES} \
# || exit 1
#mv data_train* ./ainix_kernel/training/opennmt
#mv data_val* ./ainix_kernel/training/opennmt
#cd ./ainix_kernel/training/opennmt
#
#echo "Preproc data"
#rm expirs/exp1*
#python3 ./OpenNMT-py/preprocess.py \
# -train_src data_train_x.txt \
# -train_tgt data_train_y.txt \
# -valid_src data_val_x.txt \
# -valid_tgt data_val_y.txt \
# --save_data expirs/exp1 \
# --src_words_min_frequency 3 \
# --tgt_words_min_frequency 3 \
# || exit 1
echo "Exporting latest data"
cd ../../..
python3 -m ainix_kernel.training.export_data \
--replace_samples ${REPLACE_SAMPLES} \
|| exit 1
mv data_train* ./ainix_kernel/training/opennmt
mv data_val* ./ainix_kernel/training/opennmt
cd ./ainix_kernel/training/opennmt

echo "Preproc data"
rm expirs/exp1*
python3 ./OpenNMT-py/preprocess.py \
-train_src data_train_x.txt \
-train_tgt data_train_y.txt \
-valid_src data_val_x.txt \
-valid_tgt data_val_y.txt \
--save_data expirs/exp1 \
--src_words_min_frequency 3 \
--tgt_words_min_frequency 3 \
|| exit 1

#echo "prepare glove"
#cd ./OpenNMT-py/
#python3 -m tools.embeddings_to_torch \
# -emb_file_both "../glove_dir/glove.840B.${WORD_VEC_SIZE}d.txt" \
# -dict_file "../expirs/exp1.vocab.pt" \
# -output_file "../data/embeddings" \
# || exit 1
#cd ..
#
#echo "Train"
#data_size=$(wc -l < data_train_x.txt)
##steps_to_do=$[(TRAIN_EPOCHS*BATCH_SIZE)/REPLACE_SAMPLES/BATCH_SIZE]
#echo ${steps_to_do}
#CUDA_VISIBLE_DEVICES=0 python3 ./OpenNMT-py/train.py \
# -data expirs/exp1 \
# -save_model data/demo-model \
# --src_word_vec_size 64 \
# --tgt_word_vec_size 64 \
# --rnn_size 128 \
# --batch_size ${BATCH_SIZE} \
# --train_steps ${TRAIN_STEPS} \
# --report_every 50 \
# --start_decay_steps 4000 \
# --decay_steps 2000 \
# --gpu_rank 0 \
# --word_vec_size ${WORD_VEC_SIZE} \
# --pre_word_vecs_enc "data/embeddings.enc.pt" \
# --pre_word_vecs_dec "data/embeddings.dec.pt" \
# || exit 1
echo "prepare glove"
cd ./OpenNMT-py/
python3 -m tools.embeddings_to_torch \
-emb_file_both "../glove_dir/glove.840B.${WORD_VEC_SIZE}d.txt" \
-dict_file "../expirs/exp1.vocab.pt" \
-output_file "../data/embeddings" \
|| exit 1
cd ..

echo "Train"
data_size=$(wc -l < data_train_x.txt)
#steps_to_do=$[(TRAIN_EPOCHS*BATCH_SIZE)/REPLACE_SAMPLES/BATCH_SIZE]
echo ${steps_to_do}
CUDA_VISIBLE_DEVICES=0 python3 ./OpenNMT-py/train.py \
-data expirs/exp1 \
-save_model data/demo-model \
--src_word_vec_size 64 \
--tgt_word_vec_size 64 \
--rnn_size 128 \
--batch_size ${BATCH_SIZE} \
--train_steps ${TRAIN_STEPS} \
--report_every 50 \
--start_decay_steps 4000 \
--decay_steps 2000 \
--gpu_rank 0 \
--word_vec_size ${WORD_VEC_SIZE} \
--pre_word_vecs_enc "data/embeddings.enc.pt" \
--pre_word_vecs_dec "data/embeddings.dec.pt" \
|| exit 1

echo "Predict"
python3 ./OpenNMT-py/translate.py \
Expand Down
2 changes: 1 addition & 1 deletion ainix_kernel/training/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from ainix_kernel.specialtypes import allspecials
from ainix_kernel.training.model_specific_training import update_latent_store_from_examples
from ainix_kernel.training.train_contexts import ALL_EXAMPLE_NAMES, load_all_examples, \
load_tellia_examples
load_tellia_examples, load_all_and_tellina
from ainix_kernel.util.sampling import WeightedRandomChooser
from ainix_kernel.util.serialization import serialize
from tqdm import tqdm
Expand Down
31 changes: 31 additions & 0 deletions aish/shell.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from aish.parser import BashParser
from terminaltables import SingleTable
import colorama
import math

builtin_dict = {}

Expand Down Expand Up @@ -86,9 +87,39 @@ def do_predict(self, in_x: str) -> Tuple[Optional[str], float]:
f"{colorama.Fore.MAGENTA}{pred_result.unparse.total_string.strip()}"
f"{colorama.Fore.RESET} "
f"(confidence score {pred_result.metad.total_confidence:.2f} {conf_emoji} )")
# actual method
#self.do_example_retrieve_explanation(
# pred_result.metad.example_retrieve_explanations, pred_result.ast,
# pred_result.unparse)

# hacky for fullrett

# Hackily just grab things out of the interface. This should be improved
#index = self.kernel_interface.example_store
#retr_explan = pred_result.metad.example_retrieve_explanations[0]
#for sim, example_id in zip(
# retr_explan.reference_confidence,
# retr_explan.reference_example_ids
#):
# example = index.get_example_by_id(example_id)
# yvs = index.get_y_values_for_y_set(example.y_set_id)
# print(math.exp(sim), yvs[0].y_text)
print("Other possible options:")
printed = set([pred_result.unparse.total_string.strip()])
for sim, other_ast in pred_result.metad.other_options[1:]:
try:
s = self.kernel_interface.unparser.to_string(other_ast).total_string.strip()
if s in printed:
continue
print(math.exp(sim), s)
printed.add(s)
except Exception:
# panic! time
pass
if len(printed) >= 3:
break
print("-")

if total_confidence > PROMPT_CONF_THRESHOLD:
pass
return pred_result.unparse.total_string.strip(), total_confidence
Expand Down
31 changes: 31 additions & 0 deletions builtin_types/find_examples.ainix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,99 +8,118 @@ defines:
- recursively search for files and dirs named [-[ENGWORD]-].txt
y:
- 'find . -name "[-[ENGWORD]-].txt"'
risk: 1
- x:
- recursively print all directories here
- what are all the directories anywhere on this path?
y:
- 'find . -type d'
risk: 1
- x:
- recursively find all files that start with "[-[ENGWORD]-]"
- Starting here, find all files which start with "[-[ENGWORD]-]"
y:
- 'find . -name "[-[ENGWORD]-]*" -type f'
risk: 1
- x:
- recursively find all files that end with "[-[ENGWORD]-]"
- Starting here, find all files which end with "[-[ENGWORD]-]"
- find files that end with "[-[ENGWORD]-]" starting here
y:
- 'find . -name "*[-[ENGWORD]-]" -type f'
risk: 1
- x:
- find all files that start with the letter "[-[LETTER]-]"
- recursively find files that start with "[-[LETTER]-]"
y:
# TODO: once user queries are supported, ask if care about case?
- 'find . -name "[-[LETTER]-]*" -type f'
risk: 1
- x:
- recursively find files that contain "[-[ENGWORD]-]" in their name
y:
- 'find . -name "*[-[ENGWORD]-]*" -type f'
risk: 1
- x:
- recursively find files with "[-[LETTER]-][-[ENGWORD]-]" in name
y:
- 'find . -name "*[-[LETTER]-][-[ENGWORD]-]*" -type f'
risk: 1
- x:
- recursively find all directories that end with "[-[ENGWORD]-]" starting here
y:
- 'find . -name "*[-[ENGWORD]-]" -type d'
risk: 1
- x:
- recursively find all python files
y:
- 'find . -name "*.py" -type f'
risk: 1
- x:
- recursively find all javascript files
- find all javascript files starting here
y:
- 'find . -name "*.js" -type f'
risk: 1
- x:
- recursively find all [-[EXTENSION]-] files starting here
- recursively find all [-[EXTENSION]-] files
- find all [-[EXTENSION]-] files
- find all "[-[EXTENSION]-]" files
y:
- 'find . -name "*.[-[EXTENSION]-]" -type f'
risk: 1
- x:
- recursively find all .[-[EXTENSION]-] files starting here
- recursively find all ".[-[EXTENSION]-]" files starting here
- find all .[-[EXTENSION]-] files
y:
- 'find . -name "*.[-[EXTENSION]-]" -type f'
risk: 1
- x:
- find all hidden files starting here
- recursively list all hidden files here
- recursively find all hidden files
y:
- 'find . -name ".*" -type f'
risk: 1
- x:
- find all hidden files starting in [-[DIRNAME]-]
y:
- 'find [-[DIRNAME]-] -name ".*" -type f'
risk: 1
- x:
- find all hidden files starting in my tmp directory
y:
- 'find \tmp -name ".*" -type f'
risk: 1
- x:
- find all files starting here which belong to [-[USERNAME]-]
- recursively find files that belong to [-[USERNAME]-]
- what files here belong to [-[USERNAME]-]?
y:
- 'find . -type f -user [-[USERNAME]-]'
risk: 1
- x:
- search for all files in the whole system that belong to [-[USERNAME]-]
- starting from root find all files that belong to [-[USERNAME]-]
y:
- 'find / -type f -user [-[USERNAME]-]'
risk: 1
- x:
- find all files here which belong to group [-[GROUPNAME]-]
- look for files with group name [-[GROUPNAME]-]
y:
- 'find . -type f -user [-[GROUPNAME]-]'
risk: 1
- x:
- how many [-[EXTENSION]-] files are here
- count the number of [-[EXTENSION]-] files are here
- how many .[-[EXTENSION]-] files are here
- how many *.[-[EXTENSION]-] files are in this dir
y:
- 'find . -name "*.[-[EXTENSION]-]" -type f -maxdepth 1 | wc -l'
risk: 0
# We don't actually do anything with this warn parameter, but it seems useful
warn: 'If any files contain new lines in their name, this might give incorrect results.'
- x:
Expand All @@ -110,12 +129,24 @@ defines:
- how many files in this directory
y:
- 'find . -type f -maxdepth 1 | wc -l'
risk: 0
warn: 'If any files contain new lines in their name, this might give incorrect results.'
- x:
- how many files are in this directory and all subdirectories
- how many files are here recursive
- recursively count number files here
- how many files in this folder or all subfolders
y:
- 'find . -type f | wc -l'
warn: 'If any files contain new lines in their name, this might give incorrect results.'
risk: 1
- x:
- how many empty files are here nonrecursive
y:
- 'find . -type f -empty -maxdepth 1 | wc -l'
risk: 0
- x:
- how many empty files are here recursive
y:
- 'find . -type f -empty | wc -l'
risk: 1
4 changes: 2 additions & 2 deletions builtin_types/otherdata/stackexchange/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Depending on what you are doing you might not need all these.
$ ./download_data.sh
# Split it into a txt file which has a sentence per line
# and an empty line between documents (this is like what bert wants)
$ python3 split_stacke_data.py -s unix-stackexchange/Posts.xml -o unix-stackexchange/sentences.txt
$ ./split_all_stacke.sh

# Train a word tokenizer

Expand All @@ -44,4 +44,4 @@ $ cat sentences_no_blank_lines.txt | spm_encode --model=testmod_upper_2000.model
$ $PATH_TO_FASTTEXT skipgram -input sentences_tokenized_with_upper_2000.txt -output fasttext/m3 -thread 8 -maxn 0


```
```
Loading

0 comments on commit 1860efe

Please sign in to comment.