diff --git a/ainix_kernel/models/Fullretrieval/fullretmodel.py b/ainix_kernel/models/Fullretrieval/fullretmodel.py index 9197530..bb4e62f 100644 --- a/ainix_kernel/models/Fullretrieval/fullretmodel.py +++ b/ainix_kernel/models/Fullretrieval/fullretmodel.py @@ -99,17 +99,33 @@ def predict( example_ref = self.example_refs[top_inds[max_score_ind]] ref_ast = example_ref.reference_ast - valid_for_copy_mask = get_valid_for_copy_mask(tokens) ast_with_new_copies = self._apply_copy_changes( ref_ast, example_ref.copy_refs, memory, valid_for_copy_mask, tokens[0]) + + other_options = [] + for ind, sim in zip(top_inds, top_sims): + try: + new_example_ref = self.example_refs[ind] + new_ref_ast = new_example_ref.reference_ast + + new_valid_for_copy_mask = get_valid_for_copy_mask(tokens) + new_ast_with_new_copies = self._apply_copy_changes( + new_ref_ast, new_example_ref.copy_refs, memory, new_valid_for_copy_mask, + tokens[0]) + other_options.append((math.log(float(sim)), new_ast_with_new_copies)) + except Exception: + pass + + metad = TypeTranslatePredictMetadata( (math.log(float(top_sims[0])), ), (ExampleRetrieveExplanation( tuple([self.example_refs[int(ind)].x_val_id for ind in top_inds]), tuple([math.log(float(sim)) for sim in top_sims]), None), - ) + ), + other_options=other_options ) return ast_with_new_copies, metad diff --git a/ainix_kernel/models/model_types.py b/ainix_kernel/models/model_types.py index 146ad42..178e8b1 100644 --- a/ainix_kernel/models/model_types.py +++ b/ainix_kernel/models/model_types.py @@ -207,6 +207,7 @@ class TypeTranslatePredictMetadata: """ log_confidences: Tuple[float, ...] example_retrieve_explanations: Tuple['ExampleRetrieveExplanation', ...] + other_options: List[Tuple[float, ObjectChoiceNode]] = None @property def total_confidence(self) -> float: diff --git a/ainix_kernel/specialtypes/generic_strings.py b/ainix_kernel/specialtypes/generic_strings.py index 809b1e3..8b9950a 100644 --- a/ainix_kernel/specialtypes/generic_strings.py +++ b/ainix_kernel/specialtypes/generic_strings.py @@ -19,6 +19,7 @@ WORD_PART_MODIFIER_ARG_NAME = "modifier" WORD_PART_NEXT_ARG_NAME = "next_part" + def create_generic_strings(type_context: TypeContext): """Main public interface for creating the appropriate types inside context""" _create_root_types(type_context) diff --git a/ainix_kernel/training/fullret_try.py b/ainix_kernel/training/fullret_try.py index 7797ad2..558a482 100644 --- a/ainix_kernel/training/fullret_try.py +++ b/ainix_kernel/training/fullret_try.py @@ -1,6 +1,7 @@ import math from ainix_common.parsing.stringparser import AstUnparser +from ainix_kernel.indexing.examplestore import DataSplits from ainix_kernel.models.Fullretrieval.fullretmodel import full_ret_from_example_store from ainix_kernel.training.evaluate import EvaluateLogger, print_ast_eval_log from ainix_kernel.training.trainer import TypeTranslateCFTrainer, get_examples @@ -13,6 +14,7 @@ def train_the_thing(): pretrained_checkpoint_path = f"{dir_path}/../../checkpoints/" \ "lmchkp_30epoch2rnn_merge_toks_total_2.922_ns0.424_lm2.4973.pt" type_context, index, replacers, loader = get_examples() + print(f"count {len(list(index.get_all_examples((DataSplits.VALIDATION,))))}") model = full_ret_from_example_store(index, replacers, pretrained_checkpoint_path) return model, index, replacers, type_context, loader diff --git a/ainix_kernel/training/opennmt/expir3.sh b/ainix_kernel/training/opennmt/expir3.sh index 5f0dcce..70beb5f 100755 --- a/ainix_kernel/training/opennmt/expir3.sh +++ b/ainix_kernel/training/opennmt/expir3.sh @@ -8,56 +8,56 @@ REPLACE_SAMPLES=10 WORD_VEC_SIZE=300 TRAIN_STEPS=7500 -#echo "Exporting latest data" -#cd ../../.. -#python3 -m ainix_kernel.training.export_data \ -# --replace_samples ${REPLACE_SAMPLES} \ -# || exit 1 -#mv data_train* ./ainix_kernel/training/opennmt -#mv data_val* ./ainix_kernel/training/opennmt -#cd ./ainix_kernel/training/opennmt -# -#echo "Preproc data" -#rm expirs/exp1* -#python3 ./OpenNMT-py/preprocess.py \ -# -train_src data_train_x.txt \ -# -train_tgt data_train_y.txt \ -# -valid_src data_val_x.txt \ -# -valid_tgt data_val_y.txt \ -# --save_data expirs/exp1 \ -# --src_words_min_frequency 3 \ -# --tgt_words_min_frequency 3 \ -# || exit 1 +echo "Exporting latest data" +cd ../../.. +python3 -m ainix_kernel.training.export_data \ + --replace_samples ${REPLACE_SAMPLES} \ + || exit 1 +mv data_train* ./ainix_kernel/training/opennmt +mv data_val* ./ainix_kernel/training/opennmt +cd ./ainix_kernel/training/opennmt + +echo "Preproc data" +rm expirs/exp1* +python3 ./OpenNMT-py/preprocess.py \ + -train_src data_train_x.txt \ + -train_tgt data_train_y.txt \ + -valid_src data_val_x.txt \ + -valid_tgt data_val_y.txt \ + --save_data expirs/exp1 \ + --src_words_min_frequency 3 \ + --tgt_words_min_frequency 3 \ + || exit 1 -#echo "prepare glove" -#cd ./OpenNMT-py/ -#python3 -m tools.embeddings_to_torch \ -# -emb_file_both "../glove_dir/glove.840B.${WORD_VEC_SIZE}d.txt" \ -# -dict_file "../expirs/exp1.vocab.pt" \ -# -output_file "../data/embeddings" \ -# || exit 1 -#cd .. -# -#echo "Train" -#data_size=$(wc -l < data_train_x.txt) -##steps_to_do=$[(TRAIN_EPOCHS*BATCH_SIZE)/REPLACE_SAMPLES/BATCH_SIZE] -#echo ${steps_to_do} -#CUDA_VISIBLE_DEVICES=0 python3 ./OpenNMT-py/train.py \ -# -data expirs/exp1 \ -# -save_model data/demo-model \ -# --src_word_vec_size 64 \ -# --tgt_word_vec_size 64 \ -# --rnn_size 128 \ -# --batch_size ${BATCH_SIZE} \ -# --train_steps ${TRAIN_STEPS} \ -# --report_every 50 \ -# --start_decay_steps 4000 \ -# --decay_steps 2000 \ -# --gpu_rank 0 \ -# --word_vec_size ${WORD_VEC_SIZE} \ -# --pre_word_vecs_enc "data/embeddings.enc.pt" \ -# --pre_word_vecs_dec "data/embeddings.dec.pt" \ -# || exit 1 +echo "prepare glove" +cd ./OpenNMT-py/ +python3 -m tools.embeddings_to_torch \ + -emb_file_both "../glove_dir/glove.840B.${WORD_VEC_SIZE}d.txt" \ + -dict_file "../expirs/exp1.vocab.pt" \ + -output_file "../data/embeddings" \ + || exit 1 +cd .. + +echo "Train" +data_size=$(wc -l < data_train_x.txt) +#steps_to_do=$[(TRAIN_EPOCHS*BATCH_SIZE)/REPLACE_SAMPLES/BATCH_SIZE] +echo ${steps_to_do} +CUDA_VISIBLE_DEVICES=0 python3 ./OpenNMT-py/train.py \ + -data expirs/exp1 \ + -save_model data/demo-model \ + --src_word_vec_size 64 \ + --tgt_word_vec_size 64 \ + --rnn_size 128 \ + --batch_size ${BATCH_SIZE} \ + --train_steps ${TRAIN_STEPS} \ + --report_every 50 \ + --start_decay_steps 4000 \ + --decay_steps 2000 \ + --gpu_rank 0 \ + --word_vec_size ${WORD_VEC_SIZE} \ + --pre_word_vecs_enc "data/embeddings.enc.pt" \ + --pre_word_vecs_dec "data/embeddings.dec.pt" \ + || exit 1 echo "Predict" python3 ./OpenNMT-py/translate.py \ diff --git a/ainix_kernel/training/trainer.py b/ainix_kernel/training/trainer.py index 8601c8b..552f53e 100644 --- a/ainix_kernel/training/trainer.py +++ b/ainix_kernel/training/trainer.py @@ -16,7 +16,7 @@ from ainix_kernel.specialtypes import allspecials from ainix_kernel.training.model_specific_training import update_latent_store_from_examples from ainix_kernel.training.train_contexts import ALL_EXAMPLE_NAMES, load_all_examples, \ - load_tellia_examples + load_tellia_examples, load_all_and_tellina from ainix_kernel.util.sampling import WeightedRandomChooser from ainix_kernel.util.serialization import serialize from tqdm import tqdm diff --git a/aish/shell.py b/aish/shell.py index 0a9cc18..43b31e7 100644 --- a/aish/shell.py +++ b/aish/shell.py @@ -19,6 +19,7 @@ from aish.parser import BashParser from terminaltables import SingleTable import colorama +import math builtin_dict = {} @@ -86,9 +87,39 @@ def do_predict(self, in_x: str) -> Tuple[Optional[str], float]: f"{colorama.Fore.MAGENTA}{pred_result.unparse.total_string.strip()}" f"{colorama.Fore.RESET} " f"(confidence score {pred_result.metad.total_confidence:.2f} {conf_emoji} )") + # actual method #self.do_example_retrieve_explanation( # pred_result.metad.example_retrieve_explanations, pred_result.ast, # pred_result.unparse) + + # hacky for fullrett + + # Hackily just grab things out of the interface. This should be improved + #index = self.kernel_interface.example_store + #retr_explan = pred_result.metad.example_retrieve_explanations[0] + #for sim, example_id in zip( + # retr_explan.reference_confidence, + # retr_explan.reference_example_ids + #): + # example = index.get_example_by_id(example_id) + # yvs = index.get_y_values_for_y_set(example.y_set_id) + # print(math.exp(sim), yvs[0].y_text) + print("Other possible options:") + printed = set([pred_result.unparse.total_string.strip()]) + for sim, other_ast in pred_result.metad.other_options[1:]: + try: + s = self.kernel_interface.unparser.to_string(other_ast).total_string.strip() + if s in printed: + continue + print(math.exp(sim), s) + printed.add(s) + except Exception: + # panic! time + pass + if len(printed) >= 3: + break + print("-") + if total_confidence > PROMPT_CONF_THRESHOLD: pass return pred_result.unparse.total_string.strip(), total_confidence diff --git a/builtin_types/find_examples.ainix.yaml b/builtin_types/find_examples.ainix.yaml index add3919..09fd7b0 100644 --- a/builtin_types/find_examples.ainix.yaml +++ b/builtin_types/find_examples.ainix.yaml @@ -8,49 +8,59 @@ defines: - recursively search for files and dirs named [-[ENGWORD]-].txt y: - 'find . -name "[-[ENGWORD]-].txt"' + risk: 1 - x: - recursively print all directories here - what are all the directories anywhere on this path? y: - 'find . -type d' + risk: 1 - x: - recursively find all files that start with "[-[ENGWORD]-]" - Starting here, find all files which start with "[-[ENGWORD]-]" y: - 'find . -name "[-[ENGWORD]-]*" -type f' + risk: 1 - x: - recursively find all files that end with "[-[ENGWORD]-]" - Starting here, find all files which end with "[-[ENGWORD]-]" - find files that end with "[-[ENGWORD]-]" starting here y: - 'find . -name "*[-[ENGWORD]-]" -type f' + risk: 1 - x: - find all files that start with the letter "[-[LETTER]-]" - recursively find files that start with "[-[LETTER]-]" y: # TODO: once user queries are supported, ask if care about case? - 'find . -name "[-[LETTER]-]*" -type f' + risk: 1 - x: - recursively find files that contain "[-[ENGWORD]-]" in their name y: - 'find . -name "*[-[ENGWORD]-]*" -type f' + risk: 1 - x: - recursively find files with "[-[LETTER]-][-[ENGWORD]-]" in name y: - 'find . -name "*[-[LETTER]-][-[ENGWORD]-]*" -type f' + risk: 1 - x: - recursively find all directories that end with "[-[ENGWORD]-]" starting here y: - 'find . -name "*[-[ENGWORD]-]" -type d' + risk: 1 - x: - recursively find all python files y: - 'find . -name "*.py" -type f' + risk: 1 - x: - recursively find all javascript files - find all javascript files starting here y: - 'find . -name "*.js" -type f' + risk: 1 - x: - recursively find all [-[EXTENSION]-] files starting here - recursively find all [-[EXTENSION]-] files @@ -58,42 +68,50 @@ defines: - find all "[-[EXTENSION]-]" files y: - 'find . -name "*.[-[EXTENSION]-]" -type f' + risk: 1 - x: - recursively find all .[-[EXTENSION]-] files starting here - recursively find all ".[-[EXTENSION]-]" files starting here - find all .[-[EXTENSION]-] files y: - 'find . -name "*.[-[EXTENSION]-]" -type f' + risk: 1 - x: - find all hidden files starting here - recursively list all hidden files here - recursively find all hidden files y: - 'find . -name ".*" -type f' + risk: 1 - x: - find all hidden files starting in [-[DIRNAME]-] y: - 'find [-[DIRNAME]-] -name ".*" -type f' + risk: 1 - x: - find all hidden files starting in my tmp directory y: - 'find \tmp -name ".*" -type f' + risk: 1 - x: - find all files starting here which belong to [-[USERNAME]-] - recursively find files that belong to [-[USERNAME]-] - what files here belong to [-[USERNAME]-]? y: - 'find . -type f -user [-[USERNAME]-]' + risk: 1 - x: - search for all files in the whole system that belong to [-[USERNAME]-] - starting from root find all files that belong to [-[USERNAME]-] y: - 'find / -type f -user [-[USERNAME]-]' + risk: 1 - x: - find all files here which belong to group [-[GROUPNAME]-] - look for files with group name [-[GROUPNAME]-] y: - 'find . -type f -user [-[GROUPNAME]-]' + risk: 1 - x: - how many [-[EXTENSION]-] files are here - count the number of [-[EXTENSION]-] files are here @@ -101,6 +119,7 @@ defines: - how many *.[-[EXTENSION]-] files are in this dir y: - 'find . -name "*.[-[EXTENSION]-]" -type f -maxdepth 1 | wc -l' + risk: 0 # We don't actually do anything with this warn parameter, but it seems useful warn: 'If any files contain new lines in their name, this might give incorrect results.' - x: @@ -110,12 +129,24 @@ defines: - how many files in this directory y: - 'find . -type f -maxdepth 1 | wc -l' + risk: 0 warn: 'If any files contain new lines in their name, this might give incorrect results.' + - x: + - how many files are in this directory and all subdirectories + - how many files are here recursive + - recursively count number files here + - how many files in this folder or all subfolders + y: + - 'find . -type f | wc -l' + warn: 'If any files contain new lines in their name, this might give incorrect results.' + risk: 1 - x: - how many empty files are here nonrecursive y: - 'find . -type f -empty -maxdepth 1 | wc -l' + risk: 0 - x: - how many empty files are here recursive y: - 'find . -type f -empty | wc -l' + risk: 1 diff --git a/builtin_types/otherdata/stackexchange/README.md b/builtin_types/otherdata/stackexchange/README.md index 17c4da8..0eb41f1 100644 --- a/builtin_types/otherdata/stackexchange/README.md +++ b/builtin_types/otherdata/stackexchange/README.md @@ -18,7 +18,7 @@ Depending on what you are doing you might not need all these. $ ./download_data.sh # Split it into a txt file which has a sentence per line # and an empty line between documents (this is like what bert wants) -$ python3 split_stacke_data.py -s unix-stackexchange/Posts.xml -o unix-stackexchange/sentences.txt +$ ./split_all_stacke.sh # Train a word tokenizer @@ -44,4 +44,4 @@ $ cat sentences_no_blank_lines.txt | spm_encode --model=testmod_upper_2000.model $ $PATH_TO_FASTTEXT skipgram -input sentences_tokenized_with_upper_2000.txt -output fasttext/m3 -thread 8 -maxn 0 -``` \ No newline at end of file +``` diff --git a/builtin_types/otherdata/stackexchange/download_data.sh b/builtin_types/otherdata/stackexchange/download_data.sh index f5790b4..084b48b 100755 --- a/builtin_types/otherdata/stackexchange/download_data.sh +++ b/builtin_types/otherdata/stackexchange/download_data.sh @@ -1,8 +1,16 @@ #!/usr/bin/env bash -# Downloads and extracts unix stack exchange -mkdir unix-stackexchange -cd unix-stackexchange -wget -O unix.stackexchange.7z https://archive.org/download/stackexchange/unix.stackexchange.com.7z -7z x unix.stackexchange.7z -ls | grep -v Posts.xml | xargs rm -cd .. + +# Downloads and extracts different stack exchanges +for network in $(cat networks.txt); do + # Don't redownload things we have already done. + if [ -f ${network}-stackexchange/Posts.xml ]; then + echo "Already found files ${network}. Delete them to redownload" + continue + fi + mkdir ${network}-stackexchange + cd ${network}-stackexchange + wget -O ${network}.7z https://archive.org/download/stackexchange/${network}.com.7z + 7z x ${network}.7z + ls | grep -v Posts.xml | xargs rm + cd .. +done diff --git a/builtin_types/otherdata/stackexchange/networks.txt b/builtin_types/otherdata/stackexchange/networks.txt new file mode 100644 index 0000000..136c97e --- /dev/null +++ b/builtin_types/otherdata/stackexchange/networks.txt @@ -0,0 +1,3 @@ +unix.stackexchange +serverfault +askubuntu diff --git a/builtin_types/otherdata/stackexchange/split_all_stacke.sh b/builtin_types/otherdata/stackexchange/split_all_stacke.sh new file mode 100755 index 0000000..f3d663b --- /dev/null +++ b/builtin_types/otherdata/stackexchange/split_all_stacke.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +for network in $(cat networks.txt); do + python3 split_stacke_data.py \ + -s ${network}-stackexchange/Posts.xml \ + -o ${network}-stackexchange/sentences.txt +done diff --git a/builtin_types/otherdata/stackexchange/split_stacke_data.py b/builtin_types/otherdata/stackexchange/split_stacke_data.py index 30925de..a9a2174 100644 --- a/builtin_types/otherdata/stackexchange/split_stacke_data.py +++ b/builtin_types/otherdata/stackexchange/split_stacke_data.py @@ -9,8 +9,10 @@ import attr from tqdm import tqdm DEFAULT_NAME = "./unix-stackexchange/small_posts.xml" -MIN_NUM_OF_WORDS_IN_SENTENCE = 4 -MIN_NUM_CHARS_IN_SENTENCE = 16 +MIN_NUM_OF_WORDS_IN_SENTENCE = 4 # 4 +MIN_NUM_CHARS_IN_SENTENCE = 16 # 16 +MAX_SENTENCE_CHARACTERS = 320 +MAX_CODE_PRE_LEN = 160 # Exclude posts with really long code blocks def get_post_body(row_element, min_score=-9e9) -> Optional[str]: @@ -38,10 +40,10 @@ def longest_code_pre_len(string: str) -> Optional[int]: return max(map(len, matches)) - len("
") -def clean_post_body(body: str, max_code_pre_len=60) -> Optional[str]: +def clean_post_body(body: str) -> Optional[str]: longest_quoted_out_code_block = longest_code_pre_len(body) if longest_quoted_out_code_block is not None and \ - longest_quoted_out_code_block > max_code_pre_len: + longest_quoted_out_code_block > MAX_CODE_PRE_LEN: return None body = remove_html_tags(body) body = unscape_html_entities(body) @@ -89,6 +91,9 @@ def unscape_html_entities(string: str): shortest_char_count_sentence = min(map(len, sentences)) if shortest_char_count_sentence < MIN_NUM_CHARS_IN_SENTENCE: continue + longest_char_count_sentence = max(map(len, sentences)) + if longest_char_count_sentence > MAX_SENTENCE_CHARACTERS: + continue shortest_word_count_sentence = min(len(s.split()) for s in sentences) if shortest_word_count_sentence < MIN_NUM_OF_WORDS_IN_SENTENCE: continue diff --git a/builtin_types/otherdata/tellina/convert_data.py b/builtin_types/otherdata/tellina/convert_data.py index 5141993..ac6e49e 100644 --- a/builtin_types/otherdata/tellina/convert_data.py +++ b/builtin_types/otherdata/tellina/convert_data.py @@ -44,6 +44,8 @@ def get_parsable_commands(data: Tuple[str, str]) -> List[Tuple[str, str]]: parsable_data = [] for nl, cm in data: try: + if cm.strip().startswith("tar"): + continue ast = parser.create_parse_tree(cm, "CommandSequence") parsable_data.append((nl, cm)) print(f"PASS {cm}") diff --git a/builtin_types/tar_examples.ainix.yaml b/builtin_types/tar_examples.ainix.yaml index 65c0727..ad02422 100644 --- a/builtin_types/tar_examples.ainix.yaml +++ b/builtin_types/tar_examples.ainix.yaml @@ -26,6 +26,10 @@ defines: - make a tarball for [-[DIRNAME]-] - make a compressed archive for [-[DIRNAME]-] - move [-[DIRNAME]-] to a tarball + - make [-[DIRNAME]-] a tar file + - move [-[DIRNAME]-] to a tarfile + - compress [-[DIRNAME]-] into a tarfile + - compress [-[DIRNAME]-] into a tarball y: - tar -c -z -f [-[DIRNAME]-].tar.gz [-[DIRNAME]-] # TODO Replacers of certain extension diff --git a/builtin_types/wc_examples.ainix.yaml b/builtin_types/wc_examples.ainix.yaml index cc31f15..e297911 100644 --- a/builtin_types/wc_examples.ainix.yaml +++ b/builtin_types/wc_examples.ainix.yaml @@ -45,9 +45,10 @@ defines: - ls but only one per line | wc -l - print each file or directory here but only one per line | wc -l - ls and count the number of lines of the output - - get a list of files here and count the number lines returned. y: - - ls -1 | wc -l + - get a list of files here and count the number lines returned. + x: + - - x: - how many letters are in "foolumboo" y: