-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_experiment.py
83 lines (65 loc) · 3.61 KB
/
run_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import json
import subprocess
import os
import sys
PYTHON = sys.executable
def run_command(command, args):
formatted_command = command.format(**args)
print(f"Running command:\n{formatted_command}\n")
process = subprocess.Popen(formatted_command, shell=True)
process.wait()
if process.returncode != 0:
raise Exception('Error in subprocess!')
def run_experiment(experiment, experiment_index):
project_dir = os.getcwd() # get current directory
src_dir = os.path.join(project_dir, 'src') # append 'src' to the path
out_name = experiment['tgt_emb_name'] if bool(experiment['save_tgt_embeddings_without_concat']) else experiment['tgt_model_new_name']
experiment.setdefault('refine_it', 0)
experiment.setdefault('refine_top_n', 3)
experiment.setdefault('epochs', 5)
experiment.setdefault('min_sim', -100)
experiment.setdefault('max_sim', 100)
experiment.setdefault('anchor_output_dir', f"{experiment['tgt_model_new_name']}.aux")
commands = {
"find_identical_words": f"{PYTHON} {src_dir}/find_identical_words.py {{src_model_file}} {{tgt_model_file}} {{output_identical_word_pair_file}} {{top_vocab}}",
"anchor_embedding_training": f"{PYTHON} {src_dir}/anchor_embedding_training.py {{output_identical_word_pair_file}} {{tgt_model_training_data}} {out_name} 0 {{src_model_file}} {{embedding_type_cbow_or_sg}} {{vector_dim}} {{vector_count}} {{fixed}} {{refine_it}} {{refine_top_n}} --output_dir {{anchor_output_dir}} --epochs {{epochs}} --min_similarity {{min_sim}} --max_similarity {{max_sim}}",
"concat_kv": f"{PYTHON} {src_dir}/concat_kv.py {out_name} {{src_model_file}} {{tgt_model_new_name}}"
}
print(
f"\nExperiment {experiment_index + 1} parameters:\n{json.dumps(experiment, indent=4)}\n")
if 'train_dico' in experiment:
if type(experiment['train_dico']) == list:
if not os.path.exists(experiment['anchor_output_dir']):
os.makedirs(experiment['anchor_output_dir'])
path = os.path.join(experiment['anchor_output_dir'], 'train.dico')
with open(path, 'w') as fout:
for dico in experiment['train_dico']:
with open(dico) as fin:
print(fin.read().strip(), file=fout)
experiment['train_dico'] = path
steps = ["anchor_embedding_training", "concat_kv"]
experiment['output_identical_word_pair_file'] = experiment['train_dico']
else:
steps = ["find_identical_words", "anchor_embedding_training", "concat_kv"]
for step in steps:
print(
f"\nRunning step '{step}' for experiment {experiment_index + 1}\n")
run_command(commands[step], experiment)
def main():
if len(sys.argv) != 2:
print(f"Usage: {PYTHON} script.py <experiment_config.json>")
sys.exit(1)
config_file = sys.argv[1]
with open(config_file) as f:
data = json.load(f)
for i, experiment in enumerate(data['experiments']):
print(f"\nStarting experiment {i + 1}...\n")
run_experiment(experiment, i)
if 'evaluate' in data:
project_dir = os.getcwd() # get current directory
evaluate_command = f"{PYTHON} {project_dir}/MUSE/evaluate.py --tgt_emb {{tgt_model_new_name}} --src_emb {{src_model_file}} --tgt_lang {{tgt_lang}} --src_lang {{src_lang}} --dico_eval {{path_to_evaluation_file}} --exp_name {{experiment_name}} --cuda {{cuda}}"
print(
f"\nRunning final evaluation with the following parameters:\n{json.dumps(data['evaluate'], indent=4)}\n")
run_command(evaluate_command, data['evaluate'])
if __name__ == "__main__":
main()