forked from ManchesterBioinference/mRNA_LLM
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdvc.lock
More file actions
5959 lines (5959 loc) · 206 KB
/
dvc.lock
File metadata and controls
5959 lines (5959 loc) · 206 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
schema: '2.0'
stages:
chunkData:
cmd: python scripts/chunkData.py --params params.yaml
deps:
- path: data/3UTRs/
hash: md5
md5: 00248ad7e5c0c1d9b66e329d45d0cacb.dir
size: 17954073
nfiles: 3
- path: scripts/chunkData.py
hash: md5
md5: ff3df7e2c3bbeb23993b45e019767b72
size: 2159
params:
params.yaml:
chunkData:
input_dir: data/3UTRs/
output_dir: output/data/3utrFlyChunked
chunk_size: 510
outs:
- path: output/data/3utrFlyChunked
hash: md5
md5: a1943614b2957fd848e911443cc9e281.dir
size: 10139130
nfiles: 3
preprocessFlyUTRs:
cmd: conda run -n rayTune python scripts/preprocessFlyUTRs.py --params params.yaml
--data_dir data/3UTRs
deps:
- path: data/3UTRs
hash: md5
md5: 3e8ac59f542d3f8584bebf3130087f01.dir
size: 91250877
nfiles: 3
- path: scripts/preprocessFlyUTRs.py
hash: md5
md5: 444320e09c571b55fbcf531ec5e8f713
size: 9966
params:
params.yaml:
preprocessFlyUTRs:
output_dir: output/data/flyUTRs
kmer: 4
oma_groups: data/oma-groups.txt.gz
oma_ncbi_map: data/oma-ncbi.txt.gz
ncbi_geneID_map: data/gene2refseq_Dmel.txt
geneID_flybase_map: data/GeneID_to_FlybaseID.txt
outs:
- path: output/data/flyUTRs
hash: md5
md5: d799de301c01f205bcdcf0cb8155a820.dir
size: 30196403
nfiles: 3
mergeUTRsAndDecayRates:
cmd: conda run -n rayTune python scripts/mergeUTRsAndDecayRates.py --params params.yaml
--fasta output/data/flyUTRs
deps:
- path: data/decay/parameters_estimates_extended_zygotic_tr_18022025_filtered.csv
hash: md5
md5: f457204cec911d26cf261291e0d5eb3d
size: 218026
- path: output/data/flyUTRs
hash: md5
md5: d799de301c01f205bcdcf0cb8155a820.dir
size: 30196403
nfiles: 3
- path: scripts/mergeUTRsAndDecayRates.py
hash: md5
md5: 1b05f9edadd9d669ccc7ab9ffe73a6d5
size: 2065
params:
params.yaml:
mergeUTRsAndDecayRates:
decay_rates: data/decay/parameters_estimates_extended_zygotic_tr_18022025_filtered.csv
output_file: output/data/utrDecayRates.fasta
outs:
- path: output/data/utrDecayRates.fasta
hash: md5
md5: 598e8e9d16f74d3608117f77311acf8a
size: 891903
preprocessData:
cmd: conda run -n rayTune python scripts/preprocess.py --params params.yaml --data_dir
output/data/utrDecayRates.fasta --ncbi_geneID_map data/gene2refseq_Dmel.txt
--geneID_flybase_map data/GeneID_to_FlybaseID.txt
deps:
- path: output/data/utrDecayRates.fasta
hash: md5
md5: 598e8e9d16f74d3608117f77311acf8a
size: 891903
- path: scripts/preprocess.py
hash: md5
md5: d743e017cd26a9041672d7260cc08d51
size: 7604
params:
params.yaml:
preprocessData:
output_dir: output/data/decay
kmer: 4
preprocessFlyUTRs:
output_dir: output/data/flyUTRs
kmer: 4
oma_groups: data/oma-groups.txt.gz
oma_ncbi_map: data/oma-ncbi.txt.gz
ncbi_geneID_map: data/gene2refseq_Dmel.txt
geneID_flybase_map: data/GeneID_to_FlybaseID.txt
outs:
- path: output/data/decay
hash: md5
md5: 33921ede90f63aad12fddbb798193ea7.dir
size: 900608
nfiles: 3
trainToFlyUTRs:
cmd: conda run -n rayTune python scripts/trainToFlyUTRs.py --params params.yaml
--data_dir output/data/flyUTRs
deps:
- path: output/data/flyUTRs
hash: md5
md5: d799de301c01f205bcdcf0cb8155a820.dir
size: 30196403
nfiles: 3
- path: scripts/trainToFlyUTRs.py
hash: md5
md5: a69d221aac0285f3a49ef00c41e4a27c
size: 39093
params:
params.yaml:
trainToFlyUTRs:
test_run: false
do_train: true
do_eval: false
do_predict: false
do_visualize: false
output_dir: output/flyTrained
kmer: 4
model_type: 3utrfly
should_continue: false
model_name_or_path: data/4-new-12w-0
tokenizer_name: rna4
evaluate_during_training: true
do_visualize_during_training: false
numEpochsBeforeEarlyStopping: 20
patience: 10
per_gpu_train_batch_size: 48
num_train_epochs: 100
learning_rate: 5e-06
beta1: 0.9
beta2: 0.98
adam_epsilon: 1e-06
weight_decay: 0.01
warmup_percent: 0.1
seed: 6
eval_all_checkpoints: false
no_cuda: false
logging_steps: 100
save_steps: 1000
save_total_limit: 2
overwrite_output_dir: true
neptune_token:
neptune_project:
overwrite_cache: false
do_lower_case: false
curriculumLearning: false
outs:
- path: output/flyTrained/best_checkpoint
hash: md5
md5: 310c844e079a23d02f04d3597227df06.dir
size: 445221454
nfiles: 7
- path: output/flyTrained/checkpoints
hash: md5
md5: 101dbe1321cfaa78fbf306a12bd4b680.dir
size: 2661123936
nfiles: 18
- path: output/flyTrained/pytorch_model.bin
hash: md5
md5: 0a188a05e2bb1b9188ec3258f1e3142c
size: 442812226
fineTuneModel:
cmd: export RAY_PICKLE_VERBOSE_DEBUG=1 && python scripts/train.py --params params.yaml
--model_name_or_path output/flyTrained/best_checkpoint/ --max_seq_length 512
--data_dir output/data/decay --extraFeatures output/data/codons/extraFeatures.csv
--mfe output/data/codons/vienna_features.csv --use_ray_tune --train_final_model
deps:
- path: output/data/codons/extraFeatures.csv
hash: md5
md5: 553d9c1081f048fe66c15dcb6ffc1f3e
size: 7894810
- path: output/data/codons/vienna_features.csv
hash: md5
md5: af600d3e3adf97355b18c76bfee40ef4
size: 15605730
- path: output/data/decay
hash: md5
md5: b20ee60bfc13a9c94438f227943ad9b9.dir
size: 4345349
nfiles: 3
- path: output/flyTrained/best_checkpoint
hash: md5
md5: 310c844e079a23d02f04d3597227df06.dir
size: 445221454
nfiles: 7
- path: scripts/GenaLMWithExtraFeatures.py
hash: md5
md5: 9b3306db7d220495b22061ba1f6b9ea9
size: 6625
- path: scripts/train.py
hash: md5
md5: 2058bda2929297045afa47c2a2ed44ee
size: 87259
params:
params.yaml:
fineTuneModel:
output_dir: output/ftModel
do_train: true
do_eval: false
do_predict: false
do_visualize: false
modelParams:
model_type: 3utrfly
task_name: rnadecay
should_continue: false
evaluate_during_training: true
do_visualize_during_training: false
hidden_dropout_prob: 0.45255999948991715
attention_probs_dropout_prob: 0.45255999948991715
projector_dropout: 0.24955501667701577
classifier_dropout_prob: 0.2836668357639133
learning_rate: 2.7197500975257185e-06
classifier_lr: 0.0003087317652943953
patience: 5
per_gpu_train_batch_size: 48
per_gpu_eval_batch_size: 48
num_train_epochs: 50
weight_decay: 0.06577969639864487
classifier_decay: 0.02952629653430042
warmup_percent: 0.1
seed: 6
eval_all_checkpoints: false
no_cuda: false
logging_steps: 100
save_steps: 100000
save_total_limit: 10
overwrite_output_dir: true
neptune_token:
neptune_project:
overwrite_cache: false
do_lower_case: false
curriculumLearning: false
rayTune:
ray_tune_samples: 100
ray_tune_max_epochs: 50
ray_tune_initial_points: 5
ray_tune_grace_period: 4
ray_tune_reduction_factor: 2
ray_tune_cpu_per_trial: 2
ray_tune_gpu_per_trial: 1.0
ray_tune_local_dir: output/ray_results
outs:
- path: output/ray_results/best_ray_tune_config.json
hash: md5
md5: de709c9a81dc072babbabe9bc3d923db
size: 351
- path: output/ray_results/ray_tune_results.csv
hash: md5
md5: d485463a4002ac072067676757032b10
size: 43031
predict:
cmd: python scripts/predict.py --params params.yaml --model_name_or_path output/ftModel/best_spearmanr/
--data_dir output/data/decay
deps:
- path: output/data/decay
hash: md5
md5: 33921ede90f63aad12fddbb798193ea7.dir
size: 900608
nfiles: 3
- path: output/ftModel/best_spearmanr
hash: md5
md5: 02aa042cb06870ed97a15a7e3f9226e9.dir
size: 443470923
nfiles: 7
- path: scripts/predict.py
hash: md5
md5: 460c440e487f96eebda8163d44029644
size: 17491
params:
params.yaml:
modelParams:
model_type: 3utrfly
task_name: rnadecay
should_continue: false
tokenizer_name: rna4
evaluate_during_training: true
do_visualize_during_training: false
learning_rate: 4e-05
patience: 20
per_gpu_train_batch_size: 48
per_gpu_eval_batch_size: 48
num_train_epochs: 35
weight_decay: 0.01
warmup_percent: 0.07
seed: 6
eval_all_checkpoints: false
no_cuda: false
logging_steps: 100
save_steps: 100000
save_total_limit: 10
overwrite_output_dir: true
neptune_token:
neptune_project:
overwrite_cache: false
do_lower_case: false
curriculumLearning: false
predict:
output_dir: output/predict
do_train: false
do_eval: false
do_predict: true
do_visualize: false
outs:
- path: output/predict
hash: md5
md5: ad1add7b76f54d43451e43659e9c41a5.dir
size: 25
nfiles: 1
getDecayResiduals:
cmd: conda run -n rayTune python scripts/getDecayResiduals.py --params params.yaml
--decay_file data/decay/parameters_estimates_extended_zygotic_tr_18022025_filtered.csv
--utr_fasta output/data/utrDecayRates.fasta --CDS_fasta data/3UTRs/Drosophila_melanogaster-cds.fa
deps:
- path: data/3UTRs/Drosophila_melanogaster-cds.fa
hash: md5
md5: 99d74cc84b9079be0abfd3a0a29af10d
size: 61914081
- path: output/data/utrDecayRates.fasta
hash: md5
md5: 598e8e9d16f74d3608117f77311acf8a
size: 891903
- path: scripts/getDecayResiduals.py
hash: md5
md5: f9604d471ca9364289889e5ef84921eb
size: 10913
params:
params.yaml:
getDecayResiduals:
codon_output: output/data/codons/codon_frequencies.pkl
residuals: output/data/codons/decayResiduals.fasta
extraFeatures: output/data/codons/extraFeatures.csv
outs:
- path: output/data/codons/decayResiduals.fasta
hash: md5
md5: c8b02645a4bcd81a6b372fa3a9a0804d
size: 904614
- path: output/data/codons/extraFeatures.csv
hash: md5
md5: 5037dd8852180cd853e0ec6c0fb5b8e1
size: 859672
downloadSpeciesData:
cmd: conda run -n rayTune python scripts/downloadSpeciesData.py data/downloaded
deps:
- path: scripts/downloadSpeciesData.py
hash: md5
md5: bc5c83bf18fb7b2fc233af185a02ea9b
size: 4557
- path: setupEnv.log
hash: md5
md5: df1d3e1167aed27dd6167d342c9e621f
size: 64
params:
params.yaml:
downloadSpeciesData:
output_dir: data/downloaded
outs:
- path: data/downloaded
hash: md5
md5: edd2ffb5bae5b9df825ca55bf7409d09.dir
size: 50408678
nfiles: 2
isolate3UTRs:
cmd: conda run -n rayTune python scripts/isolate3UTRs.py --params params.yaml
--data_dir data/downloaded
deps:
- path: data/downloaded
hash: md5
md5: edd2ffb5bae5b9df825ca55bf7409d09.dir
size: 50408678
nfiles: 2
- path: scripts/isolate3UTRs.py
hash: md5
md5: 65a80c6b3d3dd231d3a36a3c73e05aa1
size: 4420
params:
params.yaml:
isolate3UTRs:
output_dir: data/3UTRs
transcriptSection: three_prime_utr,CDS,five_prime_utr
outs:
- path: data/3UTRs
hash: md5
md5: 3e8ac59f542d3f8584bebf3130087f01.dir
size: 91250877
nfiles: 3
importanceAnalysis:
cmd: python scripts/importanceAnalysis.py --params params.yaml --model_name_or_path
output/ftModel/best_spearmanr/ --save_path output/importance/shap.pkl --extraFeatures
output/data/codons/extraFeatures.csv --sequence_file output/data/decay/train.fasta
--mfe output/data/codons/vienna_features.csv
deps:
- path: output/ftModel
hash: md5
md5: c27f29ba4e4f5e14addfa383d1f814f9.dir
size: 2216501807
nfiles: 26
- path: scripts/importanceAnalysis.py
hash: md5
md5: 676e920dc203087653ddbbe84562ed62
size: 19221
params:
params.yaml:
importanceAnalysis:
output_dir: output/importance
pickle_file: shap.pkl
debug: false
outs:
- path: output/importance/shap.pkl
hash: md5
md5: 2cc07ffb80d128722c718b382a339b32
size: 7723160
visualizeImportance:
cmd: python scripts/visualizeImportance.py --params params.yaml --kmer 4 --SHAP
output/importance/shap.pkl --scoresOnly output/importance/shapScores.npy --save_path
output/importance/visualizeImportance.html --save_tokenized_path output/importance/visualizeImportance_tokenized.html
deps:
- path: output/importance/shap.pkl
hash: md5
md5: 9b86e560625db45b402993f1d8cb9f58
size: 7723181
- path: scripts/visualizeImportance.py
hash: md5
md5: a0c0c1aff1fe371ac073281f7763d4ad
size: 5889
params:
params.yaml:
visualizeImportance:
scoresOnly: shapScores.npy
save_path: visualizeImportance.html
save_tokenized_path: visualizeImportance_tokenized.html
outs:
- path: output/importance/visualizeImportance.html
hash: md5
md5: a5eaef7a003d4528bd1a8de1a1be593c
size: 33548214
findMotifs:
cmd: python scripts/find_motifs.py --SHAP output/importance/shap.pkl --save_file_dir
output/motifs
deps:
- path: output/importance/visualizeImportance.html
hash: md5
md5: f64aafd85c8f3da9575e496e189759a8
size: 33517613
- path: scripts/find_motifs.py
hash: md5
md5: ab8c8dd90d38d1aaf842c6f28cc249f7
size: 40136
params:
params.yaml:
findMotifs:
output_dir: output/motifs
control_file: control_seqs.fasta
motif_file: motif_seqs.fasta
control_positions: control_positions.txt
motif_positions: motif_positions.txt
interest_file: interest_seqs.fasta
outs:
- path: output/motifs/negative/control_positions.txt
hash: md5
md5: 396d5772bb595c7883e5e0bd927a5e6e
size: 255653
- path: output/motifs/negative/control_seqs.fasta
hash: md5
md5: 2ab4458eec0842fb509fc5700c84ecca
size: 6471742
- path: output/motifs/negative/motif_positions.txt
hash: md5
md5: 2b1e89c798fde1f71fc6d40f06e60d33
size: 55160
- path: output/motifs/negative/motif_seqs.fasta
hash: md5
md5: 2cdef669c28f609cb3613614f8618b9b
size: 61726
- path: output/motifs/positive/control_positions.txt
hash: md5
md5: 5a0e0715bb78caf532b67b768d78c0e7
size: 118037
- path: output/motifs/positive/control_seqs.fasta
hash: md5
md5: 5f93df882d1e172745b6540f4eaa795b
size: 363826
- path: output/motifs/positive/motif_positions.txt
hash: md5
md5: 31d71377344c8b5ce6dd0b464ee793dd
size: 326850
- path: output/motifs/positive/motif_seqs.fasta
hash: md5
md5: 1f0fb36b4c369e92f3cbc1a64b650fa4
size: 548422
runAME@positive:
cmd: singularity exec -B $(readlink output/motifs/positive/control_seqs.fasta):/tmp/control_seqs.fasta
-B $(readlink output/motifs/positive/motif_seqs.fasta):/tmp/motif_seqs.fasta
docker://memesuite/memesuite:5.5.7 ame $( cat output/mastFilter_out/motifsToKeep.txt)
--oc output/motifs/positive/ame_output --control /tmp/control_seqs.fasta --evalue-report-threshold
100 --method fisher --rna /tmp/motif_seqs.fasta
/opt/meme/share/meme-5.5.7/db/motif_databases/RNA/Ray2013_rbp_Drosophila_melanogaster.meme
deps:
- path: output/mastFilter_out/motifsToKeep.txt
hash: md5
md5: 41d9031308f0ebaa466e2aab73833b49
size: 540
- path: output/motifs/positive/control_seqs.fasta
hash: md5
md5: 5f93df882d1e172745b6540f4eaa795b
size: 363826
- path: output/motifs/positive/motif_seqs.fasta
hash: md5
md5: 1f0fb36b4c369e92f3cbc1a64b650fa4
size: 548422
params:
params.yaml:
runAME:
output_dir: ame_output
outs:
- path: output/motifs/positive/ame_output
hash: md5
md5: 49412c6e43b2a58b309638dc9bb31ab1.dir
size: 3559005
nfiles: 3
runAME@negative:
cmd: singularity exec -B $(readlink output/motifs/negative/control_seqs.fasta):/tmp/control_seqs.fasta
-B $(readlink output/motifs/negative/motif_seqs.fasta):/tmp/motif_seqs.fasta
docker://memesuite/memesuite:5.5.7 ame $( cat output/mastFilter_out/motifsToKeep.txt)
--oc output/motifs/negative/ame_output --control /tmp/control_seqs.fasta --evalue-report-threshold
100 --method fisher --rna /tmp/motif_seqs.fasta
/opt/meme/share/meme-5.5.7/db/motif_databases/RNA/Ray2013_rbp_Drosophila_melanogaster.meme
deps:
- path: output/mastFilter_out/motifsToKeep.txt
hash: md5
md5: 41d9031308f0ebaa466e2aab73833b49
size: 540
- path: output/motifs/negative/control_seqs.fasta
hash: md5
md5: 2ab4458eec0842fb509fc5700c84ecca
size: 6471742
- path: output/motifs/negative/motif_seqs.fasta
hash: md5
md5: 2cdef669c28f609cb3613614f8618b9b
size: 61726
params:
params.yaml:
runAME:
output_dir: ame_output
outs:
- path: output/motifs/negative/ame_output
hash: md5
md5: 366a50c4f2e809ba163e7bfc3062f78b.dir
size: 10048795
nfiles: 3
correctAME_pvalues@positive:
cmd: python scripts/correctAME_pvalues.py --params params.yaml --ame_input output/motifs/positive/ame_output/ame.tsv
--motif_seqs output/motifs/positive/motif_seqs.fasta --control_seqs output/motifs/positive/control_seqs.fasta
--ame_corrected output/motifs/positive/ame_corrected.tsv
deps:
- path: output/motifs/positive/ame_output/ame.tsv
hash: md5
md5: 3e3f70eba558b9ad0c1029a9452f06d5
size: 12309
- path: scripts/correctAME_pvalues.py
hash: md5
md5: 9ddf3d2361ed66eb52109b1a3f11a66b
size: 1876
params:
params.yaml:
correctAME_pvalues:
output_file: ame_corrected.tsv
outs:
- path: output/motifs/positive/ame_corrected.tsv
hash: md5
md5: b4b178fe64e8d465ac0f5fdcca80d979
size: 15367
correctAME_pvalues@negative:
cmd: python scripts/correctAME_pvalues.py --params params.yaml --ame_input output/motifs/negative/ame_output/ame.tsv
--motif_seqs output/motifs/negative/motif_seqs.fasta --control_seqs output/motifs/negative/control_seqs.fasta
--ame_corrected output/motifs/negative/ame_corrected.tsv
deps:
- path: output/motifs/negative/ame_output/ame.tsv
hash: md5
md5: c8dc9af026da8d621505b5fbb2a0a743
size: 12229
- path: scripts/correctAME_pvalues.py
hash: md5
md5: 9ddf3d2361ed66eb52109b1a3f11a66b
size: 1876
params:
params.yaml:
correctAME_pvalues:
output_file: ame_corrected.tsv
outs:
- path: output/motifs/negative/ame_corrected.tsv
hash: md5
md5: 526d18e5907fc9a4bdabc939b6c39fc4
size: 14750
splitHighLowDecay:
cmd: python scripts/splitHighLowDecay.py --params params.yaml --input output/data/decay/test.fasta
deps:
- path: output/data/decay/test.fasta
hash: md5
md5: 439c60179bd0997df7875cbf7e50f252
size: 122015
- path: scripts/splitHighLowDecay.py
hash: md5
md5: 7abc6beff3b46ded2220b95a903ab88d
size: 4713
params:
params.yaml:
splitHighLowDecay:
output: output/splitDecay
high: highDecay.fasta
low: lowDecay.fasta
outs:
- path: output/splitDecay/highDecay.fasta
hash: md5
md5: 73fd089090c28aef0c402820f663d333
size: 35325
- path: output/splitDecay/lowDecay.fasta
hash: md5
md5: 9d320a60f0ca2bae90e038db03170f86
size: 52516
runAME_highLowDecay@0:
cmd: mkdir -p output/splitDecay/high/ame_output && singularity exec -B $(readlink
output/splitDecay/test_lowDecay.fasta):/tmp/control.fasta -B $(readlink output/splitDecay/test_highDecay.fasta):/tmp/rna.fasta
docker://memesuite/memesuite:5.5.7 ame $( cat output/mastFilter_out/motifsToKeep.txt)
--oc output/splitDecay/high/test_ame_output --control /tmp/control.fasta --evalue-report-threshold
100 --method fisher --rna /tmp/rna.fasta
/opt/meme/share/meme-5.5.7/db/motif_databases/RNA/Ray2013_rbp_Drosophila_melanogaster.meme
deps:
- path: output/splitDecay/test_highDecay.fasta
hash: md5
md5: a19201a8859d8ca1791495ebe8cb9477
size: 232469
- path: output/splitDecay/test_lowDecay.fasta
hash: md5
md5: a1aa8f945a28c0bdb1914b5d484f0a6c
size: 280591
params:
params.yaml:
runAME_highLowDecay:
output_dir: ame_output
outs:
- path: output/splitDecay/high/test_ame_output
hash: md5
md5: 7f85baa87009a0842c38d9f638b88763.dir
size: 488335
nfiles: 3
runAME_highLowDecay@1:
cmd: mkdir -p output/splitDecay/low/ame_output && singularity exec -B $(readlink
output/splitDecay/test_highDecay.fasta):/tmp/control.fasta -B $(readlink output/splitDecay/test_lowDecay.fasta):/tmp/rna.fasta
docker://memesuite/memesuite:5.5.7 ame $( cat output/mastFilter_out/motifsToKeep.txt)
--oc output/splitDecay/low/test_ame_output --control /tmp/control.fasta --evalue-report-threshold
100 --method fisher --rna /tmp/rna.fasta
/opt/meme/share/meme-5.5.7/db/motif_databases/RNA/Ray2013_rbp_Drosophila_melanogaster.meme
deps:
- path: output/splitDecay/test_highDecay.fasta
hash: md5
md5: a19201a8859d8ca1791495ebe8cb9477
size: 232469
- path: output/splitDecay/test_lowDecay.fasta
hash: md5
md5: a1aa8f945a28c0bdb1914b5d484f0a6c
size: 280591
params:
params.yaml:
runAME_highLowDecay:
output_dir: ame_output
outs:
- path: output/splitDecay/low/test_ame_output
hash: md5
md5: 108821ae2118c8a5249e8d3354688491.dir
size: 820426
nfiles: 3
splitHighLowDecay@0:
cmd: python scripts/splitHighLowDecay.py --params params.yaml --input output/data/utrTE.fasta
--high all_highDecay.fasta --low all_lowDecay.fasta
deps:
- path: output/data/utrTE.fasta
hash: md5
md5: 0d9f0b6f3bd89c0956c719f42505bcd4
size: 4287694
- path: scripts/splitHighLowDecay.py
hash: md5
md5: 7abc6beff3b46ded2220b95a903ab88d
size: 4713
params:
params.yaml:
splitHighLowDecay:
output: output/splitDecay
high: highDecay.fasta
low: lowDecay.fasta
outs:
- path: output/splitDecay/all_highDecay.fasta
hash: md5
md5: 409a3ec71995d79612a086fe0ab80432
size: 1487749
- path: output/splitDecay/all_lowDecay.fasta
hash: md5
md5: cffc242e8e45681a05cf8cb4858bda8d
size: 1905816
splitHighLowDecay@1:
cmd: python scripts/splitHighLowDecay.py --params params.yaml --input output/data/decay/train.fasta
--high train_highDecay.fasta --low train_lowDecay.fasta
deps:
- path: output/data/decay/train.fasta
hash: md5
md5: 4c7808bd05acff6e4808f9d5997c57ab
size: 3014517
- path: scripts/splitHighLowDecay.py
hash: md5
md5: 7abc6beff3b46ded2220b95a903ab88d
size: 4713
params:
params.yaml:
splitHighLowDecay:
output: output/splitDecay
high: highDecay.fasta
low: lowDecay.fasta
outs:
- path: output/splitDecay/train_highDecay.fasta
hash: md5
md5: aa53f369db8a11a7bddf03ab59d5c71a
size: 1043574
- path: output/splitDecay/train_lowDecay.fasta
hash: md5
md5: 566e9e5ce15cf87030062474d15b43df
size: 1356428
prepRandomizeSeqsAndExtraFeatures@same3UTR:
cmd: python scripts/prepRandomizeSeqsAndExtraFeatures.py --params params.yaml
--sequence_file output/data/decay/test.fasta --process same3UTR --output_file
output/data/sanityCheck/same3UTR.fasta
deps:
- path: output/data/decay/test.fasta
hash: md5
md5: b261c024f57872ab67fa4ac555380895
size: 662914
- path: scripts/prepRandomizeSeqsAndExtraFeatures.py
hash: md5
md5: 87ad429f9d1f642a3febdefded03731f
size: 3000
params:
params.yaml:
prepRandomizeSeqsAndExtraFeatures:
sequence_dir: output/data/sanityCheck
outs:
- path: output/data/sanityCheck/same3UTR.fasta
hash: md5
md5: b398b883d5b7f3e6958f4a6fa6606938
size: 919108
prepRandomizeSeqsAndExtraFeatures@originalSeqs:
cmd: python scripts/prepRandomizeSeqsAndExtraFeatures.py --params params.yaml
--sequence_file output/data/decay/test.fasta --process originalSeqs --output_file
output/data/sanityCheck/originalSeqs.fasta
deps:
- path: output/data/decay/test.fasta
hash: md5
md5: b261c024f57872ab67fa4ac555380895
size: 662914
- path: scripts/prepRandomizeSeqsAndExtraFeatures.py
hash: md5
md5: 87ad429f9d1f642a3febdefded03731f
size: 3000
params:
params.yaml:
prepRandomizeSeqsAndExtraFeatures:
sequence_dir: output/data/sanityCheck
outs:
- path: output/data/sanityCheck/originalSeqs.fasta
hash: md5
md5: c890b6da675896a7317e5786284e8c1f
size: 660472
prepRandomizeSeqsAndExtraFeatures@same5UTR:
cmd: python scripts/prepRandomizeSeqsAndExtraFeatures.py --params params.yaml
--sequence_file output/data/decay/test.fasta --process same5UTR --output_file
output/data/sanityCheck/same5UTR.fasta
deps:
- path: output/data/decay/test.fasta
hash: md5
md5: b261c024f57872ab67fa4ac555380895
size: 662914
- path: scripts/prepRandomizeSeqsAndExtraFeatures.py
hash: md5
md5: 87ad429f9d1f642a3febdefded03731f
size: 3000
params:
params.yaml:
prepRandomizeSeqsAndExtraFeatures:
sequence_dir: output/data/sanityCheck
outs:
- path: output/data/sanityCheck/same5UTR.fasta
hash: md5
md5: ea61567c618fe6b25230682d8748d726
size: 661048
prepRandomizeSeqsAndExtraFeatures@sameSeq:
cmd: python scripts/prepRandomizeSeqsAndExtraFeatures.py --params params.yaml
--sequence_file output/data/decay/test.fasta --process sameSeq --output_file
output/data/sanityCheck/sameSeq.fasta
deps:
- path: output/data/decay/test.fasta
hash: md5
md5: b261c024f57872ab67fa4ac555380895
size: 662914
- path: scripts/prepRandomizeSeqsAndExtraFeatures.py
hash: md5
md5: 87ad429f9d1f642a3febdefded03731f
size: 3000
params:
params.yaml:
prepRandomizeSeqsAndExtraFeatures:
sequence_dir: output/data/sanityCheck
outs:
- path: output/data/sanityCheck/sameSeq.fasta
hash: md5
md5: c8cada44e6c3a2308d85f13412a8e895
size: 931505
runAME_highLowDecay@2:
cmd: mkdir -p output/splitDecay/high/ame_output && singularity exec -B $(readlink
output/splitDecay/dev_lowDecay.fasta):/tmp/control.fasta -B $(readlink output/splitDecay/dev_highDecay.fasta):/tmp/rna.fasta
docker://memesuite/memesuite:5.5.7 ame $( cat output/mastFilter_out/motifsToKeep.txt)
--oc output/splitDecay/high/dev_ame_output --control /tmp/control.fasta --evalue-report-threshold
100 --method fisher --rna /tmp/rna.fasta
/opt/meme/share/meme-5.5.7/db/motif_databases/RNA/Ray2013_rbp_Drosophila_melanogaster.meme
deps:
- path: output/splitDecay/dev_highDecay.fasta
hash: md5
md5: 65324f647429226558af11adb83d8fa9
size: 231371
- path: output/splitDecay/dev_lowDecay.fasta
hash: md5
md5: e6d8a7f7e609c0229c27bd90fd1a0a49
size: 289153
params:
params.yaml:
runAME_highLowDecay:
output_dir: ame_output
outs:
- path: output/splitDecay/high/dev_ame_output
hash: md5
md5: ba047424741264aa297459cc2a4b54e7.dir
size: 425214
nfiles: 3
runAME_highLowDecay@3:
cmd: mkdir -p output/splitDecay/low/ame_output && singularity exec -B $(readlink
output/splitDecay/dev_highDecay.fasta):/tmp/control.fasta -B $(readlink output/splitDecay/dev_lowDecay.fasta):/tmp/rna.fasta
docker://memesuite/memesuite:5.5.7 ame $( cat output/mastFilter_out/motifsToKeep.txt)
--oc output/splitDecay/low/dev_ame_output --control /tmp/control.fasta --evalue-report-threshold
100 --method fisher --rna /tmp/rna.fasta
/opt/meme/share/meme-5.5.7/db/motif_databases/RNA/Ray2013_rbp_Drosophila_melanogaster.meme
deps:
- path: output/splitDecay/dev_highDecay.fasta
hash: md5
md5: 65324f647429226558af11adb83d8fa9
size: 231371
- path: output/splitDecay/dev_lowDecay.fasta
hash: md5
md5: e6d8a7f7e609c0229c27bd90fd1a0a49
size: 289153
params:
params.yaml:
runAME_highLowDecay:
output_dir: ame_output
outs:
- path: output/splitDecay/low/dev_ame_output
hash: md5
md5: 5943b500c084359afe28edb820b0098e.dir
size: 864645
nfiles: 3
randomizeSeqsAndExtraFeatures@same5UTR:
cmd: python scripts/sanityCheck_predictions.py --params params.yaml --sequence_file
output/data/sanityCheck/same5UTR.fasta --save_path output/data/sanityCheck/same5UTR_predictions.csv
--extraFeatures output/data/codons/extraFeatures.csv --mfe output/data/codons/vienna_features.csv
deps:
- path: output/data/sanityCheck/same5UTR.fasta
hash: md5
md5: ea61567c618fe6b25230682d8748d726
size: 661048
- path: scripts/sanityCheck_predictions.py
hash: md5
md5: 5f485cadac7c483fb1b9359501eb7437
size: 14042
params:
params.yaml:
prepRandomizeSeqsAndExtraFeatures:
sequence_dir: output/data/sanityCheck
outs:
- path: output/data/sanityCheck/same5UTR_predictions.csv
hash: md5
md5: 5abe8df0e78d607ad52fcdf5ea40d0a6
size: 12547
randomizeSeqsAndExtraFeatures@same3UTR:
cmd: python scripts/sanityCheck_predictions.py --params params.yaml --sequence_file
output/data/sanityCheck/same3UTR.fasta --save_path output/data/sanityCheck/same3UTR_predictions.csv
--extraFeatures output/data/codons/extraFeatures.csv --mfe output/data/codons/vienna_features.csv
deps:
- path: output/data/sanityCheck/same3UTR.fasta
hash: md5
md5: b398b883d5b7f3e6958f4a6fa6606938
size: 919108
- path: scripts/sanityCheck_predictions.py
hash: md5
md5: 5f485cadac7c483fb1b9359501eb7437
size: 14042
params:
params.yaml:
prepRandomizeSeqsAndExtraFeatures:
sequence_dir: output/data/sanityCheck
outs:
- path: output/data/sanityCheck/same3UTR_predictions.csv
hash: md5
md5: 3d05f3f4fa162e6f80d1da7ad7bb51fb
size: 12549
randomizeSeqsAndExtraFeatures@originalSeqs:
cmd: python scripts/sanityCheck_predictions.py --params params.yaml --sequence_file
output/data/sanityCheck/originalSeqs.fasta --save_path output/data/sanityCheck/originalSeqs_predictions.csv
--extraFeatures output/data/codons/extraFeatures.csv --mfe output/data/codons/vienna_features.csv
deps:
- path: output/data/sanityCheck/originalSeqs.fasta
hash: md5
md5: c890b6da675896a7317e5786284e8c1f
size: 660472
- path: scripts/sanityCheck_predictions.py
hash: md5
md5: 5f485cadac7c483fb1b9359501eb7437
size: 14042
params:
params.yaml:
prepRandomizeSeqsAndExtraFeatures:
sequence_dir: output/data/sanityCheck
outs:
- path: output/data/sanityCheck/originalSeqs_predictions.csv
hash: md5
md5: e75ca98c77a7186cb8bc4d76ef201847
size: 12540
randomizeSeqsAndExtraFeatures@sameSeq:
cmd: python scripts/sanityCheck_predictions.py --params params.yaml --sequence_file
output/data/sanityCheck/sameSeq.fasta --save_path output/data/sanityCheck/sameSeq_predictions.csv
--extraFeatures output/data/codons/extraFeatures.csv --mfe output/data/codons/vienna_features.csv
deps:
- path: output/data/sanityCheck/sameSeq.fasta
hash: md5
md5: c8cada44e6c3a2308d85f13412a8e895
size: 931505
- path: scripts/sanityCheck_predictions.py
hash: md5
md5: 5f485cadac7c483fb1b9359501eb7437
size: 14042
params:
params.yaml:
prepRandomizeSeqsAndExtraFeatures:
sequence_dir: output/data/sanityCheck
outs:
- path: output/data/sanityCheck/sameSeq_predictions.csv
hash: md5
md5: 611eeb586547a9720d897fd1870eecef
size: 12505
isolateFullTranscripts:
cmd: conda run -n rayTune python scripts/isolateFullTranscripts.py --params params.yaml
--data_dir data/3UTRs --target_ids_fasta output/data/utrDecayRates.fasta
deps:
- path: data/downloaded
hash: md5
md5: edd2ffb5bae5b9df825ca55bf7409d09.dir
size: 50408678
nfiles: 2
- path: output/data/utrDecayRates.fasta
hash: md5
md5: 598e8e9d16f74d3608117f77311acf8a
size: 891903
- path: scripts/isolateFullTranscripts.py
hash: md5
md5: 7aa6be3829f80630148a9699b5030fe4
size: 3799
params:
params.yaml:
isolateFullTranscripts:
output_file: output/data/full_transcripts/Dmela-full_transcripts.fa
outs:
- path: output/data/full_transcripts/Dmela-full_transcripts.fa
hash: md5
md5: 7c19d64b6a5afe32c055fd0944612505
size: 2444865
runViennaRNA:
cmd: conda run -n rayTune python scripts/runViennaRNA.py --input_fasta output/data/full_transcripts/Dmela-full_transcripts.fa
--output_csv output/data/codons/vienna_features.csv --condaPath /mnt/mr01-home01/m65338lb/.local/bin/micromamba
--condaEnv inseq --RNAfold_path linearfold