diff --git a/tools/semibin/bin.xml b/tools/semibin/bin.xml index 3f7886b8c94..65ca5d06aa4 100644 --- a/tools/semibin/bin.xml +++ b/tools/semibin/bin.xml @@ -11,11 +11,15 @@ @@ -44,7 +85,7 @@ SemiBin2 bin - + @@ -78,7 +119,7 @@ SemiBin2 bin - + @@ -141,7 +182,7 @@ SemiBin2 bin - + @@ -172,4 +213,4 @@ Outputs ]]> - + \ No newline at end of file diff --git a/tools/semibin/concatenate_fasta.xml b/tools/semibin/concatenate_fasta.xml index a4624f4b60a..d09f70081d5 100644 --- a/tools/semibin/concatenate_fasta.xml +++ b/tools/semibin/concatenate_fasta.xml @@ -63,4 +63,4 @@ Outputs ]]> - + \ No newline at end of file diff --git a/tools/semibin/convert.py b/tools/semibin/convert.py new file mode 100644 index 00000000000..071783a888f --- /dev/null +++ b/tools/semibin/convert.py @@ -0,0 +1,94 @@ +import os +import pickle +import sys + +import torch +from safetensors.torch import load_file, save_file + +# ------------------------------- +# Metadata encoding/decoding +# ------------------------------- + + +def encode_metadata(obj): + """ + Recursively encode Python objects into tensors: + - torch.Tensor → leave as-is + - dict → recursively encode + - list/tuple → convert to dict {0: v0, 1: v1, ...} and encode recursively + - other → pickle into uint8 tensor + """ + if isinstance(obj, torch.Tensor): + return obj + elif isinstance(obj, dict): + return {k: encode_metadata(v) for k, v in obj.items()} + elif isinstance(obj, (list, tuple)): + return {str(i): encode_metadata(v) for i, v in enumerate(obj)} + else: + data = pickle.dumps(obj) + return torch.tensor(list(data), dtype=torch.uint8) + + +def decode_metadata(obj): + """ + Recursively decode tensors back into Python objects. + """ + if isinstance(obj, torch.Tensor): + if obj.dtype == torch.uint8: + data = bytes(obj.tolist()) + return pickle.loads(data) + return obj + elif isinstance(obj, dict): + # Convert dicts with all digit keys back to lists + if all(k.isdigit() for k in obj.keys()): + return [decode_metadata(obj[k]) for k in sorted(obj.keys(), key=int)] + else: + return {k: decode_metadata(v) for k, v in obj.items()} + else: + return obj + +# ------------------------------- +# Flatten/unflatten for SafeTensors +# ------------------------------- + + +def flatten_dict(d, parent_key='', sep='/'): + items = {} + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.update(flatten_dict(v, new_key, sep=sep)) + else: + items[new_key] = v + return items + + +def unflatten_dict(d, sep='/'): + result = {} + for k, v in d.items(): + keys = k.split(sep) + target = result + for key in keys[:-1]: + target = target.setdefault(key, {}) + target[keys[-1]] = v + return result + + +# ------------------------------- +# Save .pt as SafeTensors +# ------------------------------- + +if __name__ == "__main__": + FILE_PATH = sys.argv[1] + if FILE_PATH.endswith('.pt'): + checkpoint = torch.load("model.pt", map_location="cpu") + encoded = encode_metadata(checkpoint) + flat = flatten_dict(encoded) + save_file(flat, os.path.join(os.path.dirname(sys.argv[1]), "model.safetensors")) + print("Saved restorable SafeTensors file!") + else: + loaded_flat = load_file("model_restorable.safetensors") + loaded_nested = unflatten_dict(loaded_flat) + restored_checkpoint = decode_metadata(loaded_nested) + torch.save(restored_checkpoint, os.path.join(os.path.dirname(sys.argv[1]), "model.pt")) + print("Saved restored checkpoint as model_restored.pt!") diff --git a/tools/semibin/generate_cannot_links.xml b/tools/semibin/generate_cannot_links.xml index 209ed24013f..3d1ff31d9f5 100644 --- a/tools/semibin/generate_cannot_links.xml +++ b/tools/semibin/generate_cannot_links.xml @@ -30,7 +30,6 @@ SemiBin2 #if $ml_threshold: --ml-threshold $ml_threshold #end if - --cannot-name 'cannot' --threads \${GALAXY_SLOTS:-1} --processes \${GALAXY_SLOTS:-1} ]]> @@ -136,4 +135,4 @@ Outputs ]]> - + \ No newline at end of file diff --git a/tools/semibin/generate_sequence_features.xml b/tools/semibin/generate_sequence_features.xml index 2600ddf0f91..b5c4005c875 100644 --- a/tools/semibin/generate_sequence_features.xml +++ b/tools/semibin/generate_sequence_features.xml @@ -433,4 +433,4 @@ Outputs ]]> - + \ No newline at end of file diff --git a/tools/semibin/macros.xml b/tools/semibin/macros.xml index 17f22410405..657fa95340a 100644 --- a/tools/semibin/macros.xml +++ b/tools/semibin/macros.xml @@ -1,8 +1,8 @@ - 2.1.0 - 1 - 21.01 + 2.2.0 + 0 + 25.0 semibin @@ -11,6 +11,7 @@ semibin + safetensors @@ -462,7 +463,7 @@ ln -s '$e' '${identifier}.bam' && - + diff --git a/tools/semibin/semibin.xml b/tools/semibin/semibin.xml index 1c36ea5ff71..9cbb559a8cd 100644 --- a/tools/semibin/semibin.xml +++ b/tools/semibin/semibin.xml @@ -75,8 +75,27 @@ SemiBin2 --compression none --threads \${GALAXY_SLOTS:-1} --processes \${GALAXY_SLOTS:-1} + 2> 'stderr.txt' || true && -echo "output" && + +if grep -q "is empty or misformatted" 'stderr.txt'; then + mkdir 'output/output_bins' 'output/output_recluster_bins' 'output/output_prerecluster_bins'; + touch 'output/output_bins/empty_bin.fa' ; + touch 'output/output_recluster_bins/empty_bin.fa' ; + touch 'output/output_prerecluster_bins/empty_bin.fa' ; +fi +&& + +if grep -q "Edge weights must not be NaN values" 'stderr.txt'; then + mkdir 'output/output_bins' 'output/output_recluster_bins' 'output/output_prerecluster_bins'; + touch 'output/output_bins/empty_bin.fa' ; + touch 'output/output_recluster_bins/empty_bin.fa' ; + touch 'output/output_prerecluster_bins/empty_bin.fa' ; +fi + +&& cat 'stderr.txt' >&2 +&& rm 'stderr.txt' +&& echo "output" && ls output ]]> diff --git a/tools/semibin/test-data/model.h5 b/tools/semibin/test-data/model.h5 deleted file mode 100644 index e2b0c4d7da2..00000000000 Binary files a/tools/semibin/test-data/model.h5 and /dev/null differ diff --git a/tools/semibin/train.xml b/tools/semibin/train.xml index c392442322f..86b5628f76a 100644 --- a/tools/semibin/train.xml +++ b/tools/semibin/train.xml @@ -55,6 +55,9 @@ SemiBin2 train_semi --ratio $min_len.ratio #end if --orf-finder '$orf_finder' + +&& python '$__tool_directory__/convert.py' 'output/model.pt' +&& rm 'output/model.pt' ]]> @@ -101,9 +104,53 @@ SemiBin2 train_semi - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -167,9 +214,9 @@ SemiBin2 train_semi - + - + @@ -192,4 +239,4 @@ Outputs @HELP_MODEL@ ]]> - + \ No newline at end of file