Skip to content

Commit

Permalink
Merge pull request #127 from KoslickiLab/dev-patch-NEW-algorithm
Browse files Browse the repository at this point in the history
Patch to use `.sig` instead of `*.sig.gz`
  • Loading branch information
mahmudhera authored Nov 6, 2024
2 parents 5628ee0 + f316419 commit 15ccba9
Show file tree
Hide file tree
Showing 12 changed files with 110 additions and 138 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,4 +167,8 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
gtdb-rs214-reps.k31_0.9995_pretrained/
gtdb-rs214-reps.k31_0.9995_pretrained/

# added by mahmudhera
src/cpp/main.o
.gitignore
11 changes: 11 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Include C++ source files
include src/cpp/*.cpp
include src/cpp/*.hpp

# Include other necessary files
include LICENSE.txt
include README.md
include Makefile
include build.sh
include build_unix.sh
include build_windows.bat
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Compiler and flags
CXX = g++
CXX ?= g++
CXXFLAGS = -std=c++17 -Wall -w -O3 -Wsign-compare

# Directories
Expand All @@ -26,12 +26,12 @@ $(BIN_DIR):
# build the object files
$(OBJ_FILES): %.o: %.cpp
echo "Compiling: $<"
$(CXX) $(CXXFLAGS) -c $< -o $@ -lz
$(CXX) $(CXXFLAGS) -c $< -o $@

# build the target executable
$(TARGET): $(OBJ_FILES) | $(BIN_DIR)
echo "Linking to create executable: $(TARGET)"
$(CXX) $(CXXFLAGS) $(OBJ_FILES) -o $(TARGET) -lz -lpthread
$(CXX) $(CXXFLAGS) $(OBJ_FILES) -o $(TARGET) -lpthread

# clean up
clean:
Expand Down
24 changes: 17 additions & 7 deletions conda_recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,19 @@ package:

source:
url: https://github.com/KoslickiLab/YACHT/releases/download/v{{ version }}/yacht-{{ version }}.tar.gz
sha256: 3365731592e37b5708fb6bdedc817debe813f7050814a68e07d1a4836337762e
sha256: 68d272daeb70ed7390aa2d468934dc4bf0aa9a021f99fe99847b8a664e8ac8cf

build:
number: 0
skip: True # [osx]
script: "{{ PYTHON }} -m pip install . --no-deps --no-build-isolation --no-cache-dir -vvv"
run_exports:
- {{ pin_subpackage('yacht') }}

requirements:
build:
- {{ compiler('cxx') }} # Adds platform-specific C++ compiler (g++, clang, MSVC)
- {{ compiler('cxx') }}
- make # Ensures that Make is available (for Unix)
- python >3.6,<3.12
- pip
- setuptools

host:
- python >3.6,<3.12
Expand Down Expand Up @@ -54,7 +54,6 @@ requirements:
- ruff
- sourmash_plugin_branchwater


test:
commands:
- yacht --help
Expand All @@ -71,8 +70,19 @@ about:
extra:
skip-lints:
- should_use_compilers
- should_be_noarch_generic
identifiers:
- doi:10.1093/bioinformatics/btae047
recipe-maintainers:
- chunyuma
- dkoslicki
- dkoslicki
authors:
- dkoslicki
- chunyuma
- mahmudhera
- sew347
- mlupei
- mfl15
- ShaopengLiu1
- raquellewei
- mohsenht
33 changes: 0 additions & 33 deletions src/cpp/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@
#include <chrono>
#include <random>


#include <zlib.h>

using namespace std;
using json = nlohmann::json;

Expand Down Expand Up @@ -62,38 +59,8 @@ vector<vector<int>> similars;



string decompressGzip(const std::string& filename) {
// Open file
gzFile file = gzopen(filename.c_str(), "rb");
if (!file) {
throw runtime_error("Failed to open gzip file.");
}

// Buffer for decompressed data
const size_t bufferSize = 8192;
vector<char> buffer(bufferSize);
string decompressedData;

int bytesRead;
while ((bytesRead = gzread(file, buffer.data(), bufferSize)) > 0) {
decompressedData.append(buffer.data(), bytesRead);
}

gzclose(file);
return decompressedData;
}



vector<hash_t> read_min_hashes(const string& json_filename) {

// check if gz file
if (json_filename.find(".gz") != std::string::npos) {
auto jsonData = json::parse(decompressGzip(json_filename));
std::vector<hash_t> min_hashes = jsonData[0]["signatures"][0]["mins"];
return min_hashes;
}

// Open the JSON file
ifstream inputFile(json_filename);

Expand Down
13 changes: 9 additions & 4 deletions src/yacht/hypothesis_recovery_src.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
from tqdm import tqdm
from multiprocessing import Pool
import sourmash
import glob
from typing import List, Set, Tuple
from .utils import load_signature_with_ksize
from .utils import load_signature_with_ksize, decompress_all_sig_files
# Configure Loguru logger
from loguru import logger

Expand All @@ -23,7 +24,7 @@
sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO"
)

SIG_SUFFIX = ".sig.gz"
SIG_SUFFIX = ".sig"


def get_organisms_with_nonzero_overlap(
Expand Down Expand Up @@ -61,6 +62,10 @@ def get_organisms_with_nonzero_overlap(
logger.info("Unzipping the sample signature zip file")
with zipfile.ZipFile(sample_file, "r") as sample_zip_file:
sample_zip_file.extractall(path_to_sample_temp_dir)
all_gz_files = glob.glob(f"{path_to_sample_temp_dir}/signatures/*.sig.gz")
# decompress all signature files
logger.info(f"Decompressing {len(all_gz_files)} .sig.gz files using {num_threads} threads.")
decompress_all_sig_files(all_gz_files, num_threads)

sample_sig_file = pd.DataFrame(
[
Expand Down Expand Up @@ -141,7 +146,7 @@ def __find_exclusive_hashes(
) -> Set[int]:
# load genome signature
sig = load_signature_with_ksize(
os.path.join(path_to_temp_dir, "signatures", md5sum + ".sig.gz"), ksize
os.path.join(path_to_temp_dir, "signatures", md5sum + SIG_SUFFIX), ksize
)
return {hash for hash in sig.minhash.hashes if hash in single_occurrence_hashes}

Expand All @@ -155,7 +160,7 @@ def __find_exclusive_hashes(
multiple_occurrence_hashes: Set[int] = set()
for md5sum in tqdm(organism_md5sum_list, desc="Processing organism signatures"):
sig = load_signature_with_ksize(
os.path.join(path_to_genome_temp_dir, "signatures", md5sum + ".sig.gz"),
os.path.join(path_to_genome_temp_dir, "signatures", md5sum + SIG_SUFFIX),
ksize,
)
for hash in sig.minhash.hashes:
Expand Down
6 changes: 6 additions & 0 deletions src/yacht/make_training_data_from_sketches.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from loguru import logger
import json
import shutil
import glob
from . import utils

# Configure Loguru logger
Expand Down Expand Up @@ -107,6 +108,11 @@ def main(args):
logger.info("Unzipping the sourmash signature file to the temporary directory")
with zipfile.ZipFile(ref_file, "r") as sourmash_db:
sourmash_db.extractall(path_to_temp_dir)
all_gz_files = glob.glob(f"{path_to_temp_dir}/signatures/*.sig.gz")

# decompress all signature files
logger.info(f"Decompressing {len(all_gz_files)} .sig.gz files using {num_threads} threads.")
utils.decompress_all_sig_files(all_gz_files, num_threads)

# Extract signature information
logger.info("Extracting signature information")
Expand Down
11 changes: 10 additions & 1 deletion src/yacht/run_YACHT.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
import json
import warnings
import zipfile
import glob
from loguru import logger

from .utils import decompress_all_sig_files
warnings.filterwarnings("ignore")

# Configure Loguru logger
Expand Down Expand Up @@ -173,6 +174,14 @@ def main(args):
else:
has_raw = True

# a patch to check if the genome signature files have been decompressed
training_sig_file = glob.glob(f"{path_to_genome_temp_dir}/training_sig_files.*")[0]
df = pd.read_csv(training_sig_file, sep="\t", header=None)
if 'sig.gz' in df[0].values[0]:
pd.DataFrame([x.replace('sig.gz','sig') for x in df[0]]).to_csv(training_sig_file, header=False, index=False)
all_gz_files = glob.glob(f"{path_to_genome_temp_dir}/signatures/*.sig.gz")
decompress_all_sig_files(all_gz_files, num_threads)

manifest_list = hr.hypothesis_recovery(
manifest,
sample_info_set,
Expand Down
92 changes: 25 additions & 67 deletions src/yacht/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from loguru import logger
from typing import Optional, List, Set, Dict, Tuple
import shutil
import gzip
from glob import glob

# Configure Loguru logger
Expand Down Expand Up @@ -477,75 +478,32 @@ def check_download_args(args, db_type):
logger.error("We now haven't supported for virus database.")
sys.exit(1)

def _temp_get_genome_name(sig_file_path, ksize):

res = get_info_from_single_sig(sig_file_path, ksize)
if res:
return res[0]
else:
return None

def temp_generate_inputs(
selected_genomes_file_path: str,
sig_info_dict: Dict[str, Tuple[str, float, int, int]],
ksize: int,
num_threads: int = 16,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
def _decompress_and_remove(file_path: str) -> None:
"""
Temporary Helper function that generates the required input for `yacht run`.
:param selected_genomes_file_path: Path to a file containing all the genome file path.
:param num_threads: Number of threads to use for multiprocessing when reading the comparison files. Default is 16.
:param sig_info_dict:
A dictionary mapping each genome signature name to a tuple containing metadata:
(md5sum, minhash mean abundance, minhash hashes length, minhash scaled).
- md5sum: Checksum for data integrity.
- minhash mean abundance: The mean abundance for the genome's minhash.
- minhash hashes length: The length of minhash hashes.
- minhash scaled: The scaling factor for the minhash.
:return
manifest_df: a dataframe containing the processed reference signature information
Decompresses a GZIP-compressed file and removes the original compressed file.
:param file_path: The path to the .sig.gz file that needs to be decompressed and deleted.
:return: None
"""
# get info from the signature files of selected genomes
selected_sig_files = pd.read_csv(selected_genomes_file_path, sep="\t", header=None)
selected_sig_files = selected_sig_files[0].to_list()

# get the genome name from the signature files using multiprocessing
with Pool(num_threads) as p:
result_list = p.starmap(_temp_get_genome_name, [(sig_file_path, ksize) for sig_file_path in selected_sig_files])
selected_genome_names_set = set([x for x in result_list if x])
try:
output_filename = os.path.splitext(file_path)[0]
with gzip.open(file_path, 'rb') as f_in:
with open(output_filename, 'wb') as f_out:
f_out.write(f_in.read())

# remove the close related organisms from the reference genome list
manifest_df = []
for sig_name, (
md5sum,
minhash_mean_abundance,
minhash_hashes_len,
minhash_scaled,
) in tqdm(sig_info_dict.items(), desc="Removing close related organisms from the reference genome list"):
if sig_name in selected_genome_names_set:
manifest_df.append(
(
sig_name,
md5sum,
minhash_hashes_len,
get_num_kmers(
minhash_mean_abundance,
minhash_hashes_len,
minhash_scaled,
False,
),
minhash_scaled,
)
)
manifest_df = pd.DataFrame(
manifest_df,
columns=[
"organism_name",
"md5sum",
"num_unique_kmers_in_genome_sketch",
"num_total_kmers_in_genome_sketch",
"genome_scale_factor",
],
)
os.remove(file_path)

return manifest_df
except Exception as e:
logger.info(f"Failed to process {file_path}: {e}")

def decompress_all_sig_files(sig_files: List[str], num_threads: int) -> None:
"""
Decompresses all .sig.gz files in the list using multiple threads.
:param sig_files: List of .sig.gz files that need to be decompressed.
:param num_threads: Number of threads to use for decompression.
:return: None
"""
with Pool(num_threads) as p:
p.map(_decompress_and_remove, sig_files)

logger.info("All .sig.gz files have been decompressed.")
4 changes: 3 additions & 1 deletion tests/test_hypothesis_recovery_src.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,10 @@ def test_hypothesis_recovery(self, mock_get_exclusive_hashes, mock_get_organisms
@patch('yacht.hypothesis_recovery_src.os.listdir')
@patch('builtins.open', new_callable=mock_open, create=True)
@patch('yacht.hypothesis_recovery_src.zipfile.ZipFile')
def test_get_organisms_with_nonzero_overlap(self, mock_zipfile, _, mock_os_listdir,
@patch('glob.glob')
def test_get_organisms_with_nonzero_overlap(self, mock_glob, mock_zipfile, _, mock_os_listdir,
mock_os_path_join, mock_os_system, mock_read_csv):
mock_glob.return_value = ['training_sig_file_1.sig']
mock_os_listdir.return_value = ['sig_file']
mock_os_path_join.return_value = 'joined_path'
mock_os_system.return_value = 0
Expand Down
Loading

0 comments on commit 15ccba9

Please sign in to comment.