Merge pull request #127 from KoslickiLab/dev-patch-NEW-algorithm

Patch to use `.sig` instead of `*.sig.gz`
KoslickiLab · Nov 6, 2024 · 15ccba9 · 15ccba9
2 parents 5628ee0 + f316419
commit 15ccba9
Show file tree

Hide file tree

Showing 12 changed files with 110 additions and 138 deletions.
diff --git a/.gitignore b/.gitignore
@@ -167,4 +167,8 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-gtdb-rs214-reps.k31_0.9995_pretrained/
+gtdb-rs214-reps.k31_0.9995_pretrained/
+
+# added by mahmudhera
+src/cpp/main.o
+.gitignore
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,11 @@
+# Include C++ source files
+include src/cpp/*.cpp
+include src/cpp/*.hpp
+
+# Include other necessary files
+include LICENSE.txt
+include README.md
+include Makefile
+include build.sh
+include build_unix.sh
+include build_windows.bat
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 # Compiler and flags
-CXX = g++
+CXX ?= g++
 CXXFLAGS = -std=c++17 -Wall -w -O3 -Wsign-compare
 
 # Directories
@@ -26,12 +26,12 @@ $(BIN_DIR):
 # build the object files
 $(OBJ_FILES): %.o: %.cpp
 	echo "Compiling: $<"
-	$(CXX) $(CXXFLAGS) -c $< -o $@ -lz
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 # build the target executable
 $(TARGET): $(OBJ_FILES) | $(BIN_DIR)
 	echo "Linking to create executable: $(TARGET)"
-	$(CXX) $(CXXFLAGS) $(OBJ_FILES) -o $(TARGET) -lz -lpthread
+	$(CXX) $(CXXFLAGS) $(OBJ_FILES) -o $(TARGET) -lpthread
 
 # clean up
 clean:

diff --git a/conda_recipe/meta.yaml b/conda_recipe/meta.yaml
@@ -6,19 +6,19 @@ package:
 
 source:
   url: https://github.com/KoslickiLab/YACHT/releases/download/v{{ version  }}/yacht-{{ version  }}.tar.gz
-  sha256: 3365731592e37b5708fb6bdedc817debe813f7050814a68e07d1a4836337762e
+  sha256: 68d272daeb70ed7390aa2d468934dc4bf0aa9a021f99fe99847b8a664e8ac8cf
 
 build:
   number: 0
+  skip: True  # [osx]
   script: "{{ PYTHON }} -m pip install . --no-deps --no-build-isolation --no-cache-dir -vvv"
+  run_exports:
+    - {{ pin_subpackage('yacht') }}
 
 requirements:
   build:
-    - {{ compiler('cxx') }}       # Adds platform-specific C++ compiler (g++, clang, MSVC)
+    - {{ compiler('cxx') }}
     - make                        # Ensures that Make is available (for Unix)
-    - python >3.6,<3.12
-    - pip
-    - setuptools 
 
   host:
     - python >3.6,<3.12
@@ -54,7 +54,6 @@ requirements:
     - ruff
     - sourmash_plugin_branchwater
 
-
 test:
   commands:
     - yacht --help
@@ -71,8 +70,19 @@ about:
 extra:
   skip-lints:
     - should_use_compilers
+    - should_be_noarch_generic
   identifiers:
     - doi:10.1093/bioinformatics/btae047
   recipe-maintainers:
     - chunyuma
-    - dkoslicki
+    - dkoslicki
+  authors:
+    - dkoslicki
+    - chunyuma
+    - mahmudhera
+    - sew347
+    - mlupei
+    - mfl15
+    - ShaopengLiu1
+    - raquellewei
+    - mohsenht
diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp
@@ -27,9 +27,6 @@
 #include <chrono>
 #include <random>
 
-
-#include <zlib.h>
-
 using namespace std;
 using json = nlohmann::json;
 
@@ -62,38 +59,8 @@ vector<vector<int>> similars;
 
 
 
-string decompressGzip(const std::string& filename) {
-    // Open file
-    gzFile file = gzopen(filename.c_str(), "rb");
-    if (!file) {
-        throw runtime_error("Failed to open gzip file.");
-    }
-
-    // Buffer for decompressed data
-    const size_t bufferSize = 8192;
-    vector<char> buffer(bufferSize);
-    string decompressedData;
-
-    int bytesRead;
-    while ((bytesRead = gzread(file, buffer.data(), bufferSize)) > 0) {
-        decompressedData.append(buffer.data(), bytesRead);
-    }
-
-    gzclose(file);
-    return decompressedData;
-}
-
-
-
 vector<hash_t> read_min_hashes(const string& json_filename) {
 
-    // check if gz file
-    if (json_filename.find(".gz") != std::string::npos) {
-        auto jsonData = json::parse(decompressGzip(json_filename));
-        std::vector<hash_t> min_hashes = jsonData[0]["signatures"][0]["mins"];
-        return min_hashes;
-    }
-
     // Open the JSON file
     ifstream inputFile(json_filename);
 

diff --git a/src/yacht/hypothesis_recovery_src.py b/src/yacht/hypothesis_recovery_src.py
@@ -9,8 +9,9 @@
 from tqdm import tqdm
 from multiprocessing import Pool
 import sourmash
+import glob
 from typing import List, Set, Tuple
-from .utils import load_signature_with_ksize
+from .utils import load_signature_with_ksize, decompress_all_sig_files
 # Configure Loguru logger
 from loguru import logger
 
@@ -23,7 +24,7 @@
     sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO"
 )
 
-SIG_SUFFIX = ".sig.gz"
+SIG_SUFFIX = ".sig"
 
 
 def get_organisms_with_nonzero_overlap(
@@ -61,6 +62,10 @@ def get_organisms_with_nonzero_overlap(
     logger.info("Unzipping the sample signature zip file")
     with zipfile.ZipFile(sample_file, "r") as sample_zip_file:
         sample_zip_file.extractall(path_to_sample_temp_dir)
+    all_gz_files = glob.glob(f"{path_to_sample_temp_dir}/signatures/*.sig.gz")
+    # decompress all signature files
+    logger.info(f"Decompressing {len(all_gz_files)} .sig.gz files using {num_threads} threads.")
+    decompress_all_sig_files(all_gz_files, num_threads)
 
     sample_sig_file = pd.DataFrame(
         [
@@ -141,7 +146,7 @@ def __find_exclusive_hashes(
     ) -> Set[int]:
         # load genome signature
         sig = load_signature_with_ksize(
-            os.path.join(path_to_temp_dir, "signatures", md5sum + ".sig.gz"), ksize
+            os.path.join(path_to_temp_dir, "signatures", md5sum + SIG_SUFFIX), ksize
         )
         return {hash for hash in sig.minhash.hashes if hash in single_occurrence_hashes}
 
@@ -155,7 +160,7 @@ def __find_exclusive_hashes(
     multiple_occurrence_hashes: Set[int] = set()
     for md5sum in tqdm(organism_md5sum_list, desc="Processing organism signatures"):
         sig = load_signature_with_ksize(
-            os.path.join(path_to_genome_temp_dir, "signatures", md5sum + ".sig.gz"),
+            os.path.join(path_to_genome_temp_dir, "signatures", md5sum + SIG_SUFFIX),
             ksize,
         )
         for hash in sig.minhash.hashes:

diff --git a/src/yacht/make_training_data_from_sketches.py b/src/yacht/make_training_data_from_sketches.py
@@ -7,6 +7,7 @@
 from loguru import logger
 import json
 import shutil
+import glob
 from . import utils
 
 # Configure Loguru logger
@@ -107,6 +108,11 @@ def main(args):
     logger.info("Unzipping the sourmash signature file to the temporary directory")
     with zipfile.ZipFile(ref_file, "r") as sourmash_db:
         sourmash_db.extractall(path_to_temp_dir)
+    all_gz_files = glob.glob(f"{path_to_temp_dir}/signatures/*.sig.gz")
+
+    # decompress all signature files
+    logger.info(f"Decompressing {len(all_gz_files)} .sig.gz files using {num_threads} threads.")
+    utils.decompress_all_sig_files(all_gz_files, num_threads)
 
     # Extract signature information
     logger.info("Extracting signature information")

diff --git a/src/yacht/run_YACHT.py b/src/yacht/run_YACHT.py
@@ -9,8 +9,9 @@
 import json
 import warnings
 import zipfile
+import glob
 from loguru import logger
-
+from .utils import decompress_all_sig_files
 warnings.filterwarnings("ignore")
 
 # Configure Loguru logger
@@ -173,6 +174,14 @@ def main(args):
     else:
         has_raw = True
 
+    # a patch to check if the genome signature files have been decompressed
+    training_sig_file = glob.glob(f"{path_to_genome_temp_dir}/training_sig_files.*")[0]
+    df = pd.read_csv(training_sig_file, sep="\t", header=None)
+    if 'sig.gz' in df[0].values[0]:
+        pd.DataFrame([x.replace('sig.gz','sig') for x in df[0]]).to_csv(training_sig_file, header=False, index=False)
+        all_gz_files = glob.glob(f"{path_to_genome_temp_dir}/signatures/*.sig.gz")
+        decompress_all_sig_files(all_gz_files, num_threads)  
+
     manifest_list = hr.hypothesis_recovery(
         manifest,
         sample_info_set,

diff --git a/src/yacht/utils.py b/src/yacht/utils.py
@@ -8,6 +8,7 @@
 from loguru import logger
 from typing import Optional, List, Set, Dict, Tuple
 import shutil
+import gzip
 from glob import glob
 
 # Configure Loguru logger
@@ -477,75 +478,32 @@ def check_download_args(args, db_type):
             logger.error("We now haven't supported for virus database.")
             sys.exit(1)
 
-def _temp_get_genome_name(sig_file_path, ksize):
 
-    res = get_info_from_single_sig(sig_file_path, ksize)
-    if res:
-        return res[0]
-    else:
-        return None
-
-def temp_generate_inputs(
-    selected_genomes_file_path: str,
-    sig_info_dict: Dict[str, Tuple[str, float, int, int]],
-    ksize: int,
-    num_threads: int = 16,
-) -> Tuple[pd.DataFrame, pd.DataFrame]:
+def _decompress_and_remove(file_path: str) -> None:
     """
-    Temporary Helper function that generates the required input for `yacht run`.
-    :param selected_genomes_file_path: Path to a file containing all the genome file path.
-    :param num_threads: Number of threads to use for multiprocessing when reading the comparison files. Default is 16.
-    :param sig_info_dict:
-        A dictionary mapping each genome signature name to a tuple containing metadata: 
-        (md5sum, minhash mean abundance, minhash hashes length, minhash scaled).
-        - md5sum: Checksum for data integrity.
-        - minhash mean abundance: The mean abundance for the genome's minhash.
-        - minhash hashes length: The length of minhash hashes.
-        - minhash scaled: The scaling factor for the minhash.    
-    :return 
-        manifest_df: a dataframe containing the processed reference signature information
+    Decompresses a GZIP-compressed file and removes the original compressed file.
+    :param file_path: The path to the .sig.gz file that needs to be decompressed and deleted.
+    :return: None
     """
-    # get info from the signature files of selected genomes
-    selected_sig_files = pd.read_csv(selected_genomes_file_path, sep="\t", header=None)
-    selected_sig_files = selected_sig_files[0].to_list()
-
-    # get the genome name from the signature files using multiprocessing
-    with Pool(num_threads) as p:
-        result_list = p.starmap(_temp_get_genome_name, [(sig_file_path, ksize) for sig_file_path in selected_sig_files])
-    selected_genome_names_set = set([x for x in result_list if x])
+    try:
+        output_filename = os.path.splitext(file_path)[0]
+        with gzip.open(file_path, 'rb') as f_in:
+            with open(output_filename, 'wb') as f_out:
+                f_out.write(f_in.read())
 
-    # remove the close related organisms from the reference genome list
-    manifest_df = []
-    for sig_name, (
-        md5sum,
-        minhash_mean_abundance,
-        minhash_hashes_len,
-        minhash_scaled,
-    ) in tqdm(sig_info_dict.items(), desc="Removing close related organisms from the reference genome list"):
-        if sig_name in selected_genome_names_set:
-            manifest_df.append(
-                (
-                    sig_name,
-                    md5sum,
-                    minhash_hashes_len,
-                    get_num_kmers(
-                        minhash_mean_abundance,
-                        minhash_hashes_len,
-                        minhash_scaled,
-                        False,
-                    ),
-                    minhash_scaled,
-                )
-            )
-    manifest_df = pd.DataFrame(
-        manifest_df,
-        columns=[
-            "organism_name",
-            "md5sum",
-            "num_unique_kmers_in_genome_sketch",
-            "num_total_kmers_in_genome_sketch",
-            "genome_scale_factor",
-        ],
-    )
+        os.remove(file_path)
 
-    return manifest_df
+    except Exception as e:
+        logger.info(f"Failed to process {file_path}: {e}")
+
+def decompress_all_sig_files(sig_files: List[str], num_threads: int) -> None:
+    """
+    Decompresses all .sig.gz files in the list using multiple threads.
+    :param sig_files: List of .sig.gz files that need to be decompressed.
+    :param num_threads: Number of threads to use for decompression.
+    :return: None
+    """
+    with Pool(num_threads) as p:
+        p.map(_decompress_and_remove, sig_files)
+
+    logger.info("All .sig.gz files have been decompressed.")
diff --git a/tests/test_hypothesis_recovery_src.py b/tests/test_hypothesis_recovery_src.py
@@ -46,8 +46,10 @@ def test_hypothesis_recovery(self, mock_get_exclusive_hashes, mock_get_organisms
     @patch('yacht.hypothesis_recovery_src.os.listdir')
     @patch('builtins.open', new_callable=mock_open, create=True)
     @patch('yacht.hypothesis_recovery_src.zipfile.ZipFile')
-    def test_get_organisms_with_nonzero_overlap(self, mock_zipfile, _, mock_os_listdir,
+    @patch('glob.glob')
+    def test_get_organisms_with_nonzero_overlap(self, mock_glob, mock_zipfile, _, mock_os_listdir,
                                                 mock_os_path_join, mock_os_system, mock_read_csv):
+        mock_glob.return_value = ['training_sig_file_1.sig']
         mock_os_listdir.return_value = ['sig_file']
         mock_os_path_join.return_value = 'joined_path'
         mock_os_system.return_value = 0