From 44db9d437c4fa53f6ab44c5f2f9c2db317ae99ef Mon Sep 17 00:00:00 2001 From: Brandon Fuller Date: Fri, 8 Nov 2024 11:40:26 -0500 Subject: [PATCH 1/5] Add function to return top level directory Added a helper function that recursively searches the directory structure and returns the path to the top level directory in an lcdb-wf repo --- lib/helpers.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/lib/helpers.py b/lib/helpers.py index 053bca2b..651a9c13 100644 --- a/lib/helpers.py +++ b/lib/helpers.py @@ -5,6 +5,7 @@ from snakemake.shell import shell from snakemake.io import expand, regex from lib import common +import os class ConfigurationError(Exception): @@ -203,3 +204,22 @@ def strand_arg_lookup(config, lookup): keys = list(lookup.keys()) raise KeyError(f"'{config.stranded}' not one of {keys}") return lookup[config.stranded] + +def get_top_level_dir(start_dir=None): + # Start from the specified directory or current working directory if none is given + current_dir = os.path.abspath(start_dir or os.getcwd()) + # Search current directory and above for targets + while True: + # Check if the target directories exists in the current directory + if (os.path.isdir(os.path.join(current_dir, ".git")) and os.path.isdir(os.path.join(current_dir, "workflows"))): + return current_dir + # Move up one level + parent_dir = os.path.dirname(current_dir) + # Stop if we've reached the root directory + if current_dir == parent_dir: + break + current_dir = parent_dir + #TODO: Check for other edge cases? + + return None + From 2751abf87610d951d705d43cd39e8f061dd7a1b0 Mon Sep 17 00:00:00 2001 From: Brandon Fuller Date: Fri, 8 Nov 2024 11:42:48 -0500 Subject: [PATCH 2/5] Account for different 'layout's in sampletable -Implemented changes to account for 'LibraryLayout', 'Layout' and 'layout' formats in sampletable. -Refactored the test suite and added pytest tests to check for correctness --- lib/common.py | 15 +++++++++++---- {lib => test/tests}/test_suite.py | 23 ++++++++++++++++++++--- 2 files changed, 31 insertions(+), 7 deletions(-) rename {lib => test/tests}/test_suite.py (68%) diff --git a/lib/common.py b/lib/common.py index 829cc129..822f5640 100644 --- a/lib/common.py +++ b/lib/common.py @@ -15,7 +15,7 @@ from lib import utils from snakemake.shell import shell from snakemake.io import expand - +from . import helpers # List of possible keys in config that are to be interpreted as paths PATH_KEYS = [ 'references_dir', @@ -616,13 +616,16 @@ def get_techreps(sampletable, label): return result -def load_config(config, missing_references_ok=False): +def load_config(config, missing_references_ok=False, test=False): """ Loads the config. Resolves any included references directories/files and runs the deprecation handler. """ + # If this is a test, then change the current working directory to the rnaseq workflow level + if test: + os.chdir(os.path.join(helpers.get_top_level_dir(), "workflows","rnaseq")) if isinstance(config, str): config = yaml.load(open(config), Loader=yaml.FullLoader) @@ -728,7 +731,7 @@ def is_paired_end(sampletable, sample): if "Run" in sampletable.columns: if all(sampletable["Run"].str.startswith("SRR")): - if "Layout" not in sampletable.columns and "layout" not in sampletable.columns: + if "Layout" not in sampletable.columns and "layout" not in sampletable.columns and "LibraryLayout" not in sampletable.columns: raise ValueError( "Sampletable appears to be SRA, but no 'Layout' column " "found. This is required to specify single- or paired-end " @@ -737,7 +740,7 @@ def is_paired_end(sampletable, sample): row = sampletable.set_index(sampletable.columns[0]).loc[sample] if 'orig_filename_R2' in row: return True - if 'layout' in row and 'LibraryLayout' in row: + if 'layout' in row and 'LibraryLayout' in row or 'Layout' in row and 'LibraryLayout' in row: raise ValueError("Expecting column 'layout' or 'LibraryLayout', " "not both") try: @@ -748,6 +751,10 @@ def is_paired_end(sampletable, sample): return row['LibraryLayout'].lower() in ['pe', 'paired'] except KeyError: pass + try: + return row['Layout'].lower() in ['pe', 'paired'] + except KeyError: + pass return False diff --git a/lib/test_suite.py b/test/tests/test_suite.py similarity index 68% rename from lib/test_suite.py rename to test/tests/test_suite.py index 21b9c052..b1389f03 100644 --- a/lib/test_suite.py +++ b/test/tests/test_suite.py @@ -1,8 +1,25 @@ -import os -import pprint +import sys +import subprocess +top_level_dir = subprocess.run(["dirname $(dirname $(pwd))"], shell=True, capture_output=True, text=True).stdout.strip() +print("top level dir: ", top_level_dir) +sys.path.insert(0, top_level_dir) +import pytest from textwrap import dedent -from . import common +from lib import common, helpers, patterns_targets +# Make config object that can be re-used for any test +@pytest.fixture +def config(request): + config_path = request.param + config = common.load_config(config_path, test=True) + return patterns_targets.RNASeqConfig(config, config.get('patterns', '../workflows/rnaseq/config/rnaseq_patterns.yaml')) + +# Call helpers.detect_layout(), which implicitly tests common.is_paired_end() +# TODO: Make assertion condition NOT hard coded in to work with current example table +@pytest.mark.parametrize("config", ['../../workflows/rnaseq/config/config.yaml'], indirect=True) +def test_is_paired_end(config): + is_paired = helpers.detect_layout(config.sampletable) == 'PE' + assert not is_paired, f"Test failed, is_paired = {is_paired}" def test_config_loading(tmpdir): f0 = tmpdir.mkdir('subdir').join('file0.yaml') From 385ac92bbb5da309fd70cf072e05386e42ad194a Mon Sep 17 00:00:00 2001 From: Brandon Fuller Date: Fri, 8 Nov 2024 12:20:33 -0500 Subject: [PATCH 3/5] Fix import bug Changed import statement for helpers package that was causing an error --- lib/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/common.py b/lib/common.py index 822f5640..a625086b 100644 --- a/lib/common.py +++ b/lib/common.py @@ -15,7 +15,7 @@ from lib import utils from snakemake.shell import shell from snakemake.io import expand -from . import helpers +from lib import helpers # List of possible keys in config that are to be interpreted as paths PATH_KEYS = [ 'references_dir', From c88cb6dd8c2fb4acb419ab81aa4234c4fce9149c Mon Sep 17 00:00:00 2001 From: Brandon Fuller Date: Wed, 13 Nov 2024 10:36:10 -0500 Subject: [PATCH 4/5] Improve sampletable layout column detection Improve the way the layout column is checked and remove the test parameter and functionality in load_config as it has become clear that implementing unit tests will require its own issues and resolutions --- lib/common.py | 39 ++++++++++----------------------------- 1 file changed, 10 insertions(+), 29 deletions(-) diff --git a/lib/common.py b/lib/common.py index a625086b..447321be 100644 --- a/lib/common.py +++ b/lib/common.py @@ -16,6 +16,8 @@ from snakemake.shell import shell from snakemake.io import expand from lib import helpers +from pathlib import Path + # List of possible keys in config that are to be interpreted as paths PATH_KEYS = [ 'references_dir', @@ -616,18 +618,13 @@ def get_techreps(sampletable, label): return result -def load_config(config, missing_references_ok=False, test=False): +def load_config(config, missing_references_ok=False): """ Loads the config. Resolves any included references directories/files and runs the deprecation handler. """ - # If this is a test, then change the current working directory to the rnaseq workflow level - if test: - os.chdir(os.path.join(helpers.get_top_level_dir(), "workflows","rnaseq")) - if isinstance(config, str): - config = yaml.load(open(config), Loader=yaml.FullLoader) # Here we populate a list of reference sections. Items later on the list # will have higher priority @@ -725,34 +722,18 @@ def is_paired_end(sampletable, sample): # We can't fall back to detecting PE based on two fastq files provided for # each sample when it's an SRA sampletable (which only has SRR accessions). # - # So detect first detect if SRA sampletable based on presence of "Run" - # column and all values of that column starting with "SRR", and then raise - # an error if the Layout column does not exist. - - if "Run" in sampletable.columns: - if all(sampletable["Run"].str.startswith("SRR")): - if "Layout" not in sampletable.columns and "layout" not in sampletable.columns and "LibraryLayout" not in sampletable.columns: - raise ValueError( - "Sampletable appears to be SRA, but no 'Layout' column " - "found. This is required to specify single- or paired-end " - "libraries.") + # So instead first detect if there is in fact a second fastq file listed, + # and if not then check if the layout of the library is listed row = sampletable.set_index(sampletable.columns[0]).loc[sample] if 'orig_filename_R2' in row: return True - if 'layout' in row and 'LibraryLayout' in row or 'Layout' in row and 'LibraryLayout' in row: - raise ValueError("Expecting column 'layout' or 'LibraryLayout', " - "not both") - try: - return row['layout'].lower() in ['pe', 'paired'] - except KeyError: - pass - try: - return row['LibraryLayout'].lower() in ['pe', 'paired'] - except KeyError: - pass + layout_columns = set(sampletable.columns).intersection(['layout', 'LibraryLayout', 'Layout']) + if len(layout_columns) != 1: + raise ValueError("Expected exactly one of ['layout', 'LibraryLayout', 'Layout'] in sample table") + layout_column = list(layout_columns)[0] try: - return row['Layout'].lower() in ['pe', 'paired'] + return row[layout_column].lower() in ['pe', 'paired'] except KeyError: pass return False From 6c81cc15f9ee304122e34102d54765827ef31c64 Mon Sep 17 00:00:00 2001 From: Brandon Fuller Date: Wed, 13 Nov 2024 10:51:23 -0500 Subject: [PATCH 5/5] Improve SRA table checking Make sure the table is in fact an SRA sampletable before checking the layout --- lib/common.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/lib/common.py b/lib/common.py index 447321be..08959dc1 100644 --- a/lib/common.py +++ b/lib/common.py @@ -728,14 +728,16 @@ def is_paired_end(sampletable, sample): row = sampletable.set_index(sampletable.columns[0]).loc[sample] if 'orig_filename_R2' in row: return True - layout_columns = set(sampletable.columns).intersection(['layout', 'LibraryLayout', 'Layout']) - if len(layout_columns) != 1: - raise ValueError("Expected exactly one of ['layout', 'LibraryLayout', 'Layout'] in sample table") - layout_column = list(layout_columns)[0] - try: - return row[layout_column].lower() in ['pe', 'paired'] - except KeyError: - pass + if "Run" in sampletable.columns: + if all(sampletable["Run"].str.startswith("SRR")): + layout_columns = set(sampletable.columns).intersection(['layout', 'LibraryLayout', 'Layout']) + if len(layout_columns) != 1: + raise ValueError("Expected exactly one of ['layout', 'LibraryLayout', 'Layout'] in sample table") + layout_column = list(layout_columns)[0] + try: + return row[layout_column].lower() in ['pe', 'paired'] + except KeyError: + pass return False