Merge pull request #44 from pepkit/dev

rafalstepien · web-flow · commit e65b2357ce1b · 2022-08-29T11:41:00.000-04:00
Changes that will be shipped with release 0.1.8
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -2,6 +2,18 @@
 
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format.
 
+## [0.1.8] - 2022-08-29
+### Changed
+- the way of merging tables for multiline output format from eido convert
+### Added
+- better architecture for output formatters that goes well with **open-closed principle**
+- using mock in some testcases
+- test data in the format that was causing the errors previously
+### Fixed
+- passing plugin keyword arguments to `run_filter` function
+- saving output file will now work for path like `file.txt`, no need to pass full path
+
+
 ## [0.1.7] - 2022-08-11
 ### Changed
 - When a validation fails, `eido` will now return all errors instead of just the first one it finds.
diff --git a/eido/_version.py b/eido/_version.py
@@ -1 +1 @@
-__version__ = "0.1.7"
+__version__ = "0.1.8"
diff --git a/eido/conversion.py b/eido/conversion.py
@@ -6,6 +6,7 @@
 from pkg_resources import iter_entry_points
 
 from .exceptions import *
+from typing import NoReturn
 
 _LOGGER = getLogger(__name__)
 
@@ -40,7 +41,7 @@ def convert_project(prj, target_format, plugin_kwargs=None):
     :param str target_format: the format to convert the Project object to
     :raise EidoFilterError: if the requested filter is not defined
     """
-    return run_filter(prj, target_format, plugin_kwargs or dict())
+    return run_filter(prj, target_format, plugin_kwargs=plugin_kwargs or dict())
 
 
 def run_filter(prj, filter_name, verbose=True, plugin_kwargs=None):
@@ -90,23 +91,24 @@ def run_filter(prj, filter_name, verbose=True, plugin_kwargs=None):
                 )
             else:
                 # create path if it doesn't exist
-                if not os.path.exists(result_path):
+                if not os.path.exists(result_path) and os.path.isdir(
+                    os.path.dirname(result_path)
+                ):
                     os.makedirs(os.path.dirname(result_path), exist_ok=True)
-                # write to path
-                with open(result_path, "w") as f:
-                    f.write(conv_result[result_key])
+                save_result(result_path, conv_result[result_key])
 
     if verbose:
         for result_key in conv_result:
             sys.stdout.write(conv_result[result_key])
-    else:
-        # simply return from the function with
-        # conversion results
-        pass
 
     return conv_result
 
 
+def save_result(result_path: str, content: str) -> NoReturn:
+    with open(result_path, "w") as f:
+        f.write(content)
+
+
 def get_available_pep_filters():
     """
     Get a list of available target formats
diff --git a/eido/conversion_plugins.py b/eido/conversion_plugins.py
@@ -1,5 +1,6 @@
 """ built-in PEP filters """
 from typing import Dict
+from .output_formatters import MultilineOutputFormatter
 
 
 def basic_pep_filter(p, **kwargs) -> Dict[str, str]:
@@ -53,21 +54,7 @@ def csv_pep_filter(p, **kwargs) -> Dict[str, str]:
 
     :param peppy.Project p: a Project to run filter on
     """
-    sample_table_path = kwargs.get("sample_table_path")
-    subsample_table_path = kwargs.get("subsample_table_path")
-    sample_table_repr = p.sample_table.to_csv(path_or_buf=sample_table_path)
-
-    s = ""
-    if sample_table_repr is not None:
-        s += sample_table_repr
-    if p.subsample_table is not None:
-        subsample_table_repr = p.subsample_table.to_csv(
-            path_or_buf=subsample_table_path
-        )
-        if subsample_table_repr is not None:
-            s += subsample_table_repr
-
-    return {"samples": s}
+    return {"samples": MultilineOutputFormatter.format(p.samples)}
 
 
 def processed_pep_filter(p, **kwargs) -> Dict[str, str]:
diff --git a/eido/output_formatters.py b/eido/output_formatters.py
@@ -0,0 +1,125 @@
+from abc import ABC, abstractmethod
+from typing import Iterable, List, Union
+
+from peppy.sample import Sample
+
+
+class BaseOutputFormatter(ABC):
+    @staticmethod
+    @abstractmethod
+    def format(samples: List[Sample]):
+        """
+        Convert the samples to correct format.
+        """
+        pass
+
+
+class MultilineOutputFormatter(BaseOutputFormatter):
+    @staticmethod
+    def format(samples: List[Sample]) -> str:
+        output_rows = []
+        sample_attributes = [
+            attribute
+            for attribute in samples[0].keys()
+            if not attribute.startswith("_") and not attribute == "subsample_name"
+        ]
+        header = MultilineOutputFormatter._get_header(sample_attributes)
+
+        for sample in samples:
+
+            attribute_with_multiple_properties = MultilineOutputFormatter._get_the_name_of_the_first_attribute_with_multiple_properties(
+                sample, sample_attributes
+            )
+            if attribute_with_multiple_properties:
+                sample_rows = MultilineOutputFormatter._split_sample_to_multiple_rows(
+                    sample, sample_attributes, attribute_with_multiple_properties
+                )
+                output_rows.extend(sample_rows)
+            else:
+                one_sample_row = MultilineOutputFormatter._convert_sample_to_row(
+                    sample, sample_attributes
+                )
+                output_rows.append(one_sample_row)
+
+        return "\n".join(header + output_rows) + "\n"
+
+    @staticmethod
+    def _get_header(header_column_names: List[str]):
+        return [",".join(header_column_names)]
+
+    @staticmethod
+    def _get_the_name_of_the_first_attribute_with_multiple_properties(
+        sample: Sample, sample_attributes: List[str]
+    ) -> Union[str, None]:
+        for attribute in sample_attributes:
+            if MultilineOutputFormatter._sample_attribute_is_list(sample, attribute):
+                return attribute
+
+    @staticmethod
+    def _split_sample_to_multiple_rows(
+        sample: Sample, sample_attributes: List, attribute_with_multiple_properties: str
+    ) -> Iterable[str]:
+        """
+        If one sample object contains array properties instead of single value, then it will be converted
+        to multiple rows.
+
+        Args:
+            sample: Sample from project.
+            sample_attributes: List of all sample properties names (name of columns from sample_table).
+
+        Returns:
+            List of rows created from given sample object.
+        """
+        number_of_samples_after_split = len(
+            getattr(sample, attribute_with_multiple_properties)
+        )
+        sample_rows_after_split = []
+
+        for sample_index in range(number_of_samples_after_split):
+            sample_row = MultilineOutputFormatter._convert_sample_to_row(
+                sample, sample_attributes, sample_index
+            )
+            sample_rows_after_split.append(sample_row)
+
+        return sample_rows_after_split
+
+    @staticmethod
+    def _convert_sample_to_row(
+        sample: Sample, sample_attributes: List, sample_index: int = 0
+    ) -> str:
+        """
+        Converts single sample object to CSV row.
+
+        Some samples have a list of values instead of single value for given attribute (column), and
+        sample_index indicates index of the value that will be used to create a row. For samples that don't
+        have any attributes with given names this will always be zero.
+
+        Args:
+            sample: Single sample object.
+            sample_attributes: Array of all attributes names (column names) for given sample.
+            sample_index: Number indicating which value will be used to create row. Some samples
+
+        Returns:
+            Representation of sample as a CSV row.
+        """
+        sample_row = []
+
+        for attribute in sample_attributes:
+
+            if MultilineOutputFormatter._sample_attribute_is_list(sample, attribute):
+                value = getattr(sample, attribute)[sample_index]
+            else:
+                value = getattr(sample, attribute, "")
+
+            sample_row.append(value)
+
+        return ",".join(sample_row)
+
+    @staticmethod
+    def _sample_attribute_is_list(sample: Sample, attribute: str) -> bool:
+        return isinstance(getattr(sample, attribute, ""), list)
+
+
+class SampleSubsampleOutputFormatter(BaseOutputFormatter):
+    def format(self, samples: List[Sample]):
+        pass
diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
@@ -2,3 +2,4 @@ coveralls
 mock>=2.0.0
 pytest>=4.6.9
 pytest-cov>=2.8.1
+pytest-mock==3.6.1
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,5 +1,6 @@
 import os
 
+import pandas as pd
 import pytest
 from peppy import Project
 
@@ -21,7 +22,7 @@ def peps_path(data_path):
 
 @pytest.fixture
 def project_file_path(peps_path):
-    return os.path.join(peps_path, "test_cfg.yaml")
+    return os.path.join(peps_path, "test_pep", "test_cfg.yaml")
 
 
 @pytest.fixture
@@ -52,3 +53,30 @@ def schema_sample_invalid_file_path(schemas_path):
 @pytest.fixture
 def schema_imports_file_path(schemas_path):
     return os.path.join(schemas_path, "test_schema_imports.yaml")
+
+
+@pytest.fixture
+def taxprofiler_project_path(peps_path):
+    return os.path.join(peps_path, "taxprofiler_pep", "config.yaml")
+
+
+@pytest.fixture
+def taxprofiler_project(taxprofiler_project_path):
+    return Project(taxprofiler_project_path)
+
+
+@pytest.fixture
+def path_to_taxprofiler_csv_multiline_output(peps_path):
+    return os.path.join(peps_path, "taxprofiler_pep", "multiline_output.csv")
+
+
+@pytest.fixture
+def taxprofiler_csv_multiline_output(path_to_taxprofiler_csv_multiline_output):
+    return pd.read_csv(path_to_taxprofiler_csv_multiline_output).to_csv(
+        path_or_buf=None, index=None
+    )
+
+
+@pytest.fixture
+def save_result_mock(mocker):
+    return mocker.patch("eido.conversion.save_result")
diff --git a/tests/data/peps/taxprofiler_pep/config.yaml b/tests/data/peps/taxprofiler_pep/config.yaml
@@ -0,0 +1,5 @@
+pep_version: "2.0.0"
+sample_table: "samplesheet.csv"
+subsample_table: "subsamplesheet.csv"
+sample_table_index: "sample"
+subsample_table_index: "sample"
diff --git a/tests/data/peps/taxprofiler_pep/multiline_output.csv b/tests/data/peps/taxprofiler_pep/multiline_output.csv
@@ -0,0 +1,8 @@
+sample,strandedness,instrument_platform,run_accession,fastq_1,fastq_2
+WT_REP1,reverse,ABI_SOLID,runaccession1,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357070_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357070_2.fastq.gz
+WT_REP1,reverse,BGISEQ,runaccession2,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357071_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357071_2.fastq.gz
+WT_REP2,reverse,CAPILLARY,123123123,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357072_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357072_2.fastq.gz
+RAP1_UNINDUCED_REP1,reverse,COMPLETE_GENOMICS,somerunaccesion,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357073_1.fastq.gz,
+RAP1_UNINDUCED_REP2,reverse,DNBSEQ,ERR2412421,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357074_1.fastq.gz,
+RAP1_UNINDUCED_REP2,reverse,HELICOS,xxxxxxxxxx,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357075_1.fastq.gz,
+RAP1_IAA_30M_REP1,reverse,ILLUMINA,None,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357076_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357076_2.fastq.gz
diff --git a/tests/data/peps/taxprofiler_pep/samplesheet.csv b/tests/data/peps/taxprofiler_pep/samplesheet.csv
@@ -0,0 +1,6 @@
+sample,strandedness
+WT_REP1,reverse
+WT_REP2,reverse
+RAP1_UNINDUCED_REP1,reverse
+RAP1_UNINDUCED_REP2,reverse
+RAP1_IAA_30M_REP1,reverse
diff --git a/tests/data/peps/taxprofiler_pep/subsamplesheet.csv b/tests/data/peps/taxprofiler_pep/subsamplesheet.csv
@@ -0,0 +1,8 @@
+sample,instrument_platform,run_accession,fastq_1,fastq_2
+WT_REP1,ABI_SOLID,runaccession1,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357070_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357070_2.fastq.gz
+WT_REP1,BGISEQ,runaccession2,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357071_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357071_2.fastq.gz
+WT_REP2,CAPILLARY,123123123,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357072_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357072_2.fastq.gz
+RAP1_UNINDUCED_REP1,COMPLETE_GENOMICS,somerunaccesion,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357073_1.fastq.gz,
+RAP1_UNINDUCED_REP2,DNBSEQ,ERR2412421,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357074_1.fastq.gz,
+RAP1_UNINDUCED_REP2,HELICOS,xxxxxxxxxx,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357075_1.fastq.gz,
+RAP1_IAA_30M_REP1,ILLUMINA,None,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357076_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/testdata/GSE110004/SRR6357076_2.fastq.gz
diff --git a/tests/data/peps/test_pep/test_cfg.yaml b/tests/data/peps/test_pep/test_cfg.yaml
diff --git a/tests/data/peps/test_pep/test_sample_table.csv b/tests/data/peps/test_pep/test_sample_table.csv
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
@@ -1,5 +1,4 @@
 from eido.conversion import *
-import peppy
 
 
 class TestConversionInfrastructure:
@@ -20,12 +19,27 @@ def test_plugins_are_callable(self):
             [callable(plugin_fun) for plugin_name, plugin_fun in avail_plugins.items()]
         )
 
-    def test_basic_filter(self, project_object):
+    def test_basic_filter(self, save_result_mock, project_object):
         conv_result = run_filter(
             project_object,
             "basic",
             verbose=False,
             plugin_kwargs={"paths": {"project": "out/basic_prj.txt"}},
         )
-        # the basic filter just converts to a string
+
+        assert save_result_mock.called
         assert conv_result["project"] == str(project_object)
+
+    def test_csv_filter(
+        self, save_result_mock, taxprofiler_project, taxprofiler_csv_multiline_output
+    ):
+
+        conv_result = run_filter(
+            taxprofiler_project,
+            "csv",
+            verbose=False,
+            plugin_kwargs={"paths": {"samples": "out/basic_prj.txt"}},
+        )
+
+        assert save_result_mock.called
+        assert conv_result["samples"] == taxprofiler_csv_multiline_output
diff --git a/tests/test_validations.py b/tests/test_validations.py
@@ -136,6 +136,5 @@ def test_validate_detects_invalid(self, schema_invalid_file_path, remote_pep_cfg
         _check_remote_file_accessible(remote_pep_cfg)
         with pytest.raises(EidoValidationError):
             validate_project(
-                project=Project(remote_pep_cfg),
-                schema=schema_invalid_file_path,
+                project=Project(remote_pep_cfg), schema=schema_invalid_file_path
             )

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.1.7"`
	`1`	`+__version__ = "0.1.8"`
Original file line number	Diff line number	Diff line change
`@@ -136,6 +136,5 @@ def test_validate_detects_invalid(self, schema_invalid_file_path, remote_pep_cfg`
`136`	`136`	`_check_remote_file_accessible(remote_pep_cfg)`
`137`	`137`	`with pytest.raises(EidoValidationError):`
`138`	`138`	`validate_project(`
`139`		`- project=Project(remote_pep_cfg),`
`140`		`- schema=schema_invalid_file_path,`
	`139`	`+ project=Project(remote_pep_cfg), schema=schema_invalid_file_path`
`141`	`140`	`)`