Skip to content

Commit 0b4659d

Browse files
wasadeElDeveloper
authored andcommitted
Variance adjusted UniFrac (#20)
* Test for NULL on malloc... * Test for file existence * Cleaning up warning messages * ENH: generalized unifrac * Expose via q2 * TST: d^0 and d^0.5 * Missed import * Refactoring * regression: task param boundaries were set wrong * Refactor tasks to separate file * VAW methods * Citations * Expose VAW * travis c++ needs another include * syntax... * wrong param name * Missing a merge conflict * Fix param * Merge conflict error * Merge hell * Not sure * Addressing @ElDeveloper's comments * Missed close of comment
1 parent 6966503 commit 0b4659d

9 files changed

+714
-93
lines changed

q2_state_unifrac/_methods.py

+21-9
Original file line numberDiff line numberDiff line change
@@ -23,61 +23,73 @@ def _sanity():
2323
raise ValueError("ssu could not be located!")
2424

2525

26-
def _run(table_fp, tree_fp, output_fp, threads, method):
26+
def _run(table_fp, tree_fp, output_fp, threads, method,
27+
variance_adjusted=False, alpha=None):
2728
cmd = [resource_filename(*ARGS),
2829
'-i', table_fp,
2930
'-t', tree_fp,
3031
'-o', output_fp,
3132
'-n', threads,
3233
'-m', method]
3334

35+
if variance_adjusted:
36+
cmd.append('--vaw')
37+
38+
if alpha is not None:
39+
cmd.append('-a')
40+
cmd.append(str(alpha))
41+
3442
subprocess.run(cmd, check=True)
3543

3644

3745
def unweighted(table: BIOMV210Format,
3846
phylogeny: NewickFormat,
39-
threads: int=1)-> skbio.DistanceMatrix:
47+
threads: int=1,
48+
variance_adjusted: bool=False)-> skbio.DistanceMatrix:
4049
_sanity()
4150

4251
with tempfile.TemporaryDirectory() as tmp:
4352
output_fp = os.path.join(tmp, 'foo.dm')
4453
_run(str(table), str(phylogeny), output_fp, str(threads),
45-
'unweighted')
54+
'unweighted', variance_adjusted)
4655
return skbio.DistanceMatrix.read(output_fp)
4756

4857

4958
def weighted_normalized(table: BIOMV210Format,
5059
phylogeny: NewickFormat,
51-
threads: int=1)-> skbio.DistanceMatrix:
60+
threads: int=1,
61+
variance_adjusted: bool=False)-> skbio.DistanceMatrix:
5262
_sanity()
5363

5464
with tempfile.TemporaryDirectory() as tmp:
5565
output_fp = os.path.join(tmp, 'foo.dm')
5666
_run(str(table), str(phylogeny), output_fp, str(threads),
57-
'weighted_normalized')
67+
'weighted_normalized', variance_adjusted)
5868
return skbio.DistanceMatrix.read(output_fp)
5969

6070

6171
def weighted_unnormalized(table: BIOMV210Format,
6272
phylogeny: NewickFormat,
63-
threads: int=1)-> skbio.DistanceMatrix:
73+
threads: int=1,
74+
variance_adjusted: bool=False) -> skbio.DistanceMatrix: # noqa
6475
_sanity()
6576

6677
with tempfile.TemporaryDirectory() as tmp:
6778
output_fp = os.path.join(tmp, 'foo.dm')
6879
_run(str(table), str(phylogeny), output_fp, str(threads),
69-
'weighted_unnormalized')
80+
'weighted_unnormalized', variance_adjusted)
7081
return skbio.DistanceMatrix.read(output_fp)
7182

7283

7384
def generalized(table: BIOMV210Format,
7485
phylogeny: NewickFormat,
7586
threads: int=1,
76-
alpha: float=1.0)-> skbio.DistanceMatrix:
87+
alpha: float=1.0,
88+
variance_adjusted: bool=False)-> skbio.DistanceMatrix:
7789
_sanity()
7890

7991
with tempfile.TemporaryDirectory() as tmp:
8092
output_fp = os.path.join(tmp, 'foo.dm')
8193
_run(str(table), str(phylogeny), output_fp, str(threads),
82-
'generalized')
94+
'generalized', variance_adjusted, alpha)
8395
return skbio.DistanceMatrix.read(output_fp)

q2_state_unifrac/plugin_setup.py

+23-7
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# The full license is in the file LICENSE, distributed with this software.
77
# ----------------------------------------------------------------------------
88

9-
from qiime2.plugin import (Plugin, Properties, Int, Float)
9+
from qiime2.plugin import (Plugin, Properties, Int, Float, Bool)
1010
from q2_types.feature_table import FeatureTable, Frequency
1111
from q2_types.distance_matrix import DistanceMatrix
1212
from q2_types.tree import Phylogeny, Rooted
@@ -28,8 +28,12 @@
2828
function=q2_state_unifrac.unweighted,
2929
inputs={'table': FeatureTable[Frequency] % Properties('uniform-sampling'),
3030
'phylogeny': Phylogeny[Rooted]},
31-
parameters={'threads': Int},
32-
parameter_descriptions={'threads': 'The number of threads to use.'},
31+
parameters={'threads': Int,
32+
'variance_adjusted': Bool},
33+
parameter_descriptions={'threads': 'The number of threads to use.',
34+
'variance_adjusted':
35+
('Perform variance adjustment based on '
36+
'Chang et al. BMC Bioinformatics 2011')},
3337
input_descriptions={
3438
'table': 'A rarefied FeatureTable',
3539
'phylogeny': ('A rooted phylogeny which relates the observations in '
@@ -47,8 +51,12 @@
4751
function=q2_state_unifrac.weighted_unnormalized,
4852
inputs={'table': FeatureTable[Frequency] % Properties('uniform-sampling'),
4953
'phylogeny': Phylogeny[Rooted]},
50-
parameters={'threads': Int},
51-
parameter_descriptions={'threads': 'The number of threads to use.'},
54+
parameters={'threads': Int,
55+
'variance_adjusted': Bool},
56+
parameter_descriptions={'threads': 'The number of threads to use.',
57+
'variance_adjusted':
58+
('Perform variance adjustment based on '
59+
'Chang et al. BMC Bioinformatics 2011')},
5260
input_descriptions={
5361
'table': 'A rarefied FeatureTable',
5462
'phylogeny': ('A rooted phylogeny which relates the observations in '
@@ -66,8 +74,12 @@
6674
function=q2_state_unifrac.weighted_normalized,
6775
inputs={'table': FeatureTable[Frequency] % Properties('uniform-sampling'),
6876
'phylogeny': Phylogeny[Rooted]},
69-
parameters={'threads': Int},
70-
parameter_descriptions={'threads': 'The number of threads to use.'},
77+
parameters={'threads': Int,
78+
'variance_adjusted': Bool},
79+
parameter_descriptions={'threads': 'The number of threads to use.',
80+
'variance_adjusted':
81+
('Perform variance adjustment based on '
82+
'Chang et al. BMC Bioinformatics 2011')},
7183
input_descriptions={
7284
'table': 'A rarefied FeatureTable',
7385
'phylogeny': ('A rooted phylogeny which relates the observations in '
@@ -86,8 +98,12 @@
8698
inputs={'table': FeatureTable[Frequency] % Properties('uniform-sampling'),
8799
'phylogeny': Phylogeny[Rooted]},
88100
parameters={'threads': Int,
101+
'variance_adjusted': Bool,
89102
'alpha': Float},
90103
parameter_descriptions={'threads': 'The number of threads to use.',
104+
'variance_adjusted':
105+
('Perform variance adjustment based on '
106+
'Chang et al. BMC Bioinformatics 2011'),
91107
'alpha': ('The value of alpha controls importance '
92108
'of sample proportions. 1.0 is '
93109
'weighted normalized UniFrac. 0.0 is '

sucpp/Makefile

+4-4
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@ else
2929
endif
3030

3131

32-
test: tree.o test_su.cpp biom.o unifrac.o
33-
$(CXX) $(CPPFLAGS) -Wno-unused-parameter test_su.cpp -o test_su tree.o biom.o unifrac.o
32+
test: tree.o test_su.cpp biom.o unifrac.o unifrac_task.o
33+
$(CXX) $(CPPFLAGS) -Wno-unused-parameter test_su.cpp -o test_su tree.o biom.o unifrac.o unifrac_task.o
3434

35-
main: tree.o biom.o unifrac.o cmd.o
36-
$(CXX) $(CPPFLAGS) su.cpp -o ssu tree.o biom.o unifrac.o cmd.o
35+
main: tree.o biom.o unifrac.o cmd.o unifrac_task.o
36+
$(CXX) $(CPPFLAGS) su.cpp -o ssu tree.o biom.o unifrac.o cmd.o unifrac_task.o
3737

3838
ocltest: opencl.o
3939
clang++ -framework OpenCL opencl.cpp -c

sucpp/su.cpp

+36-14
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,26 @@
99
#include <thread>
1010

1111
void usage() {
12-
std::cout << "usage: ssu -i <biom> -o <out.dm> -m [METHOD] -t <newick> [-n threads] [-a alpha]" << std::endl;
12+
std::cout << "usage: ssu -i <biom> -o <out.dm> -m [METHOD] -t <newick> [-n threads] [-a alpha] [--vaw]" << std::endl;
1313
std::cout << std::endl;
14-
std::cout << " -i\tThe input BIOM table." << std::endl;
15-
std::cout << " -t\tThe input phylogeny in newick." << std::endl;
16-
std::cout << " -m\tThe method, [unweighted | weighted_normalized | weighted_unnormalized | generalized]." << std::endl;
17-
std::cout << " -o\tThe output distance matrix." << std::endl;
18-
std::cout << " -n\t[OPTIONAL] The number of threads, default is 1." << std::endl;
19-
std::cout << " -a\t[OPTIONAL] Generalized UniFrac alpha, default is 1." << std::endl;
14+
std::cout << " -i\t\tThe input BIOM table." << std::endl;
15+
std::cout << " -t\t\tThe input phylogeny in newick." << std::endl;
16+
std::cout << " -m\t\tThe method, [unweighted | weighted_normalized | weighted_unnormalized | generalized]." << std::endl;
17+
std::cout << " -o\t\tThe output distance matrix." << std::endl;
18+
std::cout << " -n\t\t[OPTIONAL] The number of threads, default is 1." << std::endl;
19+
std::cout << " -a\t\t[OPTIONAL] Generalized UniFrac alpha, default is 1." << std::endl;
20+
std::cout << " --vaw\t[OPTIONAL] Variance adjusted, default is to not adjust for variance." << std::endl;
21+
std::cout << std::endl;
22+
std::cout << "Citations: " << std::endl;
23+
std::cout << " For UniFrac, please see:" << std::endl;
24+
std::cout << " Lozupone and Knight Appl Environ Microbiol 2005; DOI: 10.1128/AEM.71.12.8228-8235.2005" << std::endl;
25+
std::cout << " Lozupone et al. Appl Environ Microbiol 2007; DOI: 10.1128/AEM.01996-06" << std::endl;
26+
std::cout << " Hamady et al. ISME 2010; DOI: 10.1038/ismej.2009.97" << std::endl;
27+
std::cout << " Lozupone et al. ISME 2011; DOI: 10.1038/ismej.2010.133" << std::endl;
28+
std::cout << " For Generalized UniFrac, please see: " << std::endl;
29+
std::cout << " Chen et al. Bioinformatics 2012; DOI: 10.1093/bioinformatics/bts342" << std::endl;
30+
std::cout << " For Variance Adjusted UniFrac, please see: " << std::endl;
31+
std::cout << " Chang et al. BMC Bioinformatics 2011; DOI: 10.1186/1471-2105-12-118" << std::endl;
2032
std::cout << std::endl;
2133
}
2234

@@ -97,6 +109,7 @@ int main(int argc, char **argv){
97109
nthreads = atoi(nthreads_arg.c_str());
98110
}
99111

112+
bool vaw = input.cmdOptionExists("--vaw");
100113
double g_unifrac_alpha;
101114
if(gunifrac_arg.empty()) {
102115
g_unifrac_alpha = 1.0;
@@ -150,13 +163,22 @@ int main(int argc, char **argv){
150163
tasks[threads.size() - 1].stop = end;
151164

152165
for(unsigned int tid = 0; tid < threads.size(); tid++) {
153-
threads[tid] = std::thread(su::unifrac,
154-
std::ref(table),
155-
std::ref(tree_sheared),
156-
method,
157-
std::ref(dm_stripes),
158-
std::ref(dm_stripes_total),
159-
&tasks[tid]);
166+
if(vaw)
167+
threads[tid] = std::thread(su::unifrac_vaw,
168+
std::ref(table),
169+
std::ref(tree_sheared),
170+
method,
171+
std::ref(dm_stripes),
172+
std::ref(dm_stripes_total),
173+
&tasks[tid]);
174+
else
175+
threads[tid] = std::thread(su::unifrac,
176+
std::ref(table),
177+
std::ref(tree_sheared),
178+
method,
179+
std::ref(dm_stripes),
180+
std::ref(dm_stripes_total),
181+
&tasks[tid]);
160182
}
161183

162184
for(unsigned int tid = 0; tid < threads.size(); tid++) {

sucpp/test_su.cpp

+46-3
Original file line numberDiff line numberDiff line change
@@ -743,6 +743,48 @@ void test_generalized_unifrac() {
743743
SUITE_END();
744744
}
745745

746+
void test_vaw_unifrac_weighted_normalized() {
747+
SUITE_START("test vaw weighted normalized unifrac");
748+
749+
std::vector<std::thread> threads(1);
750+
su::BPTree tree = su::BPTree("(GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1,(GG_OTU_5:1,GG_OTU_4:1):1);");
751+
su::biom table = su::biom("test.biom");
752+
753+
// as computed by GUniFrac, the original implementation of VAW-UniFrac
754+
// could not be found.
755+
// Sample1 Sample2 Sample3 Sample4 Sample5 Sample6
756+
//Sample1 0.0000000 0.4086040 0.6240185 0.4639481 0.2857143 0.2766318
757+
//Sample2 0.4086040 0.0000000 0.3798594 0.6884992 0.6807616 0.4735781
758+
//Sample3 0.6240185 0.3798594 0.0000000 0.7713254 0.8812897 0.5047114
759+
//Sample4 0.4639481 0.6884992 0.7713254 0.0000000 0.6666667 0.2709298
760+
//Sample5 0.2857143 0.6807616 0.8812897 0.6666667 0.0000000 0.4735991
761+
//Sample6 0.2766318 0.4735781 0.5047114 0.2709298 0.4735991 0.0000000
762+
// weighted normalized unifrac as computed above
763+
764+
std::vector<double*> w_exp;
765+
double w_stride1[] = {0.4086040, 0.3798594, 0.7713254, 0.6666667, 0.4735991, 0.2766318};
766+
double w_stride2[] = {0.6240185, 0.6884992, 0.8812897, 0.2709298, 0.2857143, 0.4735781};
767+
double w_stride3[] = {0.4639481, 0.6807616, 0.5047114, 0.4639481, 0.6807616, 0.5047114};
768+
w_exp.push_back(w_stride1);
769+
w_exp.push_back(w_stride2);
770+
w_exp.push_back(w_stride3);
771+
std::vector<double*> w_strides = su::make_strides(6);
772+
std::vector<double*> w_strides_total = su::make_strides(6);
773+
su::task_parameters w_task_p;
774+
w_task_p.start = 0; w_task_p.stop = 3; w_task_p.tid = 0; w_task_p.n_samples = 6;
775+
w_task_p.g_unifrac_alpha = 1.0;
776+
su::unifrac_vaw(table, tree, su::weighted_normalized, w_strides, w_strides_total, &w_task_p);
777+
778+
for(unsigned int i = 0; i < 3; i++) {
779+
for(unsigned int j = 0; j < 6; j++) {
780+
ASSERT(fabs(w_strides[i][j] - w_exp[i][j]) < 0.000001);
781+
}
782+
free(w_strides[i]);
783+
}
784+
SUITE_END();
785+
}
786+
787+
746788
void test_make_strides() {
747789
SUITE_START("test make stripes");
748790
std::vector<double*> exp;
@@ -826,7 +868,7 @@ void test_bptree_shear_simple() {
826868
su::BPTree tree = su::BPTree("((3:2,4:3,(6:5)5:4)2:1,7:6,((10:9,11:10)9:8)8:7)r");
827869

828870
// simple
829-
std::unordered_set<std::string> to_keep = {"4", "6", "7", "10", "11"};
871+
std::unordered_set<std::string> to_keep = {"4", "6", "7", "10", "11"};
830872

831873
uint32_t exp_nparens = 20;
832874
std::vector<bool> exp_structure = {true, true, true, false, true, true, false, false, false, true,
@@ -848,7 +890,7 @@ void test_bptree_shear_deep() {
848890
su::BPTree tree = su::BPTree("((3:2,4:3,(6:5)5:4)2:1,7:6,((10:9,11:10)9:8)8:7)r");
849891

850892
// deep
851-
std::unordered_set<std::string> to_keep = {"10", "11"};
893+
std::unordered_set<std::string> to_keep = {"10", "11"};
852894

853895
uint32_t exp_nparens = 10;
854896
std::vector<bool> exp_structure = {true, true, true, true, false, true, false, false, false, false};
@@ -893,7 +935,7 @@ void test_bptree_collapse_edge() {
893935
ASSERT(obs.names == exp.names);
894936
ASSERT(vec_almost_equal(obs.lengths, exp.lengths));
895937

896-
SUITE_END();
938+
SUITE_END();
897939
}
898940

899941
void test_unifrac_sample_counts() {
@@ -945,6 +987,7 @@ int main(int argc, char** argv) {
945987
test_unnormalized_weighted_unifrac();
946988
test_normalized_weighted_unifrac();
947989
test_generalized_unifrac();
990+
test_vaw_unifrac_weighted_normalized();
948991
test_unifrac_sample_counts();
949992

950993
printf("\n");

0 commit comments

Comments
 (0)