Skip to content

Commit 6966503

Browse files
wasadeElDeveloper
authored andcommitted
G unifrac (#19)
* Test for NULL on malloc... * Test for file existence * Cleaning up warning messages * ENH: generalized unifrac * Expose via q2 * TST: d^0 and d^0.5 * Missed import * regression: task param boundaries were set wrong * Addressing @ElDeveloper's comments
1 parent c227f46 commit 6966503

File tree

8 files changed

+248
-88
lines changed

8 files changed

+248
-88
lines changed

.travis.yml

+2
Original file line numberDiff line numberDiff line change
@@ -49,5 +49,7 @@ script:
4949
- for i in $(find ./ -maxdepth 1 -name 'core*' -print); do gdb $(pwd)/sucpp/ssu core* -ex "thread apply all bt" -ex "set pagination 0" -batch; done;
5050
- ls -lrt
5151
- qiime state-unifrac unweighted --i-table table.qza --i-phylogeny tree.qza --o-distance-matrix ssu_result.qza --verbose
52+
- qiime state-unifrac unweighted --i-table table.qza --i-phylogeny tree.qza --o-distance-matrix ssu_result_thread.qza --verbose --p-threads 2
5253
- qiime diversity beta-phylogenetic --p-metric unweighted_unifrac --i-table table.qza --i-phylogeny tree.qza --o-distance-matrix sk_result.qza
5354
- python compare_dms.py ssu_result.qza sk_result.qza
55+
- python compare_dms.py ssu_result_thread.qza sk_result.qza

q2_state_unifrac/__init__.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@
1010

1111
from q2_state_unifrac._methods import (unweighted,
1212
weighted_normalized,
13-
weighted_unnormalized)
13+
weighted_unnormalized,
14+
generalized)
1415

1516

1617
__version__ = pkg_resources.get_distribution('q2-state-unifrac').version
17-
__all__ = ['unweighted', 'weighted_normalized', 'weighted_unnormalized']
18+
__all__ = ['unweighted', 'weighted_normalized', 'weighted_unnormalized',
19+
'generalized']

q2_state_unifrac/_methods.py

+13
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,16 @@ def weighted_unnormalized(table: BIOMV210Format,
6868
_run(str(table), str(phylogeny), output_fp, str(threads),
6969
'weighted_unnormalized')
7070
return skbio.DistanceMatrix.read(output_fp)
71+
72+
73+
def generalized(table: BIOMV210Format,
74+
phylogeny: NewickFormat,
75+
threads: int=1,
76+
alpha: float=1.0)-> skbio.DistanceMatrix:
77+
_sanity()
78+
79+
with tempfile.TemporaryDirectory() as tmp:
80+
output_fp = os.path.join(tmp, 'foo.dm')
81+
_run(str(table), str(phylogeny), output_fp, str(threads),
82+
'generalized')
83+
return skbio.DistanceMatrix.read(output_fp)

q2_state_unifrac/plugin_setup.py

+37-5
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# The full license is in the file LICENSE, distributed with this software.
77
# ----------------------------------------------------------------------------
88

9-
from qiime2.plugin import (Plugin, Properties, Int)
9+
from qiime2.plugin import (Plugin, Properties, Int, Float)
1010
from q2_types.feature_table import FeatureTable, Frequency
1111
from q2_types.distance_matrix import DistanceMatrix
1212
from q2_types.tree import Phylogeny, Rooted
@@ -37,7 +37,9 @@
3737
},
3838
outputs=[('distance_matrix', DistanceMatrix % Properties('phylogenetic'))],
3939
name='Unweighted UniFrac',
40-
description=('This method computes Unweighted UniFrac')
40+
description=('This method computes unweighted UniFrac as described in '
41+
'Lozupone and Knight 2005 Appl Environ Microbiol; '
42+
'DOI: 10.1128/AEM.71.12.8228-8235.2005')
4143
)
4244

4345

@@ -54,7 +56,9 @@
5456
},
5557
outputs=[('distance_matrix', DistanceMatrix % Properties('phylogenetic'))],
5658
name='Weighted unnormalizd UniFrac',
57-
description=('This method computes weighted unnormalized UniFrac')
59+
description=('This method computes weighted unnormalized UniFrac as '
60+
'described in Lozupone et al. 2007 Appl Environ Microbiol; '
61+
'DOI: 10.1128/AEM.01996-06')
5862
)
5963

6064

@@ -70,6 +74,34 @@
7074
'the table.')
7175
},
7276
outputs=[('distance_matrix', DistanceMatrix % Properties('phylogenetic'))],
73-
name='Wweighted normalized UniFrac',
74-
description=('This method computes weighted normalized UniFrac')
77+
name='Weighted normalized UniFrac',
78+
description=('This method computes weighted normalized UniFrac as '
79+
'described in Lozupone et al. 2007 Appl Environ Microbiol; '
80+
'DOI: 10.1128/AEM.01996-06')
81+
)
82+
83+
84+
plugin.methods.register_function(
85+
function=q2_state_unifrac.generalized,
86+
inputs={'table': FeatureTable[Frequency] % Properties('uniform-sampling'),
87+
'phylogeny': Phylogeny[Rooted]},
88+
parameters={'threads': Int,
89+
'alpha': Float},
90+
parameter_descriptions={'threads': 'The number of threads to use.',
91+
'alpha': ('The value of alpha controls importance '
92+
'of sample proportions. 1.0 is '
93+
'weighted normalized UniFrac. 0.0 is '
94+
'close to unweighted UniFrac, but only '
95+
'if the sample proportions are '
96+
'dichotomized.')},
97+
input_descriptions={
98+
'table': 'A rarefied FeatureTable',
99+
'phylogeny': ('A rooted phylogeny which relates the observations in '
100+
'the table.')
101+
},
102+
outputs=[('distance_matrix', DistanceMatrix % Properties('phylogenetic'))],
103+
name='Generalized UniFrac',
104+
description=('This method computes Generalized UniFrac as described in '
105+
'Chen et al. 2012 Bioinformatics; '
106+
'DOI: 10.1093/bioinformatics/bts342')
75107
)

sucpp/su.cpp

+24-25
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,14 @@
99
#include <thread>
1010

1111
void usage() {
12-
std::cout << "usage: ssu -i <biom> -o <out.dm> -m [METHOD] -t <newick> [-n threads]" << std::endl;
12+
std::cout << "usage: ssu -i <biom> -o <out.dm> -m [METHOD] -t <newick> [-n threads] [-a alpha]" << std::endl;
1313
std::cout << std::endl;
1414
std::cout << " -i\tThe input BIOM table." << std::endl;
1515
std::cout << " -t\tThe input phylogeny in newick." << std::endl;
16-
std::cout << " -m\tThe method, [unweighted | weighted_normalized | weighted_unnormalized]." << std::endl;
16+
std::cout << " -m\tThe method, [unweighted | weighted_normalized | weighted_unnormalized | generalized]." << std::endl;
1717
std::cout << " -o\tThe output distance matrix." << std::endl;
1818
std::cout << " -n\t[OPTIONAL] The number of threads, default is 1." << std::endl;
19+
std::cout << " -a\t[OPTIONAL] Generalized UniFrac alpha, default is 1." << std::endl;
1920
std::cout << std::endl;
2021
}
2122

@@ -60,6 +61,7 @@ int main(int argc, char **argv){
6061
const std::string &output_filename = input.getCmdOption("-o");
6162
const std::string &method_string = input.getCmdOption("-m");
6263
const std::string &nthreads_arg = input.getCmdOption("-n");
64+
const std::string &gunifrac_arg = input.getCmdOption("-a");
6365

6466
su::Method method;
6567

@@ -95,12 +97,21 @@ int main(int argc, char **argv){
9597
nthreads = atoi(nthreads_arg.c_str());
9698
}
9799

100+
double g_unifrac_alpha;
101+
if(gunifrac_arg.empty()) {
102+
g_unifrac_alpha = 1.0;
103+
} else {
104+
g_unifrac_alpha = atof(gunifrac_arg.c_str());
105+
}
106+
98107
if(method_string == "unweighted")
99108
method = su::unweighted;
100109
else if(method_string == "weighted_normalized")
101110
method = su::weighted_normalized;
102111
else if(method_string == "weighted_unnormalized")
103112
method = su::weighted_unnormalized;
113+
else if(method_string == "generalized")
114+
method = su::generalized;
104115
else {
105116
err("Unknown method");
106117
return EXIT_FAILURE;
@@ -114,36 +125,29 @@ int main(int argc, char **argv){
114125
std::unordered_set<std::string> to_keep(table.obs_ids.begin(), table.obs_ids.end());
115126
su::BPTree tree_sheared = tree.shear(to_keep).collapse();
116127

117-
std::vector<double*> dm_stripes; // = su::make_strides(table.n_samples);
118-
std::vector<double*> dm_stripes_total; // = su::make_strides(table.n_samples);
128+
std::vector<double*> dm_stripes;
129+
std::vector<double*> dm_stripes_total;
119130
dm_stripes.resize((table.n_samples + 1) / 2);
120131
dm_stripes_total.resize((table.n_samples + 1) / 2);
121132

122133
unsigned int chunksize = dm_stripes.size() / nthreads;
123134
unsigned int start = 0;
124135
unsigned int end = dm_stripes.size();
125136

126-
unsigned int *starts = (unsigned int*)malloc(sizeof(unsigned int) * nthreads);
127-
if(starts == NULL) {
128-
fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n",
129-
sizeof(unsigned int) * nthreads, __FILE__, __LINE__);
130-
exit(EXIT_FAILURE);
131-
}
132-
unsigned int *ends = (unsigned int*)malloc(sizeof(unsigned int) * nthreads);
133-
if(ends == NULL) {
134-
fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n",
135-
sizeof(unsigned int) * nthreads, __FILE__, __LINE__);
136-
exit(EXIT_FAILURE);
137-
}
137+
std::vector<su::task_parameters> tasks;
138+
tasks.resize(nthreads);
138139

139140
std::vector<std::thread> threads(nthreads);
140141
for(unsigned int tid = 0; tid < threads.size(); tid++) {
141-
starts[tid] = start;
142-
ends[tid] = start + chunksize;
142+
tasks[tid].tid = tid;
143+
tasks[tid].start = start; // stripe start
144+
tasks[tid].stop = start + chunksize; // stripe end
145+
tasks[tid].n_samples = table.n_samples;
146+
tasks[tid].g_unifrac_alpha = g_unifrac_alpha;
143147
start = start + chunksize;
144148
}
145149
// the last thread gets any trailing bits
146-
ends[threads.size() - 1] = end;
150+
tasks[threads.size() - 1].stop = end;
147151

148152
for(unsigned int tid = 0; tid < threads.size(); tid++) {
149153
threads[tid] = std::thread(su::unifrac,
@@ -152,18 +156,13 @@ int main(int argc, char **argv){
152156
method,
153157
std::ref(dm_stripes),
154158
std::ref(dm_stripes_total),
155-
starts[tid],
156-
ends[tid],
157-
tid);
159+
&tasks[tid]);
158160
}
159161

160162
for(unsigned int tid = 0; tid < threads.size(); tid++) {
161163
threads[tid].join();
162164
}
163165

164-
free(starts);
165-
free(ends);
166-
167166
double **dm = su::deconvolute_stripes(dm_stripes, table.n_samples);
168167

169168
std::ofstream output;

sucpp/test_su.cpp

+93-4
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,10 @@ void test_unnormalized_weighted_unifrac() {
651651
std::vector<double*> strides = su::make_strides(6);
652652
std::vector<double*> strides_total = su::make_strides(6);
653653

654-
su::unifrac(table, tree, su::weighted_unnormalized, strides, strides_total, 0, 3, 0);
654+
su::task_parameters task_p;
655+
task_p.start = 0; task_p.stop = 3; task_p.tid = 0; task_p.n_samples = 6;
656+
657+
su::unifrac(table, tree, su::weighted_unnormalized, strides, strides_total, &task_p);
655658
for(unsigned int i = 0; i < 3; i++) {
656659
for(unsigned int j = 0; j < 6; j++) {
657660
ASSERT(fabs(strides[i][j] - exp[i][j]) < 0.000001);
@@ -661,6 +664,85 @@ void test_unnormalized_weighted_unifrac() {
661664
SUITE_END();
662665
}
663666

667+
void test_generalized_unifrac() {
668+
SUITE_START("test generalized unifrac");
669+
670+
std::vector<std::thread> threads(1);
671+
su::BPTree tree = su::BPTree("(GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1,(GG_OTU_5:1,GG_OTU_4:1):1);");
672+
su::biom table = su::biom("test.biom");
673+
674+
// weighted normalized unifrac as computed above
675+
std::vector<double*> w_exp;
676+
double w_stride1[] = {0.38095238, 0.33333333, 0.73333333, 0.33333333, 0.5, 0.26785714};
677+
double w_stride2[] = {0.58095238, 0.66666667, 0.86666667, 0.25, 0.28571429, 0.45833333};
678+
double w_stride3[] = {0.47619048, 0.66666667, 0.46666667, 0.47619048, 0.66666667, 0.46666667};
679+
w_exp.push_back(w_stride1);
680+
w_exp.push_back(w_stride2);
681+
w_exp.push_back(w_stride3);
682+
std::vector<double*> w_strides = su::make_strides(6);
683+
std::vector<double*> w_strides_total = su::make_strides(6);
684+
su::task_parameters w_task_p;
685+
w_task_p.start = 0; w_task_p.stop = 3; w_task_p.tid = 0; w_task_p.n_samples = 6;
686+
w_task_p.g_unifrac_alpha = 1.0;
687+
su::unifrac(table, tree, su::generalized, w_strides, w_strides_total, &w_task_p);
688+
689+
// as computed by GUniFrac v1.0
690+
// Sample1 Sample2 Sample3 Sample4 Sample5 Sample6
691+
//Sample1 0.0000000 0.4408392 0.6886965 0.7060606 0.5833333 0.3278410
692+
//Sample2 0.4408392 0.0000000 0.5102041 0.7500000 0.8000000 0.5208125
693+
//Sample3 0.6886965 0.5102041 0.0000000 0.8649351 0.9428571 0.5952381
694+
//Sample4 0.7060606 0.7500000 0.8649351 0.0000000 0.5000000 0.4857143
695+
//Sample5 0.5833333 0.8000000 0.9428571 0.5000000 0.0000000 0.7485714
696+
//Sample6 0.3278410 0.5208125 0.5952381 0.4857143 0.7485714 0.0000000
697+
std::vector<double*> d0_exp;
698+
double d0_stride1[] = {0.4408392, 0.5102041, 0.8649351, 0.5000000, 0.7485714, 0.3278410};
699+
double d0_stride2[] = {0.6886965, 0.7500000, 0.9428571, 0.4857143, 0.5833333, 0.5208125};
700+
double d0_stride3[] = {0.7060606, 0.8000000, 0.5952381, 0.7060606, 0.8000000, 0.5952381};
701+
d0_exp.push_back(d0_stride1);
702+
d0_exp.push_back(d0_stride2);
703+
d0_exp.push_back(d0_stride3);
704+
std::vector<double*> d0_strides = su::make_strides(6);
705+
std::vector<double*> d0_strides_total = su::make_strides(6);
706+
su::task_parameters d0_task_p;
707+
d0_task_p.start = 0; d0_task_p.stop = 3; d0_task_p.tid = 0; d0_task_p.n_samples = 6;
708+
d0_task_p.g_unifrac_alpha = 0.0;
709+
su::unifrac(table, tree, su::generalized, d0_strides, d0_strides_total, &d0_task_p);
710+
711+
// as computed by GUniFrac v1.0
712+
// Sample1 Sample2 Sample3 Sample4 Sample5 Sample6
713+
//Sample1 0.0000000 0.4040518 0.6285560 0.5869439 0.4082483 0.2995673
714+
//Sample2 0.4040518 0.0000000 0.4160597 0.7071068 0.7302479 0.4860856
715+
//Sample3 0.6285560 0.4160597 0.0000000 0.8005220 0.9073159 0.5218198
716+
//Sample4 0.5869439 0.7071068 0.8005220 0.0000000 0.4117216 0.3485667
717+
//Sample5 0.4082483 0.7302479 0.9073159 0.4117216 0.0000000 0.6188282
718+
//Sample6 0.2995673 0.4860856 0.5218198 0.3485667 0.6188282 0.0000000
719+
std::vector<double*> d05_exp;
720+
double d05_stride1[] = {0.4040518, 0.4160597, 0.8005220, 0.4117216, 0.6188282, 0.2995673};
721+
double d05_stride2[] = {0.6285560, 0.7071068, 0.9073159, 0.3485667, 0.4082483, 0.4860856};
722+
double d05_stride3[] = {0.5869439, 0.7302479, 0.5218198, 0.5869439, 0.7302479, 0.5218198};
723+
d05_exp.push_back(d05_stride1);
724+
d05_exp.push_back(d05_stride2);
725+
d05_exp.push_back(d05_stride3);
726+
std::vector<double*> d05_strides = su::make_strides(6);
727+
std::vector<double*> d05_strides_total = su::make_strides(6);
728+
su::task_parameters d05_task_p;
729+
d05_task_p.start = 0; d05_task_p.stop = 3; d05_task_p.tid = 0; d05_task_p.n_samples = 6;
730+
d05_task_p.g_unifrac_alpha = 0.5;
731+
su::unifrac(table, tree, su::generalized, d05_strides, d05_strides_total, &d05_task_p);
732+
733+
for(unsigned int i = 0; i < 3; i++) {
734+
for(unsigned int j = 0; j < 6; j++) {
735+
ASSERT(fabs(w_strides[i][j] - w_exp[i][j]) < 0.000001);
736+
ASSERT(fabs(d0_strides[i][j] - d0_exp[i][j]) < 0.000001);
737+
ASSERT(fabs(d05_strides[i][j] - d05_exp[i][j]) < 0.000001);
738+
}
739+
free(w_strides[i]);
740+
free(d0_strides[i]);
741+
free(d05_strides[i]);
742+
}
743+
SUITE_END();
744+
}
745+
664746
void test_make_strides() {
665747
SUITE_START("test make stripes");
666748
std::vector<double*> exp;
@@ -695,7 +777,10 @@ void test_unweighted_unifrac() {
695777
std::vector<double*> strides = su::make_strides(6);
696778
std::vector<double*> strides_total = su::make_strides(6);
697779

698-
su::unifrac(table, tree, su::unweighted, strides, strides_total, 0, 3, 0);
780+
su::task_parameters task_p;
781+
task_p.start = 0; task_p.stop = 3; task_p.tid = 0; task_p.n_samples = 6;
782+
783+
su::unifrac(table, tree, su::unweighted, strides, strides_total, &task_p);
699784

700785
for(unsigned int i = 0; i < 3; i++) {
701786
for(unsigned int j = 0; j < 6; j++) {
@@ -722,8 +807,11 @@ void test_normalized_weighted_unifrac() {
722807
exp.push_back(stride3);
723808
std::vector<double*> strides = su::make_strides(6);
724809
std::vector<double*> strides_total = su::make_strides(6);
725-
726-
su::unifrac(table, tree, su::weighted_normalized, strides, strides_total, 0, 3, 0);
810+
811+
su::task_parameters task_p;
812+
task_p.start = 0; task_p.stop = 3; task_p.tid = 0; task_p.n_samples = 6;
813+
814+
su::unifrac(table, tree, su::weighted_normalized, strides, strides_total, &task_p);
727815
for(unsigned int i = 0; i < 3; i++) {
728816
for(unsigned int j = 0; j < 6; j++) {
729817
ASSERT(fabs(strides[i][j] - exp[i][j]) < 0.000001);
@@ -856,6 +944,7 @@ int main(int argc, char** argv) {
856944
test_unweighted_unifrac();
857945
test_unnormalized_weighted_unifrac();
858946
test_normalized_weighted_unifrac();
947+
test_generalized_unifrac();
859948
test_unifrac_sample_counts();
860949

861950
printf("\n");

0 commit comments

Comments
 (0)