Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

G unifrac #19

Merged
merged 13 commits into from
Aug 18, 2017
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,7 @@ script:
- for i in $(find ./ -maxdepth 1 -name 'core*' -print); do gdb $(pwd)/sucpp/ssu core* -ex "thread apply all bt" -ex "set pagination 0" -batch; done;
- ls -lrt
- qiime state-unifrac unweighted --i-table table.qza --i-phylogeny tree.qza --o-distance-matrix ssu_result.qza --verbose
- qiime state-unifrac unweighted --i-table table.qza --i-phylogeny tree.qza --o-distance-matrix ssu_result_thread.qza --verbose --p-threads 2
- qiime diversity beta-phylogenetic --p-metric unweighted_unifrac --i-table table.qza --i-phylogeny tree.qza --o-distance-matrix sk_result.qza
- python compare_dms.py ssu_result.qza sk_result.qza
- python compare_dms.py ssu_result_thread.qza sk_result.qza
6 changes: 4 additions & 2 deletions q2_state_unifrac/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@

from q2_state_unifrac._methods import (unweighted,
weighted_normalized,
weighted_unnormalized)
weighted_unnormalized,
generalized)


__version__ = pkg_resources.get_distribution('q2-state-unifrac').version
__all__ = ['unweighted', 'weighted_normalized', 'weighted_unnormalized']
__all__ = ['unweighted', 'weighted_normalized', 'weighted_unnormalized',
'generalized']
13 changes: 13 additions & 0 deletions q2_state_unifrac/_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,16 @@ def weighted_unnormalized(table: BIOMV210Format,
_run(str(table), str(phylogeny), output_fp, str(threads),
'weighted_unnormalized')
return skbio.DistanceMatrix.read(output_fp)


def generalized(table: BIOMV210Format,
phylogeny: NewickFormat,
threads: int=1,
alpha: float=1.0)-> skbio.DistanceMatrix:
_sanity()

with tempfile.TemporaryDirectory() as tmp:
output_fp = os.path.join(tmp, 'foo.dm')
_run(str(table), str(phylogeny), output_fp, str(threads),
'generalized')
return skbio.DistanceMatrix.read(output_fp)
42 changes: 37 additions & 5 deletions q2_state_unifrac/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from qiime2.plugin import (Plugin, Properties, Int)
from qiime2.plugin import (Plugin, Properties, Int, Float)
from q2_types.feature_table import FeatureTable, Frequency
from q2_types.distance_matrix import DistanceMatrix
from q2_types.tree import Phylogeny, Rooted
Expand Down Expand Up @@ -37,7 +37,9 @@
},
outputs=[('distance_matrix', DistanceMatrix % Properties('phylogenetic'))],
name='Unweighted UniFrac',
description=('This method computes Unweighted UniFrac')
description=('This method computes unweighted UniFrac as described in '
'Lozupone and Knight 2005 Appl Environ Microbiol; '
'DOI: 10.1128/AEM.71.12.8228-8235.2005')
)


Expand All @@ -54,7 +56,9 @@
},
outputs=[('distance_matrix', DistanceMatrix % Properties('phylogenetic'))],
name='Weighted unnormalizd UniFrac',
description=('This method computes weighted unnormalized UniFrac')
description=('This method computes weighted unnormalized UniFrac as '
'described in Lozupone et al. 2007 Appl Environ Microbiol; '
'DOI: 10.1128/AEM.01996-06')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be nice to add the citation_text for all these papers, see for example:

https://github.com/qiime2/q2-emperor/blob/3acf6b7b643b2b64c4fb31862ec56f8a0363f91c/q2_emperor/plugin_setup.py#L22-L25

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Isn't that at the plugin level though, and not at the method level? These citations are method specific

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are exactly right!

)


Expand All @@ -70,6 +74,34 @@
'the table.')
},
outputs=[('distance_matrix', DistanceMatrix % Properties('phylogenetic'))],
name='Wweighted normalized UniFrac',
description=('This method computes weighted normalized UniFrac')
name='Weighted normalized UniFrac',
description=('This method computes weighted normalized UniFrac as '
'described in Lozupone et al. 2007 Appl Environ Microbiol; '
'DOI: 10.1128/AEM.01996-06')
)


plugin.methods.register_function(
function=q2_state_unifrac.generalized,
inputs={'table': FeatureTable[Frequency] % Properties('uniform-sampling'),
'phylogeny': Phylogeny[Rooted]},
parameters={'threads': Int,
'alpha': Float},
parameter_descriptions={'threads': 'The number of threads to use.',
'alpha': ('The value of alpha controls importance '
'of sample proportions. 1.0 is '
'weighted normalized UniFrac. 0.0 is '
'close to unweighted UniFrac, but only '
'if the sample proportions are '
'dichotomized.')},
input_descriptions={
'table': 'A rarefied FeatureTable',
'phylogeny': ('A rooted phylogeny which relates the observations in '
'the table.')
},
outputs=[('distance_matrix', DistanceMatrix % Properties('phylogenetic'))],
name='Generalized UniFrac',
description=('This method computes Generalized UniFrac as described in '
'Chen et al. 2012 Bioinformatics; '
'DOI: 10.1093/bioinformatics/bts342')
)
49 changes: 24 additions & 25 deletions sucpp/su.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@
#include <thread>

void usage() {
std::cout << "usage: ssu -i <biom> -o <out.dm> -m [METHOD] -t <newick> [-n threads]" << std::endl;
std::cout << "usage: ssu -i <biom> -o <out.dm> -m [METHOD] -t <newick> [-n threads] [-a alpha]" << std::endl;
std::cout << std::endl;
std::cout << " -i\tThe input BIOM table." << std::endl;
std::cout << " -t\tThe input phylogeny in newick." << std::endl;
std::cout << " -m\tThe method, [unweighted | weighted_normalized | weighted_unnormalized]." << std::endl;
std::cout << " -m\tThe method, [unweighted | weighted_normalized | weighted_unnormalized | generalized]." << std::endl;
std::cout << " -o\tThe output distance matrix." << std::endl;
std::cout << " -n\t[OPTIONAL] The number of threads, default is 1." << std::endl;
std::cout << " -a\t[OPTIONAL] Generalized UniFrac alpha, default is 1." << std::endl;
std::cout << std::endl;
}

Expand Down Expand Up @@ -60,6 +61,7 @@ int main(int argc, char **argv){
const std::string &output_filename = input.getCmdOption("-o");
const std::string &method_string = input.getCmdOption("-m");
const std::string &nthreads_arg = input.getCmdOption("-n");
const std::string &gunifrac_arg = input.getCmdOption("-a");

su::Method method;

Expand Down Expand Up @@ -95,12 +97,21 @@ int main(int argc, char **argv){
nthreads = atoi(nthreads_arg.c_str());
}

double g_unifrac_alpha;
if(gunifrac_arg.empty()) {
g_unifrac_alpha = 1.0;
} else {
g_unifrac_alpha = atof(gunifrac_arg.c_str());
}

if(method_string == "unweighted")
method = su::unweighted;
else if(method_string == "weighted_normalized")
method = su::weighted_normalized;
else if(method_string == "weighted_unnormalized")
method = su::weighted_unnormalized;
else if(method_string == "generalized")
method = su::generalized;
else {
err("Unknown method");
return EXIT_FAILURE;
Expand All @@ -114,36 +125,29 @@ int main(int argc, char **argv){
std::unordered_set<std::string> to_keep(table.obs_ids.begin(), table.obs_ids.end());
su::BPTree tree_sheared = tree.shear(to_keep).collapse();

std::vector<double*> dm_stripes; // = su::make_strides(table.n_samples);
std::vector<double*> dm_stripes_total; // = su::make_strides(table.n_samples);
std::vector<double*> dm_stripes;
std::vector<double*> dm_stripes_total;
dm_stripes.resize((table.n_samples + 1) / 2);
dm_stripes_total.resize((table.n_samples + 1) / 2);

unsigned int chunksize = dm_stripes.size() / nthreads;
unsigned int start = 0;
unsigned int end = dm_stripes.size();

unsigned int *starts = (unsigned int*)malloc(sizeof(unsigned int) * nthreads);
if(starts == NULL) {
fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n",
sizeof(unsigned int) * nthreads, __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
unsigned int *ends = (unsigned int*)malloc(sizeof(unsigned int) * nthreads);
if(ends == NULL) {
fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n",
sizeof(unsigned int) * nthreads, __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
std::vector<su::task_parameters> tasks;
tasks.resize(nthreads);

std::vector<std::thread> threads(nthreads);
for(unsigned int tid = 0; tid < threads.size(); tid++) {
starts[tid] = start;
ends[tid] = start + chunksize;
tasks[tid].tid = tid;
tasks[tid].start = start; // stripe start
tasks[tid].stop = start + chunksize; // stripe end
tasks[tid].n_samples = table.n_samples;
tasks[tid].g_unifrac_alpha = g_unifrac_alpha;
start = start + chunksize;
}
// the last thread gets any trailing bits
ends[threads.size() - 1] = end;
tasks[threads.size() - 1].stop = end;

for(unsigned int tid = 0; tid < threads.size(); tid++) {
threads[tid] = std::thread(su::unifrac,
Expand All @@ -152,18 +156,13 @@ int main(int argc, char **argv){
method,
std::ref(dm_stripes),
std::ref(dm_stripes_total),
starts[tid],
ends[tid],
tid);
&tasks[tid]);
}

for(unsigned int tid = 0; tid < threads.size(); tid++) {
threads[tid].join();
}

free(starts);
free(ends);

double **dm = su::deconvolute_stripes(dm_stripes, table.n_samples);

std::ofstream output;
Expand Down
97 changes: 93 additions & 4 deletions sucpp/test_su.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,10 @@ void test_unnormalized_weighted_unifrac() {
std::vector<double*> strides = su::make_strides(6);
std::vector<double*> strides_total = su::make_strides(6);

su::unifrac(table, tree, su::weighted_unnormalized, strides, strides_total, 0, 3, 0);
su::task_parameters task_p;
task_p.start = 0; task_p.stop = 3; task_p.tid = 0; task_p.n_samples = 6;

su::unifrac(table, tree, su::weighted_unnormalized, strides, strides_total, &task_p);
for(unsigned int i = 0; i < 3; i++) {
for(unsigned int j = 0; j < 6; j++) {
ASSERT(fabs(strides[i][j] - exp[i][j]) < 0.000001);
Expand All @@ -661,6 +664,85 @@ void test_unnormalized_weighted_unifrac() {
SUITE_END();
}

void test_generalized_unifrac() {
SUITE_START("test generalized unifrac");

std::vector<std::thread> threads(1);
su::BPTree tree = su::BPTree("(GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1,(GG_OTU_5:1,GG_OTU_4:1):1);");
su::biom table = su::biom("test.biom");

// weighted normalized unifrac as computed above
std::vector<double*> w_exp;
double w_stride1[] = {0.38095238, 0.33333333, 0.73333333, 0.33333333, 0.5, 0.26785714};
double w_stride2[] = {0.58095238, 0.66666667, 0.86666667, 0.25, 0.28571429, 0.45833333};
double w_stride3[] = {0.47619048, 0.66666667, 0.46666667, 0.47619048, 0.66666667, 0.46666667};
w_exp.push_back(w_stride1);
w_exp.push_back(w_stride2);
w_exp.push_back(w_stride3);
std::vector<double*> w_strides = su::make_strides(6);
std::vector<double*> w_strides_total = su::make_strides(6);
su::task_parameters w_task_p;
w_task_p.start = 0; w_task_p.stop = 3; w_task_p.tid = 0; w_task_p.n_samples = 6;
w_task_p.g_unifrac_alpha = 1.0;
su::unifrac(table, tree, su::generalized, w_strides, w_strides_total, &w_task_p);

// as computed by GUniFrac v1.0
// Sample1 Sample2 Sample3 Sample4 Sample5 Sample6
//Sample1 0.0000000 0.4408392 0.6886965 0.7060606 0.5833333 0.3278410
//Sample2 0.4408392 0.0000000 0.5102041 0.7500000 0.8000000 0.5208125
//Sample3 0.6886965 0.5102041 0.0000000 0.8649351 0.9428571 0.5952381
//Sample4 0.7060606 0.7500000 0.8649351 0.0000000 0.5000000 0.4857143
//Sample5 0.5833333 0.8000000 0.9428571 0.5000000 0.0000000 0.7485714
//Sample6 0.3278410 0.5208125 0.5952381 0.4857143 0.7485714 0.0000000
std::vector<double*> d0_exp;
double d0_stride1[] = {0.4408392, 0.5102041, 0.8649351, 0.5000000, 0.7485714, 0.3278410};
double d0_stride2[] = {0.6886965, 0.7500000, 0.9428571, 0.4857143, 0.5833333, 0.5208125};
double d0_stride3[] = {0.7060606, 0.8000000, 0.5952381, 0.7060606, 0.8000000, 0.5952381};
d0_exp.push_back(d0_stride1);
d0_exp.push_back(d0_stride2);
d0_exp.push_back(d0_stride3);
std::vector<double*> d0_strides = su::make_strides(6);
std::vector<double*> d0_strides_total = su::make_strides(6);
su::task_parameters d0_task_p;
d0_task_p.start = 0; d0_task_p.stop = 3; d0_task_p.tid = 0; d0_task_p.n_samples = 6;
d0_task_p.g_unifrac_alpha = 0.0;
su::unifrac(table, tree, su::generalized, d0_strides, d0_strides_total, &d0_task_p);

// as computed by GUniFrac v1.0
// Sample1 Sample2 Sample3 Sample4 Sample5 Sample6
//Sample1 0.0000000 0.4040518 0.6285560 0.5869439 0.4082483 0.2995673
//Sample2 0.4040518 0.0000000 0.4160597 0.7071068 0.7302479 0.4860856
//Sample3 0.6285560 0.4160597 0.0000000 0.8005220 0.9073159 0.5218198
//Sample4 0.5869439 0.7071068 0.8005220 0.0000000 0.4117216 0.3485667
//Sample5 0.4082483 0.7302479 0.9073159 0.4117216 0.0000000 0.6188282
//Sample6 0.2995673 0.4860856 0.5218198 0.3485667 0.6188282 0.0000000
std::vector<double*> d05_exp;
double d05_stride1[] = {0.4040518, 0.4160597, 0.8005220, 0.4117216, 0.6188282, 0.2995673};
double d05_stride2[] = {0.6285560, 0.7071068, 0.9073159, 0.3485667, 0.4082483, 0.4860856};
double d05_stride3[] = {0.5869439, 0.7302479, 0.5218198, 0.5869439, 0.7302479, 0.5218198};
d05_exp.push_back(d05_stride1);
d05_exp.push_back(d05_stride2);
d05_exp.push_back(d05_stride3);
std::vector<double*> d05_strides = su::make_strides(6);
std::vector<double*> d05_strides_total = su::make_strides(6);
su::task_parameters d05_task_p;
d05_task_p.start = 0; d05_task_p.stop = 3; d05_task_p.tid = 0; d05_task_p.n_samples = 6;
d05_task_p.g_unifrac_alpha = 0.5;
su::unifrac(table, tree, su::generalized, d05_strides, d05_strides_total, &d05_task_p);

for(unsigned int i = 0; i < 3; i++) {
for(unsigned int j = 0; j < 6; j++) {
ASSERT(fabs(w_strides[i][j] - w_exp[i][j]) < 0.000001);
ASSERT(fabs(d0_strides[i][j] - d0_exp[i][j]) < 0.000001);
ASSERT(fabs(d05_strides[i][j] - d05_exp[i][j]) < 0.000001);
}
free(w_strides[i]);
free(d0_strides[i]);
free(d05_strides[i]);
}
SUITE_END();
}

void test_make_strides() {
SUITE_START("test make stripes");
std::vector<double*> exp;
Expand Down Expand Up @@ -695,7 +777,10 @@ void test_unweighted_unifrac() {
std::vector<double*> strides = su::make_strides(6);
std::vector<double*> strides_total = su::make_strides(6);

su::unifrac(table, tree, su::unweighted, strides, strides_total, 0, 3, 0);
su::task_parameters task_p;
task_p.start = 0; task_p.stop = 3; task_p.tid = 0; task_p.n_samples = 6;

su::unifrac(table, tree, su::unweighted, strides, strides_total, &task_p);

for(unsigned int i = 0; i < 3; i++) {
for(unsigned int j = 0; j < 6; j++) {
Expand All @@ -722,8 +807,11 @@ void test_normalized_weighted_unifrac() {
exp.push_back(stride3);
std::vector<double*> strides = su::make_strides(6);
std::vector<double*> strides_total = su::make_strides(6);

su::unifrac(table, tree, su::weighted_normalized, strides, strides_total, 0, 3, 0);

su::task_parameters task_p;
task_p.start = 0; task_p.stop = 3; task_p.tid = 0; task_p.n_samples = 6;

su::unifrac(table, tree, su::weighted_normalized, strides, strides_total, &task_p);
for(unsigned int i = 0; i < 3; i++) {
for(unsigned int j = 0; j < 6; j++) {
ASSERT(fabs(strides[i][j] - exp[i][j]) < 0.000001);
Expand Down Expand Up @@ -856,6 +944,7 @@ int main(int argc, char** argv) {
test_unweighted_unifrac();
test_unnormalized_weighted_unifrac();
test_normalized_weighted_unifrac();
test_generalized_unifrac();
test_unifrac_sample_counts();

printf("\n");
Expand Down
Loading