diff --git a/q2_state_unifrac/_methods.py b/q2_state_unifrac/_methods.py
index dbe015b9..ca9d4ae9 100644
--- a/q2_state_unifrac/_methods.py
+++ b/q2_state_unifrac/_methods.py
@@ -23,7 +23,8 @@ def _sanity():
         raise ValueError("ssu could not be located!")
 
 
-def _run(table_fp, tree_fp, output_fp, threads, method):
+def _run(table_fp, tree_fp, output_fp, threads, method,
+         variance_adjusted=False, alpha=None):
     cmd = [resource_filename(*ARGS),
            '-i', table_fp,
            '-t', tree_fp,
@@ -31,53 +32,64 @@ def _run(table_fp, tree_fp, output_fp, threads, method):
            '-n', threads,
            '-m', method]
 
+    if variance_adjusted:
+        cmd.append('--vaw')
+
+    if alpha is not None:
+        cmd.append('-a')
+        cmd.append(str(alpha))
+
     subprocess.run(cmd, check=True)
 
 
 def unweighted(table: BIOMV210Format,
                phylogeny: NewickFormat,
-               threads: int=1)-> skbio.DistanceMatrix:
+               threads: int=1,
+               variance_adjusted: bool=False)-> skbio.DistanceMatrix:
     _sanity()
 
     with tempfile.TemporaryDirectory() as tmp:
         output_fp = os.path.join(tmp, 'foo.dm')
         _run(str(table), str(phylogeny), output_fp, str(threads),
-             'unweighted')
+             'unweighted', variance_adjusted)
         return skbio.DistanceMatrix.read(output_fp)
 
 
 def weighted_normalized(table: BIOMV210Format,
                         phylogeny: NewickFormat,
-                        threads: int=1)-> skbio.DistanceMatrix:
+                        threads: int=1,
+                        variance_adjusted: bool=False)-> skbio.DistanceMatrix:
     _sanity()
 
     with tempfile.TemporaryDirectory() as tmp:
         output_fp = os.path.join(tmp, 'foo.dm')
         _run(str(table), str(phylogeny), output_fp, str(threads),
-             'weighted_normalized')
+             'weighted_normalized', variance_adjusted)
         return skbio.DistanceMatrix.read(output_fp)
 
 
 def weighted_unnormalized(table: BIOMV210Format,
                           phylogeny: NewickFormat,
-                          threads: int=1)-> skbio.DistanceMatrix:
+                          threads: int=1,
+                          variance_adjusted: bool=False) -> skbio.DistanceMatrix:  # noqa
     _sanity()
 
     with tempfile.TemporaryDirectory() as tmp:
         output_fp = os.path.join(tmp, 'foo.dm')
         _run(str(table), str(phylogeny), output_fp, str(threads),
-             'weighted_unnormalized')
+             'weighted_unnormalized', variance_adjusted)
         return skbio.DistanceMatrix.read(output_fp)
 
 
 def generalized(table: BIOMV210Format,
                 phylogeny: NewickFormat,
                 threads: int=1,
-                alpha: float=1.0)-> skbio.DistanceMatrix:
+                alpha: float=1.0,
+                variance_adjusted: bool=False)-> skbio.DistanceMatrix:
     _sanity()
 
     with tempfile.TemporaryDirectory() as tmp:
         output_fp = os.path.join(tmp, 'foo.dm')
         _run(str(table), str(phylogeny), output_fp, str(threads),
-             'generalized')
+             'generalized', variance_adjusted, alpha)
         return skbio.DistanceMatrix.read(output_fp)
diff --git a/q2_state_unifrac/plugin_setup.py b/q2_state_unifrac/plugin_setup.py
index caee7c6b..302b395e 100644
--- a/q2_state_unifrac/plugin_setup.py
+++ b/q2_state_unifrac/plugin_setup.py
@@ -6,7 +6,7 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-from qiime2.plugin import (Plugin, Properties, Int, Float)
+from qiime2.plugin import (Plugin, Properties, Int, Float, Bool)
 from q2_types.feature_table import FeatureTable, Frequency
 from q2_types.distance_matrix import DistanceMatrix
 from q2_types.tree import Phylogeny, Rooted
@@ -28,8 +28,12 @@
     function=q2_state_unifrac.unweighted,
     inputs={'table': FeatureTable[Frequency] % Properties('uniform-sampling'),
             'phylogeny': Phylogeny[Rooted]},
-    parameters={'threads': Int},
-    parameter_descriptions={'threads': 'The number of threads to use.'},
+    parameters={'threads': Int,
+                'variance_adjusted': Bool},
+    parameter_descriptions={'threads': 'The number of threads to use.',
+                            'variance_adjusted':
+                                ('Perform variance adjustment based on '
+                                 'Chang et al. BMC Bioinformatics 2011')},
     input_descriptions={
         'table': 'A rarefied FeatureTable',
         'phylogeny': ('A rooted phylogeny which relates the observations in '
@@ -47,8 +51,12 @@
     function=q2_state_unifrac.weighted_unnormalized,
     inputs={'table': FeatureTable[Frequency] % Properties('uniform-sampling'),
             'phylogeny': Phylogeny[Rooted]},
-    parameters={'threads': Int},
-    parameter_descriptions={'threads': 'The number of threads to use.'},
+    parameters={'threads': Int,
+                'variance_adjusted': Bool},
+    parameter_descriptions={'threads': 'The number of threads to use.',
+                            'variance_adjusted':
+                                ('Perform variance adjustment based on '
+                                 'Chang et al. BMC Bioinformatics 2011')},
     input_descriptions={
         'table': 'A rarefied FeatureTable',
         'phylogeny': ('A rooted phylogeny which relates the observations in '
@@ -66,8 +74,12 @@
     function=q2_state_unifrac.weighted_normalized,
     inputs={'table': FeatureTable[Frequency] % Properties('uniform-sampling'),
             'phylogeny': Phylogeny[Rooted]},
-    parameters={'threads': Int},
-    parameter_descriptions={'threads': 'The number of threads to use.'},
+    parameters={'threads': Int,
+                'variance_adjusted': Bool},
+    parameter_descriptions={'threads': 'The number of threads to use.',
+                            'variance_adjusted':
+                                ('Perform variance adjustment based on '
+                                 'Chang et al. BMC Bioinformatics 2011')},
     input_descriptions={
         'table': 'A rarefied FeatureTable',
         'phylogeny': ('A rooted phylogeny which relates the observations in '
@@ -86,8 +98,12 @@
     inputs={'table': FeatureTable[Frequency] % Properties('uniform-sampling'),
             'phylogeny': Phylogeny[Rooted]},
     parameters={'threads': Int,
+                'variance_adjusted': Bool,
                 'alpha': Float},
     parameter_descriptions={'threads': 'The number of threads to use.',
+                            'variance_adjusted':
+                                ('Perform variance adjustment based on '
+                                 'Chang et al. BMC Bioinformatics 2011'),
                             'alpha': ('The value of alpha controls importance '
                                       'of sample proportions. 1.0 is '
                                       'weighted normalized UniFrac. 0.0 is '
diff --git a/sucpp/Makefile b/sucpp/Makefile
index a5be99c5..2cd5ad8d 100644
--- a/sucpp/Makefile
+++ b/sucpp/Makefile
@@ -29,11 +29,11 @@ else
 endif
 
 
-test: tree.o test_su.cpp biom.o unifrac.o
-	$(CXX) $(CPPFLAGS) -Wno-unused-parameter test_su.cpp -o test_su tree.o biom.o unifrac.o 
+test: tree.o test_su.cpp biom.o unifrac.o unifrac_task.o
+	$(CXX) $(CPPFLAGS) -Wno-unused-parameter test_su.cpp -o test_su tree.o biom.o unifrac.o unifrac_task.o
 
-main: tree.o biom.o unifrac.o cmd.o
-	$(CXX) $(CPPFLAGS) su.cpp -o ssu tree.o biom.o unifrac.o cmd.o
+main: tree.o biom.o unifrac.o cmd.o unifrac_task.o
+	$(CXX) $(CPPFLAGS) su.cpp -o ssu tree.o biom.o unifrac.o cmd.o unifrac_task.o
 
 ocltest: opencl.o
 	clang++ -framework OpenCL opencl.cpp -c
diff --git a/sucpp/su.cpp b/sucpp/su.cpp
index f59edd8d..9a5c8bfc 100644
--- a/sucpp/su.cpp
+++ b/sucpp/su.cpp
@@ -9,14 +9,26 @@
 #include <thread>
 
 void usage() {
-    std::cout << "usage: ssu -i <biom> -o <out.dm> -m [METHOD] -t <newick> [-n threads] [-a alpha]" << std::endl;
+    std::cout << "usage: ssu -i <biom> -o <out.dm> -m [METHOD] -t <newick> [-n threads] [-a alpha] [--vaw]" << std::endl;
     std::cout << std::endl;
-    std::cout << "    -i\tThe input BIOM table." << std::endl;
-    std::cout << "    -t\tThe input phylogeny in newick." << std::endl;
-    std::cout << "    -m\tThe method, [unweighted | weighted_normalized | weighted_unnormalized | generalized]." << std::endl;
-    std::cout << "    -o\tThe output distance matrix." << std::endl;
-    std::cout << "    -n\t[OPTIONAL] The number of threads, default is 1." << std::endl;
-    std::cout << "    -a\t[OPTIONAL] Generalized UniFrac alpha, default is 1." << std::endl;
+    std::cout << "    -i\t\tThe input BIOM table." << std::endl;
+    std::cout << "    -t\t\tThe input phylogeny in newick." << std::endl;
+    std::cout << "    -m\t\tThe method, [unweighted | weighted_normalized | weighted_unnormalized | generalized]." << std::endl;
+    std::cout << "    -o\t\tThe output distance matrix." << std::endl;
+    std::cout << "    -n\t\t[OPTIONAL] The number of threads, default is 1." << std::endl;
+    std::cout << "    -a\t\t[OPTIONAL] Generalized UniFrac alpha, default is 1." << std::endl;
+    std::cout << "    --vaw\t[OPTIONAL] Variance adjusted, default is to not adjust for variance." << std::endl;
+    std::cout << std::endl;
+    std::cout << "Citations: " << std::endl;
+    std::cout << "    For UniFrac, please see:" << std::endl;
+    std::cout << "        Lozupone and Knight Appl Environ Microbiol 2005; DOI: 10.1128/AEM.71.12.8228-8235.2005" << std::endl;
+    std::cout << "        Lozupone et al. Appl Environ Microbiol 2007; DOI: 10.1128/AEM.01996-06" << std::endl;
+    std::cout << "        Hamady et al. ISME 2010; DOI: 10.1038/ismej.2009.97" << std::endl;
+    std::cout << "        Lozupone et al. ISME 2011; DOI: 10.1038/ismej.2010.133" << std::endl;
+    std::cout << "    For Generalized UniFrac, please see: " << std::endl;
+    std::cout << "        Chen et al. Bioinformatics 2012; DOI: 10.1093/bioinformatics/bts342" << std::endl;
+    std::cout << "    For Variance Adjusted UniFrac, please see: " << std::endl;
+    std::cout << "        Chang et al. BMC Bioinformatics 2011; DOI: 10.1186/1471-2105-12-118" << std::endl;
     std::cout << std::endl;
 }
 
@@ -97,6 +109,7 @@ int main(int argc, char **argv){
         nthreads = atoi(nthreads_arg.c_str());
     }
     
+    bool vaw = input.cmdOptionExists("--vaw"); 
     double g_unifrac_alpha;
     if(gunifrac_arg.empty()) {
         g_unifrac_alpha = 1.0;
@@ -150,13 +163,22 @@ int main(int argc, char **argv){
     tasks[threads.size() - 1].stop = end;
     
     for(unsigned int tid = 0; tid < threads.size(); tid++) {
-        threads[tid] = std::thread(su::unifrac, 
-                                   std::ref(table),
-                                   std::ref(tree_sheared), 
-                                   method, 
-                                   std::ref(dm_stripes), 
-                                   std::ref(dm_stripes_total), 
-                                   &tasks[tid]);
+        if(vaw)
+            threads[tid] = std::thread(su::unifrac_vaw, 
+                                       std::ref(table),
+                                       std::ref(tree_sheared), 
+                                       method, 
+                                       std::ref(dm_stripes), 
+                                       std::ref(dm_stripes_total), 
+                                       &tasks[tid]);
+        else
+            threads[tid] = std::thread(su::unifrac, 
+                                       std::ref(table),
+                                       std::ref(tree_sheared), 
+                                       method, 
+                                       std::ref(dm_stripes), 
+                                       std::ref(dm_stripes_total), 
+                                       &tasks[tid]);
     }
 
     for(unsigned int tid = 0; tid < threads.size(); tid++) {
diff --git a/sucpp/test_su.cpp b/sucpp/test_su.cpp
index 65055bfa..446c89ff 100644
--- a/sucpp/test_su.cpp
+++ b/sucpp/test_su.cpp
@@ -743,6 +743,48 @@ void test_generalized_unifrac() {
     SUITE_END();
 }
 
+void test_vaw_unifrac_weighted_normalized() {
+    SUITE_START("test vaw weighted normalized unifrac");
+    
+    std::vector<std::thread> threads(1);
+    su::BPTree tree = su::BPTree("(GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1,(GG_OTU_5:1,GG_OTU_4:1):1);");
+    su::biom table = su::biom("test.biom");
+    
+    // as computed by GUniFrac, the original implementation of VAW-UniFrac
+    // could not be found.
+    //          Sample1   Sample2   Sample3   Sample4   Sample5   Sample6
+    //Sample1 0.0000000 0.4086040 0.6240185 0.4639481 0.2857143 0.2766318
+    //Sample2 0.4086040 0.0000000 0.3798594 0.6884992 0.6807616 0.4735781
+    //Sample3 0.6240185 0.3798594 0.0000000 0.7713254 0.8812897 0.5047114
+    //Sample4 0.4639481 0.6884992 0.7713254 0.0000000 0.6666667 0.2709298
+    //Sample5 0.2857143 0.6807616 0.8812897 0.6666667 0.0000000 0.4735991
+    //Sample6 0.2766318 0.4735781 0.5047114 0.2709298 0.4735991 0.0000000
+    // weighted normalized unifrac as computed above
+    
+    std::vector<double*> w_exp;
+    double w_stride1[] = {0.4086040, 0.3798594, 0.7713254, 0.6666667, 0.4735991, 0.2766318};
+    double w_stride2[] = {0.6240185, 0.6884992, 0.8812897, 0.2709298, 0.2857143, 0.4735781};
+    double w_stride3[] = {0.4639481, 0.6807616, 0.5047114, 0.4639481, 0.6807616, 0.5047114};
+    w_exp.push_back(w_stride1);
+    w_exp.push_back(w_stride2);
+    w_exp.push_back(w_stride3);
+    std::vector<double*> w_strides = su::make_strides(6);
+    std::vector<double*> w_strides_total = su::make_strides(6);
+    su::task_parameters w_task_p;
+    w_task_p.start = 0; w_task_p.stop = 3; w_task_p.tid = 0; w_task_p.n_samples = 6; 
+    w_task_p.g_unifrac_alpha = 1.0;
+    su::unifrac_vaw(table, tree, su::weighted_normalized, w_strides, w_strides_total, &w_task_p);
+    
+    for(unsigned int i = 0; i < 3; i++) {
+        for(unsigned int j = 0; j < 6; j++) {
+            ASSERT(fabs(w_strides[i][j] - w_exp[i][j]) < 0.000001);
+        }
+        free(w_strides[i]);
+    }
+    SUITE_END();
+}
+
+
 void test_make_strides() {
     SUITE_START("test make stripes");
     std::vector<double*> exp;
@@ -826,7 +868,7 @@ void test_bptree_shear_simple() {
     su::BPTree tree = su::BPTree("((3:2,4:3,(6:5)5:4)2:1,7:6,((10:9,11:10)9:8)8:7)r");
  
     // simple
-    std::unordered_set<std::string> to_keep = {"4", "6", "7", "10", "11"};	
+    std::unordered_set<std::string> to_keep = {"4", "6", "7", "10", "11"};    
     
     uint32_t exp_nparens = 20;
     std::vector<bool> exp_structure = {true, true, true, false, true, true, false, false, false, true, 
@@ -848,7 +890,7 @@ void test_bptree_shear_deep() {
     su::BPTree tree = su::BPTree("((3:2,4:3,(6:5)5:4)2:1,7:6,((10:9,11:10)9:8)8:7)r");
  
     // deep
-    std::unordered_set<std::string> to_keep = {"10", "11"};	
+    std::unordered_set<std::string> to_keep = {"10", "11"};    
     
     uint32_t exp_nparens = 10;
     std::vector<bool> exp_structure = {true, true, true, true, false, true, false, false, false, false};
@@ -893,7 +935,7 @@ void test_bptree_collapse_edge() {
     ASSERT(obs.names == exp.names);
     ASSERT(vec_almost_equal(obs.lengths, exp.lengths));
 
-	SUITE_END();
+    SUITE_END();
 }
 
 void test_unifrac_sample_counts() {
@@ -945,6 +987,7 @@ int main(int argc, char** argv) {
     test_unnormalized_weighted_unifrac();
     test_normalized_weighted_unifrac();
     test_generalized_unifrac();
+    test_vaw_unifrac_weighted_normalized();
     test_unifrac_sample_counts();
 
     printf("\n");
diff --git a/sucpp/unifrac.cpp b/sucpp/unifrac.cpp
index 74619a3b..528647d9 100644
--- a/sucpp/unifrac.cpp
+++ b/sucpp/unifrac.cpp
@@ -3,7 +3,6 @@
 #include "unifrac.hpp"
 #include <unordered_map>
 #include <cstdlib>
-#include <math.h>
 #include <algorithm>
 #include <thread>
 
@@ -52,11 +51,12 @@ double* PropStack::pop(uint32_t node) {
      * add it to our record of known vectors so we can track our mallocs
      */
     double *vec;
+    int err = 0;
     if(prop_stack.empty()) {
-        posix_memalign((void **)&vec, 32, sizeof(double) * defaultsize);
-        if(vec == NULL) {
-            fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", 
-                    sizeof(double) * defaultsize, __FILE__, __LINE__);
+        err = posix_memalign((void **)&vec, 32, sizeof(double) * defaultsize);
+        if(vec == NULL || err != 0) {
+            fprintf(stderr, "Failed to allocate %zd bytes, err %d; [%s]:%d\n", 
+                    sizeof(double) * defaultsize, err, __FILE__, __LINE__);
             exit(EXIT_FAILURE);
         }
     }
@@ -307,10 +307,10 @@ void _unweighted_unifrac_task(std::vector<double*> &__restrict__ dm_stripes,
 
 
 void progressbar(float progress) {
-	// from http://stackoverflow.com/a/14539953
+    // from http://stackoverflow.com/a/14539953
     //
     // could encapsulate into a classs for displaying time elapsed etc
-	int barWidth = 70;
+    int barWidth = 70;
     std::cout << "[";
     int pos = barWidth * progress;
     for (int i = 0; i < barWidth; ++i) {
@@ -322,6 +322,59 @@ void progressbar(float progress) {
     std::cout.flush();
 }
 
+void initialize_embedded(double*& prop, const su::task_parameters* task_p) {
+    int err = 0;
+    err = posix_memalign((void **)&prop, 32, sizeof(double) * task_p->n_samples * 2);
+    if(prop == NULL || err != 0) {
+        fprintf(stderr, "Failed to allocate %zd bytes, err %d; [%s]:%d\n", 
+                sizeof(double) * task_p->n_samples * 2, err, __FILE__, __LINE__);
+        exit(EXIT_FAILURE);
+    }
+}
+
+void initialize_sample_counts(double*& counts, const su::task_parameters* task_p, biom &table) {
+    int err = 0;
+    err = posix_memalign((void **)&counts, 32, sizeof(double) * task_p->n_samples * 2);
+    if(counts == NULL || err != 0) {
+        fprintf(stderr, "Failed to allocate %zd bytes, err %d; [%s]:%d\n", 
+                sizeof(double) * task_p->n_samples, err, __FILE__, __LINE__);
+        exit(EXIT_FAILURE);
+    }
+    for(unsigned int i = 0; i < table.n_samples; i++) {
+        counts[i] = table.sample_counts[i];
+        counts[i + table.n_samples] = table.sample_counts[i];
+    }
+}
+
+void initialize_stripes(std::vector<double*> &dm_stripes, 
+                        std::vector<double*> &dm_stripes_total, 
+                        Method unifrac_method, 
+                        const su::task_parameters* task_p) {
+    int err = 0;
+    for(unsigned int i = task_p->start; i < task_p->stop; i++){
+        err = posix_memalign((void **)&dm_stripes[i], 32, sizeof(double) * task_p->n_samples);
+        if(dm_stripes[i] == NULL || err != 0) {
+            fprintf(stderr, "Failed to allocate %zd bytes, err %d; [%s]:%d\n", 
+                    sizeof(double) * task_p->n_samples, err, __FILE__, __LINE__);
+            exit(EXIT_FAILURE);
+        }
+        for(unsigned int j = 0; j < task_p->n_samples; j++)
+            dm_stripes[i][j] = 0.;
+
+        if(unifrac_method == unweighted || unifrac_method == weighted_normalized) {
+            err = posix_memalign((void **)&dm_stripes_total[i], 32, sizeof(double) * task_p->n_samples);
+            if(dm_stripes_total[i] == NULL || err != 0) {
+                fprintf(stderr, "Failed to allocate %zd bytes err %d; [%s]:%d\n", 
+                        sizeof(double) * task_p->n_samples, err, __FILE__, __LINE__);
+                exit(EXIT_FAILURE);
+            }
+            for(unsigned int j = 0; j < task_p->n_samples; j++)
+                dm_stripes_total[i][j] = 0.;
+        }
+    }
+}
+
+
 void su::unifrac(biom &table,
                  BPTree &tree, 
                  Method unifrac_method,
@@ -342,53 +395,36 @@ void su::unifrac(biom &table,
 
     switch(unifrac_method) {
         case unweighted:
-            func = &_unweighted_unifrac_task;
+            func = &su::_unweighted_unifrac_task;
             break;
         case weighted_normalized:
-            func = &_normalized_weighted_unifrac_task;
+            func = &su::_normalized_weighted_unifrac_task;
             break;
         case weighted_unnormalized:
-            func = &_unnormalized_weighted_unifrac_task;
+            func = &su::_unnormalized_weighted_unifrac_task;
             break;
         case generalized:
-            func = &_generalized_unifrac_task;
+            func = &su::_generalized_unifrac_task;
+            break;
+        default:
+            func = NULL;
             break;
     }
+
+    if(func == NULL) {
+        fprintf(stderr, "Unknown unifrac task\n");
+        exit(1);
+    }
+
     PropStack propstack(table.n_samples);
 
     uint32_t node;
     double *node_proportions;
     double *embedded_proportions; 
     double length;
-	posix_memalign((void **)&embedded_proportions, 32, sizeof(double) * table.n_samples * 2);
-    if(embedded_proportions == NULL) {
-        fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", 
-                sizeof(double) * table.n_samples, __FILE__, __LINE__);
-        exit(EXIT_FAILURE);
-    }
 
-    // thread local memory
-    for(unsigned int i = task_p->start; i < task_p->stop; i++){
-        posix_memalign((void **)&dm_stripes[i], 32, sizeof(double) * table.n_samples);
-        if(dm_stripes[i] == NULL) {
-            fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", 
-                    sizeof(double) * table.n_samples, __FILE__, __LINE__);
-            exit(EXIT_FAILURE);
-        }
-        for(unsigned int j = 0; j < table.n_samples; j++)
-            dm_stripes[i][j] = 0.;
-
-        if(unifrac_method == unweighted || unifrac_method == weighted_normalized) {
-            posix_memalign((void **)&dm_stripes_total[i], 32, sizeof(double) * table.n_samples);
-            if(dm_stripes_total[i] == NULL) {
-                fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", 
-                        sizeof(double) * table.n_samples, __FILE__, __LINE__);
-                exit(EXIT_FAILURE);
-            }
-            for(unsigned int j = 0; j < table.n_samples; j++)
-                dm_stripes_total[i][j] = 0.;
-        }
-    }
+    initialize_embedded(embedded_proportions, task_p);
+    initialize_stripes(std::ref(dm_stripes), std::ref(dm_stripes_total), unifrac_method, task_p);
 
     // - 1 to avoid root   
     for(unsigned int k = 0; k < (tree.nparens / 2) - 1; k++) {
@@ -447,7 +483,7 @@ void su::unifrac(biom &table,
         
         // should make this compile-time support
         //if((tid == 0) && ((k % 1000) == 0))
- 	    //    progressbar((float)k / (float)(tree.nparens / 2));       
+         //    progressbar((float)k / (float)(tree.nparens / 2));       
     }
     
     if(unifrac_method == weighted_normalized || unifrac_method == unweighted || unifrac_method == generalized) {
@@ -461,15 +497,105 @@ void su::unifrac(biom &table,
     free(embedded_proportions);
 }
 
+void su::unifrac_vaw(biom &table,
+                     BPTree &tree, 
+                     Method unifrac_method,
+                     std::vector<double*> &dm_stripes,
+                     std::vector<double*> &dm_stripes_total,
+                     const su::task_parameters* task_p) {
+
+    if(table.n_samples != task_p->n_samples) {
+        fprintf(stderr, "Task and table n_samples not equal\n");
+        exit(EXIT_FAILURE);
+    }
+
+    void (*func)(std::vector<double*>&,  // dm_stripes
+                 std::vector<double*>&,  // dm_stripes_total
+                 double*,                // embedded_proportions
+                 double*,                // embedded_counts
+                 double*,                // sample total counts
+                 double,                 // length
+                 const su::task_parameters*);
+
+    switch(unifrac_method) {
+        case unweighted:
+            func = &su::_vaw_unweighted_unifrac_task;
+            break;
+        case weighted_normalized:
+            func = &su::_vaw_normalized_weighted_unifrac_task;
+            break;
+        case weighted_unnormalized:
+            func = &su::_vaw_unnormalized_weighted_unifrac_task;
+            break;
+        case generalized:
+            func = &su::_vaw_generalized_unifrac_task;
+            break;
+        default:
+            func = NULL;
+            break;
+    }
+    if(func == NULL) {
+        fprintf(stderr, "Unknown unifrac task\n");
+        exit(1);
+    }
+    PropStack propstack(table.n_samples);
+    PropStack countstack(table.n_samples);
+
+    uint32_t node;
+    double *node_proportions;
+    double *node_counts;
+    double *embedded_proportions; 
+    double *embedded_counts; 
+    double *sample_total_counts;
+    double length;
+
+    initialize_embedded(embedded_proportions, task_p);
+    initialize_embedded(embedded_counts, task_p);
+    initialize_sample_counts(sample_total_counts, task_p, table);
+    initialize_stripes(std::ref(dm_stripes), std::ref(dm_stripes_total), unifrac_method, task_p);
+
+    // - 1 to avoid root   
+    for(unsigned int k = 0; k < (tree.nparens / 2) - 1; k++) {
+        node = tree.postorderselect(k);
+        length = tree.lengths[node];
+
+        node_proportions = propstack.pop(node);
+        node_counts = countstack.pop(node);
+
+        set_proportions(node_proportions, tree, node, table, propstack);
+        set_proportions(node_counts, tree, node, table, countstack, false);
+
+        embed_proportions(embedded_proportions, node_proportions, task_p->n_samples);
+        embed_proportions(embedded_counts, node_counts, task_p->n_samples);
+
+        func(dm_stripes, dm_stripes_total, embedded_proportions, embedded_counts, sample_total_counts, length, task_p);
+    }
+    
+    if(unifrac_method == weighted_normalized || unifrac_method == unweighted || unifrac_method == generalized) {
+        for(unsigned int i = task_p->start; i < task_p->stop; i++) {
+            for(unsigned int j = 0; j < task_p->n_samples; j++) {
+                dm_stripes[i][j] = dm_stripes[i][j] / dm_stripes_total[i][j];
+            }
+        }
+    }
+    
+    free(embedded_proportions);
+    free(embedded_counts);
+    free(sample_total_counts);
+}
 void su::set_proportions(double* props, 
                          BPTree &tree, 
                          uint32_t node, 
                          biom &table, 
-                         PropStack &ps) {
+                         PropStack &ps,
+                         bool normalize) {
     if(tree.isleaf(node)) {
        table.get_obs_data(tree.names[node], props);
-       for(unsigned int i = 0; i < table.n_samples; i++)
-           props[i] = props[i] / table.sample_counts[i];
+       for(unsigned int i = 0; i < table.n_samples; i++) {
+           props[i] = props[i];
+           if(normalize)
+               props[i] /= table.sample_counts[i];
+       }
 
     } else {
         unsigned int current = tree.leftchild(node);
@@ -495,12 +621,13 @@ std::vector<double*> su::make_strides(unsigned int n_samples) {
     uint32_t n_rotations = (n_samples + 1) / 2;
     std::vector<double*> dm_stripes(n_rotations);
 
+    int err = 0;
     for(unsigned int i = 0; i < n_rotations; i++) {
         double* tmp;
-        posix_memalign((void **)&tmp, 32, sizeof(double) * n_samples);
-        if(tmp == NULL) {
-            fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", 
-                    sizeof(double) * n_samples, __FILE__, __LINE__);
+        err = posix_memalign((void **)&tmp, 32, sizeof(double) * n_samples);
+        if(tmp == NULL || err != 0) {
+            fprintf(stderr, "Failed to allocate %zd bytes, err %d; [%s]:%d\n", 
+                    sizeof(double) * n_samples, err,  __FILE__, __LINE__);
             exit(EXIT_FAILURE);
         }
         for(unsigned int j = 0; j < n_samples; j++)
diff --git a/sucpp/unifrac.hpp b/sucpp/unifrac.hpp
index baa1fa84..d5403cf3 100644
--- a/sucpp/unifrac.hpp
+++ b/sucpp/unifrac.hpp
@@ -2,18 +2,11 @@
 #include <vector>
 #include <unordered_map>
 #include <thread>
+#include "unifrac_task.hpp"
 
 namespace su {
     enum Method {unweighted, weighted_normalized, weighted_unnormalized, generalized};
     
-    struct task_parameters {
-       uint32_t n_samples;          // number of samples
-       unsigned int start;          // starting stripe
-       unsigned int stop;           // stopping stripe
-       unsigned int tid;            // thread ID
-       double g_unifrac_alpha;      // generalized unifrac alpha
-    };
-
     class PropStack {
         private:
             std::stack<double*> prop_stack;
@@ -34,11 +27,19 @@ namespace su {
                  std::vector<double*> &dm_stripes_total,
                  const task_parameters* task_p);
     
+    void unifrac_vaw(biom &table, 
+                     BPTree &tree, 
+                     Method unifrac_method,
+                     std::vector<double*> &dm_stripes,
+                     std::vector<double*> &dm_stripes_total,
+                     const task_parameters* task_p);
+    
     double** deconvolute_stripes(std::vector<double*> &stripes, uint32_t n);
     void set_proportions(double* props, 
                          BPTree &tree, uint32_t node, 
                          biom &table, 
-                         PropStack &ps);
+                         PropStack &ps,
+                         bool normalize = true);
     std::vector<double*> make_strides(unsigned int n_samples);
     inline void embed_proportions(double* out, double* in, uint32_t n) {
         double val;
diff --git a/sucpp/unifrac_task.cpp b/sucpp/unifrac_task.cpp
new file mode 100644
index 00000000..d84a60b4
--- /dev/null
+++ b/sucpp/unifrac_task.cpp
@@ -0,0 +1,292 @@
+#include "unifrac_task.hpp"
+
+
+void su::_unnormalized_weighted_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                             std::vector<double*> &__restrict__ dm_stripes_total,
+                                             double* __restrict__ embedded_proportions,
+                                             double length,
+                                             const su::task_parameters* task_p) {
+    double *dm_stripe;
+    for(unsigned int stripe=task_p->start; stripe < task_p->stop; stripe++) {
+        dm_stripe = dm_stripes[stripe];
+
+        /* intrinsics yield about a 2x reduction in runtime on llvm. they
+         * were not effective on linux gcc 4.9.1 or 4.9.2. it is unclear
+         * if they would be effective on other versions of gcc.
+         *
+         * one reason they help is that these for loops are not easily
+         * autovectorizable. using the intrinsics effectively gets around
+         * this. ...although, it also appears that loop unrolling works.
+         *
+         * it may make sense to revisit the inclusion of intriniscs, however
+         * support must be tested at compile time, so it's rather annoying
+         * at the moment. basically, we can't assume the presence of avx2.
+         */
+        for(unsigned int j = 0; j < task_p->n_samples / 4; j++) {
+            int k = j * 4;
+            double u1 = embedded_proportions[k];
+            double u2 = embedded_proportions[k + 1];
+            double u3 = embedded_proportions[k + 2];
+            double u4 = embedded_proportions[k + 3];
+
+            double v1 = embedded_proportions[k + stripe + 1];
+            double v2 = embedded_proportions[k + stripe + 2];
+            double v3 = embedded_proportions[k + stripe + 3];
+            double v4 = embedded_proportions[k + stripe + 4];
+            
+            dm_stripe[k] += fabs(u1 - v1) * length;
+            dm_stripe[k + 1] += fabs(u2 - v2) * length;
+            dm_stripe[k + 2] += fabs(u3 - v3) * length;
+            dm_stripe[k + 3] += fabs(u4 - v4) * length;
+        }
+
+        if((task_p->n_samples % 4) != 0) {
+            for(unsigned int k = task_p->n_samples - (task_p->n_samples % 4); k < task_p->n_samples; k++) {
+                double u = embedded_proportions[k];
+                double v = embedded_proportions[k + stripe + 1];
+ 
+                dm_stripe[k] += fabs(u - v) * length;
+            }
+        }
+    }
+}
+
+void su::_vaw_unnormalized_weighted_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                                 std::vector<double*> &__restrict__ dm_stripes_total,
+                                                 double* __restrict__ embedded_proportions,
+                                                 double* __restrict__ embedded_counts,
+                                                 double* __restrict__ sample_total_counts,
+                                                 double length,
+                                                 const su::task_parameters* task_p) {
+    double *dm_stripe;
+    for(unsigned int stripe=task_p->start; stripe < task_p->stop; stripe++) {
+        dm_stripe = dm_stripes[stripe];
+
+        for(unsigned int j = 0; j < task_p->n_samples; j++) {
+            double u = embedded_proportions[j];
+            double v = embedded_proportions[j + stripe + 1];
+
+            double m = sample_total_counts[j] + sample_total_counts[j + stripe + 1];
+            double mi = embedded_counts[j] + embedded_counts[j + stripe + 1];
+            double vaw = sqrt(mi * (m - mi));
+
+            if(vaw > 0)
+                dm_stripe[j] += (fabs(u - v) * length) / vaw;
+        }
+    }
+}
+void su::_normalized_weighted_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                           std::vector<double*> &__restrict__ dm_stripes_total,
+                                           double* __restrict__ embedded_proportions, 
+                                           double length, 
+                                           const su::task_parameters* task_p) {
+    double *dm_stripe;
+    double *dm_stripe_total;
+
+    // point of thread
+    for(unsigned int stripe = task_p->start; stripe < task_p->stop; stripe++) {
+        dm_stripe = dm_stripes[stripe];
+        dm_stripe_total = dm_stripes_total[stripe];
+
+        for(unsigned int j = 0; j < task_p->n_samples / 4; j++) {
+            int k = j * 4;
+            double u1 = embedded_proportions[k];
+            double u2 = embedded_proportions[k + 1];
+            double u3 = embedded_proportions[k + 2];
+            double u4 = embedded_proportions[k + 3];
+         
+            double v1 = embedded_proportions[k + stripe + 1];
+            double v2 = embedded_proportions[k + stripe + 2];
+            double v3 = embedded_proportions[k + stripe + 3];
+            double v4 = embedded_proportions[k + stripe + 4];
+               
+            dm_stripe[k] += fabs(u1 - v1) * length;
+            dm_stripe[k + 1] += fabs(u2 - v2) * length;
+            dm_stripe[k + 2] += fabs(u3 - v3) * length;
+            dm_stripe[k + 3] += fabs(u4 - v4) * length;
+
+            dm_stripe_total[k] += (u1 + v1) * length;
+            dm_stripe_total[k + 1] += (u2 + v2) * length;
+            dm_stripe_total[k + 2] += (u3 + v3) * length;
+            dm_stripe_total[k + 3] += (u4 + v4) * length;
+        }
+
+        if((task_p->n_samples % 4) != 0) {
+            for(unsigned int k = task_p->n_samples - (task_p->n_samples % 4); k < task_p->n_samples; k++) {
+                double u = embedded_proportions[k];
+                double v = embedded_proportions[k + stripe + 1];
+                   
+                dm_stripe[k] += fabs(u - v) * length;
+                dm_stripe_total[k] += (u + v) * length;
+            }
+        }
+    }
+}
+
+void su::_vaw_normalized_weighted_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                               std::vector<double*> &__restrict__ dm_stripes_total,
+                                               double* __restrict__ embedded_proportions, 
+                                               double* __restrict__ embedded_counts, 
+                                               double* __restrict__ sample_total_counts,
+                                               double length, 
+                                               const su::task_parameters* task_p) {
+    double *dm_stripe;
+    double *dm_stripe_total;
+
+    // point of thread
+    for(unsigned int stripe = task_p->start; stripe < task_p->stop; stripe++) {
+        dm_stripe = dm_stripes[stripe];
+        dm_stripe_total = dm_stripes_total[stripe];
+
+        for(unsigned int j = 0; j < task_p->n_samples; j++) {
+            double u = embedded_proportions[j];
+            double v = embedded_proportions[j + stripe + 1];
+            
+            double m = sample_total_counts[j] + sample_total_counts[j + stripe + 1];
+            double mi = embedded_counts[j] + embedded_counts[j + stripe + 1];
+            double vaw = sqrt(mi * (m - mi));
+
+            if(vaw > 0) {   
+                dm_stripe[j] += (fabs(u - v) * length) / vaw;
+                dm_stripe_total[j] += ((u + v) * length) / vaw;
+            }
+        }
+    }
+}
+void su::_generalized_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                   std::vector<double*> &__restrict__ dm_stripes_total,
+                                   double* __restrict__ embedded_proportions, 
+                                   double length, 
+                                   const su::task_parameters* task_p) {
+    double *dm_stripe;
+    double *dm_stripe_total;
+
+    // point of thread
+    for(unsigned int stripe = task_p->start; stripe < task_p->stop; stripe++) {
+        dm_stripe = dm_stripes[stripe];
+        dm_stripe_total = dm_stripes_total[stripe];
+
+        for(unsigned int j = 0; j < task_p->n_samples; j++) {
+            double u1 = embedded_proportions[j];
+            double v1 = embedded_proportions[j + stripe + 1];
+            double sum1 = u1 + v1;
+            if(sum1 != 0.0) {
+                double sub1 = fabs(u1 - v1);
+                double sum_pow1 = pow(sum1, task_p->g_unifrac_alpha) * length;
+                dm_stripe[j] += sum_pow1 * (sub1 / sum1);
+                dm_stripe_total[j] += sum_pow1;
+            }
+        }
+    }
+}
+
+void su::_vaw_generalized_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                       std::vector<double*> &__restrict__ dm_stripes_total,
+                                       double* __restrict__ embedded_proportions, 
+                                       double* __restrict__ embedded_counts, 
+                                       double* __restrict__ sample_total_counts,
+                                       double length, 
+                                       const su::task_parameters* task_p) {
+    double *dm_stripe;
+    double *dm_stripe_total;
+
+    // point of thread
+    for(unsigned int stripe = task_p->start; stripe < task_p->stop; stripe++) {
+        dm_stripe = dm_stripes[stripe];
+        dm_stripe_total = dm_stripes_total[stripe];
+
+        for(unsigned int j = 0; j < task_p->n_samples; j++) {
+            double m = sample_total_counts[j] + sample_total_counts[j + stripe + 1];
+            double mi = embedded_counts[j] + embedded_counts[j + stripe + 1];
+            double vaw = sqrt(mi * (m - mi));
+            
+            double u1 = embedded_proportions[j];
+            double v1 = embedded_proportions[j + stripe + 1];
+            
+            if(vaw > 0.0) {
+                double sum1 = (u1 + v1) / vaw;
+                double sub1 = fabs(u1 - v1) / vaw;
+                double sum_pow1 = pow(sum1, task_p->g_unifrac_alpha) * length;
+                dm_stripe[j] += sum_pow1 * (sub1 / sum1);
+                dm_stripe_total[j] += sum_pow1;
+            }
+        }
+    }
+}
+void su::_unweighted_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                  std::vector<double*> &__restrict__ dm_stripes_total,
+                                  double* __restrict__ embedded_proportions, 
+                                  double length,  
+                                  const su::task_parameters* task_p) {
+    double *dm_stripe;
+    double *dm_stripe_total;
+    
+    for(unsigned int stripe = task_p->start; stripe < task_p->stop; stripe++) {
+        dm_stripe = dm_stripes[stripe];
+        dm_stripe_total = dm_stripes_total[stripe];
+
+        for(unsigned int j = 0; j < task_p->n_samples / 4; j++) {
+            int k = j * 4;
+            int32_t u1 = embedded_proportions[k] > 0;
+            int32_t u2 = embedded_proportions[k + 1] > 0;
+            int32_t u3 = embedded_proportions[k + 2] > 0;
+            int32_t u4 = embedded_proportions[k + 3] > 0;
+ 
+            int32_t v1 = embedded_proportions[k + stripe + 1] > 0;
+            int32_t v2 = embedded_proportions[k + stripe + 2] > 0;
+            int32_t v3 = embedded_proportions[k + stripe + 3] > 0;
+            int32_t v4 = embedded_proportions[k + stripe + 4] > 0;
+
+            dm_stripe[k] += (u1 ^ v1) * length;
+            dm_stripe[k + 1] += (u2 ^ v2) * length;
+            dm_stripe[k + 2] += (u3 ^ v3) * length;
+            dm_stripe[k + 3] += (u4 ^ v4) * length;
+ 
+            dm_stripe_total[k] += (u1 | v1) * length;
+            dm_stripe_total[k + 1] += (u2 | v2) * length;
+            dm_stripe_total[k + 2] += (u3 | v3) * length;
+            dm_stripe_total[k + 3] += (u4 | v4) * length;
+        }
+        
+        if((task_p->n_samples % 4) != 0) {
+            for(unsigned int k = task_p->n_samples - (task_p->n_samples % 4); k < task_p->n_samples; k++) {
+                int32_t u = embedded_proportions[k] > 0;
+                int32_t v = embedded_proportions[k + stripe + 1] > 0;
+
+                dm_stripe[k] += (u ^ v) * length;
+                dm_stripe_total[k] += (u | v) * length;
+            }
+        }
+    }
+}
+
+void su::_vaw_unweighted_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                      std::vector<double*> &__restrict__ dm_stripes_total,
+                                      double* __restrict__ embedded_proportions, 
+                                      double* __restrict__ embedded_counts, 
+                                      double* __restrict__ sample_total_counts,
+                                      double length,  
+                                      const su::task_parameters* task_p) {
+    double *dm_stripe;
+    double *dm_stripe_total;
+    
+    for(unsigned int stripe = task_p->start; stripe < task_p->stop; stripe++) {
+        dm_stripe = dm_stripes[stripe];
+        dm_stripe_total = dm_stripes_total[stripe];
+
+        for(unsigned int j = 0; j < task_p->n_samples; j++) {
+            int32_t u = embedded_proportions[j] > 0;
+            int32_t v = embedded_proportions[j + stripe + 1] > 0;
+
+            double m = sample_total_counts[j] + sample_total_counts[j + stripe + 1];
+            double mi = embedded_counts[j] + embedded_counts[j + stripe + 1];
+            double vaw = sqrt(mi * (m - mi));
+            
+            if(vaw > 0) {
+                dm_stripe[j] += ((u ^ v) * length) / vaw;
+                dm_stripe_total[j] += ((u | v) * length) / vaw;
+            }
+        }
+    }
+}
+
diff --git a/sucpp/unifrac_task.hpp b/sucpp/unifrac_task.hpp
new file mode 100644
index 00000000..4a2dec6c
--- /dev/null
+++ b/sucpp/unifrac_task.hpp
@@ -0,0 +1,108 @@
+#include <math.h>
+#include <vector>
+#include <stdint.h>
+
+namespace su {
+    /* task specific compute parameters
+     *
+     * n_samples <int> the number of samples being processed
+     * start <uint> the first stride to process
+     * stop <uint> the last stride to process
+     * tid <uint> the thread identifier
+     * g_unifrac_alpha <double> an alpha value for generalized unifrac
+     */
+    struct task_parameters {
+       uint32_t n_samples;          // number of samples
+       unsigned int start;          // starting stripe
+       unsigned int stop;           // stopping stripe
+       unsigned int tid;            // thread ID
+
+       // task specific arguments below
+       double g_unifrac_alpha;      // generalized unifrac alpha
+    };
+
+    /* void su::unifrac tasks
+     *
+     * all methods utilize the same function signature. that signature is as follows:
+     *
+     * dm_stripes vector<double> the stripes of the distance matrix being accumulated 
+     *      into for unique branch length
+     * dm_stripes vector<double> the stripes of the distance matrix being accumulated 
+     *      into for total branch length (e.g., to normalize unweighted unifrac)
+     * embedded_proportions <double*> the proportions vector for a sample, or rather
+     *      the counts vector normalized to 1. this vector is embedded as it is 
+     *      duplicated: if A, B and C are proportions for features A, B, and C, the
+     *      vector will look like [A B C A B C].
+     * length <double> the branch length of the current node to its parent.
+     * task_p <task_parameters*> task specific parameters.
+     */
+    void _unnormalized_weighted_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                             std::vector<double*> &__restrict__ dm_stripes_total,
+                                             double* __restrict__ embedded_proportions,
+                                             double length,
+                                             const su::task_parameters* task_p);
+    void _normalized_weighted_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                           std::vector<double*> &__restrict__ dm_stripes_total,
+                                           double* __restrict__ embedded_proportions,
+                                           double length,
+                                           const su::task_parameters* task_p);
+    void _unweighted_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                  std::vector<double*> &__restrict__ dm_stripes_total,
+                                  double* __restrict__ embedded_proportions,
+                                  double length,
+                                  const su::task_parameters* task_p);
+    void _generalized_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                   std::vector<double*> &__restrict__ dm_stripes_total,
+                                   double* __restrict__ embedded_proportions,
+                                   double length,
+                                   const su::task_parameters* task_p);
+    
+    /* void su::unifrac_vaw tasks
+     *
+     * all methods utilize the same function signature. that signature is as follows:
+     *
+     * dm_stripes vector<double> the stripes of the distance matrix being accumulated 
+     *      into for unique branch length
+     * dm_stripes vector<double> the stripes of the distance matrix being accumulated 
+     *      into for total branch length (e.g., to normalize unweighted unifrac)
+     * embedded_proportions <double*> the proportions vector for a sample, or rather
+     *      the counts vector normalized to 1. this vector is embedded as it is 
+     *      duplicated: if A, B and C are proportions for features A, B, and C, the
+     *      vector will look like [A B C A B C].
+     * embedded_counts <double*> the counts vector embedded in the same way and order as
+     *      embedded_proportions. the values of this array are unnormalized feature 
+     *      counts for the subtree.
+     * sample_total_counts <double*> the total unnormalized feature counts for all samples
+     *      embedded in the same way and order as embedded_proportions.
+     * length <double> the branch length of the current node to its parent.
+     * task_p <task_parameters*> task specific parameters.
+     */
+    void _vaw_unnormalized_weighted_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                                 std::vector<double*> &__restrict__ dm_stripes_total,
+                                                 double* __restrict__ embedded_proportions,
+                                                 double* __restrict__ embedded_counts,
+                                                 double* __restrict__ sample_total_counts,
+                                                 double length,
+                                                 const su::task_parameters* task_p);
+    void _vaw_normalized_weighted_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                               std::vector<double*> &__restrict__ dm_stripes_total,
+                                               double* __restrict__ embedded_proportions,
+                                               double* __restrict__ embedded_counts,
+                                               double* __restrict__ sample_total_counts,
+                                               double length,
+                                               const su::task_parameters* task_p);
+    void _vaw_unweighted_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                      std::vector<double*> &__restrict__ dm_stripes_total,
+                                      double* __restrict__ embedded_proportions,
+                                      double* __restrict__ embedded_counts,
+                                      double* __restrict__ sample_total_counts,
+                                      double length,
+                                      const su::task_parameters* task_p);
+    void _vaw_generalized_unifrac_task(std::vector<double*> &__restrict__ dm_stripes, 
+                                       std::vector<double*> &__restrict__ dm_stripes_total,
+                                       double* __restrict__ embedded_proportions,
+                                       double* __restrict__ embedded_counts,
+                                       double* __restrict__ sample_total_counts,
+                                       double length,
+                                       const su::task_parameters* task_p);
+}