Skip to content

Commit 20389ca

Browse files
committed
Integrate mutations (on a branch) to a likelihood vectors
1 parent fbf6c4b commit 20389ca

File tree

2 files changed

+295
-0
lines changed

2 files changed

+295
-0
lines changed

alignment/seqregions.h

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,22 @@ class SeqRegions : public std::vector<SeqRegion> {
119119
cmaple::PositionType seq_length,
120120
cmaple::StateType num_states,
121121
const cmaple::Params& params) const;
122+
123+
/**
124+
Integrate mutations at a branch to a likelihood vector
125+
This regions is the input likelihood vector
126+
@param output_regions the output regions
127+
@param mutations the vector of mutations
128+
@param aln the alignment
129+
@param inverse True to inverse the mutations
130+
@throw std::logic\_error if unexpected values/behaviors found during the
131+
operations
132+
*/
133+
template <const cmaple::StateType num_states>
134+
void integrateMutations(std::unique_ptr<SeqRegions>& output_regions,
135+
const SeqRegions& mutations,
136+
const Alignment* aln,
137+
const bool inverse = false) const;
122138

123139
/**
124140
Merge two likelihood vectors, one from above and one from below
@@ -620,6 +636,141 @@ auto updateMultLHwithMat(const RealNumType* mat_row,
620636
return sum_lh;
621637
}
622638

639+
template <const StateType num_states>
640+
void SeqRegions::integrateMutations(std::unique_ptr<SeqRegions>& output_regions,
641+
const SeqRegions& mutations,
642+
const Alignment* aln,
643+
const bool inverse) const {
644+
assert(mutations.size() > 0);
645+
assert(aln);
646+
647+
// init variables
648+
PositionType pos = 0;
649+
const SeqRegions& seq_regions = *this;
650+
const SeqRegions& mutation_regions = mutations;
651+
size_t iseq = 0;
652+
size_t imut = 0;
653+
const PositionType seq_length = static_cast<PositionType>(aln->ref_seq.size());
654+
655+
// init merged_regions
656+
if (output_regions) {
657+
output_regions->clear();
658+
} else {
659+
output_regions = cmaple::make_unique<SeqRegions>();
660+
}
661+
662+
// avoid realloc of vector data (minimize memory footprint)
663+
output_regions->reserve(countSharedSegments(
664+
mutation_regions, static_cast<size_t>(seq_length)));
665+
666+
#ifdef DEBUG
667+
// remember capacity (may be more than we 'reserved')
668+
const size_t max_elements = output_regions->capacity();
669+
#endif
670+
671+
while (pos < seq_length) {
672+
PositionType end_pos;
673+
674+
// get the next shared segment in the two sequences
675+
cmaple::SeqRegions::getNextSharedSegment(pos, seq_regions, mutation_regions,
676+
iseq, imut, end_pos);
677+
const auto* const seq_region = &seq_regions[iseq];
678+
const auto* const mutation = &mutation_regions[imut];
679+
680+
// extract the new state of the mutation
681+
// according to the direction
682+
cmaple::StateType mut_new_state = inverse ?
683+
mutation->prev_state : mutation->type;
684+
685+
// first add seq_region to the output lh vector
686+
output_regions->push_back(SeqRegion::clone(*seq_region));
687+
// extract the newly added seq_region
688+
SeqRegion& last_region = output_regions->back();
689+
690+
// case 1: seq_region = N
691+
if (seq_region->type == TYPE_N)
692+
{
693+
// force moving to the end of this region
694+
while (end_pos < seq_region->position)
695+
{
696+
// update pos
697+
pos = end_pos + 1;
698+
699+
// move to the next share segment
700+
cmaple::SeqRegions::getNextSharedSegment(
701+
pos, seq_regions, mutation_regions,
702+
iseq, imut, end_pos);
703+
}
704+
assert(end_pos == seq_region->position);
705+
}
706+
// case 2: seq_region = A/C/G/T
707+
else if (seq_region->type < num_states)
708+
{
709+
// if a mutation occurs at this position,
710+
// modify the type (i.e., current state) and the previous state
711+
if (mutation->type < num_states)
712+
{
713+
// if the current state is identical to the new state of the mutation
714+
// change it to R
715+
if (last_region.type == mut_new_state)
716+
{
717+
last_region.type = TYPE_R;
718+
last_region.prev_state = TYPE_N;
719+
}
720+
// otherwise, keep the current state but update the previous state
721+
else
722+
{
723+
last_region.prev_state = mut_new_state;
724+
}
725+
}
726+
}
727+
// case 3: seq_region = R
728+
else if (seq_region->type == TYPE_R)
729+
{
730+
// if a mutation occurs at this position,
731+
// modify the type (i.e., current state) and the previous state
732+
if (mutation->type < num_states)
733+
{
734+
// set the previous state of the newly-added region
735+
// as the new state of the mutation
736+
last_region.prev_state = mut_new_state;
737+
738+
// extract the previous state of the mutation
739+
// according to the direction
740+
cmaple::StateType mut_prev_state = inverse ?
741+
mutation->type : mutation->prev_state;
742+
743+
// the new state of the newly-added region
744+
// is R which is also the previous state of the mutation
745+
last_region.type = mut_prev_state;
746+
}
747+
// otherwise, update the last position of the newly-added R region
748+
{
749+
last_region.position = end_pos;
750+
}
751+
}
752+
// case 4: seq_region = O
753+
else
754+
{
755+
// if a mutation occurs at this position,
756+
// modify the previous state
757+
if (mutation->type < num_states)
758+
{
759+
last_region.prev_state = mut_new_state;
760+
}
761+
}
762+
763+
// update pos
764+
pos = end_pos + 1;
765+
}
766+
767+
#ifdef DEBUG
768+
// ensure we did the correct reserve, otherwise it was
769+
// a pessimization
770+
assert(output_regions->capacity() == max_elements);
771+
#endif
772+
}
773+
623774
template <const StateType num_states>
624775
void merge_N_O(const RealNumType lower_plength,
625776
const SeqRegion& reg_o,

unittest/seqregions_test.cpp

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2618,6 +2618,150 @@ void genTestData(std::unique_ptr<SeqRegions>& seqregions1,
26182618
*seqregions_3, 13e-8, tree.aln, tree.model, tree.cumulative_rate, threshold_prob);
26192619
}
26202620

2621+
/*
2622+
Test integrateMutations(std::unique_ptr<SeqRegions>& output_regions,
2623+
const SeqRegions& mutations,
2624+
const Alignment* aln,
2625+
const bool inverse) const
2626+
*/
2627+
TEST(SeqRegions, integrateMutations)
2628+
{
2629+
Alignment aln = loadAln5K();
2630+
Model model(cmaple::ModelBase::GTR);
2631+
Tree tree(&aln, &model);
2632+
std::unique_ptr<Params>& params = tree.params;
2633+
const PositionType seq_length = aln.ref_seq.size();
2634+
const StateType num_states = aln.num_states;
2635+
2636+
std::unique_ptr<SeqRegions> seqregions1 = nullptr;
2637+
std::unique_ptr<SeqRegions> mutations = nullptr;
2638+
std::unique_ptr<SeqRegions> output_regions_ptr = nullptr;
2639+
2640+
// dummy variables
2641+
const RealNumType threshold_prob = params->threshold_prob;
2642+
2643+
// ----- Test 1 -----
2644+
genTestData(seqregions1, mutations, tree, threshold_prob, 1);
2645+
2646+
seqregions1->integrateMutations<4>(output_regions_ptr,
2647+
*mutations, tree.aln, false);
2648+
2649+
std::string output = "";
2650+
for (auto i = 0; i < output_regions_ptr->size(); ++i)
2651+
{
2652+
output += "State: " + convertIntToString(output_regions_ptr->at(i).type) + " ; Prev_state: " + convertIntToString(output_regions_ptr->at(i).prev_state) + " ; Position: " + convertIntToString(output_regions_ptr->at(i).position) + "; ";
2653+
}
2654+
2655+
std::string expected_output = "State: 250 ; Prev_state: 252 ; Position: 239; State: 250 ; Prev_state: 252 ; Position: 240; State: 250 ; Prev_state: 252 ; Position: 377; State: 250 ; Prev_state: 252 ; Position: 378; State: 250 ; Prev_state: 252 ; Position: 1705; State: 251 ; Prev_state: 252 ; Position: 1706; State: 250 ; Prev_state: 252 ; Position: 3035; State: 250 ; Prev_state: 252 ; Position: 3036; State: 250 ; Prev_state: 252 ; Position: 14406; State: 250 ; Prev_state: 252 ; Position: 14407; State: 250 ; Prev_state: 252 ; Position: 23401; State: 250 ; Prev_state: 252 ; Position: 23402; State: 250 ; Prev_state: 252 ; Position: 26445; State: 250 ; Prev_state: 252 ; Position: 26446; State: 250 ; Prev_state: 252 ; Position: 28142; State: 250 ; Prev_state: 252 ; Position: 28143; State: 250 ; Prev_state: 252 ; Position: 28248; State: 250 ; Prev_state: 252 ; Position: 28249; State: 250 ; Prev_state: 252 ; Position: 28252; State: 250 ; Prev_state: 252 ; Position: 28253; State: 250 ; Prev_state: 252 ; Position: 29890; ";
2656+
2657+
EXPECT_EQ(output, expected_output);
2658+
// ----- Test 1 -----
2659+
2660+
// ----- Test 2 -----
2661+
genTestData(seqregions1, mutations, tree, threshold_prob, 2);
2662+
2663+
seqregions1->integrateMutations<4>(output_regions_ptr,
2664+
*mutations, tree.aln, false);
2665+
2666+
output = "";
2667+
for (auto i = 0; i < output_regions_ptr->size(); ++i)
2668+
{
2669+
output += "State: " + convertIntToString(output_regions_ptr->at(i).type) + " ; Prev_state: " + convertIntToString(output_regions_ptr->at(i).prev_state) + " ; Position: " + convertIntToString(output_regions_ptr->at(i).position) + "; ";
2670+
}
2671+
2672+
expected_output = "State: 250 ; Prev_state: 252 ; Position: 14; State: 250 ; Prev_state: 252 ; Position: 239; State: 250 ; Prev_state: 252 ; Position: 240; State: 250 ; Prev_state: 252 ; Position: 3035; State: 250 ; Prev_state: 252 ; Position: 3036; State: 250 ; Prev_state: 252 ; Position: 8780; State: 251 ; Prev_state: 3 ; Position: 8781; State: 250 ; Prev_state: 252 ; Position: 14406; State: 250 ; Prev_state: 252 ; Position: 14407; State: 250 ; Prev_state: 252 ; Position: 17745; State: 250 ; Prev_state: 252 ; Position: 17746; State: 250 ; Prev_state: 252 ; Position: 17856; State: 250 ; Prev_state: 252 ; Position: 17857; State: 250 ; Prev_state: 252 ; Position: 18058; State: 250 ; Prev_state: 252 ; Position: 18059; State: 250 ; Prev_state: 252 ; Position: 23401; State: 250 ; Prev_state: 252 ; Position: 23402; State: 250 ; Prev_state: 252 ; Position: 28142; State: 250 ; Prev_state: 252 ; Position: 28143; State: 250 ; Prev_state: 252 ; Position: 29887; State: 251 ; Prev_state: 252 ; Position: 29888; State: 250 ; Prev_state: 252 ; Position: 29890; ";
2673+
2674+
EXPECT_EQ(output, expected_output);
2675+
// ----- Test 2 -----
2676+
2677+
// ----- Test 3 -----
2678+
genTestData(seqregions1, mutations, tree, threshold_prob, 3);
2679+
seqregions1->at(0).type = TYPE_N;
2680+
seqregions1->at(1).type = 3;
2681+
seqregions1->at(4).type = TYPE_N;
2682+
2683+
seqregions1->integrateMutations<4>(output_regions_ptr,
2684+
*mutations, tree.aln, false);
2685+
2686+
output = "";
2687+
for (auto i = 0; i < output_regions_ptr->size(); ++i)
2688+
{
2689+
output += "State: " + convertIntToString(output_regions_ptr->at(i).type) + " ; Prev_state: " + convertIntToString(output_regions_ptr->at(i).prev_state) + " ; Position: " + convertIntToString(output_regions_ptr->at(i).position) + "; ";
2690+
}
2691+
2692+
expected_output = "State: 252 ; Prev_state: 252 ; Position: 239; State: 3 ; Prev_state: 1 ; Position: 240; State: 250 ; Prev_state: 252 ; Position: 3035; State: 250 ; Prev_state: 252 ; Position: 3036; State: 252 ; Prev_state: 252 ; Position: 14406; State: 250 ; Prev_state: 252 ; Position: 14407; State: 250 ; Prev_state: 252 ; Position: 18623; State: 251 ; Prev_state: 252 ; Position: 18624; State: 250 ; Prev_state: 252 ; Position: 23401; State: 250 ; Prev_state: 252 ; Position: 23402; State: 250 ; Prev_state: 252 ; Position: 24032; State: 251 ; Prev_state: 252 ; Position: 24033; State: 250 ; Prev_state: 252 ; Position: 28075; State: 251 ; Prev_state: 252 ; Position: 28076; State: 250 ; Prev_state: 252 ; Position: 28142; State: 250 ; Prev_state: 252 ; Position: 28143; State: 250 ; Prev_state: 252 ; Position: 29843; State: 250 ; Prev_state: 252 ; Position: 29890; ";
2693+
2694+
EXPECT_EQ(output, expected_output);
2695+
// ----- Test 3 -----
2696+
2697+
// ----- Test 4 -----
2698+
genTestData(seqregions1, mutations, tree, threshold_prob, 4);
2699+
seqregions1->at(1).type = TYPE_N;
2700+
seqregions1->at(2).type = 3;
2701+
seqregions1->at(2).prev_state = 2;
2702+
seqregions1->at(4).type = TYPE_O;
2703+
seqregions1->at(8).type = 2;
2704+
seqregions1->at(9).type = TYPE_N;
2705+
2706+
seqregions1->integrateMutations<4>(output_regions_ptr,
2707+
*mutations, tree.aln, true);
2708+
2709+
output = "";
2710+
for (auto i = 0; i < output_regions_ptr->size(); ++i)
2711+
{
2712+
output += "State: " + convertIntToString(output_regions_ptr->at(i).type) + " ; Prev_state: " + convertIntToString(output_regions_ptr->at(i).prev_state) + " ; Position: " + convertIntToString(output_regions_ptr->at(i).position) + "; ";
2713+
}
2714+
2715+
expected_output = "State: 250 ; Prev_state: 252 ; Position: 0; State: 252 ; Prev_state: 252 ; Position: 239; State: 3 ; Prev_state: 252 ; Position: 240; State: 250 ; Prev_state: 252 ; Position: 3035; State: 251 ; Prev_state: 252 ; Position: 3036; State: 250 ; Prev_state: 252 ; Position: 8780; State: 251 ; Prev_state: 252 ; Position: 8781; State: 250 ; Prev_state: 252 ; Position: 14406; State: 2 ; Prev_state: 252 ; Position: 14407; State: 252 ; Prev_state: 252 ; Position: 23401; State: 0 ; Prev_state: 252 ; Position: 23402; State: 250 ; Prev_state: 252 ; Position: 28142; State: 1 ; Prev_state: 252 ; Position: 28143; State: 250 ; Prev_state: 252 ; Position: 29882; State: 250 ; Prev_state: 252 ; Position: 29890; ";
2716+
2717+
EXPECT_EQ(output, expected_output);
2718+
2719+
// ----- Test 4 -----
2720+
2721+
// ----- Test 5 -----
2722+
genTestData(seqregions1, mutations, tree, threshold_prob, 5);
2723+
seqregions1->at(9).type = TYPE_N;
2724+
seqregions1->at(2).type = TYPE_O;
2725+
seqregions1->at(2).prev_state = 2;
2726+
mutations->at(5).type = 3;
2727+
mutations->at(5).type = 2;
2728+
2729+
/*std::cout << "seqregions1: " << std::endl;
2730+
for (auto i = 0; i < seqregions1->size(); ++i)
2731+
{
2732+
std::cout << "State: " << seqregions1->at(i).type <<" ; Prev_state: " << seqregions1->at(i).prev_state << " ; Position: " << seqregions1->at(i).position << std::endl;
2733+
}
2734+
2735+
std::cout << "mutations: " << std::endl;
2736+
for (auto i = 0; i < mutations->size(); ++i)
2737+
{
2738+
std::cout << "State: " << mutations->at(i).type <<" ; Prev_state: " << mutations->at(i).prev_state << " ; Position: " << mutations->at(i).position << std::endl;
2739+
}*/
2740+
2741+
seqregions1->integrateMutations<4>(output_regions_ptr,
2742+
*mutations, tree.aln, true);
2743+
2744+
/*std::cout << "output_regions_ptr: " << std::endl;
2745+
for (auto i = 0; i < output_regions_ptr->size(); ++i)
2746+
{
2747+
std::cout << "State: " << output_regions_ptr->at(i).type <<" ; Prev_state: " << output_regions_ptr->at(i).prev_state << " ; Position: " << output_regions_ptr->at(i).position << std::endl;
2748+
}*/
2749+
2750+
output = "";
2751+
for (auto i = 0; i < output_regions_ptr->size(); ++i)
2752+
{
2753+
output += "State: " + convertIntToString(output_regions_ptr->at(i).type) + " ; Prev_state: " + convertIntToString(output_regions_ptr->at(i).prev_state) + " ; Position: " + convertIntToString(output_regions_ptr->at(i).position) + "; ";
2754+
}
2755+
2756+
// std::cout << std::endl << std::endl << output << std::endl;
2757+
2758+
expected_output = "State: 250 ; Prev_state: 252 ; Position: 1; State: 250 ; Prev_state: 252 ; Position: 16; State: 250 ; Prev_state: 252 ; Position: 239; State: 251 ; Prev_state: 252 ; Position: 240; State: 250 ; Prev_state: 252 ; Position: 3035; State: 1 ; Prev_state: 252 ; Position: 3036; State: 250 ; Prev_state: 252 ; Position: 8780; State: 251 ; Prev_state: 252 ; Position: 8781; State: 250 ; Prev_state: 252 ; Position: 14406; State: 1 ; Prev_state: 252 ; Position: 14407; State: 252 ; Prev_state: 252 ; Position: 23401; State: 0 ; Prev_state: 252 ; Position: 23402; State: 250 ; Prev_state: 252 ; Position: 28142; State: 1 ; Prev_state: 252 ; Position: 28143; State: 250 ; Prev_state: 252 ; Position: 29869; State: 250 ; Prev_state: 252 ; Position: 29875; State: 250 ; Prev_state: 252 ; Position: 29890; ";
2759+
2760+
EXPECT_EQ(output, expected_output);
2761+
2762+
// ----- Test 5 -----
2763+
}
2764+
26212765
/*
26222766
Test mergeUpperLower<4>(SeqRegions* &merged_regions, RealNumType upper_plength,
26232767
const SeqRegions& lower_regions, RealNumType lower_plength, const

0 commit comments

Comments
 (0)