From 25c78b1c5b4113f299a2864b5fa062372e8ca985 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Fri, 10 Oct 2025 17:55:22 +0200 Subject: [PATCH 1/2] ``: Process non-greedy and longest-mode simple loops non-recursively --- stl/inc/regex | 100 ++++++++++++------ .../std/tests/VSO_0000000_regex_use/test.cpp | 15 +++ 2 files changed, 80 insertions(+), 35 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 22f1947ff7b..560a35d14f9 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1680,6 +1680,7 @@ enum class _Rx_unwind_ops { _Disjunction_eval_alt_on_failure, _Disjunction_eval_alt_always, _Do_nothing, + _Loop_simple_nongreedy, }; template @@ -1814,7 +1815,7 @@ private: void _Decrease_stack_usage_count(); void _Increase_complexity_count(); - bool _Do_rep0(_Node_rep*, bool); + bool _Do_rep0(_Node_rep*); bool _Do_rep(_Node_rep*, bool, int); void _Prepare_rep(_Node_rep*); bool _Find_first_inner_capture_group(_Node_base*, _Loop_vals_v2_t*); @@ -3413,11 +3414,11 @@ void _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Increase_complexity_coun } template -bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node, bool _Greedy) { +bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node) { // apply repetition to loop with no nested if/do int _Ix = _Node->_Min; const size_t _Frame_idx = _Loop_vals[_Node->_Loop_number]._Loop_frame_idx; - _Loop_vals[_Node->_Loop_number]._Loop_idx = _Ix + 1; + _Loop_vals[_Node->_Loop_number]._Loop_idx = _Ix + 2; _Tgt_state_t<_It> _Final; bool _Matched0 = false; @@ -3425,10 +3426,6 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node bool _Done = false; if (_Match_pat(_Node->_End_rep->_Next)) { - if (!_Greedy) { - return true; // go with current match - } - // record an acceptable match and continue _Final = _Tgt_state; _Matched0 = true; @@ -3449,10 +3446,6 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node } else { _Saved_pos = _Tgt_state._Cur; if (_Match_pat(_Node->_End_rep->_Next)) { - if (!_Greedy) { - return true; // go with current match - } - // record match and continue _Final = _Tgt_state; _Matched0 = true; @@ -3472,10 +3465,6 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node // since loop is branchless, empty rep match is not possible at this point _Saved_pos = _Tgt_state._Cur; if (_Match_pat(_Node->_End_rep->_Next)) { - if (!_Greedy) { - return true; // go with current match - } - // record match and continue _Final = _Tgt_state; _Matched0 = true; @@ -4135,13 +4124,25 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N if (_Node->_Simple_loop == 1) { auto& _Sav = _Loop_vals[_Node->_Loop_number]; - _Sav._Loop_idx = 1; _Sav._Loop_frame_idx = _Push_frame(_Rx_unwind_ops::_Do_nothing); - if (_Node->_Min == 0) { - _Failed = !_Do_rep0(_Node, _Greedy); - _Next = nullptr; - } else { + if (_Node->_Min > 0) { // try to match a rep _Increase_complexity_count(); + _Sav._Loop_idx = 1; + // _Next is already assigned correctly for matching a rep + } else if (!_Greedy || _Longest) { // non-greedy matching + _Increase_complexity_count(); + + // try tail first + _Sav._Loop_idx = 0; + _Next = _Node->_End_rep->_Next; + + // set up stack unwinding for non-greedy matching if at least one rep is allowed + if (_Node->_Max == -1 || 0 < _Node->_Max) { + _Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Node); + } + } else { + _Failed = !_Do_rep0(_Node); + _Next = nullptr; } } else { _Failed = !_Do_rep(_Node, _Greedy, 0); @@ -4155,29 +4156,41 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N { _Node_rep* _Nr = static_cast<_Node_end_rep*>(_Nx)->_Begin_rep; auto& _Sav = _Loop_vals[_Nr->_Loop_number]; + bool _Greedy = (_Nr->_Flags & _Fl_greedy) != 0; if (_Nr->_Simple_loop != 0) { - if (_Sav._Loop_idx <= _Nr->_Min) { - if (_Sav._Loop_idx == 1 - && _Tgt_state._Cur == _Frames[_Sav._Loop_frame_idx]._Match_state._Cur) { // match empty - // loop is branchless, so it will only ever match empty strings - // -> skip all other matches as they don't change state and immediately try tail + if (_Sav._Loop_idx == 1 + && _Tgt_state._Cur + == _Frames[_Sav._Loop_frame_idx]._Match_state._Cur) { // initial match empty + // loop is branchless, so it will only ever match empty strings + // -> we only try tail for POSIX or if minimum number of reps is non-zero + if ((_Sflags & regex_constants::_Any_posix) || _Nr->_Min > 0) { _Increase_complexity_count(); // _Next is already assigned correctly for matching tail - } else if (_Sav._Loop_idx < _Nr->_Min) { // needs at least one more rep to reach minimum - _Increase_complexity_count(); - // GH-5365: We have to reset the capture groups from the second iteration on. - _Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid; - _Next = _Nr->_Next; - ++_Sav._Loop_idx; - } else { // minimum number of reps reached - _Failed = !_Do_rep0(_Nr, (_Nr->_Flags & _Fl_greedy) != 0); - _Next = nullptr; + } else { + _Failed = true; } + } else if (_Sav._Loop_idx < _Nr->_Min) { // at least one more rep to reach minimum + _Increase_complexity_count(); + + _Next = _Nr->_Next; + // GH-5365: We have to reset the capture groups from the second iteration on. + _Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid; + ++_Sav._Loop_idx; + } else if (_Longest || !_Greedy) { + _Increase_complexity_count(); + // set up stack unwinding for non-greedy matching if one more rep is allowed + if (_Nr->_Max == -1 || _Sav._Loop_idx < _Nr->_Max) { + _Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Nr); + } + // _Next is already assigned correctly for matching tail + } else if (_Sav._Loop_idx == _Nr->_Min) { // greedy and minimum number of reps reached + _Failed = !_Do_rep0(_Nr); + _Next = nullptr; } else { // internal _Match_pat(_Node->_Next) call in _Do_rep0() _Next = nullptr; } } else { - _Failed = !_Do_rep(_Nr, (_Nr->_Flags & _Fl_greedy) != 0, _Sav._Loop_idx); + _Failed = !_Do_rep(_Nr, _Greedy, _Sav._Loop_idx); _Next = nullptr; } break; @@ -4267,6 +4280,23 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N case _Rx_unwind_ops::_Do_nothing: break; + case _Rx_unwind_ops::_Loop_simple_nongreedy: + // try one more rep after matching tail if necessary + if (_Longest || _Failed) { + auto _Node = static_cast<_Node_rep*>(_Frame._Node); + auto& _Sav = _Loop_vals[_Node->_Loop_number]; + + _Increase_complexity_count(); + _Nx = _Node->_Next; + _Tgt_state._Cur = _Frame._Match_state._Cur; + _Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid; + _Failed = false; + if (_Sav._Loop_idx < INT_MAX) { // avoid overflowing _Loop_idx + ++_Sav._Loop_idx; + } + } + break; + default: #if _ITERATOR_DEBUG_LEVEL != 0 _STL_REPORT_ERROR("internal stack of regex matcher corrupted"); diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index ea6e246adb6..b4fc9b2f8fa 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -2129,6 +2129,20 @@ void test_gh_5672() { } } +void test_gh_5774() { + // GH-5774: Process non-greedy and longest-mode simple loops non-recursively. + // This extends our test coverage on non-greedy simple loops with bounded number of repetitions. + g_regexTester.should_not_match("", "a+?"); + g_regexTester.should_not_match("ab", "a{0}?b"); + g_regexTester.should_match("ab", "a{0,1}?b"); + g_regexTester.should_not_match("aab", "a{0,1}?b"); + g_regexTester.should_match("aab", "a{0,2}?b"); + g_regexTester.should_match("aab", "a{1,2}?b"); + g_regexTester.should_not_match("aab", "a{1}?b"); + g_regexTester.should_not_match("aaab", "a{1,2}?b"); + g_regexTester.should_match("aaab", "a{1,3}?b"); +} + int main() { test_dev10_449367_case_insensitivity_should_work(); test_dev11_462743_regex_collate_should_not_disable_regex_icase(); @@ -2180,6 +2194,7 @@ int main() { test_gh_5509(); test_gh_5576(); test_gh_5672(); + test_gh_5774(); return g_regexTester.result(); } From 78fbd39f31d682eee5699c1818de976d076536c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Sat, 11 Oct 2025 15:37:11 +0200 Subject: [PATCH 2/2] simplify check if maximum reps have been reached, explicitly document _Max value range --- stl/inc/regex | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 560a35d14f9..d886365a2f3 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1586,7 +1586,7 @@ public: _Loop_number(_Number) {} const int _Min; - const int _Max; + const int _Max; // non-negative if bounded, -1 if unbounded _Node_end_rep* _End_rep; unsigned int _Loop_number; int _Simple_loop = -1; // -1 undetermined, 0 contains if/do, 1 simple @@ -4137,7 +4137,7 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N _Next = _Node->_End_rep->_Next; // set up stack unwinding for non-greedy matching if at least one rep is allowed - if (_Node->_Max == -1 || 0 < _Node->_Max) { + if (_Node->_Max != 0) { _Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Node); } } else { @@ -4179,7 +4179,7 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N } else if (_Longest || !_Greedy) { _Increase_complexity_count(); // set up stack unwinding for non-greedy matching if one more rep is allowed - if (_Nr->_Max == -1 || _Sav._Loop_idx < _Nr->_Max) { + if (_Sav._Loop_idx != _Nr->_Max) { _Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Nr); } // _Next is already assigned correctly for matching tail