From 7bdb47d173af86205e732053dbad20bde670dd3e Mon Sep 17 00:00:00 2001 From: Edward Nolan Date: Sat, 13 Dec 2025 15:16:15 +0000 Subject: [PATCH] Have the CPOs reject arrays of char --- README.md | 2 +- examples/readme_examples.cpp | 14 ++++---- include/beman/utf_view/code_unit_view.hpp | 3 +- include/beman/utf_view/detail/concepts.hpp | 23 ++++++++----- include/beman/utf_view/to_utf_view.hpp | 12 ++----- paper/P2728.md | 40 ++++++++++++---------- tests/beman/utf_view/to_utf_view.t.cpp | 23 +++---------- 7 files changed, 52 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index 1e51dcb..a5546ad 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Transcoding a UTF-8 string literal to a `std::u32string`: ```cpp std::u32string hello_world = - u8"こんにちは世界" | beman::utf_view::to_utf32 | std::ranges::to(); + u8"こんにちは世界"sv | beman::utf_view::to_utf32 | std::ranges::to(); ``` Sanitizing potentially invalid Unicode C strings by replacing invalid code units with replacement characters: diff --git a/examples/readme_examples.cpp b/examples/readme_examples.cpp index 1809288..ded50d3 100644 --- a/examples/readme_examples.cpp +++ b/examples/readme_examples.cpp @@ -24,6 +24,8 @@ namespace beman::utf_view::examples { +using namespace std::string_view_literals; + template std::basic_string sanitize(CharT const* str) { return null_term(str) | to_utf | std::ranges::to>(); @@ -184,10 +186,10 @@ static_assert(take_five_c("Brubeck") == "Brube"sv); // passes #endif #ifndef _MSC_VER -static_assert((u8"\xf0\x9f\x99\x82" | to_utf32 | std::ranges::to()) == U"\x0001F642"); -static_assert((u8"\xf0\x9f\x99\x82" | std::views::take(3) | to_utf32 | std::ranges::to()) == U"�"); +static_assert((u8"\xf0\x9f\x99\x82"sv | to_utf32 | std::ranges::to()) == U"\x0001F642"); +static_assert((u8"\xf0\x9f\x99\x82"sv | std::views::take(3) | to_utf32 | std::ranges::to()) == U"�"); static_assert( - *(u8"\xf0\x9f\x99\x82" | std::views::take(3) | to_utf32_or_error).begin() == + *(u8"\xf0\x9f\x99\x82"sv | std::views::take(3) | to_utf32_or_error).begin() == std::unexpected{utf_transcoding_error::truncated_utf8_sequence}); #endif @@ -215,9 +217,9 @@ bool basis_operation() { } static_assert( - !std::ranges::equal(u8"foo" | to_utf32, std::array{U'f', U'o', U'o', U'\0'})); + !std::ranges::equal(u8"foo"sv | to_utf32, std::array{U'f', U'o', U'o', U'\0'})); static_assert( - std::ranges::equal(u8"foo" | to_utf32, std::array{U'f', U'o', U'o'})); + std::ranges::equal(u8"foo"sv | to_utf32, std::array{U'f', U'o', U'o'})); #endif template @@ -229,7 +231,7 @@ bool readme_examples() { using namespace std::string_view_literals; #ifndef _MSC_VER std::u32string hello_world = - u8"こんにちは世界" | to_utf32 | std::ranges::to(); + u8"こんにちは世界"sv | to_utf32 | std::ranges::to(); if (hello_world != U"こんにちは世界") { return false; } diff --git a/include/beman/utf_view/code_unit_view.hpp b/include/beman/utf_view/code_unit_view.hpp index 934a517..02e9185 100644 --- a/include/beman/utf_view/code_unit_view.hpp +++ b/include/beman/utf_view/code_unit_view.hpp @@ -38,7 +38,8 @@ namespace detail { struct as_code_unit_impl : std::ranges::range_adaptor_closure> { template - requires std::convertible_to, Char> + requires std::convertible_to, Char> && + is_not_array_of_char constexpr auto operator()(R&& r) const { using T = std::remove_cvref_t; if constexpr (detail::is_empty_view) { diff --git a/include/beman/utf_view/detail/concepts.hpp b/include/beman/utf_view/detail/concepts.hpp index d76b588..eaa64aa 100644 --- a/include/beman/utf_view/detail/concepts.hpp +++ b/include/beman/utf_view/detail/concepts.hpp @@ -15,6 +15,15 @@ namespace beman::utf_view { +/* PAPER */ + +template +concept exposition_only_code_unit = std::same_as, char8_t> || + std::same_as, char16_t> || + std::same_as, char32_t>; + +/* !PAPER */ + namespace detail { template @@ -22,6 +31,11 @@ namespace detail { template constexpr bool is_empty_view> = true; + template + concept is_not_array_of_char = + !(std::is_array_v> && + exposition_only_code_unit>>); + } // namespace detail /* PAPER: namespace std::ranges { */ @@ -30,15 +44,6 @@ template using exposition_only_maybe_const = std::conditional_t; // exposition only -/* PAPER */ - -template -concept exposition_only_code_unit = std::same_as, char8_t> || - std::same_as, char16_t> || - std::same_as, char32_t>; - -/* !PAPER */ - } // namespace beman::utf_view /* PAPER: } */ diff --git a/include/beman/utf_view/to_utf_view.hpp b/include/beman/utf_view/to_utf_view.hpp index 5a28a19..72f0a54 100644 --- a/include/beman/utf_view/to_utf_view.hpp +++ b/include/beman/utf_view/to_utf_view.hpp @@ -857,7 +857,8 @@ namespace detail { template struct to_utf_impl : std::ranges::range_adaptor_closure> { - template + template + requires is_not_array_of_char constexpr auto operator()(R&& r) const { using T = std::remove_cvref_t; if constexpr (detail::is_empty_view) { @@ -866,15 +867,6 @@ namespace detail { } else { return std::ranges::empty_view>{}; } - } else if constexpr (std::is_bounded_array_v) { - constexpr auto n = std::extent_v; - auto first{std::ranges::begin(r)}; - auto last{std::ranges::end(r)}; - if (n && !r[n - 1]) { - --last; - } - std::ranges::subrange subrange(first, last); - return to_utf_view(std::move(subrange), detail::nontype, to_utf_tag); } else { return to_utf_view(std::forward(r), detail::nontype, to_utf_tag); } diff --git a/paper/P2728.md b/paper/P2728.md index 3f1ef95..993be18 100644 --- a/paper/P2728.md +++ b/paper/P2728.md @@ -308,7 +308,7 @@ an exposition-only transformation functor that performs the needed cast. ```cpp std::u32string hello_world = - u8"こんにちは世界" | std::views::to_utf32 | std::ranges::to(); + u8"こんにちは世界"sv | std::views::to_utf32 | std::ranges::to(); ``` ## Sanitizing Potentially Invalid Unicode @@ -361,7 +361,7 @@ std::basic_string transcode_or_throw(std::basic_string_view in ```cpp // prints: "error at position 2: truncated_utf8_sequence" transcode_or_throw( - u8"hi🙂" | std::views::take(5) | std::ranges::to()); + u8"hi🙂"sv | std::views::take(5) | std::ranges::to()); ``` ## Changing the Suits of Unicode Playing Card Characters @@ -459,7 +459,8 @@ Let `views::to_utfN` denote any of the aforementioned range adaptor objects, let `Char` be its corresponding character type, and let `Error` be its corresponding `to_utf_view_error_kind`. Let `E` be an expression and let `T` be `remove_cvref_t`. If `decltype((E))` does not model -`@*utf-range*@`, `to_utfN(E)` is ill-formed. The expression `to_utfN(E)` is +`@*utf-range*@`, or if `T` is an array of `char8_t`, `char16_t`, or +`char32_t`, `to_utfN(E)` is ill-formed. The expression `to_utfN(E)` is expression-equivalent to: - If `E` is a specialization of `empty_view` ([range.empty.view]): @@ -909,8 +910,9 @@ objects ([range.adaptor.object]). Let `as_charN_t` denote any one of `as_char8_t`, `as_char16_t`, and `as_char32_t`. Let `Char` be the corresponding character type for `as_charN_t`, let `E` be an expression and let `T` be `remove_cvref_t`. If `ranges::range_reference_t` -does not model `convertible_to`, `as_charN_t(E)` is ill-formed. The -expression `as_charN_t(E)` is expression-equivalent to: +does not model `convertible_to`, or if `T` is an array of `char8_t`, +`char16_t`, or `char32_t`, `as_charN_t(E)` is ill-formed. The expression +`as_charN_t(E)` is expression-equivalent to: - If `T` is a specialization of `empty_view` ([range.empty.view]), then `empty_view{}`. @@ -948,26 +950,25 @@ adoption of this paper: # Design Discussion and Alternatives -## Special CPO Logic for String Literals +## CPO Rejection of String Literals -The `to_utfN` CPOs use a heuristic to detect null-terminated ranges and omit -the null terminator so that it doesn't appear in the output. If the input -range satisfies `is_bounded_array_v`, is nonempty, and its last element is 0, -then the last element is omitted. +String literals are arrays of char types that include a null terminator: -Without this logic, you'd have: - -```c++ -static_assert( - std::ranges::equal(u8"foo" | to_utf32, std::array{U'f', U'o', U'o', U'\0'})); +``` +static_assert(std::is_same_v, const char[4]>); +static_assert(std::ranges::equal("foo", std::array{'f', 'o', 'o', '\0'})); ``` -Instead, with the heuristic in place, the null terminator is eliminated: +Because they are ranges, a naive implementation of the `to_utfN` CPO would +result in null terminators in the output: -```c++ -static_assert( - std::ranges::equal(u8"foo" | to_utf32, std::array{U'f', U'o', U'o'})); ``` +u8"foo" | to_utf32 | std::ranges::to() +// results in a std::u32string of length 4 containing U'f', U'o', U'o', U'\0' +``` + +To avoid this situation, the `to_utfN` CPOs reject all inputs that are arrays +of `char`, as do the `as_charN_t` casting CPOs. ## The `_or_error` Views Are Basis Operations for Other Error Handling Behaviors @@ -1203,6 +1204,7 @@ These concepts are true when the type in question is the iterator/sentinel of a after it was pointed out in Kona 2025 that it has precedent in `views::adjacent_transform` and elsewhere - Don't cache `begin()` +- Reject arrays of `charN_t` in the CPOs ## Changes since R8 diff --git a/tests/beman/utf_view/to_utf_view.t.cpp b/tests/beman/utf_view/to_utf_view.t.cpp index 4cb6a1c..ed88703 100644 --- a/tests/beman/utf_view/to_utf_view.t.cpp +++ b/tests/beman/utf_view/to_utf_view.t.cpp @@ -19,10 +19,13 @@ #include #include #include +#include #include namespace beman::utf_view::tests { +using namespace std::string_view_literals; + static_assert( std::input_iterator< std::ranges::iterator_t< @@ -490,7 +493,7 @@ constexpr bool to_utf_test() { auto empty_utf_view{empty_view | to_utf8}; static_assert( std::is_same_v>); - auto u8_string_literal_utf_view{u8"foo" | to_utf32}; + auto u8_string_literal_utf_view{u8"foo"sv | to_utf32}; auto u8_string_literal_utf_view_it{u8_string_literal_utf_view.begin()}; if (*u8_string_literal_utf_view_it != U'f') { return false; @@ -507,24 +510,6 @@ constexpr bool to_utf_test() { if (u8_string_literal_utf_view_it != u8_string_literal_utf_view.end()) { return false; } - char8_t const non_null_terminated_array[3]{u8'a', u8'b', u8'c'}; - auto non_null_terminated_array_utf_view{non_null_terminated_array | to_utf32}; - auto non_null_terminated_array_utf_view_it{non_null_terminated_array_utf_view.begin()}; - if (*non_null_terminated_array_utf_view_it != U'a') { - return false; - } - ++non_null_terminated_array_utf_view_it; - if (*non_null_terminated_array_utf_view_it != U'b') { - return false; - } - ++non_null_terminated_array_utf_view_it; - if (*non_null_terminated_array_utf_view_it != U'c') { - return false; - } - ++non_null_terminated_array_utf_view_it; - if (non_null_terminated_array_utf_view_it != non_null_terminated_array_utf_view.end()) { - return false; - } std::initializer_list arr{u8'b', u8'a', u8'r'}; test_input_iterator input_it(arr); std::ranges::subrange subrange{std::move(input_it), std::default_sentinel};