Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Transcoding a UTF-8 string literal to a `std::u32string`:

```cpp
std::u32string hello_world =
u8"こんにちは世界" | beman::utf_view::to_utf32 | std::ranges::to<std::u32string>();
u8"こんにちは世界"sv | beman::utf_view::to_utf32 | std::ranges::to<std::u32string>();
```

Sanitizing potentially invalid Unicode C strings by replacing invalid code units with replacement characters:
Expand Down
14 changes: 8 additions & 6 deletions examples/readme_examples.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@

namespace beman::utf_view::examples {

using namespace std::string_view_literals;

template <typename CharT>
std::basic_string<CharT> sanitize(CharT const* str) {
return null_term(str) | to_utf<CharT> | std::ranges::to<std::basic_string<CharT>>();
Expand Down Expand Up @@ -184,10 +186,10 @@ static_assert(take_five_c("Brubeck") == "Brube"sv); // passes
#endif

#ifndef _MSC_VER
static_assert((u8"\xf0\x9f\x99\x82" | to_utf32 | std::ranges::to<std::u32string>()) == U"\x0001F642");
static_assert((u8"\xf0\x9f\x99\x82" | std::views::take(3) | to_utf32 | std::ranges::to<std::u32string>()) == U"�");
static_assert((u8"\xf0\x9f\x99\x82"sv | to_utf32 | std::ranges::to<std::u32string>()) == U"\x0001F642");
static_assert((u8"\xf0\x9f\x99\x82"sv | std::views::take(3) | to_utf32 | std::ranges::to<std::u32string>()) == U"�");
static_assert(
*(u8"\xf0\x9f\x99\x82" | std::views::take(3) | to_utf32_or_error).begin() ==
*(u8"\xf0\x9f\x99\x82"sv | std::views::take(3) | to_utf32_or_error).begin() ==
std::unexpected{utf_transcoding_error::truncated_utf8_sequence});
#endif

Expand Down Expand Up @@ -215,9 +217,9 @@ bool basis_operation() {
}

static_assert(
!std::ranges::equal(u8"foo" | to_utf32, std::array{U'f', U'o', U'o', U'\0'}));
!std::ranges::equal(u8"foo"sv | to_utf32, std::array{U'f', U'o', U'o', U'\0'}));
static_assert(
std::ranges::equal(u8"foo" | to_utf32, std::array{U'f', U'o', U'o'}));
std::ranges::equal(u8"foo"sv | to_utf32, std::array{U'f', U'o', U'o'}));
#endif

template <typename FromCharT, typename ToCharT>
Expand All @@ -229,7 +231,7 @@ bool readme_examples() {
using namespace std::string_view_literals;
#ifndef _MSC_VER
std::u32string hello_world =
u8"こんにちは世界" | to_utf32 | std::ranges::to<std::u32string>();
u8"こんにちは世界"sv | to_utf32 | std::ranges::to<std::u32string>();
if (hello_world != U"こんにちは世界") {
return false;
}
Expand Down
3 changes: 2 additions & 1 deletion include/beman/utf_view/code_unit_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ namespace detail {
struct as_code_unit_impl
: std::ranges::range_adaptor_closure<as_code_unit_impl<Char>> {
template <std::ranges::range R>
requires std::convertible_to<std::ranges::range_reference_t<R>, Char>
requires std::convertible_to<std::ranges::range_reference_t<R>, Char> &&
is_not_array_of_char<R>
constexpr auto operator()(R&& r) const {
using T = std::remove_cvref_t<R>;
if constexpr (detail::is_empty_view<T>) {
Expand Down
23 changes: 14 additions & 9 deletions include/beman/utf_view/detail/concepts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,27 @@

namespace beman::utf_view {

/* PAPER */

template <class T>
concept exposition_only_code_unit = std::same_as<std::remove_cv_t<T>, char8_t> ||
std::same_as<std::remove_cv_t<T>, char16_t> ||
std::same_as<std::remove_cv_t<T>, char32_t>;

/* !PAPER */

namespace detail {

template <class T>
constexpr bool is_empty_view = false;
template <class T>
constexpr bool is_empty_view<std::ranges::empty_view<T>> = true;

template <typename T>
concept is_not_array_of_char =
!(std::is_array_v<std::remove_cvref_t<T>> &&
exposition_only_code_unit<std::remove_extent_t<std::remove_cvref_t<T>>>);

} // namespace detail

/* PAPER: namespace std::ranges { */
Expand All @@ -30,15 +44,6 @@ template <bool Const, class T>
using exposition_only_maybe_const =
std::conditional_t<Const, const T, T>; // exposition only

/* PAPER */

template <class T>
concept exposition_only_code_unit = std::same_as<std::remove_cv_t<T>, char8_t> ||
std::same_as<std::remove_cv_t<T>, char16_t> ||
std::same_as<std::remove_cv_t<T>, char32_t>;

/* !PAPER */

} // namespace beman::utf_view

/* PAPER: } */
Expand Down
12 changes: 2 additions & 10 deletions include/beman/utf_view/to_utf_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -857,7 +857,8 @@ namespace detail {

template <to_utf_view_error_kind E, exposition_only_code_unit ToType>
struct to_utf_impl : std::ranges::range_adaptor_closure<to_utf_impl<E, ToType>> {
template <class R>
template <std::ranges::range R>
requires is_not_array_of_char<R>
constexpr auto operator()(R&& r) const {
using T = std::remove_cvref_t<R>;
if constexpr (detail::is_empty_view<T>) {
Expand All @@ -866,15 +867,6 @@ namespace detail {
} else {
return std::ranges::empty_view<std::expected<ToType, utf_transcoding_error>>{};
}
} else if constexpr (std::is_bounded_array_v<T>) {
constexpr auto n = std::extent_v<T>;
auto first{std::ranges::begin(r)};
auto last{std::ranges::end(r)};
if (n && !r[n - 1]) {
--last;
}
std::ranges::subrange subrange(first, last);
return to_utf_view(std::move(subrange), detail::nontype<E>, to_utf_tag<ToType>);
} else {
return to_utf_view(std::forward<R>(r), detail::nontype<E>, to_utf_tag<ToType>);
}
Expand Down
40 changes: 21 additions & 19 deletions paper/P2728.md
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ an exposition-only transformation functor that performs the needed cast.

```cpp
std::u32string hello_world =
u8"こんにちは世界" | std::views::to_utf32 | std::ranges::to<std::u32string>();
u8"こんにちは世界"sv | std::views::to_utf32 | std::ranges::to<std::u32string>();
```

## Sanitizing Potentially Invalid Unicode
Expand Down Expand Up @@ -361,7 +361,7 @@ std::basic_string<ToChar> transcode_or_throw(std::basic_string_view<FromChar> in
```cpp
// prints: "error at position 2: truncated_utf8_sequence"
transcode_or_throw<char8_t, char16_t>(
u8"hi🙂" | std::views::take(5) | std::ranges::to<std::u8string>());
u8"hi🙂"sv | std::views::take(5) | std::ranges::to<std::u8string>());
```

## Changing the Suits of Unicode Playing Card Characters
Expand Down Expand Up @@ -459,7 +459,8 @@ Let `views::to_utfN` denote any of the aforementioned range adaptor objects,
let `Char` be its corresponding character type, and let `Error` be its
corresponding `to_utf_view_error_kind`. Let `E` be an expression and let `T`
be `remove_cvref_t<decltype((E))>`. If `decltype((E))` does not model
`@*utf-range*@`, `to_utfN(E)` is ill-formed. The expression `to_utfN(E)` is
`@*utf-range*@`, or if `T` is an array of `char8_t`, `char16_t`, or
`char32_t`, `to_utfN(E)` is ill-formed. The expression `to_utfN(E)` is
expression-equivalent to:

- If `E` is a specialization of `empty_view` ([range.empty.view]):
Expand Down Expand Up @@ -909,8 +910,9 @@ objects ([range.adaptor.object]). Let `as_charN_t` denote any one of
`as_char8_t`, `as_char16_t`, and `as_char32_t`. Let `Char` be the
corresponding character type for `as_charN_t`, let `E` be an expression and
let `T` be `remove_cvref_t<decltype((E))>`. If `ranges::range_reference_t<T>`
does not model `convertible_to<Char>`, `as_charN_t(E)` is ill-formed. The
expression `as_charN_t(E)` is expression-equivalent to:
does not model `convertible_to<Char>`, or if `T` is an array of `char8_t`,
`char16_t`, or `char32_t`, `as_charN_t(E)` is ill-formed. The expression
`as_charN_t(E)` is expression-equivalent to:

- If `T` is a specialization of `empty_view` ([range.empty.view]), then
`empty_view<Char>{}`.
Expand Down Expand Up @@ -948,26 +950,25 @@ adoption of this paper:

# Design Discussion and Alternatives

## Special CPO Logic for String Literals
## CPO Rejection of String Literals

The `to_utfN` CPOs use a heuristic to detect null-terminated ranges and omit
the null terminator so that it doesn't appear in the output. If the input
range satisfies `is_bounded_array_v`, is nonempty, and its last element is 0,
then the last element is omitted.
String literals are arrays of char types that include a null terminator:

Without this logic, you'd have:

```c++
static_assert(
std::ranges::equal(u8"foo" | to_utf32, std::array{U'f', U'o', U'o', U'\0'}));
```
static_assert(std::is_same_v<std::remove_reference_t<decltype("foo")>, const char[4]>);
static_assert(std::ranges::equal("foo", std::array{'f', 'o', 'o', '\0'}));
```

Instead, with the heuristic in place, the null terminator is eliminated:
Because they are ranges, a naive implementation of the `to_utfN` CPO would
result in null terminators in the output:

```c++
static_assert(
std::ranges::equal(u8"foo" | to_utf32, std::array{U'f', U'o', U'o'}));
```
u8"foo" | to_utf32 | std::ranges::to<u32string>()
// results in a std::u32string of length 4 containing U'f', U'o', U'o', U'\0'
```

To avoid this situation, the `to_utfN` CPOs reject all inputs that are arrays
of `char`, as do the `as_charN_t` casting CPOs.

## The `_or_error` Views Are Basis Operations for Other Error Handling Behaviors

Expand Down Expand Up @@ -1203,6 +1204,7 @@ These concepts are true when the type in question is the iterator/sentinel of a
after it was pointed out in Kona 2025 that it has precedent in
`views::adjacent_transform<N>` and elsewhere
- Don't cache `begin()`
- Reject arrays of `charN_t` in the CPOs

## Changes since R8

Expand Down
23 changes: 4 additions & 19 deletions tests/beman/utf_view/to_utf_view.t.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@
#include <ranges>
#include <sstream>
#include <string>
#include <string_view>
#include <utility>

namespace beman::utf_view::tests {

using namespace std::string_view_literals;

static_assert(
std::input_iterator<
std::ranges::iterator_t<
Expand Down Expand Up @@ -490,7 +493,7 @@ constexpr bool to_utf_test() {
auto empty_utf_view{empty_view | to_utf8};
static_assert(
std::is_same_v<decltype(empty_utf_view), std::ranges::empty_view<char8_t>>);
auto u8_string_literal_utf_view{u8"foo" | to_utf32};
auto u8_string_literal_utf_view{u8"foo"sv | to_utf32};
auto u8_string_literal_utf_view_it{u8_string_literal_utf_view.begin()};
if (*u8_string_literal_utf_view_it != U'f') {
return false;
Expand All @@ -507,24 +510,6 @@ constexpr bool to_utf_test() {
if (u8_string_literal_utf_view_it != u8_string_literal_utf_view.end()) {
return false;
}
char8_t const non_null_terminated_array[3]{u8'a', u8'b', u8'c'};
auto non_null_terminated_array_utf_view{non_null_terminated_array | to_utf32};
auto non_null_terminated_array_utf_view_it{non_null_terminated_array_utf_view.begin()};
if (*non_null_terminated_array_utf_view_it != U'a') {
return false;
}
++non_null_terminated_array_utf_view_it;
if (*non_null_terminated_array_utf_view_it != U'b') {
return false;
}
++non_null_terminated_array_utf_view_it;
if (*non_null_terminated_array_utf_view_it != U'c') {
return false;
}
++non_null_terminated_array_utf_view_it;
if (non_null_terminated_array_utf_view_it != non_null_terminated_array_utf_view.end()) {
return false;
}
std::initializer_list<char8_t> arr{u8'b', u8'a', u8'r'};
test_input_iterator input_it(arr);
std::ranges::subrange subrange{std::move(input_it), std::default_sentinel};
Expand Down