From f6b178dd389b6c26bdcc5ed32f20a73c656b51c6 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 21 Jan 2025 16:14:09 +0100 Subject: [PATCH] Add compactStringSerializer --- .../interface/compactStringSerializer.h | 140 +++++++ .../Utilities/src/compactStringSerializer.cc | 19 + .../test_catch2_compactStringSerializer.cc | 365 ++++++++++++++++++ 3 files changed, 524 insertions(+) create mode 100644 FWCore/Utilities/interface/compactStringSerializer.h create mode 100644 FWCore/Utilities/src/compactStringSerializer.cc create mode 100644 FWCore/Utilities/test/test_catch2_compactStringSerializer.cc diff --git a/FWCore/Utilities/interface/compactStringSerializer.h b/FWCore/Utilities/interface/compactStringSerializer.h new file mode 100644 index 0000000000000..80c289355995c --- /dev/null +++ b/FWCore/Utilities/interface/compactStringSerializer.h @@ -0,0 +1,140 @@ +#ifndef FWCore_Utilities_interface_compactStringSerializer_h +#define FWCore_Utilities_interface_compactStringSerializer_h + +#include +#include +#include +#include +#include +#include + +namespace edm::compactString { + namespace detail { + constexpr std::string_view kDelimiters = "\x1d\x1e"; + constexpr char kContainerDelimiter = kDelimiters[0]; // "group separator" in ASCII + constexpr char kElementDelimiter = kDelimiters[1]; // "record separator" in ASCII + + void throwIfContainsDelimiters(std::string const& str); + } // namespace detail + + /** + * Following three functions serialize a sequence of strings and containers of strings + * + * Each top-level string or container of strings is separated with kContainerDelimeter + * In case of container of strings, each element is separated with kElementDelimeter + * The serialized string will end with kContainerDelimeter and a null character + * + * The functions throw an exception if the serialized strings + * contain any of the delimeter characters. The underlying string + * operations may also throw exceptions. + */ + inline std::string serialize(std::string arg) noexcept(false) { + detail::throwIfContainsDelimiters(arg); + arg += detail::kContainerDelimiter; + return arg; + } + + template + requires std::ranges::input_range and std::is_same_v, std::string> + std::string serialize(R const& arg) noexcept(false) { + std::string ret; + + if (not std::ranges::empty(arg)) { + for (std::string const& elem : arg) { + ret.reserve(ret.size() + elem.size() + 1); + detail::throwIfContainsDelimiters(elem); + ret += elem; + ret += detail::kElementDelimiter; + } + } + + ret += detail::kContainerDelimiter; + return ret; + } + + template + requires(sizeof...(Args) >= 1) + std::string serialize(T&& arg0, Args&&... args) noexcept(false) { + return serialize(std::forward(arg0)) + serialize(std::forward(args)...); + } + + /** + * Following three functions deserialize a string 'input' into a + * sequence of strings and containers of strings + * + * The 'input' string is assumed to be serialized with the + * serialize() functions above. + * + * The output arguments following the 'input' define the schema of + * the deserialization. + * - std::string& for strings + * - output iterator for containers of strings (e.g. std::back_inserter(vector)) + * + * Upon success, the return value is the position in `input` for the + * next possible element (i.e. the position after the + * kContainerDelimiter), that is also the number of characters + * consumed by the deserializatiom.. + * + * Upon failure, returns 0 to denote the beginning of `input`. The + * output arguments may have been modified. + * + * The functions do not explicitly throw exceptions, but underlying + * operations may throw exceptions. + */ + inline std::string_view::size_type deserialize(std::string_view input, std::string& arg) { + auto const pos = input.find_first_of(detail::kDelimiters); + if (pos == std::string_view::npos or input[pos] != detail::kContainerDelimiter) { + return 0; + } + arg = input.substr(0, pos); + return pos + 1; // skip delimiter + } + + template I> + inline std::string_view::size_type deserialize(std::string_view input, I arg) { + auto pos = input.find_first_of(detail::kDelimiters); + // invalid input + if (pos == std::string_view::npos) { + return 0; + } + // no elements + if (input[pos] == detail::kContainerDelimiter) { + // invalid input for empty container + if (pos != 0) { + return 0; + } + // skip delimiter + return pos + 1; + } + + std::string_view::size_type prev = 0; + while (pos != std::string_view::npos and input[pos] == detail::kElementDelimiter) { + *arg = std::string(input.substr(prev, pos - prev)); + ++arg; + prev = pos + 1; //skip delimiter + pos = input.find_first_of(detail::kDelimiters, prev); + } + + // every container must end with kContainerDelimiter + // reaching npos is an error + if (pos == std::string_view::npos) { + return 0; + } + assert(input[pos] == detail::kContainerDelimiter); + + return pos + 1; // skip delimiter + } + + template + requires(sizeof...(Args) >= 1) + std::string_view::size_type deserialize(std::string_view input, T&& arg0, Args&&... args) { + auto pos = deserialize(input, std::forward(arg0)); + if (pos != 0) { + auto const ret = deserialize(input.substr(pos), std::forward(args)...); + pos = (ret == 0) ? 0 : pos + ret; + } + return pos; + } +} // namespace edm::compactString + +#endif diff --git a/FWCore/Utilities/src/compactStringSerializer.cc b/FWCore/Utilities/src/compactStringSerializer.cc new file mode 100644 index 0000000000000..1a933ca33cc66 --- /dev/null +++ b/FWCore/Utilities/src/compactStringSerializer.cc @@ -0,0 +1,19 @@ +#include "FWCore/Utilities/interface/compactStringSerializer.h" +#include "FWCore/Utilities/interface/Exception.h" + +namespace edm::compactString::detail { + void throwIfContainsDelimiters(std::string const& str) { + auto pos = str.find_first_of(kDelimiters); + if (pos != std::string::npos) { + cms::Exception ex("compactString"); + ex << "Serialized string '" << str << "' contains "; + if (str[pos] == kContainerDelimiter) { + ex << "container"; + } else { + ex << "element"; + } + ex << " delimiter at position " << pos; + throw ex; + } + } +} // namespace edm::compactString::detail diff --git a/FWCore/Utilities/test/test_catch2_compactStringSerializer.cc b/FWCore/Utilities/test/test_catch2_compactStringSerializer.cc new file mode 100644 index 0000000000000..ebe51424a9646 --- /dev/null +++ b/FWCore/Utilities/test/test_catch2_compactStringSerializer.cc @@ -0,0 +1,365 @@ +#include "catch.hpp" + +#include +#include +#include +#include + +#include "FWCore/Utilities/interface/compactStringSerializer.h" +#include "FWCore/Utilities/interface/Exception.h" + +namespace cs = edm::compactString; + +TEST_CASE("Test edm::compactString serializer", "[edm::compactString]") { + using namespace std::string_literals; + SECTION("Empty inputs") { + SECTION("Serialization") { + SECTION("Empty string") { + auto result = cs::serialize(""s); + CHECK(result.size() == 1); // one delimiter + result = cs::serialize(""); + CHECK(result.size() == 1); // one delimiter + } + + SECTION("Two empty strings") { + auto result = cs::serialize(""s, ""s); + CHECK(result.size() == 2); + result = cs::serialize(""s, ""); + CHECK(result.size() == 2); + result = cs::serialize("", ""s); + CHECK(result.size() == 2); + result = cs::serialize("", ""); + CHECK(result.size() == 2); + } + + SECTION("Empty vector of strings and empty string") { + auto result = cs::serialize(std::vector()); + CHECK(result.size() == 1); + result = cs::serialize(""s, std::vector()); + CHECK(result.size() == 2); + result = cs::serialize(std::vector(), ""); + CHECK(result.size() == 2); + result = cs::serialize(std::vector(), std::vector()); + CHECK(result.size() == 2); + } + + SECTION("Empty list and vector of strings and empty string") { + auto result = cs::serialize(std::list()); + CHECK(result.size() == 1); + result = cs::serialize(""s, std::list()); + CHECK(result.size() == 2); + result = cs::serialize(std::list(), ""); + CHECK(result.size() == 2); + result = cs::serialize(std::list(), std::list()); + CHECK(result.size() == 2); + result = cs::serialize(std::vector(), std::list()); + CHECK(result.size() == 2); + result = cs::serialize(std::list(), std::vector()); + CHECK(result.size() == 2); + } + + SECTION("Vectors of empty strings") { + auto result = cs::serialize(std::vector{""}); + CHECK(result.size() == 2); + result = cs::serialize(std::vector{"", ""}); + CHECK(result.size() == 3); + result = cs::serialize(std::vector{""}, std::vector{}); + CHECK(result.size() == 3); + result = cs::serialize(std::vector{"", ""}, std::vector{}); + CHECK(result.size() == 4); + result = cs::serialize(std::vector{""}, std::vector{""}); + CHECK(result.size() == 4); + result = cs::serialize(std::vector{"", ""}, std::vector{""}); + CHECK(result.size() == 5); + result = cs::serialize(std::vector{""}, std::vector{"", ""}); + CHECK(result.size() == 5); + result = cs::serialize(std::vector{"", ""}, std::vector{"", ""}); + CHECK(result.size() == 6); + } + } + + SECTION("Serialization and deserialization") { + SECTION("Empty string") { + std::string res; + auto ret = cs::deserialize(cs::serialize(""), res); + CHECK(ret == 1); + CHECK(res.empty()); + } + + SECTION("Two empty strings") { + std::string res1, res2; + auto ret = cs::deserialize(cs::serialize("", ""), res1, res2); + CHECK(ret == 2); + CHECK(res1.empty()); + CHECK(res2.empty()); + } + + SECTION("Empty vector") { + std::vector res; + auto ret = cs::deserialize(cs::serialize(std::vector()), std::back_inserter(res)); + CHECK(ret == 1); + CHECK(res.empty()); + } + + SECTION("Two empty vectors") { + std::vector res1, res2; + auto ret = cs::deserialize(cs::serialize(std::vector(), std::vector()), + std::back_inserter(res1), + std::back_inserter(res2)); + CHECK(ret == 2); + CHECK(res1.empty()); + CHECK(res2.empty()); + } + + SECTION("Mixture") { + std::string res1; + std::vector res2; + std::list res3; + auto ret = cs::deserialize(cs::serialize("", std::vector(), std::list()), + res1, + std::back_inserter(res2), + std::back_inserter(res3)); + CHECK(ret == 3); + CHECK(res1.empty()); + CHECK(res2.empty()); + CHECK(res3.empty()); + + ret = cs::deserialize(cs::serialize(std::vector(), "", std::list()), + std::back_inserter(res3), + res1, + std::back_inserter(res2)); + CHECK(ret == 3); + CHECK(res1.empty()); + CHECK(res2.empty()); + CHECK(res3.empty()); + } + } + } + + SECTION("Inputs with values") { + SECTION("Strings") { + std::string res1, res2; + auto serial = cs::serialize("foo"); + REQUIRE(serial == "foo"s + cs::detail::kContainerDelimiter); + auto ret = cs::deserialize(serial, res1); + CHECK(ret == 3 + 1); + CHECK(res1 == "foo"); + + serial = cs::serialize("foo", "bar"); + REQUIRE(serial == "foo"s + cs::detail::kContainerDelimiter + "bar"s + cs::detail::kContainerDelimiter); + ret = cs::deserialize(serial, res1, res2); + CHECK(ret == 3 + 1 + 3 + 1); + CHECK(res1 == "foo"); + CHECK(res2 == "bar"); + } + + SECTION("Vector of strings") { + std::vector res; + auto serial = cs::serialize(std::vector{"foo"}); + REQUIRE(serial == "foo"s + cs::detail::kElementDelimiter + cs::detail::kContainerDelimiter); + auto ret = cs::deserialize(serial, std::back_inserter(res)); + CHECK(ret == 3 + 2); + REQUIRE(res.size() == 1); + REQUIRE(res[0] == "foo"); + res.clear(); + + serial = cs::serialize(std::vector{"foo", "bar"}); + REQUIRE(serial == "foo"s + cs::detail::kElementDelimiter + "bar"s + cs::detail::kElementDelimiter + + cs::detail::kContainerDelimiter); + ret = cs::deserialize(serial, std::back_inserter(res)); + CHECK(ret == 3 + 1 + 3 + 2); + REQUIRE(res.size() == 2); + CHECK(res[0] == "foo"); + CHECK(res[1] == "bar"); + res.clear(); + + serial = cs::serialize(std::vector{"foo", "bar", "xyzzy"}); + ret = cs::deserialize(serial, std::back_inserter(res)); + CHECK(ret == serial.size()); + REQUIRE(res.size() == 3); + CHECK(res[0] == "foo"); + CHECK(res[1] == "bar"); + CHECK(res[2] == "xyzzy"); + res.clear(); + + SECTION("Deserialize to list") { + std::list res2; + ret = cs::deserialize(serial, std::front_inserter(res2)); + CHECK(ret == serial.size()); + REQUIRE(res2.size() == 3); + auto it = res2.begin(); + CHECK(*it == "xyzzy"); + ++it; + CHECK(*it == "bar"); + ++it; + CHECK(*it == "foo"); + } + } + + SECTION("Vectors of strings") { + std::vector res1, res2; + ; + auto serial = + cs::serialize(std::vector{"foo", "bar", "xyzzy"}, std::vector{"fred", "wilma"}); + auto ret = cs::deserialize(serial, std::back_inserter(res1), std::back_inserter(res2)); + CHECK(ret == serial.size()); + REQUIRE(res1.size() == 3); + CHECK(res1[0] == "foo"); + CHECK(res1[1] == "bar"); + CHECK(res1[2] == "xyzzy"); + REQUIRE(res2.size() == 2); + CHECK(res2[0] == "fred"); + CHECK(res2[1] == "wilma"); + } + + SECTION("Mixture") { + auto serial = cs::serialize( + "foobar", std::vector{"fred", "wilma"}, "xyzzy", std::list{"one", "two", "th ree"}); + std::string res1, res3; + std::vector res2, res4; + auto ret = cs::deserialize(serial, res1, std::back_inserter(res2), res3, std::back_inserter(res4)); + CHECK(ret == serial.size()); + CHECK(res1 == "foobar"); + REQUIRE(res2.size() == 2); + CHECK(res2[0] == "fred"); + CHECK(res2[1] == "wilma"); + CHECK(res3 == "xyzzy"); + REQUIRE(res4.size() == 3); + CHECK(res4[0] == "one"); + CHECK(res4[1] == "two"); + CHECK(res4[2] == "th ree"); + } + } + SECTION("Deserialize only part of the serialized content") { + SECTION("String") { + std::string res; + auto serial = cs::serialize("foo", "bar"); + auto ret = cs::deserialize(serial, res); + CHECK(ret != 0); + CHECK(ret != serial.size()); + CHECK(res == "foo"); + res.clear(); + + serial = cs::serialize("bar", std::vector{"foo"}); + ret = cs::deserialize(serial, res); + CHECK(ret != 0); + CHECK(ret != serial.size()); + CHECK(res == "bar"); + } + + SECTION("Vector of strings") { + std::vector res; + auto serial = cs::serialize(std::vector{"foo", "bar"}, std::vector{"fred", "wilma"}); + auto ret = cs::deserialize(serial, std::back_inserter(res)); + CHECK(ret != 0); + CHECK(ret != serial.size()); + REQUIRE(res.size() == 2); + CHECK(res[0] == "foo"); + CHECK(res[1] == "bar"); + res.clear(); + + serial = cs::serialize(std::vector{"wilma", "fred"}, "fintstones"); + ret = cs::deserialize(serial, std::back_inserter(res)); + CHECK(ret != 0); + CHECK(ret != serial.size()); + REQUIRE(res.size() == 2); + CHECK(res[0] == "wilma"); + CHECK(res[1] == "fred"); + } + } + + SECTION("Serialization error cases") { + CHECK_THROWS_AS(cs::serialize(""s + cs::detail::kElementDelimiter), cms::Exception); + CHECK_THROWS_AS(cs::serialize("foo"s + cs::detail::kElementDelimiter), cms::Exception); + CHECK_THROWS_AS(cs::serialize(cs::detail::kElementDelimiter + "bar"s), cms::Exception); + CHECK_THROWS_AS(cs::serialize("foo"s + cs::detail::kElementDelimiter + "bar"s), cms::Exception); + CHECK_THROWS_AS(cs::serialize(""s + cs::detail::kContainerDelimiter), cms::Exception); + CHECK_THROWS_AS(cs::serialize("foo"s + cs::detail::kContainerDelimiter), cms::Exception); + CHECK_THROWS_AS(cs::serialize(cs::detail::kContainerDelimiter + "bar"s), cms::Exception); + CHECK_THROWS_AS(cs::serialize("foo"s + cs::detail::kContainerDelimiter + "bar"s), cms::Exception); + + std::string str = "foo"s + cs::detail::kContainerDelimiter; + std::vector vstr{str}; + CHECK_THROWS_AS(cs::serialize(str, std::vector{"foo"}), cms::Exception); + CHECK_THROWS_AS(cs::serialize(std::vector{"foo"}, str), cms::Exception); + CHECK_THROWS_AS(cs::serialize(vstr, "foo"), cms::Exception); + CHECK_THROWS_AS(cs::serialize("foo", vstr), cms::Exception); + } + + SECTION("Deserialization error cases") { + SECTION("Invalid input") { + SECTION("Deserializing to string") { + std::string res; + CHECK(cs::deserialize("", res) == 0); + CHECK(cs::deserialize(" ", res) == 0); + CHECK(cs::deserialize("foo", res) == 0); + CHECK(cs::deserialize("foo"s + cs::detail::kElementDelimiter + "bar"s, res) == 0); + CHECK(cs::deserialize("foo"s + cs::detail::kElementDelimiter + "bar"s + cs::detail::kContainerDelimiter, res) == + 0); + } + + SECTION("Deserializing to container") { + std::vector res; + CHECK(cs::deserialize("", std::back_inserter(res)) == 0); + CHECK(cs::deserialize(" ", std::back_inserter(res)) == 0); + CHECK(cs::deserialize("foo", std::back_inserter(res)) == 0); + CHECK(cs::deserialize("foo"s + cs::detail::kElementDelimiter + "bar"s, std::back_inserter(res)) == 0); + CHECK(cs::deserialize("foo"s + cs::detail::kContainerDelimiter, std::back_inserter(res)) == 0); + } + } + + SECTION("Schema mismatch") { + // Note: empty container and empty string have the same + // presentation, but this behavior is not tested here as one + // should not rely on it + + SECTION("Deserializing container as string") { + std::string res; + auto ret = cs::deserialize(cs::serialize(std::vector{""}), res); + CHECK(ret == 0); + ret = cs::deserialize(cs::serialize(std::vector{"foo"}), res); + CHECK(ret == 0); + ret = cs::deserialize(cs::serialize(std::vector{"foo", "bar"}), res); + CHECK(ret == 0); + } + + SECTION("Deserializing string as container") { + std::vector res; + auto ret = cs::deserialize(cs::serialize("foo"), std::back_inserter(res)); + CHECK(ret == 0); + ret = cs::deserialize(cs::serialize("foo", "bar"), std::back_inserter(res)); + CHECK(ret == 0); + } + } + + SECTION("Deserializing too much") { + SECTION("Strings") { + std::string res1, res2; + auto ret = cs::deserialize(cs::serialize("foo"), res1, res2); + CHECK(ret == 0); + CHECK(res2.empty()); + } + + SECTION("Vector of strings") { + std::vector res1, res2; + auto ret = cs::deserialize( + cs::serialize(std::vector{"foo", "bar"}), std::back_inserter(res1), std::back_inserter(res2)); + CHECK(ret == 0); + CHECK(res2.empty()); + } + + SECTION("Mixture") { + std::string ress; + std::vector resv; + auto ret = cs::deserialize(cs::serialize("foo"), ress, std::back_inserter(resv)); + CHECK(ret == 0); + CHECK(resv.empty()); + ress.clear(); + + ret = cs::deserialize(cs::serialize(std::vector{"foo"}), std::back_inserter(resv), ress); + CHECK(ret == 0); + CHECK(ress.empty()); + } + } + } +}