diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2f127b2..8aef98b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,6 +16,13 @@ jobs: - '98' - '11' - '20' + encoding: + - ICONV + - ICU + exclude: + # Exclude ICU on C++98, as modern ICU library requires at least C++11 + - cpp-standard: '98' + encoding: ICU steps: - uses: actions/checkout@v4 - name: Install GoogleTest @@ -26,15 +33,20 @@ jobs: run: sudo apt-get update - name: Install packages run: sudo apt-get install -y iwyu valgrind + - name: Install ICU + run: sudo apt-get install -y libicu-dev + if: matrix.encoding == 'ICU' - name: Build env: CPP_STANDARD: ${{ matrix.cpp-standard }} + ENCODING_TYPE: ${{ matrix.encoding }} # This tells the C++ compiler to produce debugging info that Valgrind needs to report line numbers. # See also https://valgrind.org/docs/manual/manual-core.html#manual-core.started CMAKE_BUILD_TYPE: Debug run: | .build/build \ -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \ + -DSTRING_ENCODING_TYPE="$ENCODING_TYPE" \ -DCMAKE_CXX_STANDARD="$CPP_STANDARD" -DCMAKE_CXX_STANDARD_REQUIRED=ON -DCMAKE_CXX_EXTENSIONS=OFF \ -DCMAKE_CXX_INCLUDE_WHAT_YOU_USE='include-what-you-use;-Xiwyu;--verbose=3' - name: Run tests diff --git a/CMakeLists.txt b/CMakeLists.txt index 9172ad6..fae5b89 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,8 @@ set (CMAKE_INCLUDE_CURRENT_DIR ON) find_package(ZLIB) find_package(Iconv) +find_package(ICU COMPONENTS uc) + set (HEADERS kaitai/kaitaistream.h kaitai/kaitaistruct.h @@ -20,7 +22,7 @@ set (SOURCES kaitai/kaitaistream.cpp ) -set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|NONE|...)") +set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|ICU|NONE|...)") set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) @@ -37,6 +39,10 @@ if(Iconv_FOUND) target_link_libraries(${PROJECT_NAME} PRIVATE Iconv::Iconv) endif() +if(ICU_FOUND) + target_link_libraries(${PROJECT_NAME} PRIVATE ICU::uc) +endif() + include(Common.cmake) install(TARGETS ${PROJECT_NAME} diff --git a/Common.cmake b/Common.cmake index 31d8116..f26a8d2 100644 --- a/Common.cmake +++ b/Common.cmake @@ -2,6 +2,8 @@ if (STRING_ENCODING_TYPE STREQUAL "ICONV") target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICONV) elseif (STRING_ENCODING_TYPE STREQUAL "WIN32API") target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_WIN32API) +elseif (STRING_ENCODING_TYPE STREQUAL "ICU") + target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICU) elseif (STRING_ENCODING_TYPE STREQUAL "NONE") target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_NONE) else() diff --git a/kaitai/kaitaistream.cpp b/kaitai/kaitaistream.cpp index 2ad14a4..6efe77e 100644 --- a/kaitai/kaitaistream.cpp +++ b/kaitai/kaitaistream.cpp @@ -1197,6 +1197,112 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) { return utf8; } +#elif defined(KS_STR_ENCODING_ICU) +#include +#include + +std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) { + UErrorCode err = U_ZERO_ERROR; + + // Open the source converter + UConverter* conv = ucnv_open(src_enc, &err); + if (U_FAILURE(err)) { + if (err == U_FILE_ACCESS_ERROR) { + throw unknown_encoding(src_enc); + } + throw bytes_to_str_error(u_errorName(err)); + } + + // Open UTF-8 converter + UConverter* utf8Conv = ucnv_open("UTF-8", &err); + if (U_FAILURE(err)) { + ucnv_close(conv); + throw bytes_to_str_error(u_errorName(err)); + } + + // Configure source converter to stop on illegal sequences + err = U_ZERO_ERROR; + ucnv_setToUCallBack( + conv, + UCNV_TO_U_CALLBACK_STOP, + nullptr, + nullptr, + nullptr, + &err); + if (U_FAILURE(err)) { + ucnv_close(conv); + ucnv_close(utf8Conv); + throw illegal_seq_in_encoding(u_errorName(err)); + } + + // Allocate buffer for UTF-16 intermediate representation + const int32_t uniStrCapacity = UCNV_GET_MAX_BYTES_FOR_STRING(src.length(), ucnv_getMaxCharSize(conv)); + UChar* uniStr = new UChar[uniStrCapacity]; + + // Convert from source encoding to UTF-16 + err = U_ZERO_ERROR; + int32_t uniLength = ucnv_toUChars( + conv, + uniStr, + uniStrCapacity, + src.c_str(), + src.length(), + &err); + if (U_FAILURE(err)) { + delete[] uniStr; + ucnv_close(conv); + ucnv_close(utf8Conv); + throw illegal_seq_in_encoding(u_errorName(err)); + } + + // Configure target converter to stop on illegal sequences + err = U_ZERO_ERROR; + ucnv_setFromUCallBack( + utf8Conv, + UCNV_FROM_U_CALLBACK_STOP, + nullptr, + nullptr, + nullptr, + &err); + if (U_FAILURE(err)) { + delete[] uniStr; + ucnv_close(conv); + ucnv_close(utf8Conv); + throw illegal_seq_in_encoding(u_errorName(err)); + } + + // Allocate buffer for UTF-8 output + const int32_t dstCapacity = UCNV_GET_MAX_BYTES_FOR_STRING(uniLength, ucnv_getMaxCharSize(utf8Conv)); + char* dst = new char[dstCapacity]; + + // Convert from UTF-16 to UTF-8 + err = U_ZERO_ERROR; + int32_t outputLength = ucnv_fromUChars( + utf8Conv, + dst, + dstCapacity, + uniStr, + uniLength, + &err); + if (U_FAILURE(err)) { + delete[] uniStr; + delete[] dst; + ucnv_close(conv); + ucnv_close(utf8Conv); + throw illegal_seq_in_encoding(u_errorName(err)); + } + + // Create result string + std::string result(dst, outputLength); + + // Clean up + delete[] uniStr; + delete[] dst; + ucnv_close(conv); + ucnv_close(utf8Conv); + + return result; +} #else -#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_NONE +#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_ICU, KS_STR_ENCODING_NONE #endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 39e3c17..f91e7fd 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -23,4 +23,8 @@ target_compile_options(unittest PRIVATE # Link the test executable with the main library and the test framework/library target_link_libraries(unittest PRIVATE kaitai_struct_cpp_stl_runtime GTest::GTest GTest::Main) +if(ICU_FOUND) + target_link_libraries(unittest PRIVATE ICU::uc) +endif() + add_test(NAME unittest COMMAND unittest) diff --git a/tests/unittest.cpp b/tests/unittest.cpp index 3a79694..2ed1454 100644 --- a/tests/unittest.cpp +++ b/tests/unittest.cpp @@ -540,6 +540,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_euc_jp_too_short) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND")); #else #error Unknown KS_STR_ENCODING #endif @@ -556,6 +558,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_too_short) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND")); #else #error Unknown KS_STR_ENCODING #endif @@ -581,6 +585,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_two_bytes) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EILSEQ")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_ILLEGAL_CHAR_FOUND")); #else #error Unknown KS_STR_ENCODING #endif @@ -598,6 +604,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf16le_odd_bytes) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: incomplete")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND")); #else #error Unknown KS_STR_ENCODING #endif @@ -616,6 +624,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf16le_incomplete_high_surrogat EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: WideCharToMultiByte")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND")); #else #error Unknown KS_STR_ENCODING #endif @@ -623,8 +633,23 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf16le_incomplete_high_surrogat } #endif +#if defined(KS_STR_ENCODING_ICU) +#include +#endif + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); + const int ret = RUN_ALL_TESTS(); +#if defined(KS_STR_ENCODING_ICU) + // See : + // + // > When an application is terminating it should call the function `u_cleanup()`, + // > which frees all heap storage and other system resources that are held internally + // > by the ICU library. While the use of `u_cleanup()` is not strictly required, + // > failure to call it will cause memory leak checking tools to report problems for + // > resources being held by ICU library. + u_cleanup(); +#endif + return ret; }