From a74d83eb26282d127612f6fe8d65377c0abb9d14 Mon Sep 17 00:00:00 2001 From: jinhelin Date: Mon, 10 Mar 2025 12:32:01 +0800 Subject: [PATCH] tests: SegmentBitmapFilterTest supports common handle (#9954) ref pingcap/tiflash#9963 - Make `SegmentBitmapFilterTest` support common handle. - Introduce parameter `including_right_boundary` in some test utils to support generating test data with `std::numeric_limits::max()`. Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../DeltaMerge/VersionChain/ColumnView.h | 158 +++++++ .../src/Storages/DeltaMerge/tests/DMTestEnv.h | 47 +- .../tests/gtest_dm_vector_index.cpp | 28 +- .../DeltaMerge/tests/gtest_segment_bitmap.cpp | 436 +++++++++++------- .../tests/gtest_segment_test_basic.cpp | 244 ++++++++-- .../tests/gtest_segment_test_basic.h | 46 +- .../DeltaMerge/tests/gtest_segment_util.cpp | 142 ++++-- .../DeltaMerge/tests/gtest_segment_util.h | 35 +- .../gtest_skippable_block_input_stream.cpp | 23 +- 9 files changed, 864 insertions(+), 295 deletions(-) create mode 100644 dbms/src/Storages/DeltaMerge/VersionChain/ColumnView.h diff --git a/dbms/src/Storages/DeltaMerge/VersionChain/ColumnView.h b/dbms/src/Storages/DeltaMerge/VersionChain/ColumnView.h new file mode 100644 index 00000000000..0220432f018 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/VersionChain/ColumnView.h @@ -0,0 +1,158 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +namespace DB::DM +{ +// `ColumnView` is a class that provides unified access to both Int64 handles and String handles. +template +class ColumnView +{ + static_assert(false, "Only support Int64 and String"); +}; + +template <> +class ColumnView +{ +public: + ColumnView(const IColumn & col) + : data(toColumnVectorData(col)) + {} + + auto begin() const { return data.begin(); } + + auto end() const { return data.end(); } + + Int64 operator[](size_t index) const + { + assert(index < data.size()); + return data[index]; + } + + size_t size() const { return data.size(); } + +private: + const PaddedPODArray & data; +}; + +template <> +class ColumnView +{ +public: + ColumnView(const IColumn & col) + : offsets(typeid_cast(col).getOffsets()) + , chars(typeid_cast(col).getChars()) + {} + + class Iterator + { + public: + using iterator_category = std::random_access_iterator_tag; + using value_type = std::string_view; + using difference_type = std::ptrdiff_t; + + Iterator(const IColumn::Offsets & offsets, const ColumnString::Chars_t & chars, size_t pos) + : pos(pos) + , offsets(&offsets) + , chars(&chars) + {} + + value_type operator*() const + { + assert((*offsets)[-1] == 0); + const auto off = (*offsets)[pos - 1]; + const auto size = (*offsets)[pos] - (*offsets)[pos - 1] - 1; + return std::string_view(reinterpret_cast(chars->data() + off), size); + } + + Iterator operator+(difference_type n) { return Iterator{*offsets, *chars, pos + n}; } + + Iterator operator-(difference_type n) { return Iterator{*offsets, *chars, pos - n}; } + + difference_type operator-(const Iterator & other) const { return pos - other.pos; } + + Iterator & operator++() + { + ++pos; + return *this; + } + + Iterator & operator--() + { + --pos; + return *this; + } + + Iterator operator++(int) + { + Iterator tmp = *this; + ++pos; + return tmp; + } + + Iterator operator--(int) + { + Iterator tmp = *this; + --pos; + return tmp; + } + + Iterator & operator+=(difference_type n) + { + pos += n; + return *this; + } + + Iterator & operator-=(difference_type n) + { + pos -= n; + return *this; + } + + // Perform a lexicographic comparison of elements. + // Assume `this->offsets == other.offsets && this->chars == other.chars`, + // so it equal to `this->pos <=> other.pos`. + auto operator<=>(const Iterator & other) const = default; + + private: + size_t pos = 0; + const IColumn::Offsets * offsets; // Using pointer for operator assignment + const ColumnString::Chars_t * chars; + }; + + auto begin() const { return Iterator(offsets, chars, 0); } + + auto end() const { return Iterator(offsets, chars, offsets.size()); } + + std::string_view operator[](size_t index) const + { + assert(index < offsets.size()); + const auto off = offsets[index - 1]; + const auto size = offsets[index] - offsets[index - 1] - 1; + return std::string_view(reinterpret_cast(chars.data() + off), size); + } + + size_t size() const { return offsets.size(); } + +private: + const IColumn::Offsets & offsets; + const ColumnString::Chars_t & chars; +}; + +} // namespace DB::DM diff --git a/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h b/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h index fadeacd785e..6b737c8fe82 100644 --- a/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h +++ b/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h @@ -31,11 +31,7 @@ #include -namespace DB -{ -namespace DM -{ -namespace tests +namespace DB::DM::tests { #define GET_REGION_RANGE(start, end, table_id) \ RowKeyRange::fromHandleRange(::DB::DM::HandleRange((start), (end))).toRegionRange((table_id)) @@ -112,6 +108,14 @@ inline String genMockCommonHandle(Int64 value, size_t rowkey_column_size) return ss.releaseStr(); } +inline Int64 decodeMockCommonHandle(const String & s) +{ + size_t cursor = 0; + auto flag = ::DB::DecodeUInt(cursor, s); + RUNTIME_CHECK(flag == static_cast(TiDB::CodecFlagInt), flag); + return ::DB::DecodeInt64(cursor, s); +} + class DMTestEnv { public: @@ -305,36 +309,32 @@ class DMTestEnv size_t rowkey_column_size = 1, bool with_internal_columns = true, bool is_deleted = false, - bool with_nullable_uint64 = false) + bool with_nullable_uint64 = false, + bool including_right_boundary = false) // [beg, end) or [beg, end] { Block block; - const size_t num_rows = (end - beg); + const size_t num_rows = (end - beg) + including_right_boundary; + std::vector handles(num_rows); + std::iota(handles.begin(), handles.end(), beg); + if (reversed) + std::reverse(handles.begin(), handles.end()); if (is_common_handle) { - // common_pk_col Strings values; - for (size_t i = 0; i < num_rows; i++) - { - Int64 value = reversed ? end - 1 - i : beg + i; - values.emplace_back(genMockCommonHandle(value, rowkey_column_size)); - } + for (Int64 h : handles) + values.emplace_back(genMockCommonHandle(h, rowkey_column_size)); block.insert(DB::tests::createColumn(std::move(values), pk_name_, pk_col_id)); } else { // int-like pk_col - block.insert(ColumnWithTypeAndName{ - DB::tests::makeColumn(pk_type, createNumbers(beg, end, reversed)), - pk_type, - pk_name_, - pk_col_id}); + block.insert( + ColumnWithTypeAndName{DB::tests::makeColumn(pk_type, handles), pk_type, pk_name_, pk_col_id}); // add extra column if need if (pk_col_id != MutSup::extra_handle_id) { block.insert(ColumnWithTypeAndName{ - DB::tests::makeColumn( - MutSup::getExtraHandleColumnIntType(), - createNumbers(beg, end, reversed)), + DB::tests::makeColumn(MutSup::getExtraHandleColumnIntType(), handles), MutSup::getExtraHandleColumnIntType(), MutSup::extra_handle_column_name, MutSup::extra_handle_id}); @@ -568,7 +568,4 @@ class DMTestEnv return num++; } }; - -} // namespace tests -} // namespace DM -} // namespace DB +} // namespace DB::DM::tests diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_vector_index.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_vector_index.cpp index 7bfee0b8e8d..bf5eefd293d 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_vector_index.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_vector_index.cpp @@ -1196,7 +1196,7 @@ class VectorIndexSegmentTestBase ColumnDefines columns_to_read, ANNQueryInfoPtr ann_query) { - auto range = buildRowKeyRange(begin, end); + auto range = buildRowKeyRange(begin, end, /*is_common_handle*/ false); auto [segment, snapshot] = getSegmentForRead(segment_id); // load DMilePackFilterResult for each DMFile DMFilePackFilterResults pack_filter_results; @@ -1228,9 +1228,15 @@ class VectorIndexSegmentTestBase ColumnDefine cdPK() { return getExtraHandleColumnDefine(options.is_common_handle); } protected: - Block prepareWriteBlockImpl(Int64 start_key, Int64 end_key, bool is_deleted) override - { - auto block = SegmentTestBasic::prepareWriteBlockImpl(start_key, end_key, is_deleted); + Block prepareWriteBlockImpl( + Int64 start_key, + Int64 end_key, + bool is_deleted, + bool including_right_boundary, + std::optional ts) override + { + auto block + = SegmentTestBasic::prepareWriteBlockImpl(start_key, end_key, is_deleted, including_right_boundary, ts); block.insert(colVecFloat32(fmt::format("[{}, {})", start_key, end_key), vec_column_name, vec_column_id)); return block; } @@ -1666,9 +1672,19 @@ class VectorIndexSegmentExtraColumnTest return ColumnDefine(extra_column_id, extra_column_name, tests::typeFromString("Int64")); } - Block prepareWriteBlockImpl(Int64 start_key, Int64 end_key, bool is_deleted) override + Block prepareWriteBlockImpl( + Int64 start_key, + Int64 end_key, + bool is_deleted, + bool including_right_boundary, + std::optional ts) override { - auto block = VectorIndexSegmentTestBase::prepareWriteBlockImpl(start_key, end_key, is_deleted); + auto block = VectorIndexSegmentTestBase::prepareWriteBlockImpl( + start_key, + end_key, + is_deleted, + including_right_boundary, + ts); block.insert( colInt64(fmt::format("[{}, {})", start_key + 1000, end_key + 1000), extra_column_name, extra_column_id)); return block; diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_bitmap.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_bitmap.cpp index aadbf595707..5831561028d 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_bitmap.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_bitmap.cpp @@ -17,21 +17,30 @@ #include #include #include +#include #include #include #include #include #include - using namespace std::chrono_literals; using namespace DB::tests; namespace DB::DM::tests { -class SegmentBitmapFilterTest : public SegmentTestBasic +class SegmentBitmapFilterTest + : public SegmentTestBasic + , public testing::WithParamInterface { +public: + void SetUp() override + { + is_common_handle = GetParam(); + SegmentTestBasic::SetUp(SegmentTestOptions{.is_common_handle = is_common_handle}); + } + protected: DB::LoggerPtr log = DB::Logger::get("SegmentBitmapFilterTest"); static constexpr auto SEG_ID = DELTA_MERGE_FIRST_SEGMENT_ID; @@ -39,11 +48,21 @@ class SegmentBitmapFilterTest : public SegmentTestBasic ColumnPtr hold_handle; RowKeyRanges read_ranges; - void setRowKeyRange(Int64 begin, Int64 end) + void setRowKeyRange(Int64 begin, Int64 end, bool including_right_boundary) { auto itr = segments.find(SEG_ID); RUNTIME_CHECK(itr != segments.end(), SEG_ID); - itr->second->rowkey_range = buildRowKeyRange(begin, end); + itr->second->rowkey_range = buildRowKeyRange(begin, end, is_common_handle, including_right_boundary); + } + + void writeSegmentGeneric( + std::string_view seg_data, + std::optional> rowkey_range = std::nullopt) + { + if (is_common_handle) + writeSegment(seg_data, rowkey_range); + else + writeSegment(seg_data, rowkey_range); } /* @@ -61,12 +80,16 @@ class SegmentBitmapFilterTest : public SegmentTestBasic Returns {row_id, handle}. */ - std::pair *, const PaddedPODArray *> writeSegment( + template + std::pair *, const std::optional>> writeSegment( std::string_view seg_data, - std::optional> rowkey_range = std::nullopt) + std::optional> rowkey_range = std::nullopt) { if (rowkey_range) - setRowKeyRange(rowkey_range->first, rowkey_range->second); + { + const auto & [left, right, including_right_boundary] = *rowkey_range; + setRowKeyRange(left, right, including_right_boundary); + } auto seg_data_units = parseSegData(seg_data); for (const auto & unit : seg_data_units) { @@ -77,43 +100,42 @@ class SegmentBitmapFilterTest : public SegmentTestBasic if (hold_row_id == nullptr) { RUNTIME_CHECK(hold_handle == nullptr); - return {nullptr, nullptr}; + return {nullptr, std::nullopt}; } else { - RUNTIME_CHECK(hold_handle != nullptr); - return {toColumnVectorDataPtr(hold_row_id), toColumnVectorDataPtr(hold_handle)}; + return {toColumnVectorDataPtr(hold_row_id), ColumnView(*hold_handle)}; } } void writeSegment(const SegDataUnit & unit) { const auto & type = unit.type; - auto [begin, end] = unit.range; - + auto [begin, end, including_right_boundary] = unit.range; + const auto write_count = end - begin + including_right_boundary; if (type == "d_mem") { - SegmentTestBasic::writeSegment(SEG_ID, end - begin, begin); + SegmentTestBasic::writeToCache(SEG_ID, write_count, begin, unit.shuffle, unit.ts); } else if (type == "d_mem_del") { - SegmentTestBasic::writeSegmentWithDeletedPack(SEG_ID, end - begin, begin); + SegmentTestBasic::writeSegmentWithDeletedPack(SEG_ID, write_count, begin); } else if (type == "d_tiny") { - SegmentTestBasic::writeSegment(SEG_ID, end - begin, begin); + SegmentTestBasic::writeSegment(SEG_ID, write_count, begin, unit.shuffle, unit.ts); SegmentTestBasic::flushSegmentCache(SEG_ID); } else if (type == "d_tiny_del") { - SegmentTestBasic::writeSegmentWithDeletedPack(SEG_ID, end - begin, begin); + SegmentTestBasic::writeSegmentWithDeletedPack(SEG_ID, write_count, begin); SegmentTestBasic::flushSegmentCache(SEG_ID); } else if (type == "d_big") { SegmentTestBasic::ingestDTFileIntoDelta( SEG_ID, - end - begin, + write_count, begin, false, unit.pack_size, @@ -121,18 +143,29 @@ class SegmentBitmapFilterTest : public SegmentTestBasic } else if (type == "d_dr") { - SegmentTestBasic::writeSegmentWithDeleteRange(SEG_ID, begin, end); + SegmentTestBasic::writeSegmentWithDeleteRange( + SEG_ID, + begin, + end, + is_common_handle, + including_right_boundary); } else if (type == "s") { - SegmentTestBasic::writeSegment(SEG_ID, end - begin, begin); - if (unit.pack_size) - { - db_context->getSettingsRef().dt_segment_stable_pack_rows = *(unit.pack_size); - reloadDMContext(); - ASSERT_EQ(dm_context->stable_pack_rows, *(unit.pack_size)); - } - SegmentTestBasic::mergeSegmentDelta(SEG_ID); + SegmentTestBasic::writeSegment(SEG_ID, write_count, begin, unit.shuffle, unit.ts); + SegmentTestBasic::mergeSegmentDelta(SEG_ID, /*check_rows*/ true, unit.pack_size); + } + else if (type == "compact_delta") + { + SegmentTestBasic::compactSegmentDelta(SEG_ID); + } + else if (type == "flush_cache") + { + SegmentTestBasic::flushSegmentCache(SEG_ID); + } + else if (type == "merge_delta") + { + SegmentTestBasic::mergeSegmentDelta(SEG_ID, /*check_rows*/ true, unit.pack_size); } else { @@ -142,42 +175,40 @@ class SegmentBitmapFilterTest : public SegmentTestBasic struct TestCase { - TestCase( - std::string_view seg_data_, - size_t expected_size_, - std::string_view expected_row_id_, - std::string_view expected_handle_, - std::optional> rowkey_range_ = std::nullopt) - : seg_data(seg_data_) - , expected_size(expected_size_) - , expected_row_id(expected_row_id_) - , expected_handle(expected_handle_) - , rowkey_range(rowkey_range_) - {} std::string seg_data; size_t expected_size; std::string expected_row_id; std::string expected_handle; - std::optional> rowkey_range; + std::optional> rowkey_range; }; - void runTestCase(TestCase test_case) + void runTestCaseGeneric(TestCase test_case, int caller_line) + { + if (is_common_handle) + runTestCase(test_case, caller_line); + else + runTestCase(test_case, caller_line); + } + + template + void runTestCase(TestCase test_case, int caller_line) { - auto [row_id, handle] = writeSegment(test_case.seg_data, test_case.rowkey_range); + auto info = fmt::format("caller_line={}", caller_line); + auto [row_id, handle] = writeSegment(test_case.seg_data, test_case.rowkey_range); if (test_case.expected_size == 0) { - ASSERT_EQ(nullptr, row_id); - ASSERT_EQ(nullptr, handle); + ASSERT_EQ(nullptr, row_id) << info; + ASSERT_EQ(std::nullopt, handle) << info; } else { - ASSERT_EQ(test_case.expected_size, row_id->size()); + ASSERT_EQ(test_case.expected_size, row_id->size()) << info; auto expected_row_id = genSequence(test_case.expected_row_id); - ASSERT_TRUE(sequenceEqual(expected_row_id, *row_id)); + ASSERT_TRUE(sequenceEqual(expected_row_id, *row_id)) << info; - ASSERT_EQ(test_case.expected_size, handle->size()); - auto expected_handle = genSequence(test_case.expected_handle); - ASSERT_TRUE(sequenceEqual(expected_handle, *handle)); + ASSERT_EQ(test_case.expected_size, handle->size()) << info; + auto expected_handle = genHandleSequence(test_case.expected_handle); + ASSERT_TRUE(sequenceEqual(expected_handle, *handle)) << info; } } @@ -192,176 +223,264 @@ class SegmentBitmapFilterTest : public SegmentTestBasic } return results; } + + void checkHandle(PageIdU64 seg_id, std::string_view seq_ranges, int caller_line) + { + auto info = fmt::format("caller_line={}", caller_line); + auto handle = getSegmentHandle(seg_id, {}); + if (is_common_handle) + { + auto expected_handle = genHandleSequence(seq_ranges); + ASSERT_TRUE(sequenceEqual(expected_handle, ColumnView{*handle})) << info; + } + else + { + auto expected_handle = genHandleSequence(seq_ranges); + ASSERT_TRUE(sequenceEqual(expected_handle, ColumnView{*handle})) << info; + } + } + + bool is_common_handle = false; }; -TEST_F(SegmentBitmapFilterTest, InMemory1) +INSTANTIATE_TEST_CASE_P(MVCC, SegmentBitmapFilterTest, /* is_common_handle */ ::testing::Bool()); + +TEST_P(SegmentBitmapFilterTest, InMemory1) try { - runTestCase(TestCase("d_mem:[0, 1000)", 1000, "[0, 1000)", "[0, 1000)")); + runTestCaseGeneric( + TestCase{ + .seg_data = "d_mem:[0, 1000)", + .expected_size = 1000, + .expected_row_id = "[0, 1000)", + .expected_handle = "[0, 1000)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, InMemory2) +TEST_P(SegmentBitmapFilterTest, InMemory2) try { - runTestCase(TestCase{"d_mem:[0, 1000)|d_mem:[0, 1000)", 1000, "[1000, 2000)", "[0, 1000)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "d_mem:[0, 1000)|d_mem:[0, 1000)", + .expected_size = 1000, + .expected_row_id = "[1000, 2000)", + .expected_handle = "[0, 1000)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, InMemory3) +TEST_P(SegmentBitmapFilterTest, InMemory3) try { - runTestCase(TestCase{"d_mem:[0, 1000)|d_mem:[100, 200)", 1000, "[0, 100)|[1000, 1100)|[200, 1000)", "[0, 1000)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "d_mem:[0, 1000)|d_mem:[100, 200)", + .expected_size = 1000, + .expected_row_id = "[0, 100)|[1000, 1100)|[200, 1000)", + .expected_handle = "[0, 1000)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, InMemory4) +TEST_P(SegmentBitmapFilterTest, InMemory4) try { - runTestCase(TestCase{"d_mem:[0, 1000)|d_mem:[-100, 100)", 1100, "[1000, 1200)|[100, 1000)", "[-100, 1000)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "d_mem:[0, 1000)|d_mem:[-100, 100)", + .expected_size = 1100, + .expected_row_id = "[1000, 1200)|[100, 1000)", + .expected_handle = "[-100, 1000)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, InMemory5) +TEST_P(SegmentBitmapFilterTest, InMemory5) try { - runTestCase(TestCase{"d_mem:[0, 1000)|d_mem_del:[0, 1000)", 0, "", ""}); + runTestCaseGeneric( + TestCase{ + .seg_data = "d_mem:[0, 1000)|d_mem_del:[0, 1000)", + .expected_size = 0, + .expected_row_id = "", + .expected_handle = ""}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, InMemory6) +TEST_P(SegmentBitmapFilterTest, InMemory6) try { - runTestCase(TestCase{"d_mem:[0, 1000)|d_mem_del:[100, 200)", 900, "[0, 100)|[200, 1000)", "[0, 100)|[200, 1000)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "d_mem:[0, 1000)|d_mem_del:[100, 200)", + .expected_size = 900, + .expected_row_id = "[0, 100)|[200, 1000)", + .expected_handle = "[0, 100)|[200, 1000)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, InMemory7) +TEST_P(SegmentBitmapFilterTest, InMemory7) try { - runTestCase(TestCase{"d_mem:[0, 1000)|d_mem_del:[-100, 100)", 900, "[100, 1000)", "[100, 1000)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "d_mem:[0, 1000)|d_mem_del:[-100, 100)", + .expected_size = 900, + .expected_row_id = "[100, 1000)", + .expected_handle = "[100, 1000)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, Tiny1) +TEST_P(SegmentBitmapFilterTest, Tiny1) try { - runTestCase(TestCase{"d_tiny:[100, 500)|d_mem:[200, 1000)", 900, "[0, 100)|[400, 1200)", "[100, 1000)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "d_tiny:[100, 500)|d_mem:[200, 1000)", + .expected_size = 900, + .expected_row_id = "[0, 100)|[400, 1200)", + .expected_handle = "[100, 1000)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, TinyDel1) +TEST_P(SegmentBitmapFilterTest, TinyDel1) try { - runTestCase(TestCase{ - "d_tiny:[100, 500)|d_tiny_del:[200, 300)|d_mem:[0, 100)", - 400, - "[500, 600)|[0, 100)|[200, 400)", - "[0, 200)|[300, 500)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "d_tiny:[100, 500)|d_tiny_del:[200, 300)|d_mem:[0, 100)", + .expected_size = 400, + .expected_row_id = "[500, 600)|[0, 100)|[200, 400)", + .expected_handle = "[0, 200)|[300, 500)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, DeleteRange) +TEST_P(SegmentBitmapFilterTest, DeleteRange) try { - runTestCase(TestCase{ - "d_tiny:[100, 500)|d_dr:[250, 300)|d_mem:[240, 290)", - 390, - "[0, 140)|[400, 450)|[200, 400)", - "[100, 290)|[300, 500)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "d_tiny:[100, 500)|d_dr:[250, 300)|d_mem:[240, 290)", + .expected_size = 390, + .expected_row_id = "[0, 140)|[400, 450)|[200, 400)", + .expected_handle = "[100, 290)|[300, 500)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, Big) +TEST_P(SegmentBitmapFilterTest, Big) try { - runTestCase(TestCase{ - "d_tiny:[100, 500)|d_big:[250, 1000)|d_mem:[240, 290)", - 900, - "[0, 140)|[1150, 1200)|[440, 1150)", - "[100, 1000)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "d_tiny:[100, 500)|d_big:[250, 1000)|d_mem:[240, 290)", + .expected_size = 900, + .expected_row_id = "[0, 140)|[1150, 1200)|[440, 1150)", + .expected_handle = "[100, 1000)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, Stable1) +TEST_P(SegmentBitmapFilterTest, Stable1) try { - runTestCase(TestCase{"s:[0, 1024)", 1024, "[0, 1024)", "[0, 1024)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "s:[0, 1024)", + .expected_size = 1024, + .expected_row_id = "[0, 1024)", + .expected_handle = "[0, 1024)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, Stable2) +TEST_P(SegmentBitmapFilterTest, Stable2) try { - runTestCase(TestCase{"s:[0, 1024)|d_dr:[0, 1023)", 1, "[1023, 1024)", "[1023, 1024)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "s:[0, 1024)|d_dr:[0, 1023)", + .expected_size = 1, + .expected_row_id = "[1023, 1024)", + .expected_handle = "[1023, 1024)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, Stable3) +TEST_P(SegmentBitmapFilterTest, Stable3) try { - runTestCase(TestCase{ - "s:[0, 1024)|d_dr:[128, 256)|d_tiny_del:[300, 310)", - 886, - "[0, 128)|[256, 300)|[310, 1024)", - "[0, 128)|[256, 300)|[310, 1024)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "s:[0, 1024)|d_dr:[128, 256)|d_tiny_del:[300, 310)", + .expected_size = 886, + .expected_row_id = "[0, 128)|[256, 300)|[310, 1024)", + .expected_handle = "[0, 128)|[256, 300)|[310, 1024)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, Mix) +TEST_P(SegmentBitmapFilterTest, Mix) try { - runTestCase(TestCase{ - "s:[0, 1024)|d_dr:[128, 256)|d_tiny_del:[300, 310)|d_tiny:[200, 255)|d_mem:[298, 305)", - 946, - "[0, 128)|[1034, 1089)|[256, 298)|[1089, 1096)|[310, 1024)", - "[0, 128)|[200, 255)|[256, 305)|[310, 1024)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "s:[0, 1024)|d_dr:[128, 256)|d_tiny_del:[300, 310)|d_tiny:[200, 255)|d_mem:[298, 305)", + .expected_size = 946, + .expected_row_id = "[0, 128)|[1034, 1089)|[256, 298)|[1089, 1096)|[310, 1024)", + .expected_handle = "[0, 128)|[200, 255)|[256, 305)|[310, 1024)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, Ranges) +TEST_P(SegmentBitmapFilterTest, Ranges) try { - read_ranges.emplace_back(buildRowKeyRange(222, 244)); - read_ranges.emplace_back(buildRowKeyRange(300, 303)); - read_ranges.emplace_back(buildRowKeyRange(555, 666)); - runTestCase(TestCase{ - "s:[0, 1024)|d_dr:[128, 256)|d_tiny_del:[300, 310)|d_tiny:[200, 255)|d_mem:[298, 305)", - 136, - "[1056, 1078)|[1091, 1094)|[555, 666)", - "[222, 244)|[300, 303)|[555, 666)"}); + read_ranges.emplace_back(buildRowKeyRange(222, 244, is_common_handle)); + read_ranges.emplace_back(buildRowKeyRange(300, 303, is_common_handle)); + read_ranges.emplace_back(buildRowKeyRange(555, 666, is_common_handle)); + runTestCaseGeneric( + TestCase{ + .seg_data = "s:[0, 1024)|d_dr:[128, 256)|d_tiny_del:[300, 310)|d_tiny:[200, 255)|d_mem:[298, 305)", + .expected_size = 136, + .expected_row_id = "[1056, 1078)|[1091, 1094)|[555, 666)", + .expected_handle = "[222, 244)|[300, 303)|[555, 666)"}, + __LINE__); } CATCH -TEST_F(SegmentBitmapFilterTest, LogicalSplit) +TEST_P(SegmentBitmapFilterTest, LogicalSplit) try { - runTestCase(TestCase{ - "s:[0, 1024)|d_dr:[128, 256)|d_tiny_del:[300, 310)|d_tiny:[200, 255)|d_mem:[298, 305)", - 946, - "[0, 128)|[1034, 1089)|[256, 298)|[1089, 1096)|[310, 1024)", - "[0, 128)|[200, 255)|[256, 305)|[310, 1024)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "s:[0, 1024)|d_dr:[128, 256)|d_tiny_del:[300, 310)|d_tiny:[200, 255)|d_mem:[298, 305)", + .expected_size = 946, + .expected_row_id = "[0, 128)|[1034, 1089)|[256, 298)|[1089, 1096)|[310, 1024)", + .expected_handle = "[0, 128)|[200, 255)|[256, 305)|[310, 1024)"}, + __LINE__); auto new_seg_id = splitSegmentAt(SEG_ID, 512, Segment::SplitMode::Logical); ASSERT_TRUE(new_seg_id.has_value()); ASSERT_TRUE(areSegmentsSharingStable({SEG_ID, *new_seg_id})); - auto left_handle = getSegmentHandle(SEG_ID, {}); - const auto & left_h = toColumnVectorData(left_handle); - auto expected_left_handle = genSequence("[0, 128)|[200, 255)|[256, 305)|[310, 512)"); - ASSERT_TRUE(sequenceEqual(expected_left_handle, left_h)); + checkHandle(SEG_ID, "[0, 128)|[200, 255)|[256, 305)|[310, 512)", __LINE__); auto left_row_id = getSegmentRowId(SEG_ID, {}); const auto & left_r = toColumnVectorData(left_row_id); auto expected_left_row_id = genSequence("[0, 128)|[1034, 1089)|[256, 298)|[1089, 1096)|[310, 512)"); ASSERT_TRUE(sequenceEqual(expected_left_row_id, left_r)); - auto right_handle = getSegmentHandle(*new_seg_id, {}); - const auto & right_h = toColumnVectorData(right_handle); - auto expected_right_handle = genSequence("[512, 1024)"); - ASSERT_TRUE(sequenceEqual(expected_right_handle, right_h)); + checkHandle(*new_seg_id, "[512, 1024)", __LINE__); auto right_row_id = getSegmentRowId(*new_seg_id, {}); const auto & right_r = toColumnVectorData(right_row_id); @@ -370,9 +489,9 @@ try } CATCH -TEST_F(SegmentBitmapFilterTest, CleanStable) +TEST_P(SegmentBitmapFilterTest, CleanStable) { - writeSegment("d_mem:[0, 20000)|d_mem:[30000, 35000)"); + writeSegmentGeneric("d_mem:[0, 20000)|d_mem:[30000, 35000)"); mergeSegmentDelta(SEG_ID, true); auto [seg, snap] = getSegmentForRead(SEG_ID); ASSERT_EQ(seg->getDelta()->getRows(), 0); @@ -391,9 +510,9 @@ TEST_F(SegmentBitmapFilterTest, CleanStable) ASSERT_EQ(bitmap_filter->toDebugString(), expect_result); } -TEST_F(SegmentBitmapFilterTest, NotCleanStable) +TEST_P(SegmentBitmapFilterTest, NotCleanStable) { - writeSegment("d_mem:[0, 10000)|d_mem:[5000, 15000)"); + writeSegmentGeneric("d_mem:[0, 10000)|d_mem:[5000, 15000)"); mergeSegmentDelta(SEG_ID, true); auto [seg, snap] = getSegmentForRead(SEG_ID); ASSERT_EQ(seg->getDelta()->getRows(), 0); @@ -439,16 +558,16 @@ TEST_F(SegmentBitmapFilterTest, NotCleanStable) } } -TEST_F(SegmentBitmapFilterTest, StableRange) +TEST_P(SegmentBitmapFilterTest, StableRange) { - writeSegment("d_mem:[0, 50000)"); + writeSegmentGeneric("d_mem:[0, 50000)"); mergeSegmentDelta(SEG_ID, true); auto [seg, snap] = getSegmentForRead(SEG_ID); ASSERT_EQ(seg->getDelta()->getRows(), 0); ASSERT_EQ(seg->getDelta()->getDeletes(), 0); ASSERT_EQ(seg->getStable()->getRows(), 50000); - auto ranges = std::vector{buildRowKeyRange(10000, 50000)}; // [10000, 50000) + auto ranges = std::vector{buildRowKeyRange(10000, 50000, is_common_handle)}; // [10000, 50000) auto bitmap_filter = seg->buildBitmapFilterStableOnly( *dm_context, snap, @@ -464,10 +583,10 @@ TEST_F(SegmentBitmapFilterTest, StableRange) ASSERT_EQ(bitmap_filter->toDebugString(), expect_result); } -TEST_F(SegmentBitmapFilterTest, StableLogicalSplit) +TEST_P(SegmentBitmapFilterTest, StableLogicalSplit) try { - writeSegment("d_mem:[0, 50000)"); + writeSegmentGeneric("d_mem:[0, 50000)"); mergeSegmentDelta(SEG_ID, true); auto [seg, snap] = getSegmentForRead(SEG_ID); ASSERT_EQ(seg->getDelta()->getRows(), 0); @@ -479,20 +598,14 @@ try ASSERT_TRUE(new_seg_id.has_value()); ASSERT_TRUE(areSegmentsSharingStable({SEG_ID, *new_seg_id})); - auto left_handle = getSegmentHandle(SEG_ID, {}); - const auto & left_h = toColumnVectorData(left_handle); - auto expected_left_handle = genSequence("[0, 25000)"); - ASSERT_TRUE(sequenceEqual(expected_left_handle, left_h)); + checkHandle(SEG_ID, "[0, 25000)", __LINE__); auto left_row_id = getSegmentRowId(SEG_ID, {}); const auto & left_r = toColumnVectorData(left_row_id); auto expected_left_row_id = genSequence("[0, 25000)"); ASSERT_TRUE(sequenceEqual(expected_left_row_id, left_r)); - auto right_handle = getSegmentHandle(*new_seg_id, {}); - const auto & right_h = toColumnVectorData(right_handle); - auto expected_right_handle = genSequence("[25000, 50000)"); - ASSERT_TRUE(sequenceEqual(expected_right_handle, right_h)); + checkHandle(*new_seg_id, "[25000, 50000)", __LINE__); auto right_row_id = getSegmentRowId(*new_seg_id, {}); const auto & right_r = toColumnVectorData(right_row_id); @@ -501,17 +614,19 @@ try } CATCH -TEST_F(SegmentBitmapFilterTest, BigPart) +TEST_P(SegmentBitmapFilterTest, BigPart) try { // For ColumnFileBig, only packs that intersection with the rowkey range will be considered in BitmapFilter. // Packs in rowkey_range: [270, 280)|[280, 290)|[290, 300) - runTestCase(TestCase{ - /*seg_data*/ "d_big:[250, 1000):10", - /*expected_size*/ 20, - /*expected_row_id*/ "[5, 25)", - /*expected_handle*/ "[275, 295)", - /*rowkey_range*/ std::pair{275, 295}}); + runTestCaseGeneric( + TestCase{ + .seg_data = "d_big:[250, 1000):pack_size_10", + .expected_size = 20, + .expected_row_id = "[5, 25)", + .expected_handle = "[275, 295)", + .rowkey_range = std::tuple{275, 295, false}}, + __LINE__); auto [seg, snap] = getSegmentForRead(SEG_ID); auto bitmap_filter = seg->buildBitmapFilter( @@ -527,14 +642,16 @@ try } CATCH -TEST_F(SegmentBitmapFilterTest, StablePart) +TEST_P(SegmentBitmapFilterTest, StablePart) try { - runTestCase(TestCase{ - /*seg_data*/ "s:[250, 1000):10", - /*expected_size*/ 750, - /*expected_row_id*/ "[0, 750)", - /*expected_handle*/ "[250, 1000)"}); + runTestCaseGeneric( + TestCase{ + .seg_data = "s:[250, 1000):pack_size_10", + .expected_size = 750, + .expected_row_id = "[0, 750)", + .expected_handle = "[250, 1000)"}, + __LINE__); { auto [seg, snap] = getSegmentForRead(SEG_ID); @@ -542,7 +659,7 @@ try } // For Stable, all packs of DMFile will be considered in BitmapFilter. - setRowKeyRange(275, 295); // Shrinking range + setRowKeyRange(275, 295, false); // Shrinking range auto [seg, snap] = getSegmentForRead(SEG_ID); auto bitmap_filter = seg->buildBitmapFilter( *dm_context, @@ -565,7 +682,7 @@ try } CATCH -TEST_F(SegmentBitmapFilterTest, testSkipPackStableOnly) +TEST_P(SegmentBitmapFilterTest, testSkipPackStableOnly) { std::string expect_result; expect_result.append(std::string(200, '0')); @@ -581,14 +698,14 @@ TEST_F(SegmentBitmapFilterTest, testSkipPackStableOnly) reloadDMContext(); version = 0; - writeSegment("d_mem:[0, 1000)|d_mem:[500, 1500)|d_mem:[1500, 2000)"); + writeSegmentGeneric("d_mem:[0, 1000)|d_mem:[500, 1500)|d_mem:[1500, 2000)"); mergeSegmentDelta(SEG_ID, true); auto [seg, snap] = getSegmentForRead(SEG_ID); ASSERT_EQ(seg->getDelta()->getRows(), 0); ASSERT_EQ(seg->getDelta()->getDeletes(), 0); ASSERT_EQ(seg->getStable()->getRows(), 2500); - auto ranges = std::vector{buildRowKeyRange(200, 2000)}; + auto ranges = std::vector{buildRowKeyRange(200, 2000, is_common_handle)}; auto pack_filter_results = loadPackFilterResults(snap, ranges); if (pack_rows == 1) @@ -630,7 +747,7 @@ TEST_F(SegmentBitmapFilterTest, testSkipPackStableOnly) } } -TEST_F(SegmentBitmapFilterTest, testSkipPackNormal) +TEST_P(SegmentBitmapFilterTest, testSkipPackNormal) { std::string expect_result; expect_result.append(std::string(50, '0')); @@ -656,15 +773,15 @@ TEST_F(SegmentBitmapFilterTest, testSkipPackNormal) reloadDMContext(); version = 0; - writeSegment("d_mem:[0, 1000)|d_mem:[500, 1500)|d_mem:[1500, 2000)"); + writeSegmentGeneric("d_mem:[0, 1000)|d_mem:[500, 1500)|d_mem:[1500, 2000)"); mergeSegmentDelta(SEG_ID, true); - writeSegment("d_tiny:[99, 100)|d_dr:[355, 370)|d_dr:[409, 481)|d_mem:[200, 201)|d_mem:[301, 315)"); + writeSegmentGeneric("d_tiny:[99, 100)|d_dr:[355, 370)|d_dr:[409, 481)|d_mem:[200, 201)|d_mem:[301, 315)"); auto [seg, snap] = getSegmentForRead(SEG_ID); ASSERT_EQ(seg->getDelta()->getRows(), 16); ASSERT_EQ(seg->getDelta()->getDeletes(), 2); ASSERT_EQ(seg->getStable()->getRows(), 2500); - auto ranges = std::vector{buildRowKeyRange(50, 2000)}; + auto ranges = std::vector{buildRowKeyRange(50, 2000, is_common_handle)}; auto pack_filter_results = loadPackFilterResults(snap, ranges); UInt64 start_ts = 6; if (pack_rows == 10) @@ -754,5 +871,4 @@ TEST_F(SegmentBitmapFilterTest, testSkipPackNormal) deleteRangeSegment(SEG_ID); } } - } // namespace DB::DM::tests diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp index 641584987f4..29e663b1812 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp @@ -59,6 +59,26 @@ extern DMFilePtr writeIntoNewDMFile( namespace DB::DM::tests { +namespace +{ + +// a + b +bool isSumOverflow(Int64 a, Int64 b) +{ + return (b > 0 && a > std::numeric_limits::max() - b) || (b < 0 && a < std::numeric_limits::min() - b); +} + +// a - b +auto isDiffOverflow(Int64 a, Int64 b) +{ + assert(a > b); + if (b < 0) + return a > std::numeric_limits::max() + b; + else + return false; +} +} // namespace + void SegmentTestBasic::reloadWithOptions(SegmentTestOptions config) { { @@ -302,9 +322,24 @@ void SegmentTestBasic::mergeSegment(const PageIdU64s & segments_id, bool check_r operation_statistics["mergeTwo"]++; } -void SegmentTestBasic::mergeSegmentDelta(PageIdU64 segment_id, bool check_rows) +void SegmentTestBasic::mergeSegmentDelta(PageIdU64 segment_id, bool check_rows, std::optional pack_size) { - LOG_INFO(logger_op, "mergeSegmentDelta, segment_id={}", segment_id); + LOG_INFO(logger_op, "mergeSegmentDelta, segment_id={}, pack_size={}", segment_id, pack_size); + + const size_t initial_pack_rows = db_context->getSettingsRef().dt_segment_stable_pack_rows; + if (pack_size) + { + db_context->getSettingsRef().dt_segment_stable_pack_rows = *pack_size; + reloadDMContext(); + ASSERT_EQ(dm_context->stable_pack_rows, *pack_size); + } + SCOPE_EXIT({ + if (initial_pack_rows != db_context->getSettingsRef().dt_segment_stable_pack_rows) + { + db_context->getSettingsRef().dt_segment_stable_pack_rows = initial_pack_rows; + reloadDMContext(); + } + }); RUNTIME_CHECK(segments.find(segment_id) != segments.end()); auto segment = segments[segment_id]; @@ -330,6 +365,18 @@ void SegmentTestBasic::flushSegmentCache(PageIdU64 segment_id) operation_statistics["flush"]++; } +void SegmentTestBasic::compactSegmentDelta(PageIdU64 segment_id) +{ + LOG_INFO(logger_op, "compactSegmentDelta, segment_id={}", segment_id); + + RUNTIME_CHECK(segments.find(segment_id) != segments.end()); + auto segment = segments[segment_id]; + size_t segment_row_num = getSegmentRowNum(segment_id); + segment->compactDelta(*dm_context); + EXPECT_EQ(getSegmentRowNum(segment_id), segment_row_num); + operation_statistics["compact"]++; +} + std::pair SegmentTestBasic::getSegmentKeyRange(PageIdU64 segment_id) const { RUNTIME_CHECK(segments.find(segment_id) != segments.end()); @@ -367,29 +414,43 @@ std::pair SegmentTestBasic::getSegmentKeyRange(PageIdU64 segment_i return {start_key, end_key}; } -Block SegmentTestBasic::prepareWriteBlockImpl(Int64 start_key, Int64 end_key, bool is_deleted) +Block SegmentTestBasic::prepareWriteBlockImpl( + Int64 start_key, + Int64 end_key, + bool is_deleted, + bool including_right_boundary, + std::optional ts) { - RUNTIME_CHECK(start_key <= end_key); - if (end_key == start_key) + RUNTIME_CHECK(start_key <= end_key, start_key, end_key); + if (end_key == start_key && !including_right_boundary) return Block{}; - version++; + + UInt64 v = ts.value_or(version + 1); + version = std::max(v, version + 1); // Increase version return DMTestEnv::prepareSimpleWriteBlock( start_key, // end_key, false, - version, + v, DMTestEnv::pk_name, MutSup::extra_handle_id, options.is_common_handle ? MutSup::getExtraHandleColumnStringType() : MutSup::getExtraHandleColumnIntType(), options.is_common_handle, 1, true, - is_deleted); + is_deleted, + /*with_null_uint64*/ false, + including_right_boundary); } -Block SegmentTestBasic::prepareWriteBlock(Int64 start_key, Int64 end_key, bool is_deleted) +Block SegmentTestBasic::prepareWriteBlock( + Int64 start_key, + Int64 end_key, + bool is_deleted, + bool including_right_boundary, + std::optional ts) { - return prepareWriteBlockImpl(start_key, end_key, is_deleted); + return prepareWriteBlockImpl(start_key, end_key, is_deleted, including_right_boundary, ts); } Block sortvstackBlocks(std::vector && blocks) @@ -406,28 +467,48 @@ Block sortvstackBlocks(std::vector && blocks) Block SegmentTestBasic::prepareWriteBlockInSegmentRange( PageIdU64 segment_id, - UInt64 total_write_rows, + Int64 total_write_rows, std::optional write_start_key, - bool is_deleted) + bool is_deleted, + std::optional ts) { - RUNTIME_CHECK(total_write_rows < std::numeric_limits::max()); + RUNTIME_CHECK(0 < total_write_rows, total_write_rows); - RUNTIME_CHECK(segments.find(segment_id) != segments.end()); - auto [segment_start_key, segment_end_key] = getSegmentKeyRange(segment_id); - auto segment_max_rows = static_cast(segment_end_key - segment_start_key); - - if (segment_max_rows == 0) - return {}; + // For example, write_start_key is int64_max and total_write_rows is 1, + // We will generate block with one row with int64_max. + auto is_including_right_boundary = [](Int64 write_start_key, Int64 total_write_rows) { + return std::numeric_limits::max() - total_write_rows + 1 == write_start_key; + }; + auto seg = segments.find(segment_id); + RUNTIME_CHECK(seg != segments.end()); + auto segment_range = seg->second->getRowKeyRange(); + auto [segment_start_key, segment_end_key] = getSegmentKeyRange(segment_id); + bool including_right_boundary = false; if (write_start_key.has_value()) { - // When write start key is specified, the caller must know exactly the segment range. - RUNTIME_CHECK(*write_start_key >= segment_start_key); - RUNTIME_CHECK(static_cast(segment_end_key - *write_start_key) > 0); - } + RUNTIME_CHECK_MSG( + segment_start_key <= *write_start_key + && (*write_start_key < segment_end_key || segment_range.isEndInfinite()), + "write_start_key={} segment_range={}", + *write_start_key, + segment_range.toDebugString()); - if (!write_start_key.has_value()) + including_right_boundary = is_including_right_boundary(*write_start_key, total_write_rows); + RUNTIME_CHECK( + including_right_boundary || !isSumOverflow(*write_start_key, total_write_rows), + *write_start_key, + total_write_rows); + } + else { + auto segment_max_rows = isDiffOverflow(segment_end_key, segment_start_key) + ? std::numeric_limits::max() // UInt64 is more accurate, but Int64 is enough and for simplicity. + : segment_end_key - segment_start_key; + + if (segment_max_rows == 0) + return {}; + // When write start key is unspecified, we will: // A. If the segment is large enough, we randomly pick a write start key in the range. // B. If the segment is small, we write from the beginning. @@ -460,10 +541,10 @@ Block SegmentTestBasic::prepareWriteBlockInSegmentRange( RUNTIME_CHECK(write_rows_this_round > 0); Int64 write_end_key_this_round = *write_start_key + static_cast(write_rows_this_round); RUNTIME_CHECK(write_end_key_this_round <= segment_end_key); - - Block block = prepareWriteBlock(*write_start_key, write_end_key_this_round, is_deleted); + Block block + = prepareWriteBlock(*write_start_key, write_end_key_this_round, is_deleted, including_right_boundary, ts); blocks.emplace_back(block); - remaining_rows -= write_rows_this_round; + remaining_rows -= write_rows_this_round + including_right_boundary; LOG_DEBUG( logger, @@ -478,7 +559,52 @@ Block SegmentTestBasic::prepareWriteBlockInSegmentRange( return sortvstackBlocks(std::move(blocks)); } -void SegmentTestBasic::writeSegment(PageIdU64 segment_id, UInt64 write_rows, std::optional start_at) +void SegmentTestBasic::writeToCache( + PageIdU64 segment_id, + UInt64 write_rows, + Int64 start_at, + bool shuffle, + std::optional ts) +{ + LOG_INFO(logger_op, "writeToCache, segment_id={} write_rows={}", segment_id, write_rows); + + if (write_rows == 0) + return; + + RUNTIME_CHECK(segments.find(segment_id) != segments.end()); + auto segment = segments[segment_id]; + size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id); + auto [start_key, end_key] = getSegmentKeyRange(segment_id); + LOG_DEBUG( + logger, + "write to segment, segment={} segment_rows={} start_key={} end_key={}", + segment->info(), + segment_row_num, + start_key, + end_key); + + auto block = prepareWriteBlockInSegmentRange(segment_id, write_rows, start_at, /* is_deleted */ false, ts); + + if (shuffle) + { + IColumn::Permutation perm(block.rows()); + std::iota(perm.begin(), perm.end(), 0); + std::shuffle(perm.begin(), perm.end(), std::mt19937(std::random_device()())); + for (auto & column : block) + column.column = column.column->permute(perm, 0); + } + segment->writeToCache(*dm_context, block, 0, block.rows()); + + EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows); + operation_statistics["write"]++; +} + +void SegmentTestBasic::writeSegment( + PageIdU64 segment_id, + UInt64 write_rows, + std::optional start_at, + bool shuffle, + std::optional ts) { LOG_INFO(logger_op, "writeSegment, segment_id={} write_rows={}", segment_id, write_rows); @@ -497,7 +623,16 @@ void SegmentTestBasic::writeSegment(PageIdU64 segment_id, UInt64 write_rows, std start_key, end_key); - auto block = prepareWriteBlockInSegmentRange(segment_id, write_rows, start_at, /* is_deleted */ false); + auto block = prepareWriteBlockInSegmentRange(segment_id, write_rows, start_at, /* is_deleted */ false, ts); + + if (shuffle) + { + IColumn::Permutation perm(block.rows()); + std::iota(perm.begin(), perm.end(), 0); + std::shuffle(perm.begin(), perm.end(), std::mt19937(std::random_device()())); + for (auto & column : block) + column.column = column.column->permute(perm, 0); + } segment->write(*dm_context, block, false); EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows); @@ -521,7 +656,10 @@ BlockInputStreamPtr SegmentTestBasic::getIngestDTFileInputStream( rows_per_block = std::min(rows_per_block, write_rows - written); std::optional start; if (start_at) + { + RUNTIME_CHECK(!isSumOverflow(*start_at, written), *start_at, written); start.emplace(*start_at + written); + } if (check_range) { @@ -530,9 +668,13 @@ BlockInputStreamPtr SegmentTestBasic::getIngestDTFileInputStream( } else { - auto start_key = start ? *start : 0; - auto end_key = start_key + rows_per_block; - auto block = prepareWriteBlock(start_key, end_key); + Int64 start_key = start.value_or(0); + bool overflow = std::numeric_limits::max() - static_cast(rows_per_block) < start_key; + Int64 end_key = overflow ? std::numeric_limits::max() : start_key + rows_per_block; + // if overflow, write [start_key, end_key] + // if not overflow, write [start_key, end_key) + rows_per_block += overflow; + auto block = prepareWriteBlock(start_key, end_key, /*is_deleted*/ false, overflow); streams.push_back(std::make_shared(std::move(block))); } } @@ -1053,10 +1195,33 @@ Block mergeSegmentRowIds(std::vector && blocks) return accumulated_block; } -RowKeyRange SegmentTestBasic::buildRowKeyRange(Int64 begin, Int64 end) +RowKeyRange SegmentTestBasic::buildRowKeyRange( + Int64 begin, + Int64 end, + bool is_common_handle, + bool including_right_boundary) { - HandleRange range(begin, end); - return RowKeyRange::fromHandleRange(range); + // `including_right_boundary` is for creating range like [begin, std::numeric_limits::max()) or [begin, std::numeric_limits::max()] + if (including_right_boundary) + RUNTIME_CHECK(end == std::numeric_limits::max()); + + if (is_common_handle) + { + auto create_rowkey_value = [](Int64 v) { + WriteBufferFromOwnString ss; + DB::EncodeUInt(static_cast(TiDB::CodecFlagInt), ss); + DB::EncodeInt64(v, ss); + return std::make_shared(ss.releaseStr()); + }; + auto left = RowKeyValue{is_common_handle, create_rowkey_value(begin)}; + auto right = including_right_boundary ? RowKeyValue::COMMON_HANDLE_MAX_KEY + : RowKeyValue{is_common_handle, create_rowkey_value(end)}; + return RowKeyRange{left, right, is_common_handle, 1}; + } + + auto left = RowKeyValue::fromHandle(begin); + auto right = including_right_boundary ? RowKeyValue::INT_HANDLE_MAX_KEY : RowKeyValue::fromHandle(end); + return RowKeyRange{left, right, is_common_handle, 1}; } std::pair SegmentTestBasic::getSegmentForRead(PageIdU64 segment_id) @@ -1126,9 +1291,14 @@ ColumnPtr SegmentTestBasic::getSegmentHandle(PageIdU64 segment_id, const RowKeyR } } -void SegmentTestBasic::writeSegmentWithDeleteRange(PageIdU64 segment_id, Int64 begin, Int64 end) +void SegmentTestBasic::writeSegmentWithDeleteRange( + PageIdU64 segment_id, + Int64 begin, + Int64 end, + bool is_common_handle, + bool including_right_boundary) { - auto range = buildRowKeyRange(begin, end); + auto range = buildRowKeyRange(begin, end, is_common_handle, including_right_boundary); RUNTIME_CHECK(segments.find(segment_id) != segments.end()); auto segment = segments[segment_id]; RUNTIME_CHECK(segment->write(*dm_context, range)); diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h index 0ac5d82cb76..0eb91754d38 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h @@ -65,14 +65,24 @@ class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic Segment::SplitMode split_mode = Segment::SplitMode::Auto, bool check_rows = true); void mergeSegment(const std::vector & segments, bool check_rows = true); - void mergeSegmentDelta(PageIdU64 segment_id, bool check_rows = true); + void mergeSegmentDelta( + PageIdU64 segment_id, + bool check_rows = true, + std::optional pack_size = std::nullopt); void flushSegmentCache(PageIdU64 segment_id); + void compactSegmentDelta(PageIdU64 segment_id); /** * When begin_key is specified, new rows will be written from specified key. Otherwise, new rows may be * written randomly in the segment range. */ - void writeSegment(PageIdU64 segment_id, UInt64 write_rows = 100, std::optional start_at = std::nullopt); + void writeToCache(PageIdU64 segment_id, UInt64 write_rows, Int64 start_at, bool shuffle, std::optional ts); + void writeSegment( + PageIdU64 segment_id, + UInt64 write_rows = 100, + std::optional start_at = std::nullopt, + bool shuffle = false, + std::optional ts = std::nullopt); void ingestDTFileIntoDelta( PageIdU64 segment_id, UInt64 write_rows = 100, @@ -113,12 +123,18 @@ class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic */ bool ensureSegmentDeltaLocalIndex(PageIdU64 segment_id, const LocalIndexInfosPtr & local_index_infos); - Block prepareWriteBlock(Int64 start_key, Int64 end_key, bool is_deleted = false); + Block prepareWriteBlock( + Int64 start_key, + Int64 end_key, + bool is_deleted = false, + bool including_right_boundary = false, + std::optional ts = std::nullopt); Block prepareWriteBlockInSegmentRange( PageIdU64 segment_id, - UInt64 total_write_rows, + Int64 total_write_rows, std::optional write_start_key = std::nullopt, - bool is_deleted = false); + bool is_deleted = false, + std::optional ts = std::nullopt); size_t getSegmentRowNumWithoutMVCC(PageIdU64 segment_id); size_t getSegmentRowNum(PageIdU64 segment_id); @@ -138,9 +154,18 @@ class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic std::vector readSegment(PageIdU64 segment_id, bool need_row_id, const RowKeyRanges & ranges); ColumnPtr getSegmentRowId(PageIdU64 segment_id, const RowKeyRanges & ranges); ColumnPtr getSegmentHandle(PageIdU64 segment_id, const RowKeyRanges & ranges); - void writeSegmentWithDeleteRange(PageIdU64 segment_id, Int64 begin, Int64 end); + void writeSegmentWithDeleteRange( + PageIdU64 segment_id, + Int64 begin, + Int64 end, + bool is_common_handle, + bool including_right_boundary); RowKeyValue buildRowKeyValue(Int64 key); - static RowKeyRange buildRowKeyRange(Int64 begin, Int64 end); + static RowKeyRange buildRowKeyRange( + Int64 begin, + Int64 end, + bool is_common_handle, + bool including_right_boundary = false); size_t getPageNumAfterGC(StorageType type, NamespaceID ns_id) const; @@ -161,7 +186,12 @@ class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic const ColumnDefinesPtr & tableColumns() const { return table_columns; } - virtual Block prepareWriteBlockImpl(Int64 start_key, Int64 end_key, bool is_deleted); + virtual Block prepareWriteBlockImpl( + Int64 start_key, + Int64 end_key, + bool is_deleted, + bool including_right_boundary, + std::optional ts); virtual void prepareColumns(const ColumnDefinesPtr &) {} diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_util.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_util.cpp index 7790c81dc4f..9aba27700ac 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_util.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_util.cpp @@ -21,58 +21,103 @@ namespace DB::DM::tests namespace { -// "[a, b)" => std::pair{a, b} +Strings splitAndTrim(std::string_view s, std::string_view delimiter, std::optional expected_size = std::nullopt) +{ + Strings results; + boost::split(results, s, boost::is_any_of(delimiter)); + if (expected_size) + RUNTIME_CHECK(results.size() == *expected_size, s, delimiter, expected_size); + else + RUNTIME_CHECK(!results.empty(), s, delimiter); + for (auto & r : results) + boost::trim(r); + return results; +} + template -std::pair parseRange(String & str_range) +SegDataRange parseRange(std::string_view s) { - boost::algorithm::trim(str_range); - RUNTIME_CHECK(str_range.front() == '[' && str_range.back() == ')', str_range); - std::vector values; - str_range = str_range.substr(1, str_range.size() - 2); - boost::split(values, str_range, boost::is_any_of(",")); - RUNTIME_CHECK(values.size() == 2, str_range); - return {static_cast(std::stol(values[0])), static_cast(std::stol(values[1]))}; + auto str_range = boost::trim_copy(s); + RUNTIME_CHECK(str_range.front() == '[' && (str_range.back() == ')' || str_range.back() == ']'), str_range); + auto values = splitAndTrim(str_range.substr(1, str_range.size() - 2), ",", 2); + return SegDataRange{ + .left = static_cast(std::stol(values[0])), + .right = static_cast(std::stol(values[1])), + .including_right_boundary = str_range.back() == ']'}; } -// "[a, b)|[c, d)" => [std::pair{a, b}, std::pair{c, d}] +// "[a, b)|[c, d]" => [{a, b, false}, {c, d, true}] template -std::vector> parseRanges(std::string_view str_ranges) +std::vector> parseRanges(std::string_view s) { - std::vector ranges; - boost::split(ranges, str_ranges, boost::is_any_of("|")); - RUNTIME_CHECK(!ranges.empty(), str_ranges); - std::vector> vector_ranges; + auto str_range = boost::trim_copy(s); + RUNTIME_CHECK(str_range.front() == '[' && (str_range.back() == ')' || str_range.back() == ']'), str_range); + auto ranges = splitAndTrim(str_range, "|"); + std::vector> vector_ranges; vector_ranges.reserve(ranges.size()); for (auto & r : ranges) - { vector_ranges.emplace_back(parseRange(r)); - } return vector_ranges; } -// "type:[a, b)" => SegDataUnit -SegDataUnit parseSegDataUnit(String & s) +const std::unordered_set segment_commands = {"flush_cache", "compact_delta", "merge_delta"}; +const std::unordered_set delta_small_data_types = {"d_mem", "d_mem_del", "d_tiny", "d_tiny_del"}; +const std::unordered_set segment_data_types + = {"d_mem", "d_mem_del", "d_tiny", "d_tiny_del", "d_dr", "s"}; + +void parseSegUnitAttr(std::string_view attr, SegDataUnit & unit) { - boost::algorithm::trim(s); - std::vector values; - boost::split(values, s, boost::is_any_of(":")); - if (values.size() == 2) + // Pack size for DMFile + static const std::string_view attr_pack_size_prefix{"pack_size_"}; + // Shuffle data ColumnFileTiny and ColumnFileMemory + static const std::string_view attr_shuffle{"shuffle"}; + // Timestamp for generated data + static const std::string_view attr_timestamp_prefix{"ts_"}; + + if (attr.starts_with(attr_pack_size_prefix)) + { + RUNTIME_CHECK(unit.type == "d_big" || unit.type == "s" || unit.type == "merge_delta", attr, unit.type); + unit.pack_size = std::stoul(String(attr.substr(attr_pack_size_prefix.size()))); + return; + } + + if (attr == attr_shuffle) { - return SegDataUnit{ - .type = boost::algorithm::trim_copy(values[0]), - .range = parseRange(values[1]), - }; + RUNTIME_CHECK(delta_small_data_types.contains(unit.type), attr, unit.type, delta_small_data_types); + unit.shuffle = true; + return; } - else if (values.size() == 3) + + if (attr.starts_with(attr_timestamp_prefix)) { - RUNTIME_CHECK(values[0] == "d_big" || values[0] == "s", s); - return SegDataUnit{ - .type = boost::algorithm::trim_copy(values[0]), - .range = parseRange(values[1]), - .pack_size = std::stoul(values[2]), - }; + RUNTIME_CHECK( + delta_small_data_types.contains(unit.type) || unit.type == "s", + attr, + unit.type, + delta_small_data_types); + unit.ts = std::stoul(String(attr.substr(attr_timestamp_prefix.size()))); + return; } - RUNTIME_CHECK_MSG(false, "parseSegDataUnit failed: {}", s); + + RUNTIME_CHECK_MSG(false, "{} is unsupported", attr); +} + +// data_type:[left, right):attr1:attr2 +// cmd_type:attr1:attr2 +SegDataUnit parseSegDataUnit(std::string_view s) +{ + auto s_trim = boost::trim_copy(s); + auto values = splitAndTrim(s_trim, ":"); + size_t i = 0; + SegDataUnit unit{.type = values[i++]}; + if (!segment_commands.contains(unit.type)) + { + RUNTIME_CHECK(values.size() >= i, s, values); + unit.range = parseRange(values[i++]); + } + for (; i < values.size(); i++) + parseSegUnitAttr(values[i], unit); + return unit; } void check(const std::vector & seg_data_units) @@ -83,6 +128,9 @@ void check(const std::vector & seg_data_units) for (size_t i = 0; i < seg_data_units.size(); i++) { const auto & type = seg_data_units[i].type; + if (segment_commands.contains(type)) + continue; + if (type == "s") { stable_units.emplace_back(i); @@ -91,31 +139,29 @@ void check(const std::vector & seg_data_units) { mem_units.emplace_back(i); } - auto [begin, end] = seg_data_units[i].range; - RUNTIME_CHECK(begin < end, begin, end); + auto [begin, end, including_right_boundary] = seg_data_units[i].range; + RUNTIME_CHECK(end - begin + including_right_boundary > 0, begin, end, including_right_boundary); } + // If stable exists, it should be the first one. RUNTIME_CHECK(stable_units.empty() || (stable_units.size() == 1 && stable_units[0] == 0)); - std::vector expected_mem_units(mem_units.size()); - std::iota(expected_mem_units.begin(), expected_mem_units.end(), seg_data_units.size() - mem_units.size()); - RUNTIME_CHECK(mem_units == expected_mem_units, expected_mem_units, mem_units); } template -std::vector genSequence(T begin, T end) +std::vector genSequence(T begin, T end, bool including_right_boundary) { - auto size = end - begin; + auto size = end - begin + including_right_boundary; std::vector v(size); std::iota(v.begin(), v.end(), begin); return v; } template -std::vector genSequence(const std::vector> & ranges) +std::vector genSequence(const std::vector> & ranges) { std::vector res; - for (auto [begin, end] : ranges) + for (auto [begin, end, including_right_boundary] : ranges) { - auto v = genSequence(begin, end); + auto v = genSequence(begin, end, including_right_boundary); res.insert(res.end(), v.begin(), v.end()); } return res; @@ -124,12 +170,10 @@ std::vector genSequence(const std::vector> & ranges) std::vector parseSegData(std::string_view seg_data) { - std::vector str_seg_data_units; - boost::split(str_seg_data_units, seg_data, boost::is_any_of("|")); - RUNTIME_CHECK(!str_seg_data_units.empty(), seg_data); + auto str_seg_data_units = splitAndTrim(seg_data, "|"); std::vector seg_data_units; seg_data_units.reserve(str_seg_data_units.size()); - for (auto & s : str_seg_data_units) + for (const auto & s : str_seg_data_units) { seg_data_units.emplace_back(parseSegDataUnit(s)); } diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_util.h b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_util.h index ef1ffffb18a..3f90a9cae92 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_util.h +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_util.h @@ -14,17 +14,34 @@ #pragma once +#include #include #include #include namespace DB::DM::tests { + +// If including_right_boundary is false, it means [left, right). +// If including_right_boundary is true, it means [left, right]. +// `including_right_boundary` is required if we want to generate data with std::numeric_limits::max(). +// Theoretically, we could enforce the use of closed intervals, thereby eliminating the need for the parameter 'including_right_boundary'. +// However, a multitude of existing tests are predicated on the assumption that the interval is left-closed and right-open. +template +struct SegDataRange +{ + T left; + T right; + bool including_right_boundary; +}; + struct SegDataUnit { String type; - std::pair range; // Data range + SegDataRange range; std::optional pack_size; // For DMFile + bool shuffle = false; // For ColumnFileTiny and ColumnFileMemory + std::optional ts; }; std::vector parseSegData(std::string_view seg_data); @@ -32,6 +49,22 @@ std::vector parseSegData(std::string_view seg_data); template std::vector genSequence(std::string_view str_ranges); +template +std::vector genHandleSequence(std::string_view str_ranges) +{ + auto v = genSequence(str_ranges); + if constexpr (std::is_same_v) + return v; + else + { + static_assert(std::is_same_v); + std::vector res(v.size()); + for (size_t i = 0; i < v.size(); i++) + res[i] = genMockCommonHandle(v[i], 1); + return res; + } +} + template ::testing::AssertionResult sequenceEqual(const E & expected, const A & actual) { diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_skippable_block_input_stream.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_skippable_block_input_stream.cpp index e4ebbf2f6a0..ce47ee3807e 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_skippable_block_input_stream.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_skippable_block_input_stream.cpp @@ -269,37 +269,42 @@ class SkippableBlockInputStreamTest : public SegmentTestBasic void writeSegment(const SegDataUnit & unit) { const auto & type = unit.type; - auto [begin, end] = unit.range; - + const auto [begin, end, including_right_boundary] = unit.range; + const auto write_count = end - begin + including_right_boundary; if (type == "d_mem") { - SegmentTestBasic::writeSegment(SEG_ID, end - begin, begin); + SegmentTestBasic::writeSegment(SEG_ID, write_count, begin); } else if (type == "d_mem_del") { - SegmentTestBasic::writeSegmentWithDeletedPack(SEG_ID, end - begin, begin); + SegmentTestBasic::writeSegmentWithDeletedPack(SEG_ID, write_count, begin); } else if (type == "d_tiny") { - SegmentTestBasic::writeSegment(SEG_ID, end - begin, begin); + SegmentTestBasic::writeSegment(SEG_ID, write_count, begin); SegmentTestBasic::flushSegmentCache(SEG_ID); } else if (type == "d_tiny_del") { - SegmentTestBasic::writeSegmentWithDeletedPack(SEG_ID, end - begin, begin); + SegmentTestBasic::writeSegmentWithDeletedPack(SEG_ID, write_count, begin); SegmentTestBasic::flushSegmentCache(SEG_ID); } else if (type == "d_big") { - SegmentTestBasic::ingestDTFileIntoDelta(SEG_ID, end - begin, begin, false); + SegmentTestBasic::ingestDTFileIntoDelta(SEG_ID, write_count, begin, false); } else if (type == "d_dr") { - SegmentTestBasic::writeSegmentWithDeleteRange(SEG_ID, begin, end); + SegmentTestBasic::writeSegmentWithDeleteRange( + SEG_ID, + begin, + end, + /*is_common_handle*/ false, + including_right_boundary); } else if (type == "s") { - SegmentTestBasic::writeSegment(SEG_ID, end - begin, begin); + SegmentTestBasic::writeSegment(SEG_ID, write_count, begin); SegmentTestBasic::mergeSegmentDelta(SEG_ID); } else