apache · palashc · Jan 15, 2025 · Jan 15, 2025 · Jan 18, 2025 · Jan 27, 2025
diff --git a/phoenix-core-server/src/main/java/org/apache/phoenix/coprocessor/PhoenixMasterObserver.java b/phoenix-core-server/src/main/java/org/apache/phoenix/coprocessor/PhoenixMasterObserver.java
@@ -23,8 +23,10 @@
 import java.sql.SQLException;
 import java.sql.Types;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Optional;
+import java.util.stream.Collectors;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.TableName;
@@ -60,9 +62,13 @@ public class PhoenixMasterObserver implements MasterObserver, MasterCoprocessor
     private static final String PARTITION_UPSERT_SQL
             = "UPSERT INTO " + SYSTEM_CDC_STREAM_NAME + " VALUES (?,?,?,?,?,?,?,?)";
 
-    private static final String PARENT_PARTITION_QUERY
+    private static final String PARENT_PARTITION_QUERY_FOR_SPLIT
             = "SELECT PARTITION_ID, PARENT_PARTITION_ID FROM " + SYSTEM_CDC_STREAM_NAME
-            + " WHERE TABLE_NAME = ? AND STREAM_NAME = ? ";
+            + " WHERE TABLE_NAME = ? AND STREAM_NAME = ? AND PARTITION_END_TIME IS NULL ";
+
+    private static final String PARENT_PARTITION_QUERY_FOR_MERGE
+            = "SELECT PARENT_PARTITION_ID FROM " + SYSTEM_CDC_STREAM_NAME
+            + " WHERE TABLE_NAME = ? AND STREAM_NAME = ? AND PARTITION_ID = ?";
 
     private static final String PARENT_PARTITION_UPDATE_END_TIME_SQL
             = "UPSERT INTO " + SYSTEM_CDC_STREAM_NAME + " (TABLE_NAME, STREAM_NAME, PARTITION_ID, "
@@ -74,7 +80,7 @@ public Optional<MasterObserver> getMasterObserver() {
     }
 
     /**
-     * Update parent -> daughter relationship for CDC Streams.
+     * Update parent -> daughter relationship for CDC Streams when a region splits.
      * - find parent partition id using start/end keys of daughters
      * - upsert partition metadata for the 2 daughters
      * - update the end time on the parent's partition metadata
@@ -95,52 +101,92 @@ public void postCompletedSplitRegionAction(final ObserverContext<MasterCoprocess
                         regionInfoA.getTable());
                 return;
             }
-            // find streamName with ENABLED status
             String tableName = phoenixTable.getName().getString();
-            PreparedStatement pstmt = conn.prepareStatement(STREAM_STATUS_QUERY);
-            pstmt.setString(1, tableName);
-            ResultSet rs = pstmt.executeQuery();
-            if (rs.next()) {
-                String streamName = rs.getString(1);
-                LOGGER.info("Updating partition metadata for table={}, stream={} daughters {} {}",
+            String streamName = getStreamName(conn, tableName);
+            if (streamName != null) {
+                LOGGER.info("Updating split partition metadata for table={}, stream={} daughters {} {}",
                         tableName, streamName, regionInfoA.getEncodedName(), regionInfoB.getEncodedName());
-                // ancestorIDs = [parentId, grandparentId1]
-                List<String> ancestorIDs = getAncestorIds(conn, tableName, streamName, regionInfoA, regionInfoB);
-                upsertDaughterPartition(conn, tableName, streamName, ancestorIDs.get(0), regionInfoA);
-                upsertDaughterPartition(conn, tableName, streamName, ancestorIDs.get(0), regionInfoB);
-                updateParentPartitionEndTime(conn, tableName, streamName, ancestorIDs, regionInfoA.getRegionId());
+                // ancestorIDs = [parentId, grandparentId1, grandparentId2...]
+                List<String> ancestorIDs
+                        = getAncestorIdsForSplit(conn, tableName, streamName, regionInfoA, regionInfoB);
+
+                upsertDaughterPartitions(conn, tableName, streamName, ancestorIDs.subList(0, 1),
+                        Arrays.asList(regionInfoA, regionInfoB));
+
+                updateParentPartitionEndTime(conn, tableName, streamName, ancestorIDs,
+                        regionInfoA.getRegionId());
+            } else {
+                LOGGER.info("{} does not have a stream enabled, skipping partition metadata update.",
+                        regionInfoA.getTable());
             }
         } catch (SQLException e) {
             LOGGER.error("Unable to update CDC Stream Partition metadata during split with daughter regions: {} {}",
                     regionInfoA.getEncodedName(), regionInfoB.getEncodedName(), e);
         }
     }
 
-    private PTable getPhoenixTable(Connection conn, TableName tableName) throws SQLException {
-        PTable pTable;
-        try {
-            pTable = PhoenixRuntime.getTable(conn, tableName.toString());
-        } catch (TableNotFoundException e) {
-            return null;
+    /**
+     * Update parent -> daughter relationship for CDC Streams when regions merge.
+     * - upsert partition metadata for the daughter with each parent
+     * - update the end time on all the parents' partition metadata
+     * @param c the environment to interact with the framework and master
+     * @param regionsToMerge parent regions which merged
+     * @param mergedRegion daughter region
+     */
+    @Override
+    public void postCompletedMergeRegionsAction(final ObserverContext<MasterCoprocessorEnvironment> c,
+                                                final RegionInfo[] regionsToMerge,
+                                                final RegionInfo mergedRegion) {
+        Configuration conf = c.getEnvironment().getConfiguration();
+        try (Connection conn  = QueryUtil.getConnectionOnServer(conf)) {
+            // CDC will be enabled on Phoenix tables only
+            PTable phoenixTable = getPhoenixTable(conn, mergedRegion.getTable());
+            if (phoenixTable == null) {
+                LOGGER.info("{} is not a Phoenix Table, skipping partition metadata update.",
+                        mergedRegion.getTable());
+                return;
+            }
+            String tableName = phoenixTable.getName().getString();
+            String streamName = getStreamName(conn, tableName);
+            if (streamName != null) {
+                LOGGER.info("Updating merged partition metadata for table={}, stream={} daughter {}",
+                        tableName, streamName, mergedRegion.getEncodedName());
+                // upsert a row for daughter-parent for each merged region
+                upsertDaughterPartitions(conn, tableName, streamName,
+                        Arrays.stream(regionsToMerge).map(RegionInfo::getEncodedName).collect(Collectors.toList()),
+                        Arrays.asList(mergedRegion));
+
+                // lookup all ancestors of a merged region and update the endTime
+                for (RegionInfo ri : regionsToMerge) {
+                    List<String> ancestorIDs = getAncestorIdsForMerge(conn, tableName, streamName, ri);
+                    updateParentPartitionEndTime(conn, tableName, streamName, ancestorIDs,
+                            mergedRegion.getRegionId());
+                }
+            } else {
+                LOGGER.info("{} does not have a stream enabled, skipping partition metadata update.",
+                        mergedRegion.getTable());
+            }
+        } catch (SQLException e) {
+            LOGGER.error("Unable to update CDC Stream Partition metadata during merge with " +
+                            "parent regions: {} and daughter region {}",
+                    regionsToMerge, mergedRegion.getEncodedName(), e);
         }
-        return pTable;
     }
 
     /**
-     * Lookup parent's partition id (region's encoded name) in SYSTEM.CDC_STREAM.
+     * Lookup a split parent's partition id (region's encoded name) in SYSTEM.CDC_STREAM.
      * RegionInfoA is left daughter and RegionInfoB is right daughter so parent's key range would
      * be [RegionInfoA startKey, RegionInfoB endKey]
-     * Return both parent and grandparent partition ids.
+     * Return parent and all grandparent partition ids.
      *
-     * TODO: When we implement merges in this coproc, there could be multiple grandparents.
      */
-    private List<String> getAncestorIds(Connection conn, String tableName, String streamName,
+    private List<String> getAncestorIdsForSplit(Connection conn, String tableName, String streamName,
                                         RegionInfo regionInfoA, RegionInfo regionInfoB)
             throws SQLException {
         byte[] parentStartKey = regionInfoA.getStartKey();
         byte[] parentEndKey = regionInfoB.getEndKey();
 
-        StringBuilder qb = new StringBuilder(PARENT_PARTITION_QUERY);
+        StringBuilder qb = new StringBuilder(PARENT_PARTITION_QUERY_FOR_SPLIT);
         if (parentStartKey.length == 0) {
             qb.append(" AND PARTITION_START_KEY IS NULL ");
         } else {
@@ -173,50 +219,113 @@ private List<String> getAncestorIds(Connection conn, String tableName, String st
                     Bytes.toStringBinary(regionInfoB.getStartKey()),
                     Bytes.toStringBinary(regionInfoB.getEndKey())));
         }
+        // if parent was a result of a merge, there will be multiple grandparents.
+        while (rs.next()) {
+            ancestorIDs.add(rs.getString(2));
+        }
         return ancestorIDs;
     }
 
     /**
-     * Insert partition metadata for a daughter region from the split.
+     * Lookup the parent of a merged region.
+     * If the merged region was an output of a merge in the past, it will have multiple parents.
      */
-    private void upsertDaughterPartition(Connection conn, String tableName,
-                                         String streamName, String parentPartitionID,
-                                         RegionInfo regionInfo)
-            throws SQLException {
-        String partitionId = regionInfo.getEncodedName();
-        long startTime = regionInfo.getRegionId();
-        byte[] startKey = regionInfo.getStartKey();
-        byte[] endKey = regionInfo.getEndKey();
-        PreparedStatement pstmt = conn.prepareStatement(PARTITION_UPSERT_SQL);
+    private List<String> getAncestorIdsForMerge(Connection conn, String tableName, String streamName,
+                                                RegionInfo parent) throws SQLException {
+        List<String> ancestorIDs = new ArrayList<>();
+        ancestorIDs.add(parent.getEncodedName());
+        PreparedStatement pstmt = conn.prepareStatement(PARENT_PARTITION_QUERY_FOR_MERGE);
         pstmt.setString(1, tableName);
         pstmt.setString(2, streamName);
-        pstmt.setString(3, partitionId);
-        pstmt.setString(4, parentPartitionID);
-        pstmt.setLong(5, startTime);
-        // endTime in not set when inserting a new partition
-        pstmt.setNull(6, Types.BIGINT);
-        pstmt.setBytes(7, startKey.length == 0 ? null : startKey);
-        pstmt.setBytes(8, endKey.length == 0 ? null : endKey);
-        pstmt.executeUpdate();
+        pstmt.setString(3, parent.getEncodedName());
+        ResultSet rs = pstmt.executeQuery();
+        if (rs.next()) {
+            ancestorIDs.add(rs.getString(1));
+        } else {
+            throw new SQLException(String.format(
+                    "Could not find parent of the provided merged region: {}", parent.getEncodedName()));
+        }
+        // if parent was a result of a merge, there will be multiple grandparents.
+        while (rs.next()) {
+            ancestorIDs.add(rs.getString(1));
+        }
+        return ancestorIDs;
+    }
+
+    /**
+     * Insert partition metadata for a daughter region from a split or a merge.
+     * split: 2 daughters, 1 parent
+     * merge: 1 daughter, N parents
+     */
+    private void upsertDaughterPartitions(Connection conn, String tableName,
+                                         String streamName, List<String> parentPartitionIDs,
+                                         List<RegionInfo> daughters)
+            throws SQLException {
+        PreparedStatement pstmt = conn.prepareStatement(PARTITION_UPSERT_SQL);
+        for (RegionInfo daughter : daughters) {
+            for (String parentPartitionID : parentPartitionIDs) {
+                String partitionId = daughter.getEncodedName();
+                long startTime = daughter.getRegionId();
+                byte[] startKey = daughter.getStartKey();
+                byte[] endKey = daughter.getEndKey();
+                pstmt.setString(1, tableName);
+                pstmt.setString(2, streamName);
+                pstmt.setString(3, partitionId);
+                pstmt.setString(4, parentPartitionID);
+                pstmt.setLong(5, startTime);
+                // endTime in not set when inserting a new partition
+                pstmt.setNull(6, Types.BIGINT);
+                pstmt.setBytes(7, startKey.length == 0 ? null : startKey);
+                pstmt.setBytes(8, endKey.length == 0 ? null : endKey);
+                pstmt.executeUpdate();
+            }
+        }
         conn.commit();
     }
 
     /**
      * Update endTime in all rows of parent partition by setting it to daughter's startTime.
+     * parent came from a split : there will only be one record
+     * parent came from a merge : there will be multiple rows, one per grandparent
      *
-     * TODO: When we implement merges in this coproc, update all rows of the parent.
      */
     private void updateParentPartitionEndTime(Connection conn, String tableName,
                                               String streamName, List<String> ancestorIDs,
-                                          long daughterStartTime) throws SQLException {
-        // ancestorIDs = [parentID, grandparentID]
+                                              long daughterStartTime) throws SQLException {
+        // ancestorIDs = [parentID, grandparentID1, grandparentID2...]
         PreparedStatement pstmt = conn.prepareStatement(PARENT_PARTITION_UPDATE_END_TIME_SQL);
-        pstmt.setString(1, tableName);
-        pstmt.setString(2, streamName);
-        pstmt.setString(3, ancestorIDs.get(0));
-        pstmt.setString(4, ancestorIDs.get(1));
-        pstmt.setLong(5, daughterStartTime);
-        pstmt.executeUpdate();
+        for (int i=1; i<ancestorIDs.size(); i++) {
+            pstmt.setString(1, tableName);
+            pstmt.setString(2, streamName);
+            pstmt.setString(3, ancestorIDs.get(0));
+            pstmt.setString(4, ancestorIDs.get(i));
+            pstmt.setLong(5, daughterStartTime);
+            pstmt.executeUpdate();
+        }
         conn.commit();
     }
+
+    /**
+     * Get the stream name on the given table if one exists in ENABLED state.
+     */
+    private String getStreamName(Connection conn, String tableName) throws SQLException {
+        PreparedStatement pstmt = conn.prepareStatement(STREAM_STATUS_QUERY);
+        pstmt.setString(1, tableName);
+        ResultSet rs = pstmt.executeQuery();
+        if (rs.next()) {
+            return rs.getString(1);
+        } else {
+            return null;
+        }
+    }
+
+    private PTable getPhoenixTable(Connection conn, TableName tableName) throws SQLException {
+        PTable pTable;
+        try {
+            pTable = PhoenixRuntime.getTable(conn, tableName.toString());
+        } catch (TableNotFoundException e) {
+            return null;
+        }
+        return pTable;
+    }
 }