Admin/XMover: Implement suggestions by CodeRabbit

amotl · amotl · commit 2c6ef7f8ef91 · 2025-09-19T19:48:27.000+02:00
diff --git a/cratedb_toolkit/admin/xmover/analysis/shard.py b/cratedb_toolkit/admin/xmover/analysis/shard.py
@@ -38,7 +38,7 @@ def __init__(self, client: CrateDBClient):
         self.shards: List[ShardInfo] = []
 
         # Initialize session-based caches for performance.
-        self._zone_conflict_cache: Dict[Tuple[str, int, str], Union[str, None]] = {}
+        self._zone_conflict_cache: Dict[Tuple[str, str, int, str], Union[str, None]] = {}
         self._node_lookup_cache: Dict[str, Union[NodeInfo, None]] = {}
         self._target_nodes_cache: Dict[Tuple[float, frozenset[Any], float, float], List[NodeInfo]] = {}
         self._cache_hits = 0
@@ -206,7 +206,7 @@ def generate_rebalancing_recommendations(
         # Get moveable shards (only healthy ones for actual operations)
         moveable_shards = self.find_moveable_shards(constraints.min_size, constraints.max_size, constraints.table_name)
 
-        print(
+        logger.info(
             f"Analyzing {len(moveable_shards)} candidate shards "
             f"in size range {constraints.min_size}-{constraints.max_size}GB..."
         )
@@ -239,12 +239,11 @@ def generate_rebalancing_recommendations(
         # Optimize processing: if filtering by source node, only process those shards
         if constraints.source_node:
             processing_shards = [s for s in moveable_shards if s.node_name == constraints.source_node]
-            print(f"Focusing on {len(processing_shards)} shards from node {constraints.source_node}")
+            logger.info(f"Focusing on {len(processing_shards)} shards from node {constraints.source_node}")
         else:
             processing_shards = moveable_shards
 
         # Generate move recommendations
-        safe_recommendations = 0  # noqa: F841
         total_evaluated = 0
 
         for i, shard in enumerate(processing_shards):
@@ -368,12 +367,12 @@ def generate_rebalancing_recommendations(
 
         if len(processing_shards) > 100:
             print()  # New line after progress dots
-        print(f"Generated {len(recommendations)} move recommendations (evaluated {total_evaluated} shards)")
-        print(f"Performance: {self.get_cache_stats()}")
+        logger.info(f"Generated {len(recommendations)} move recommendations (evaluated {total_evaluated} shards)")
+        logger.info(f"Performance: {self.get_cache_stats()}")
         return recommendations
 
     def validate_move_safety(
-        self, recommendation: ShardRelocationResponse, max_disk_usage_percent: float = 90.0
+        self, recommendation: ShardRelocationResponse, max_disk_usage_percent: float = 90.0, buffer_gb: float = 50.0
     ) -> Tuple[bool, str]:
         """Validate that a move recommendation is safe to execute"""
         # Find target node (with caching)
@@ -388,7 +387,7 @@ def validate_move_safety(
             return False, zone_conflict
 
         # Check available space
-        required_space_gb = recommendation.size_gb + 50  # 50GB buffer
+        required_space_gb = recommendation.size_gb + buffer_gb
         if target_node.available_space_gb < required_space_gb:
             return (
                 False,
@@ -423,7 +422,7 @@ def _check_zone_conflict_cached(self, recommendation: ShardRelocationResponse) -
         """Check zone conflicts with caching"""
         # Create cache key: table, shard, target zone
         target_zone = self._get_node_zone(recommendation.to_node)
-        cache_key = (recommendation.table_name, recommendation.shard_id, target_zone)
+        cache_key = (recommendation.schema_name, recommendation.table_name, recommendation.shard_id, target_zone)
 
         if cache_key in self._zone_conflict_cache:
             self._cache_hits += 1
diff --git a/cratedb_toolkit/admin/xmover/analysis/table.py b/cratedb_toolkit/admin/xmover/analysis/table.py
@@ -14,6 +14,7 @@
 from rich.console import Console
 from rich.table import Table
 
+from cratedb_toolkit.admin.xmover.model import NodeInfo
 from cratedb_toolkit.admin.xmover.util.database import CrateDBClient
 
 logger = logging.getLogger(__name__)
@@ -97,6 +98,9 @@ def find_table_by_name(self, table_name: str) -> Optional[str]:
 
             try:
                 choice = input("\nSelect table (enter number): ").strip()
+                if not choice:
+                    rprint("[yellow]No selection made[/yellow]")
+                    return None
                 idx = int(choice) - 1
                 if 0 <= idx < len(rows):
                     schema, table = rows[idx]
@@ -292,14 +296,9 @@ def format_table_health_report(self, table_dist: TableDistribution) -> None:
             zone_distribution = {}
             for node_name, node_data in table_dist.node_distributions.items():
                 # Try to get zone info for each node
-                node_info = next((n for n in all_nodes_info if n.name == node_name), None)
-                if (
-                    node_info
-                    and hasattr(node_info, "attributes")
-                    and node_info.attributes
-                    and "zone" in node_info.attributes
-                ):
-                    zone = node_info.attributes["zone"]
+                node_info: NodeInfo = next((n for n in all_nodes_info if n.name == node_name), None)
+                if node_info.zone:
+                    zone = node_info.zone
                     if zone not in zone_distribution:
                         zone_distribution[zone] = {"nodes": 0, "shards": 0, "size": 0}
                     zone_distribution[zone]["nodes"] += 1
diff --git a/cratedb_toolkit/admin/xmover/cli.py b/cratedb_toolkit/admin/xmover/cli.py
@@ -4,7 +4,6 @@
 Command Line Interface.
 """
 
-import sys
 import time
 from typing import Optional
 
@@ -46,11 +45,11 @@ def main(ctx):
         if not client.test_connection():
             console.print("[red]Error: Could not connect to CrateDB[/red]")
             console.print("Please check your CRATE_CONNECTION_STRING in .env file")
-            sys.exit(1)
+            raise click.Abort()
         ctx.obj["client"] = client
     except Exception as e:
         console.print(f"[red]Error connecting to CrateDB: {e}[/red]")
-        sys.exit(1)
+        raise click.Abort() from e
 
 
 @main.command()
@@ -170,11 +169,11 @@ def test_connection(ctx, connection_string: Optional[str]):
                 console.print(f"  • {node.name} (zone: {node.zone})")
         else:
             console.print("[red]✗ Connection failed[/red]")
-            sys.exit(1)
+            raise click.Abort()
 
     except Exception as e:
         console.print(f"[red]✗ Connection error: {e}[/red]")
-        sys.exit(1)
+        raise click.Abort() from e
 
 
 @main.command()
@@ -525,13 +524,14 @@ def monitor_recovery(
         xmover monitor-recovery --watch                # Continuous monitoring
         xmover monitor-recovery --recovery-type PEER  # Only PEER recoveries
     """
+    effective_recovery_type = None if recovery_type == "all" else recovery_type
     recovery_monitor = RecoveryMonitor(
         client=ctx.obj["client"],
         options=RecoveryOptions(
             table=table,
             node=node,
             refresh_interval=refresh_interval,
-            recovery_type=recovery_type,
+            recovery_type=effective_recovery_type,
             include_transitioning=include_transitioning,
         ),
     )
diff --git a/cratedb_toolkit/admin/xmover/model.py b/cratedb_toolkit/admin/xmover/model.py
@@ -1,4 +1,3 @@
-import dataclasses
 from dataclasses import dataclass
 from typing import Dict, Optional
 
@@ -149,6 +148,12 @@ def safety_score(self) -> float:
         if "rebalancing" in self.reason.lower():
             score += 0.2
 
+        # Consider shard size - smaller shards are safer to move
+        if self.size_gb < 10:
+            score += 0.1
+        elif self.size_gb > 100:
+            score -= 0.2
+
         # Ensure score stays in valid range
         return max(0.0, min(1.0, score))
 
@@ -165,15 +170,15 @@ class DistributionStats:
     node_balance_score: float  # 0-100, higher is better
 
 
-@dataclasses.dataclass
+@dataclass
 class SizeCriteria:
     min_size: float = 40.0
     max_size: float = 60.0
     table_name: Optional[str] = None
     source_node: Optional[str] = None
 
 
-@dataclasses.dataclass
+@dataclass
 class ShardRelocationConstraints:
     min_size: float = SizeCriteria().min_size
     max_size: float = SizeCriteria().max_size
diff --git a/cratedb_toolkit/admin/xmover/operational/monitor.py b/cratedb_toolkit/admin/xmover/operational/monitor.py
@@ -37,7 +37,7 @@ def get_cluster_recovery_status(self) -> List[RecoveryInfo]:
         )
 
         # Apply recovery type filter
-        if self.options.recovery_type is not None:
+        if self.options.recovery_type is not None and self.options.recovery_type.lower() != "all":
             recoveries = [r for r in recoveries if r.recovery_type.upper() == self.options.recovery_type.upper()]
 
         return recoveries
@@ -178,7 +178,6 @@ def start(self, watch: bool, debug: bool = False):
 
                     # Track previous state for change detection
                     previous_recoveries: Dict[str, Dict[str, Any]] = {}
-                    previous_timestamp = None
                     first_run = True
 
                     while True:
@@ -307,7 +306,6 @@ def start(self, watch: bool, debug: bool = False):
                                 elif active_count > 0:
                                     console.print(f"{current_time} | {status} (no changes)")
 
-                        previous_timestamp = current_time  # noqa: F841
                         first_run = False
                         time.sleep(self.options.refresh_interval)
 
diff --git a/cratedb_toolkit/admin/xmover/operational/recommend.py b/cratedb_toolkit/admin/xmover/operational/recommend.py
@@ -123,8 +123,8 @@ def validate(self, request: ShardRelocationRequest):
             console.print()
             console.print("[dim]# Monitor shard health after execution[/dim]")
             console.print(
-                "[dim]# Check with: SELECT * FROM sys.shards "
-                "WHERE table_name = '{table_name}' AND id = {shard_id};[/dim]"
+                "[dim]# Check with: SELECT * FROM sys.shards "  # noqa: S608
+                f"WHERE table_name = '{table_name}' AND id = {request.shard_id};[/dim]"
             )
         else:
             console.print("[red]✗ VALIDATION FAILED - Move not safe[/red]")
@@ -323,7 +323,7 @@ def execute(
                         rec, max_disk_usage_percent=constraints.max_disk_usage
                     )
                     if not is_safe:
-                        if "Zone conflict" in safety_msg:
+                        if "zone conflict" in safety_msg.lower():
                             zone_conflicts += 1
                             console.print(f"-- Move {i}: SKIPPED - {safety_msg}")
                             console.print(
@@ -340,7 +340,7 @@ def execute(
 
             # Auto-execution if requested
             if auto_execute:
-                self._execute_recommendations_safely(recommendations, validate)
+                self._execute_recommendations_safely(constraints, recommendations, validate)
 
         if validate and safe_moves < len(recommendations):
             if zone_conflicts > 0:
@@ -352,14 +352,16 @@ def execute(
                 f"[yellow]Warning: Only {safe_moves} of {len(recommendations)} moves passed safety validation[/yellow]"
             )
 
-    def _execute_recommendations_safely(self, recommendations, validate: bool):
+    def _execute_recommendations_safely(self, constraints, recommendations, validate: bool):
         """Execute recommendations with extensive safety measures"""
 
         # Filter to only safe recommendations
         safe_recommendations = []
         if validate:
             for rec in recommendations:
-                is_safe, safety_msg = self.analyzer.validate_move_safety(rec, max_disk_usage_percent=95.0)
+                is_safe, safety_msg = self.analyzer.validate_move_safety(
+                    rec, max_disk_usage_percent=constraints.max_disk_usage
+                )
                 if is_safe:
                     safe_recommendations.append(rec)
         else:
@@ -423,7 +425,8 @@ def _execute_recommendations_safely(self, recommendations, validate: bool):
                 # Execute the SQL command
                 result = self.client.execute_query(sql_command)
 
-                if result.get("rowcount", 0) >= 0:  # Success indicator for ALTER statements
+                # ALTER TABLE REROUTE commands don't return rowcount, check for no error instead.
+                if "error" not in result:
                     console.print("    [green]✅ SUCCESS[/green] - Move initiated")
                     successful_moves += 1
 
@@ -482,7 +485,8 @@ def _wait_for_recovery_capacity(self, max_concurrent_recoveries: int = 5):
         while True:
             # Check active recoveries (including transitioning)
             recoveries = recovery_monitor.get_cluster_recovery_status()
-            active_count = len([r for r in recoveries if r.overall_progress < 100.0 or r.stage != "DONE"])
+            # Count recoveries that are actively running (not completed)
+            active_count = len([r for r in recoveries if r.overall_progress < 100.0])
             status = f"{active_count}/{max_concurrent_recoveries}"
             if active_count < max_concurrent_recoveries:
                 if wait_time > 0:
diff --git a/cratedb_toolkit/admin/xmover/util/database.py b/cratedb_toolkit/admin/xmover/util/database.py
@@ -39,6 +39,8 @@ def __init__(self, connection_string: Optional[str] = None):
         if not self.connection_string.endswith("/_sql"):
             self.connection_string = self.connection_string.rstrip("/") + "/_sql"
 
+        self.session = requests.Session()
+
     def execute_query(self, query: str, parameters: Optional[List] = None) -> Dict[str, Any]:
         """Execute a SQL query against CrateDB"""
         payload: Dict[str, Any] = {"stmt": query}
@@ -51,11 +53,18 @@ def execute_query(self, query: str, parameters: Optional[List] = None) -> Dict[s
             auth = (self.username, self.password)
 
         try:
-            response = requests.post(
+            response = self.session.post(
                 self.connection_string, json=payload, auth=auth, verify=self.ssl_verify, timeout=30
             )
             response.raise_for_status()
-            return response.json()
+            data = response.json()
+            # CrateDB may include an "error" field even with 200 OK
+            if isinstance(data, dict) and "error" in data and data["error"]:
+                # Best-effort message extraction
+                err = data["error"]
+                msg = err.get("message") if isinstance(err, dict) else str(err)
+                raise Exception(f"CrateDB error: {msg}")
+            return data
         except requests.exceptions.RequestException as e:
             raise Exception(f"Failed to execute query: {e}") from e
 
@@ -335,13 +344,13 @@ def get_recovery_details(self, schema_name: str, table_name: str, shard_id: int)
             s."primary",
             s.translog_stats['size'] as translog_size
         FROM sys.shards s
-        WHERE s.table_name = ? AND s.id = ?
+        WHERE s.schema_name = ? AND s.table_name = ? AND s.id = ?
         AND (s.state = 'RECOVERING' OR s.routing_state IN ('INITIALIZING', 'RELOCATING'))
         ORDER BY s.schema_name
         LIMIT 1
         """
 
-        result = self.execute_query(query, [table_name, shard_id])
+        result = self.execute_query(query, [schema_name, table_name, shard_id])
 
         if not result.get("rows"):
             return None
diff --git a/cratedb_toolkit/admin/xmover/util/error.py b/cratedb_toolkit/admin/xmover/util/error.py
@@ -1,12 +1,23 @@
-from typing import List, Optional, cast
+from typing import List, Optional
 
-from rich.console import Console
+from rich import get_console
 from rich.panel import Panel
 
-console = Console()
+console = get_console()
 
 
 def explain_cratedb_error(error_message: Optional[str]):
+    """
+    Decode and troubleshoot common CrateDB shard allocation errors.
+
+    Parameters
+    ----------
+    error_message:
+        Raw CrateDB error message. If None and interactive=True, the user is prompted
+        to paste the message (finish with two blank lines).
+    interactive:
+        When False, never prompt for input; return early if no message is provided.
+    """
     console.print(Panel.fit("[bold blue]CrateDB Error Message Decoder[/bold blue]"))
     console.print("[dim]Helps decode and troubleshoot CrateDB shard allocation errors[/dim]")
     console.print()
@@ -24,7 +35,7 @@ def explain_cratedb_error(error_message: Optional[str]):
                 break
         error_message = "\n".join(lines)
 
-    if not error_message.strip():
+    if not (error_message or "").strip():
         console.print("[yellow]No error message provided[/yellow]")
         return
 
@@ -96,7 +107,7 @@ def explain_cratedb_error(error_message: Optional[str]):
     error_lower = error_message.lower()
 
     for pattern_info in error_patterns:
-        if cast(str, pattern_info["pattern"]).lower() in error_lower:
+        if pattern_info["pattern"].lower() in error_lower:  # type: ignore[attr-defined]
             matches.append(pattern_info)
 
     if matches:
diff --git a/cratedb_toolkit/admin/xmover/util/format.py b/cratedb_toolkit/admin/xmover/util/format.py
diff --git a/doc/admin/xmover/handbook.md b/doc/admin/xmover/handbook.md
diff --git a/doc/admin/xmover/queries.md b/doc/admin/xmover/queries.md
diff --git a/tests/admin/test_recovery_monitor.py b/tests/admin/test_recovery_monitor.py