Skip to content

Commit 1ca98b1

Browse files
Stop repair scheduler if two major versions are detected
Do not allow repair scheduler to run repair if two major versions are detected. This can happen during major version upgrade, and we don't want repair to run since the streaming data for repair can be incompatible across major versions. Users can override this using a cassandra yaml config if they want to. Also, this does not affect minor version upgrades, or repair jobs run manually. https://issues.apache.org/jira/browse/CASSANDRA-20048
1 parent b91731b commit 1ca98b1

File tree

14 files changed

+143
-6
lines changed

14 files changed

+143
-6
lines changed

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@
220220
* Add ELAPSED command to cqlsh (CASSANDRA-18861)
221221
* Add the ability to disable bulk loading of SSTables (CASSANDRA-18781)
222222
* Clean up obsolete functions and simplify cql_version handling in cqlsh (CASSANDRA-18787)
223+
* Stop repair scheduler if two major versions are detected (CASSANDRA-20048)
223224
Merged from 5.0:
224225
* Sort SSTable TOC entries for determinism (CASSANDRA-20494)
225226
* Do not source cassandra-env.sh unnecessarily in nodetool and other tooling (CASSANDRA-20745)

conf/cassandra.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2782,6 +2782,9 @@ storage_compatibility_mode: NONE
27822782
# # The scheduler needs to adjust its order when nodes leave the ring. Deleted hosts are tracked in metadata
27832783
# # for a specified duration to ensure they are indeed removed before adjustments are made to the schedule.
27842784
# history_clear_delete_hosts_buffer_interval: 2h
2785+
# # By default repair is disabled if there are mixed major versions detected - which would happen
2786+
# # if a major version upgrade is being performed on the cluster, but a user can enable it using this flag
2787+
# mixed_major_version_repair_enabled: false
27852788
# # NOTE: Each of the below settings can be overridden per repair type under repair_type_overrides
27862789
# global_settings:
27872790
# # If true, attempts to group tables in the same keyspace into one repair; otherwise, each table is repaired

conf/cassandra_latest.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2467,6 +2467,9 @@ storage_compatibility_mode: NONE
24672467
# # The scheduler needs to adjust its order when nodes leave the ring. Deleted hosts are tracked in metadata
24682468
# # for a specified duration to ensure they are indeed removed before adjustments are made to the schedule.
24692469
# history_clear_delete_hosts_buffer_interval: 2h
2470+
# # By default repair is disabled if there are mixed major versions detected - which would happen
2471+
# # if a major version upgrade is being performed on the cluster, but a user can enable it using this flag
2472+
# mixed_major_version_repair_enabled: false
24702473
# # NOTE: Each of the below settings can be overridden per repair type under repair_type_overrides
24712474
# global_settings:
24722475
# # If true, attempts to group tables in the same keyspace into one repair; otherwise, each table is repaired

doc/modules/cassandra/pages/managing/operating/auto_repair.adoc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,10 @@ is time to schedule repairs.
167167
| history_clear_delete_hosts_buffer_interval | 2h | The scheduler needs to adjust its order when nodes leave the ring.
168168
Deleted hosts are tracked in metadata for a specified duration to ensure they are indeed removed before adjustments
169169
are made to the schedule.
170+
| mixed_major_version_repair_enabled | false | Enable/Disable running repairs on the cluster when there are mixed
171+
major versions detected, which usually occurs when the cluster is being upgraded. Repairs between nodes of
172+
different major versions is not something that is tested, so this may lead to data compatibility issues.
173+
It is strongly discouraged to set this to true without doing extensive testing beforehand.
170174
|===
171175

172176

src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,11 @@ public void repair(AutoRepairConfig.RepairType repairType)
165165
logger.debug("Auto-repair is disabled for repair type {}", repairType);
166166
return;
167167
}
168+
if (!config.isMixedMajorVersionRepairEnabled() && AutoRepairUtils.hasMultipleLiveMajorVersions())
169+
{
170+
logger.info("Auto-repair is disabled when nodes in the cluster have different major versions");
171+
return;
172+
}
168173
AutoRepairService.instance.checkCanRun(repairType);
169174
AutoRepairState repairState = repairStates.get(repairType);
170175
try

src/java/org/apache/cassandra/repair/autorepair/AutoRepairConfig.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ public class AutoRepairConfig implements Serializable
5959
// Minimum duration for the execution of a single repair task. This prevents the scheduler from overwhelming
6060
// the node by scheduling too many repair tasks in a short period of time.
6161
public volatile DurationSpec.LongSecondsBound repair_task_min_duration = new DurationSpec.LongSecondsBound("5s");
62+
// by default repair is disabled if there are mixed major versions detected, but you can enable it using this flag
63+
public volatile boolean mixed_major_version_repair_enabled = false;
6264

6365
// global_settings overides Options.defaultOptions for all repair types
6466
public volatile Options global_settings;
@@ -149,6 +151,11 @@ public void setAutoRepairSchedulingEnabled(boolean enabled)
149151
this.enabled = enabled;
150152
}
151153

154+
public boolean isMixedMajorVersionRepairEnabled()
155+
{
156+
return mixed_major_version_repair_enabled;
157+
}
158+
152159
public DurationSpec.IntSecondsBound getAutoRepairHistoryClearDeleteHostsBufferInterval()
153160
{
154161
return history_clear_delete_hosts_buffer_interval;
@@ -366,6 +373,16 @@ public void setRepairRetryBackoff(RepairType repairType, String interval)
366373
getOptions(repairType).repair_retry_backoff = new DurationSpec.LongSecondsBound(interval);
367374
}
368375

376+
public boolean getMixedMajorVersionRepairEnabled()
377+
{
378+
return this.mixed_major_version_repair_enabled;
379+
}
380+
381+
public void setMixedMajorVersionRepairEnabled(boolean enabled)
382+
{
383+
this.mixed_major_version_repair_enabled = enabled;
384+
}
385+
369386
@VisibleForTesting
370387
static IAutoRepairTokenRangeSplitter newAutoRepairTokenRangeSplitter(RepairType repairType, ParameterizedClass parameterizedClass) throws ConfigurationException
371388
{

src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,21 @@ public static CurrentRepairStatus getCurrentRepairStatus(RepairType repairType,
425425
return null;
426426
}
427427

428+
/**
429+
* Checks whether the cluster has multiple major versions
430+
* @return
431+
* true if more than one major versions are detected
432+
* false if only one major version is detected
433+
*
434+
*/
435+
public static boolean hasMultipleLiveMajorVersions()
436+
{
437+
ClusterMetadata metadata = ClusterMetadata.current();
438+
int maxMajorVersion = ClusterMetadata.current().directory.clusterMaxVersion.cassandraVersion.major;
439+
int minMajorVersion = ClusterMetadata.current().directory.clusterMinVersion.cassandraVersion.major;
440+
return maxMajorVersion != minMajorVersion;
441+
}
442+
428443
@VisibleForTesting
429444
protected static TreeSet<UUID> getHostIdsInCurrentRing(RepairType repairType, Collection<NodeAddresses> allNodesInRing)
430445
{

src/java/org/apache/cassandra/service/AutoRepairService.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ public String getAutoRepairConfiguration()
102102
appendConfig(sb, "repair_check_interval", config.getRepairCheckInterval());
103103
appendConfig(sb, "repair_task_min_duration", config.getRepairTaskMinDuration());
104104
appendConfig(sb, "history_clear_delete_hosts_buffer_interval", config.getAutoRepairHistoryClearDeleteHostsBufferInterval());
105+
appendConfig(sb, "mixed_major_version_repair_enabled", config.getMixedMajorVersionRepairEnabled());
105106
for (RepairType repairType : RepairType.values())
106107
{
107108
sb.append(formatRepairTypeConfig(repairType, config));
@@ -271,6 +272,12 @@ public void setAutoRepairRetryBackoff(String repairType, String interval)
271272
config.setRepairRetryBackoff(RepairType.parse(repairType), interval);
272273
}
273274

275+
@Override
276+
public void setMixedMajorVersionRepairEnabled(boolean enabled)
277+
{
278+
config.setMixedMajorVersionRepairEnabled(enabled);
279+
}
280+
274281
private String formatRepairTypeConfig(RepairType repairType, AutoRepairConfig config)
275282
{
276283
StringBuilder sb = new StringBuilder();

src/java/org/apache/cassandra/service/AutoRepairServiceMBean.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,6 @@ public interface AutoRepairServiceMBean
7474
public void setAutoRepairMaxRetriesCount(String repairType, int retries);
7575

7676
public void setAutoRepairRetryBackoff(String repairType, String interval);
77+
78+
public void setMixedMajorVersionRepairEnabled(boolean enabled);
7779
}

src/java/org/apache/cassandra/tools/NodeProbe.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2677,6 +2677,11 @@ public GuardrailsMBean getGuardrailsMBean()
26772677
{
26782678
return grProxy;
26792679
}
2680+
2681+
public void setMixedMajorVersionRepairEnabled(boolean enabled)
2682+
{
2683+
autoRepairProxy.setMixedMajorVersionRepairEnabled(enabled);
2684+
}
26802685
}
26812686

26822687
class ColumnFamilyStoreMBeanIterator implements Iterator<Map.Entry<String, ColumnFamilyStoreMBean>>

0 commit comments

Comments
 (0)