Skip to content

[FLINK-33634] Add Conditions to Flink CRD's Status field #957

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,20 @@

import org.apache.flink.annotation.Experimental;
import org.apache.flink.kubernetes.operator.api.spec.FlinkDeploymentSpec;
import org.apache.flink.kubernetes.operator.api.utils.ConditionUtils;

import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import io.fabric8.kubernetes.api.model.Condition;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.ToString;
import lombok.experimental.SuperBuilder;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/** Last observed status of the Flink deployment. */
Expand All @@ -55,4 +59,13 @@ public class FlinkDeploymentStatus extends CommonStatus<FlinkDeploymentSpec> {

/** Information about the TaskManagers for the scale subresource. */
private TaskManagerInfo taskManager;

/** Condition of the CR . */
private List<Condition> conditions = new ArrayList<>();

public List<Condition> getConditions() {
Condition condition = ConditionUtils.getCondition(this);
ConditionUtils.updateLastTransitionTime(conditions, condition);
return condition == null ? List.of() : List.of(condition);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,36 @@
public enum JobManagerDeploymentStatus {

/** JobManager is running and ready to receive REST API calls. */
READY,
READY("JobManagerReady", "JobManager is running and ready to receive REST API calls"),

/** JobManager is running but not ready yet to receive REST API calls. */
DEPLOYED_NOT_READY,
DEPLOYED_NOT_READY(
"DeployedNotReady",
"JobManager is running but not yet ready to receive REST API calls"),

/** JobManager process is starting up. */
DEPLOYING,
DEPLOYING("JobManagerIsDeploying", "JobManager process is starting up"),

/** JobManager deployment not found, probably not started or killed by user. */
// TODO: currently a mix of SUSPENDED and ERROR, needs cleanup
MISSING,
MISSING("JobManagerDeploymentMissing", "JobManager deployment not found"),

/** Deployment in terminal error, requires spec change for reconciliation to continue. */
ERROR;
ERROR("Error", "JobManager deployment failed");

private String reason;
private String message;

JobManagerDeploymentStatus(String reason, String message) {
this.reason = reason;
this.message = message;
}

public String getReason() {
return reason;
}

public String getMessage() {
return message;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.flink.kubernetes.operator.api.utils;

import org.apache.flink.api.common.JobStatus;
import org.apache.flink.kubernetes.operator.api.status.FlinkDeploymentStatus;
import org.apache.flink.kubernetes.operator.api.status.JobManagerDeploymentStatus;

import io.fabric8.kubernetes.api.model.Condition;
import io.fabric8.kubernetes.api.model.ConditionBuilder;

import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;

import static org.apache.flink.api.common.JobStatus.RUNNING;
import static org.apache.flink.kubernetes.operator.api.status.JobManagerDeploymentStatus.READY;

/** Creates a condition object with the type, status, message and reason. */
public class ConditionUtils {
public static final String CONDITION_TYPE_RUNNING = "Running";

public static Condition getCondition(FlinkDeploymentStatus flinkDeploymentStatus) {
Copy link

@davidradl davidradl May 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only the parameters that are required should be passed not the complete FlinkDeploymentStatus.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think from here @gyfora meant by operate on status on that util method .

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think from here @gyfora meant by operate on status on that util method .

OK I see

org.apache.flink.kubernetes.operator.api.status.JobStatus status =
flinkDeploymentStatus.getJobStatus();
Condition conditionToAdd = null;
if (status != null) {

JobStatus jobStatus = status.getState();

conditionToAdd =
jobStatus == null
? getSessionModeCondition(
flinkDeploymentStatus.getJobManagerDeploymentStatus())
: getApplicationModeCondition(jobStatus);
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we not update the last transition time here?

return conditionToAdd;
}

public static void updateLastTransitionTime(List<Condition> conditions, Condition condition) {
if (condition == null) {
return;
}
Condition existingCondition = conditions.isEmpty() ? null : conditions.get(0);
if (isLastTransactionTimeStampUpdateRequired(existingCondition, condition)) {
condition.setLastTransitionTime(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Above is from flink-autoscaler , but flink-kubernetes-operator-api , doesn't have dependency to flink-autoscaler , so we can't utilise it.

new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'").format(new Date()));
} else {
condition.setLastTransitionTime(existingCondition.getLastTransitionTime());
}
}

private static Condition getApplicationModeCondition(JobStatus jobStatus) {
return new ConditionBuilder()
.withType(CONDITION_TYPE_RUNNING)
.withStatus(jobStatus == RUNNING ? "True" : "False")
.withReason(toCamelCase(jobStatus.name()))
.withMessage("Job state " + jobStatus.name())

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should the message say Job status - as that is what we are reading?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes,

.build();
}

private static Condition getSessionModeCondition(JobManagerDeploymentStatus jmStatus) {
return new ConditionBuilder()
.withType(CONDITION_TYPE_RUNNING)
.withStatus(jmStatus == READY ? "True" : "False")
.withReason(jmStatus.getReason())
.withMessage(jmStatus.getMessage())
.build();
}

private static String toCamelCase(String reason) {
reason = reason.toLowerCase();
return reason.substring(0, 1).toUpperCase() + reason.substring(1);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Camel case is not just upper casing the first letter. We may need to upper case in the string as well. I suggest mapping the lower case to the appropriate camel cased reason.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, but not as of now as per here. If we go with mapping, we end up with new method, do we required them?.

}

private static boolean isLastTransactionTimeStampUpdateRequired(
Condition existingCondition, Condition newCondition) {
return existingCondition == null
|| !existingCondition.getStatus().equals(newCondition.getStatus());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
import static org.apache.flink.kubernetes.operator.config.KubernetesOperatorConfigOptions.OPERATOR_JOB_UPGRADE_LAST_STATE_FALLBACK_ENABLED;
import static org.apache.flink.kubernetes.operator.config.KubernetesOperatorConfigOptions.SNAPSHOT_RESOURCE_ENABLED;
import static org.apache.flink.kubernetes.operator.utils.EventRecorder.Reason.ValidationError;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
Expand Down Expand Up @@ -280,6 +281,13 @@ public void verifyFailedDeployment() throws Exception {
// Validate status
assertNotNull(appCluster.getStatus().getError());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions())
.hasSize(1)
.extracting("reason")
.contains("Reconciling");

// next cycle should not create another event
updateControl =
testController.reconcile(
Expand Down Expand Up @@ -364,6 +372,13 @@ public void verifyInProgressDeploymentWithError(String reason) throws Exception
org.apache.flink.api.common.JobStatus.RECONCILING,
appCluster.getStatus().getJobStatus().getState());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions())
.hasSize(1)
.extracting("reason")
.contains("Reconciling");

// Validate status status
assertNotNull(appCluster.getStatus().getError());

Expand Down Expand Up @@ -448,6 +463,12 @@ public void verifyUpgradeFromSavepointLegacyMode(FlinkVersion flinkVersion) thro
assertEquals(
"savepoint_1", appCluster.getStatus().getJobStatus().getUpgradeSavepointPath());

// Validate status conditions

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we use a java parameterized test (or more than one as appropriate) to cover all the permutations of the tests.

assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions())
.extracting("reason")
.contains("Finished");

// Resume from last savepoint
appCluster.getSpec().getJob().setState(JobState.RUNNING);
testController.reconcile(appCluster, context);
Expand Down Expand Up @@ -697,6 +718,12 @@ public void verifyReconcileWithAChangedOperatorModeToSession() throws Exception
JobManagerDeploymentStatus.DEPLOYING,
appCluster.getStatus().getJobManagerDeploymentStatus());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions())
.extracting("reason")
.contains("Reconciling");

updateControl = testController.reconcile(appCluster, context);
JobStatus jobStatus = appCluster.getStatus().getJobStatus();
assertFalse(updateControl.isUpdateStatus());
Expand All @@ -706,6 +733,12 @@ public void verifyReconcileWithAChangedOperatorModeToSession() throws Exception
// jobStatus has not been set at this time
assertEquals(org.apache.flink.api.common.JobStatus.RECONCILING, jobStatus.getState());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions())
.extracting("reason")
.contains("Reconciling");

// Switches operator mode to SESSION
appCluster.getSpec().setJob(null);
// Validation fails and JobObserver should still be used
Expand All @@ -727,6 +760,10 @@ public void verifyReconcileWithAChangedOperatorModeToSession() throws Exception
assertEquals(expectedJobStatus.getJobId().toHexString(), jobStatus.getJobId());
assertEquals(expectedJobStatus.getJobName(), jobStatus.getJobName());
assertEquals(expectedJobStatus.getJobState(), jobStatus.getState());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions()).extracting("reason").contains("Running");
}

@Test
Expand All @@ -741,12 +778,26 @@ public void verifyReconcileWithAChangedOperatorModeToApplication() throws Except
JobManagerDeploymentStatus.DEPLOYING,
appCluster.getStatus().getJobManagerDeploymentStatus());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions())
.hasSize(1)
.extracting("reason")
.contains("JobManagerIsDeploying");

updateControl = testController.reconcile(appCluster, context);
JobStatus jobStatus = appCluster.getStatus().getJobStatus();
assertFalse(updateControl.isUpdateStatus());
assertEquals(
JobManagerDeploymentStatus.DEPLOYED_NOT_READY,
appCluster.getStatus().getJobManagerDeploymentStatus());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions())
.hasSize(1)
.extracting("reason")
.contains("DeployedNotReady");
// jobStatus has not been set at this time
assertNull(jobStatus.getState());

Expand All @@ -764,6 +815,13 @@ public void verifyReconcileWithAChangedOperatorModeToApplication() throws Except
.getError()
.contains("Cannot switch from session to job cluster"));
assertNull(ReconciliationUtils.getDeployedSpec(appCluster).getJob());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions())
.hasSize(1)
.extracting("reason")
.contains("JobManagerReady");
}

private void testUpgradeNotReadyCluster(FlinkDeployment appCluster) throws Exception {
Expand Down Expand Up @@ -909,6 +967,10 @@ private void testUpgradeNotReadyCluster(FlinkDeployment appCluster) throws Excep
assertEquals(
JobManagerDeploymentStatus.READY,
appCluster.getStatus().getJobManagerDeploymentStatus());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions()).extracting("reason").contains("Running");
}

@Test
Expand Down Expand Up @@ -1155,6 +1217,12 @@ private void verifyReconcileInitialSuspendedDeployment(FlinkDeployment appCluste
assertNull(appCluster.getStatus().getError());
assertNull(reconciliationStatus.deserializeLastReconciledSpec());
assertNull(reconciliationStatus.getLastStableSpec());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions())
.hasSize(1)
.extracting("message")
.contains("JobManager deployment not found");
}

private void verifyReconcileNormalLifecycle(FlinkDeployment appCluster) throws Exception {
Expand All @@ -1167,6 +1235,14 @@ private void verifyReconcileNormalLifecycle(FlinkDeployment appCluster) throws E
org.apache.flink.api.common.JobStatus.RECONCILING,
appCluster.getStatus().getJobStatus().getState());
assertEquals(4, testController.getInternalStatusUpdateCount());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions())
.hasSize(1)
.extracting("reason")
.contains("Reconciling");

assertFalse(updateControl.isUpdateStatus());
assertEquals(
Optional.of(
Expand All @@ -1190,6 +1266,14 @@ private void verifyReconcileNormalLifecycle(FlinkDeployment appCluster) throws E
org.apache.flink.api.common.JobStatus.RECONCILING,
appCluster.getStatus().getJobStatus().getState());
assertEquals(5, testController.getInternalStatusUpdateCount());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions())
.hasSize(1)
.extracting("reason")
.contains("Reconciling");

assertFalse(updateControl.isUpdateStatus());
assertEquals(
Optional.of(
Expand All @@ -1203,6 +1287,11 @@ private void verifyReconcileNormalLifecycle(FlinkDeployment appCluster) throws E
assertEquals(
org.apache.flink.api.common.JobStatus.RUNNING,
appCluster.getStatus().getJobStatus().getState());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions()).extracting("reason").contains("Running");

assertEquals(6, testController.getInternalStatusUpdateCount());
assertFalse(updateControl.isUpdateStatus());
assertEquals(
Expand All @@ -1225,6 +1314,10 @@ private void verifyReconcileNormalLifecycle(FlinkDeployment appCluster) throws E
configManager.getOperatorConfiguration().getReconcileInterval().toMillis()),
updateControl.getScheduleDelay());

// Validate status conditions
assertThat(appCluster.getStatus().getConditions()).isNotNull();
assertThat(appCluster.getStatus().getConditions()).extracting("reason").contains("Running");

// Validate job status
JobStatus jobStatus = appCluster.getStatus().getJobStatus();
JobStatusMessage expectedJobStatus = flinkService.listJobs().get(0).f1;
Expand Down
Loading