diff --git a/flink-kubernetes-operator-api/src/main/java/org/apache/flink/kubernetes/operator/api/status/FlinkDeploymentStatus.java b/flink-kubernetes-operator-api/src/main/java/org/apache/flink/kubernetes/operator/api/status/FlinkDeploymentStatus.java index 136d3415f3..9ba4456d56 100644 --- a/flink-kubernetes-operator-api/src/main/java/org/apache/flink/kubernetes/operator/api/status/FlinkDeploymentStatus.java +++ b/flink-kubernetes-operator-api/src/main/java/org/apache/flink/kubernetes/operator/api/status/FlinkDeploymentStatus.java @@ -19,8 +19,10 @@ import org.apache.flink.annotation.Experimental; import org.apache.flink.kubernetes.operator.api.spec.FlinkDeploymentSpec; +import org.apache.flink.kubernetes.operator.api.utils.ConditionUtils; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import io.fabric8.kubernetes.api.model.Condition; import lombok.AllArgsConstructor; import lombok.Data; import lombok.EqualsAndHashCode; @@ -28,7 +30,9 @@ import lombok.ToString; import lombok.experimental.SuperBuilder; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; /** Last observed status of the Flink deployment. */ @@ -55,4 +59,13 @@ public class FlinkDeploymentStatus extends CommonStatus { /** Information about the TaskManagers for the scale subresource. */ private TaskManagerInfo taskManager; + + /** Condition of the CR . */ + private List conditions = new ArrayList<>(); + + public List getConditions() { + Condition condition = ConditionUtils.getCondition(this); + ConditionUtils.updateLastTransitionTime(conditions, condition); + return condition == null ? List.of() : List.of(condition); + } } diff --git a/flink-kubernetes-operator-api/src/main/java/org/apache/flink/kubernetes/operator/api/status/JobManagerDeploymentStatus.java b/flink-kubernetes-operator-api/src/main/java/org/apache/flink/kubernetes/operator/api/status/JobManagerDeploymentStatus.java index 54a0181bc0..faecf29f85 100644 --- a/flink-kubernetes-operator-api/src/main/java/org/apache/flink/kubernetes/operator/api/status/JobManagerDeploymentStatus.java +++ b/flink-kubernetes-operator-api/src/main/java/org/apache/flink/kubernetes/operator/api/status/JobManagerDeploymentStatus.java @@ -21,18 +21,36 @@ public enum JobManagerDeploymentStatus { /** JobManager is running and ready to receive REST API calls. */ - READY, + READY("JobManagerReady", "JobManager is running and ready to receive REST API calls"), /** JobManager is running but not ready yet to receive REST API calls. */ - DEPLOYED_NOT_READY, + DEPLOYED_NOT_READY( + "DeployedNotReady", + "JobManager is running but not yet ready to receive REST API calls"), /** JobManager process is starting up. */ - DEPLOYING, + DEPLOYING("JobManagerIsDeploying", "JobManager process is starting up"), /** JobManager deployment not found, probably not started or killed by user. */ // TODO: currently a mix of SUSPENDED and ERROR, needs cleanup - MISSING, + MISSING("JobManagerDeploymentMissing", "JobManager deployment not found"), /** Deployment in terminal error, requires spec change for reconciliation to continue. */ - ERROR; + ERROR("Error", "JobManager deployment failed"); + + private String reason; + private String message; + + JobManagerDeploymentStatus(String reason, String message) { + this.reason = reason; + this.message = message; + } + + public String getReason() { + return reason; + } + + public String getMessage() { + return message; + } } diff --git a/flink-kubernetes-operator-api/src/main/java/org/apache/flink/kubernetes/operator/api/utils/ConditionUtils.java b/flink-kubernetes-operator-api/src/main/java/org/apache/flink/kubernetes/operator/api/utils/ConditionUtils.java new file mode 100644 index 0000000000..11b5573650 --- /dev/null +++ b/flink-kubernetes-operator-api/src/main/java/org/apache/flink/kubernetes/operator/api/utils/ConditionUtils.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.kubernetes.operator.api.utils; + +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.kubernetes.operator.api.status.FlinkDeploymentStatus; +import org.apache.flink.kubernetes.operator.api.status.JobManagerDeploymentStatus; + +import io.fabric8.kubernetes.api.model.Condition; +import io.fabric8.kubernetes.api.model.ConditionBuilder; + +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.List; + +import static org.apache.flink.api.common.JobStatus.RUNNING; +import static org.apache.flink.kubernetes.operator.api.status.JobManagerDeploymentStatus.READY; + +/** Creates a condition object with the type, status, message and reason. */ +public class ConditionUtils { + public static final String CONDITION_TYPE_RUNNING = "Running"; + + public static Condition getCondition(FlinkDeploymentStatus flinkDeploymentStatus) { + org.apache.flink.kubernetes.operator.api.status.JobStatus status = + flinkDeploymentStatus.getJobStatus(); + Condition conditionToAdd = null; + if (status != null) { + + JobStatus jobStatus = status.getState(); + + conditionToAdd = + jobStatus == null + ? getSessionModeCondition( + flinkDeploymentStatus.getJobManagerDeploymentStatus()) + : getApplicationModeCondition(jobStatus); + } + + return conditionToAdd; + } + + public static void updateLastTransitionTime(List conditions, Condition condition) { + if (condition == null) { + return; + } + Condition existingCondition = conditions.isEmpty() ? null : conditions.get(0); + if (isLastTransactionTimeStampUpdateRequired(existingCondition, condition)) { + condition.setLastTransitionTime( + new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'").format(new Date())); + } else { + condition.setLastTransitionTime(existingCondition.getLastTransitionTime()); + } + } + + private static Condition getApplicationModeCondition(JobStatus jobStatus) { + return new ConditionBuilder() + .withType(CONDITION_TYPE_RUNNING) + .withStatus(jobStatus == RUNNING ? "True" : "False") + .withReason(toCamelCase(jobStatus.name())) + .withMessage("Job state " + jobStatus.name()) + .build(); + } + + private static Condition getSessionModeCondition(JobManagerDeploymentStatus jmStatus) { + return new ConditionBuilder() + .withType(CONDITION_TYPE_RUNNING) + .withStatus(jmStatus == READY ? "True" : "False") + .withReason(jmStatus.getReason()) + .withMessage(jmStatus.getMessage()) + .build(); + } + + private static String toCamelCase(String reason) { + reason = reason.toLowerCase(); + return reason.substring(0, 1).toUpperCase() + reason.substring(1); + } + + private static boolean isLastTransactionTimeStampUpdateRequired( + Condition existingCondition, Condition newCondition) { + return existingCondition == null + || !existingCondition.getStatus().equals(newCondition.getStatus()); + } +} diff --git a/flink-kubernetes-operator/src/test/java/org/apache/flink/kubernetes/operator/controller/FlinkDeploymentControllerTest.java b/flink-kubernetes-operator/src/test/java/org/apache/flink/kubernetes/operator/controller/FlinkDeploymentControllerTest.java index 0354a1256e..aa39acb548 100644 --- a/flink-kubernetes-operator/src/test/java/org/apache/flink/kubernetes/operator/controller/FlinkDeploymentControllerTest.java +++ b/flink-kubernetes-operator/src/test/java/org/apache/flink/kubernetes/operator/controller/FlinkDeploymentControllerTest.java @@ -69,6 +69,7 @@ import static org.apache.flink.kubernetes.operator.config.KubernetesOperatorConfigOptions.OPERATOR_JOB_UPGRADE_LAST_STATE_FALLBACK_ENABLED; import static org.apache.flink.kubernetes.operator.config.KubernetesOperatorConfigOptions.SNAPSHOT_RESOURCE_ENABLED; import static org.apache.flink.kubernetes.operator.utils.EventRecorder.Reason.ValidationError; +import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotEquals; @@ -280,6 +281,13 @@ public void verifyFailedDeployment() throws Exception { // Validate status assertNotNull(appCluster.getStatus().getError()); + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()) + .hasSize(1) + .extracting("reason") + .contains("Reconciling"); + // next cycle should not create another event updateControl = testController.reconcile( @@ -364,6 +372,13 @@ public void verifyInProgressDeploymentWithError(String reason) throws Exception org.apache.flink.api.common.JobStatus.RECONCILING, appCluster.getStatus().getJobStatus().getState()); + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()) + .hasSize(1) + .extracting("reason") + .contains("Reconciling"); + // Validate status status assertNotNull(appCluster.getStatus().getError()); @@ -448,6 +463,12 @@ public void verifyUpgradeFromSavepointLegacyMode(FlinkVersion flinkVersion) thro assertEquals( "savepoint_1", appCluster.getStatus().getJobStatus().getUpgradeSavepointPath()); + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()) + .extracting("reason") + .contains("Finished"); + // Resume from last savepoint appCluster.getSpec().getJob().setState(JobState.RUNNING); testController.reconcile(appCluster, context); @@ -697,6 +718,12 @@ public void verifyReconcileWithAChangedOperatorModeToSession() throws Exception JobManagerDeploymentStatus.DEPLOYING, appCluster.getStatus().getJobManagerDeploymentStatus()); + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()) + .extracting("reason") + .contains("Reconciling"); + updateControl = testController.reconcile(appCluster, context); JobStatus jobStatus = appCluster.getStatus().getJobStatus(); assertFalse(updateControl.isUpdateStatus()); @@ -706,6 +733,12 @@ public void verifyReconcileWithAChangedOperatorModeToSession() throws Exception // jobStatus has not been set at this time assertEquals(org.apache.flink.api.common.JobStatus.RECONCILING, jobStatus.getState()); + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()) + .extracting("reason") + .contains("Reconciling"); + // Switches operator mode to SESSION appCluster.getSpec().setJob(null); // Validation fails and JobObserver should still be used @@ -727,6 +760,10 @@ public void verifyReconcileWithAChangedOperatorModeToSession() throws Exception assertEquals(expectedJobStatus.getJobId().toHexString(), jobStatus.getJobId()); assertEquals(expectedJobStatus.getJobName(), jobStatus.getJobName()); assertEquals(expectedJobStatus.getJobState(), jobStatus.getState()); + + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()).extracting("reason").contains("Running"); } @Test @@ -741,12 +778,26 @@ public void verifyReconcileWithAChangedOperatorModeToApplication() throws Except JobManagerDeploymentStatus.DEPLOYING, appCluster.getStatus().getJobManagerDeploymentStatus()); + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()) + .hasSize(1) + .extracting("reason") + .contains("JobManagerIsDeploying"); + updateControl = testController.reconcile(appCluster, context); JobStatus jobStatus = appCluster.getStatus().getJobStatus(); assertFalse(updateControl.isUpdateStatus()); assertEquals( JobManagerDeploymentStatus.DEPLOYED_NOT_READY, appCluster.getStatus().getJobManagerDeploymentStatus()); + + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()) + .hasSize(1) + .extracting("reason") + .contains("DeployedNotReady"); // jobStatus has not been set at this time assertNull(jobStatus.getState()); @@ -764,6 +815,13 @@ public void verifyReconcileWithAChangedOperatorModeToApplication() throws Except .getError() .contains("Cannot switch from session to job cluster")); assertNull(ReconciliationUtils.getDeployedSpec(appCluster).getJob()); + + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()) + .hasSize(1) + .extracting("reason") + .contains("JobManagerReady"); } private void testUpgradeNotReadyCluster(FlinkDeployment appCluster) throws Exception { @@ -909,6 +967,10 @@ private void testUpgradeNotReadyCluster(FlinkDeployment appCluster) throws Excep assertEquals( JobManagerDeploymentStatus.READY, appCluster.getStatus().getJobManagerDeploymentStatus()); + + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()).extracting("reason").contains("Running"); } @Test @@ -1155,6 +1217,12 @@ private void verifyReconcileInitialSuspendedDeployment(FlinkDeployment appCluste assertNull(appCluster.getStatus().getError()); assertNull(reconciliationStatus.deserializeLastReconciledSpec()); assertNull(reconciliationStatus.getLastStableSpec()); + + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()) + .hasSize(1) + .extracting("message") + .contains("JobManager deployment not found"); } private void verifyReconcileNormalLifecycle(FlinkDeployment appCluster) throws Exception { @@ -1167,6 +1235,14 @@ private void verifyReconcileNormalLifecycle(FlinkDeployment appCluster) throws E org.apache.flink.api.common.JobStatus.RECONCILING, appCluster.getStatus().getJobStatus().getState()); assertEquals(4, testController.getInternalStatusUpdateCount()); + + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()) + .hasSize(1) + .extracting("reason") + .contains("Reconciling"); + assertFalse(updateControl.isUpdateStatus()); assertEquals( Optional.of( @@ -1190,6 +1266,14 @@ private void verifyReconcileNormalLifecycle(FlinkDeployment appCluster) throws E org.apache.flink.api.common.JobStatus.RECONCILING, appCluster.getStatus().getJobStatus().getState()); assertEquals(5, testController.getInternalStatusUpdateCount()); + + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()) + .hasSize(1) + .extracting("reason") + .contains("Reconciling"); + assertFalse(updateControl.isUpdateStatus()); assertEquals( Optional.of( @@ -1203,6 +1287,11 @@ private void verifyReconcileNormalLifecycle(FlinkDeployment appCluster) throws E assertEquals( org.apache.flink.api.common.JobStatus.RUNNING, appCluster.getStatus().getJobStatus().getState()); + + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()).extracting("reason").contains("Running"); + assertEquals(6, testController.getInternalStatusUpdateCount()); assertFalse(updateControl.isUpdateStatus()); assertEquals( @@ -1225,6 +1314,10 @@ private void verifyReconcileNormalLifecycle(FlinkDeployment appCluster) throws E configManager.getOperatorConfiguration().getReconcileInterval().toMillis()), updateControl.getScheduleDelay()); + // Validate status conditions + assertThat(appCluster.getStatus().getConditions()).isNotNull(); + assertThat(appCluster.getStatus().getConditions()).extracting("reason").contains("Running"); + // Validate job status JobStatus jobStatus = appCluster.getStatus().getJobStatus(); JobStatusMessage expectedJobStatus = flinkService.listJobs().get(0).f1; diff --git a/helm/flink-kubernetes-operator/crds/flinkdeployments.flink.apache.org-v1.yml b/helm/flink-kubernetes-operator/crds/flinkdeployments.flink.apache.org-v1.yml index f498ae2431..8230132547 100644 --- a/helm/flink-kubernetes-operator/crds/flinkdeployments.flink.apache.org-v1.yml +++ b/helm/flink-kubernetes-operator/crds/flinkdeployments.flink.apache.org-v1.yml @@ -10229,6 +10229,23 @@ spec: additionalProperties: type: string type: object + conditions: + items: + properties: + lastTransitionTime: + type: string + message: + type: string + observedGeneration: + type: integer + reason: + type: string + status: + type: string + type: + type: string + type: object + type: array error: type: string jobManagerDeploymentStatus: