lyft · davidbirdsong · May 1, 2020 · May 4, 2020 · May 4, 2020 · May 13, 2020
@@ -78,6 +78,9 @@ spec:
             parallelism:
               type: integer
               minimum: 1
+            updateMode:
+              type: string
+              enum: [Savepoint, Checkpoint, NoStateRestore]
             deleteMode:
               type: string
               enum: [Savepoint, None, ForceCancel]

@@ -54,13 +54,21 @@ type FlinkApplicationSpec struct {
 	Volumes                        []apiv1.Volume      `json:"volumes,omitempty"`
 	VolumeMounts                   []apiv1.VolumeMount `json:"volumeMounts,omitempty"`
 	RestartNonce                   string              `json:"restartNonce"`
+	UpdateMode                     UpdateMode          `json:"updateMode,omitempty"`
 	DeleteMode                     DeleteMode          `json:"deleteMode,omitempty"`
 	AllowNonRestoredState          bool                `json:"allowNonRestoredState,omitempty"`
 	ForceRollback                  bool                `json:"forceRollback"`
 	MaxCheckpointRestoreAgeSeconds *int32              `json:"maxCheckpointRestoreAgeSeconds,omitempty"`
 	TearDownVersionHash            string              `json:"tearDownVersionHash,omitempty"`
 }
 
+func (spec FlinkApplicationSpec) SavepointingDisabled() bool {
+	if spec.SavepointDisabled {
+		return spec.SavepointDisabled
+	}
+	return spec.UpdateMode == UpdateModeNoState
+}
+
 type FlinkConfig map[string]interface{}
 
 // Workaround for https://github.com/kubernetes-sigs/kubebuilder/issues/528
@@ -302,6 +310,14 @@ const (
 	DeleteModeNone        DeleteMode = "None"
 )
 
+type UpdateMode string
+
+const (
+	UpdateModeSavepoint  UpdateMode = "Savepoint"
+	UpdateModeCheckpoint UpdateMode = "Checkpoint"
+	UpdateModeNoState    UpdateMode = "NoStateRestore"
+)
+
 type HealthStatus string
 
 const (
@@ -353,6 +369,7 @@ const (
 	GetJobConfig           FlinkMethod = "GetJobConfig"
 	GetTaskManagers        FlinkMethod = "GetTaskManagers"
 	GetCheckpointCounts    FlinkMethod = "GetCheckpointCounts"
+	GetCheckpointConfig    FlinkMethod = "GetCheckpointConfig"
 	GetJobOverview         FlinkMethod = "GetJobOverview"
 	SavepointJob           FlinkMethod = "SavepointJob"
 )
@@ -58,6 +58,7 @@ type FlinkAPIInterface interface {
 	GetJobConfig(ctx context.Context, url string, jobID string) (*JobConfigResponse, error)
 	GetTaskManagers(ctx context.Context, url string) (*TaskManagersResponse, error)
 	GetCheckpointCounts(ctx context.Context, url string, jobID string) (*CheckpointResponse, error)
+	GetCheckpointConfig(ctx context.Context, url string, jobID string) (*CheckpointConfigResponse, error)
 	GetJobOverview(ctx context.Context, url string, jobID string) (*FlinkJobOverview, error)
 }
 
@@ -66,49 +67,53 @@ type FlinkJobManagerClient struct {
 }
 
 type flinkJobManagerClientMetrics struct {
-	scope                        promutils.Scope
-	submitJobSuccessCounter      labeled.Counter
-	submitJobFailureCounter      labeled.Counter
-	cancelJobSuccessCounter      labeled.Counter
-	cancelJobFailureCounter      labeled.Counter
-	forceCancelJobSuccessCounter labeled.Counter
-	forceCancelJobFailureCounter labeled.Counter
-	checkSavepointSuccessCounter labeled.Counter
-	checkSavepointFailureCounter labeled.Counter
-	getJobsSuccessCounter        labeled.Counter
-	getJobsFailureCounter        labeled.Counter
-	getJobConfigSuccessCounter   labeled.Counter
-	getJobConfigFailureCounter   labeled.Counter
-	getClusterSuccessCounter     labeled.Counter
-	getClusterFailureCounter     labeled.Counter
-	getCheckpointsSuccessCounter labeled.Counter
-	getCheckpointsFailureCounter labeled.Counter
-	savepointJobSuccessCounter   labeled.Counter
-	savepointJobFailureCounter   labeled.Counter
+	scope                              promutils.Scope
+	submitJobSuccessCounter            labeled.Counter
+	submitJobFailureCounter            labeled.Counter
+	cancelJobSuccessCounter            labeled.Counter
+	cancelJobFailureCounter            labeled.Counter
+	forceCancelJobSuccessCounter       labeled.Counter
+	forceCancelJobFailureCounter       labeled.Counter
+	checkSavepointSuccessCounter       labeled.Counter
+	checkSavepointFailureCounter       labeled.Counter
+	getJobsSuccessCounter              labeled.Counter
+	getJobsFailureCounter              labeled.Counter
+	getJobConfigSuccessCounter         labeled.Counter
+	getJobConfigFailureCounter         labeled.Counter
+	getClusterSuccessCounter           labeled.Counter
+	getClusterFailureCounter           labeled.Counter
+	getCheckpointsSuccessCounter       labeled.Counter
+	getCheckpointsFailureCounter       labeled.Counter
+	savepointJobSuccessCounter         labeled.Counter
+	savepointJobFailureCounter         labeled.Counter
+	getCheckpointsConfigSuccessCounter labeled.Counter
+	getCheckpointsConfigFailureCounter labeled.Counter
 }
 
 func newFlinkJobManagerClientMetrics(scope promutils.Scope) *flinkJobManagerClientMetrics {
 	flinkJmClientScope := scope.NewSubScope("flink_jm_client")
 	return &flinkJobManagerClientMetrics{
-		scope:                        scope,
-		submitJobSuccessCounter:      labeled.NewCounter("submit_job_success", "Flink job submission successful", flinkJmClientScope),
-		submitJobFailureCounter:      labeled.NewCounter("submit_job_failure", "Flink job submission failed", flinkJmClientScope),
-		cancelJobSuccessCounter:      labeled.NewCounter("cancel_job_success", "Flink job cancellation successful", flinkJmClientScope),
-		cancelJobFailureCounter:      labeled.NewCounter("cancel_job_failure", "Flink job cancellation failed", flinkJmClientScope),
-		forceCancelJobSuccessCounter: labeled.NewCounter("force_cancel_job_success", "Flink forced job cancellation successful", flinkJmClientScope),
-		forceCancelJobFailureCounter: labeled.NewCounter("force_cancel_job_failure", "Flink forced job cancellation failed", flinkJmClientScope),
-		checkSavepointSuccessCounter: labeled.NewCounter("check_savepoint_status_success", "Flink check savepoint status successful", flinkJmClientScope),
-		checkSavepointFailureCounter: labeled.NewCounter("check_savepoint_status_failure", "Flink check savepoint status failed", flinkJmClientScope),
-		getJobsSuccessCounter:        labeled.NewCounter("get_jobs_success", "Get flink jobs succeeded", flinkJmClientScope),
-		getJobsFailureCounter:        labeled.NewCounter("get_jobs_failure", "Get flink jobs failed", flinkJmClientScope),
-		getJobConfigSuccessCounter:   labeled.NewCounter("get_job_config_success", "Get flink job config succeeded", flinkJmClientScope),
-		getJobConfigFailureCounter:   labeled.NewCounter("get_job_config_failure", "Get flink job config failed", flinkJmClientScope),
-		getClusterSuccessCounter:     labeled.NewCounter("get_cluster_success", "Get cluster overview succeeded", flinkJmClientScope),
-		getClusterFailureCounter:     labeled.NewCounter("get_cluster_failure", "Get cluster overview failed", flinkJmClientScope),
-		getCheckpointsSuccessCounter: labeled.NewCounter("get_checkpoints_success", "Get checkpoint request succeeded", flinkJmClientScope),
-		getCheckpointsFailureCounter: labeled.NewCounter("get_checkpoints_failed", "Get checkpoint request failed", flinkJmClientScope),
-		savepointJobSuccessCounter:   labeled.NewCounter("savepoint_job_success", "Savepoint job request succeeded", flinkJmClientScope),
-		savepointJobFailureCounter:   labeled.NewCounter("savepoint_job_failed", "Savepoint job request failed", flinkJmClientScope),
+		scope:                              scope,
+		submitJobSuccessCounter:            labeled.NewCounter("submit_job_success", "Flink job submission successful", flinkJmClientScope),
+		submitJobFailureCounter:            labeled.NewCounter("submit_job_failure", "Flink job submission failed", flinkJmClientScope),
+		cancelJobSuccessCounter:            labeled.NewCounter("cancel_job_success", "Flink job cancellation successful", flinkJmClientScope),
+		cancelJobFailureCounter:            labeled.NewCounter("cancel_job_failure", "Flink job cancellation failed", flinkJmClientScope),
+		forceCancelJobSuccessCounter:       labeled.NewCounter("force_cancel_job_success", "Flink forced job cancellation successful", flinkJmClientScope),
+		forceCancelJobFailureCounter:       labeled.NewCounter("force_cancel_job_failure", "Flink forced job cancellation failed", flinkJmClientScope),
+		checkSavepointSuccessCounter:       labeled.NewCounter("check_savepoint_status_success", "Flink check savepoint status successful", flinkJmClientScope),
+		checkSavepointFailureCounter:       labeled.NewCounter("check_savepoint_status_failure", "Flink check savepoint status failed", flinkJmClientScope),
+		getJobsSuccessCounter:              labeled.NewCounter("get_jobs_success", "Get flink jobs succeeded", flinkJmClientScope),
+		getJobsFailureCounter:              labeled.NewCounter("get_jobs_failure", "Get flink jobs failed", flinkJmClientScope),
+		getJobConfigSuccessCounter:         labeled.NewCounter("get_job_config_success", "Get flink job config succeeded", flinkJmClientScope),
+		getJobConfigFailureCounter:         labeled.NewCounter("get_job_config_failure", "Get flink job config failed", flinkJmClientScope),
+		getClusterSuccessCounter:           labeled.NewCounter("get_cluster_success", "Get cluster overview succeeded", flinkJmClientScope),
+		getClusterFailureCounter:           labeled.NewCounter("get_cluster_failure", "Get cluster overview failed", flinkJmClientScope),
+		getCheckpointsSuccessCounter:       labeled.NewCounter("get_checkpoints_success", "Get checkpoint request succeeded", flinkJmClientScope),
+		getCheckpointsFailureCounter:       labeled.NewCounter("get_checkpoints_failed", "Get checkpoint request failed", flinkJmClientScope),
+		savepointJobSuccessCounter:         labeled.NewCounter("savepoint_job_success", "Savepoint job request succeeded", flinkJmClientScope),
+		savepointJobFailureCounter:         labeled.NewCounter("savepoint_job_failed", "Savepoint job request failed", flinkJmClientScope),
+		getCheckpointsConfigSuccessCounter: labeled.NewCounter("get_checkpoints_config_success", "Get checkpoint config request succeeded", flinkJmClientScope),
+		getCheckpointsConfigFailureCounter: labeled.NewCounter("get_checkpoints_config_failed", "Get checkpoint config request failed", flinkJmClientScope),
 	}
 }
 
@@ -369,6 +374,26 @@ func (c *FlinkJobManagerClient) GetCheckpointCounts(ctx context.Context, url str
 	c.metrics.getCheckpointsSuccessCounter.Inc(ctx)
 	return &checkpointResponse, nil
 }
+func (c *FlinkJobManagerClient) GetCheckpointConfig(ctx context.Context, url string, jobID string) (*CheckpointConfigResponse, error) {
+	endpoint := fmt.Sprintf(url+checkpointsURL+"/config", jobID)
+	response, err := c.executeRequest(ctx, httpGet, endpoint, nil)
+	if err != nil {
+		c.metrics.getCheckpointsConfigFailureCounter.Inc(ctx)
+		return nil, GetRetryableError(err, v1beta1.GetCheckpointConfig, GlobalFailure, DefaultRetries)
+	}
+	if response != nil && !response.IsSuccess() {
+		c.metrics.getCheckpointsConfigFailureCounter.Inc(ctx)
+		return nil, GetRetryableError(err, v1beta1.GetCheckpointConfig, response.Status(), DefaultRetries)
+	}
+
+	var checkpointConfigResponse CheckpointConfigResponse
+	if err = json.Unmarshal(response.Body(), &checkpointConfigResponse); err != nil {
+		logger.Errorf(ctx, "Failed to unmarshal checkpointConfigResponse %v, err %v", response, err)
+	}
+
+	c.metrics.getCheckpointsSuccessCounter.Inc(ctx)
+	return &checkpointConfigResponse, nil
+}
 
 func (c *FlinkJobManagerClient) GetJobOverview(ctx context.Context, url string, jobID string) (*FlinkJobOverview, error) {
 	endpoint := fmt.Sprintf(url+GetJobsOverviewURL, jobID)

@@ -139,12 +139,21 @@ type LatestCheckpoints struct {
 	Restored  *CheckpointStatistics `json:"restored,omitempty"`
 }
 
+type ExternalizedCheckpoints struct {
+	Enabled              bool `json:"enable,omitempty"`
+	DeleteOnCancellation bool `json:"delete_on_cancellation,omitempty"`
+}
+
 type CheckpointResponse struct {
 	Counts  map[string]int32       `json:"counts"`
 	Latest  LatestCheckpoints      `json:"latest"`
 	History []CheckpointStatistics `json:"history"`
 }
 
+type CheckpointConfigResponse struct {
+	Externalization *ExternalizedCheckpoints `json:"externalization"`
+}
+
 type TaskManagerStats struct {
 	Path                   string `json:"path"`
 	DataPort               int32  `json:"dataPort"`

@@ -16,6 +16,7 @@ type GetLatestCheckpointFunc func(ctx context.Context, url string, jobID string)
 type GetJobConfigFunc func(ctx context.Context, url string, jobID string) (*client.JobConfigResponse, error)
 type GetTaskManagersFunc func(ctx context.Context, url string) (*client.TaskManagersResponse, error)
 type GetCheckpointCountsFunc func(ctx context.Context, url string, jobID string) (*client.CheckpointResponse, error)
+type GetCheckpointConfigFunc func(ctx context.Context, url string, jobID string) (*client.CheckpointConfigResponse, error)
 type GetJobOverviewFunc func(ctx context.Context, url string, jobID string) (*client.FlinkJobOverview, error)
 type SavepointJobFunc func(ctx context.Context, url string, jobID string) (string, error)
 type JobManagerClient struct {
@@ -29,6 +30,7 @@ type JobManagerClient struct {
 	GetLatestCheckpointFunc    GetLatestCheckpointFunc
 	GetTaskManagersFunc        GetTaskManagersFunc
 	GetCheckpointCountsFunc    GetCheckpointCountsFunc
+	GetCheckpointConfigFunc    GetCheckpointConfigFunc
 	GetJobOverviewFunc         GetJobOverviewFunc
 	SavepointJobFunc           SavepointJobFunc
 }
@@ -103,6 +105,13 @@ func (m *JobManagerClient) GetCheckpointCounts(ctx context.Context, url string,
 	return nil, nil
 }
 
+func (m *JobManagerClient) GetCheckpointConfig(ctx context.Context, url string, jobID string) (*client.CheckpointConfigResponse, error) {
+	if m.GetCheckpointConfigFunc != nil {
+		return m.GetCheckpointConfigFunc(ctx, url, jobID)
+	}
+	return nil, nil
+}
+
 func (m *JobManagerClient) GetJobOverview(ctx context.Context, url string, jobID string) (*client.FlinkJobOverview, error) {
 	if m.GetJobOverviewFunc != nil {
 		return m.GetJobOverviewFunc(ctx, url, jobID)

@@ -71,7 +71,6 @@ func getInternalMetricsQueryPort(app *v1beta1.FlinkApplication) int32 {
 func getMaxCheckpointRestoreAgeSeconds(app *v1beta1.FlinkApplication) int32 {
 	return firstNonNil(app.Spec.MaxCheckpointRestoreAgeSeconds, MaxCheckpointRestoreAgeSeconds)
 }
-
 func getTaskManagerMemory(application *v1beta1.FlinkApplication) int64 {
 	tmResources := application.Spec.TaskManagerConfig.Resources
 	if tmResources == nil {

@@ -85,6 +85,9 @@ type ControllerInterface interface {
 	// able to savepoint for some reason.
 	FindExternalizedCheckpoint(ctx context.Context, application *v1beta1.FlinkApplication, hash string) (string, error)
 
+	// Ensures that application is configured to externalize and *not* delete checkpoints on cancel.
+	FindExternalizedCheckpointForSavepoint(ctx context.Context, application *v1beta1.FlinkApplication, hash string) (string, error)
+
 	// Logs an event to the FlinkApplication resource and to the operator log
 	LogEvent(ctx context.Context, app *v1beta1.FlinkApplication, eventType string, reason string, message string)
 
@@ -468,8 +471,8 @@ func (f *Controller) DeleteOldResourcesForApp(ctx context.Context, app *v1beta1.
 	return nil
 }
 
-func (f *Controller) FindExternalizedCheckpoint(ctx context.Context, application *v1beta1.FlinkApplication, hash string) (string, error) {
-	checkpoint, err := f.flinkClient.GetLatestCheckpoint(ctx, f.getURLFromApp(application, hash), f.GetLatestJobID(ctx, application))
+func (f *Controller) findExternalizedCheckpoint(ctx context.Context, application *v1beta1.FlinkApplication, hash string, checkpointMaxAge int32) (string, error) {
+	checkpoint, err := f.flinkClient.GetLatestCheckpoint(ctx, f.getURLFromApp(application, hash), application.Status.JobStatus.JobID)
 	var checkpointPath string
 	var checkpointTime int64
 	if err != nil {
@@ -490,12 +493,28 @@ func (f *Controller) FindExternalizedCheckpoint(ctx context.Context, application
 		return "", nil
 	}
 
-	if isCheckpointOldToRecover(checkpointTime, getMaxCheckpointRestoreAgeSeconds(application)) {
+	if isCheckpointOldToRecover(checkpointTime, checkpointMaxAge) {
 		logger.Info(ctx, "Found checkpoint to restore from, but was too old")
 		return "", nil
 	}
 
 	return checkpointPath, nil
+
+}
+
+func (f *Controller) FindExternalizedCheckpoint(ctx context.Context, application *v1beta1.FlinkApplication, hash string) (string, error) {
+	return f.findExternalizedCheckpoint(ctx, application, hash, getMaxCheckpointRestoreAgeSeconds(application))
+}
+
+func (f *Controller) FindExternalizedCheckpointForSavepoint(ctx context.Context, application *v1beta1.FlinkApplication, hash string) (string, error) {
+	checkpointConfig, err := f.flinkClient.GetCheckpointConfig(ctx, f.getURLFromApp(application, hash), application.Status.JobStatus.JobID)
+	if err != nil {
+		return "", err
+	}
+	if checkpointConfig.Externalization.Enabled && !checkpointConfig.Externalization.DeleteOnCancellation {
+		return "", fmt.Errorf("Checkpoint configuration not compatable for starting from checkpoints")
+	}
+	return f.findExternalizedCheckpoint(ctx, application, hash, getMaxCheckpointRestoreAgeSeconds(application))
 }
 
 func isCheckpointOldToRecover(checkpointTime int64, maxCheckpointRecoveryAgeSec int32) bool {

@@ -826,6 +826,16 @@ func TestJobStatusUpdated(t *testing.T) {
 		}, nil
 	}
 
+	mockJmClient.GetCheckpointConfigFunc = func(ctx context.Context, url string, jobID string) (*client.CheckpointConfigResponse, error) {
+		assert.Equal(t, url, "http://app-name-hash.ns:8081")
+		return &client.CheckpointConfigResponse{
+			Externalization: &client.ExternalizedCheckpoints{
+				Enabled:              true,
+				DeleteOnCancellation: false,
+			},
+		}, nil
+	}
+
 	flinkApp.Status.JobStatus.JobID = "abc"
 	expectedTime := metaV1.NewTime(time.Unix(startTime/1000, 0))
 	_, err = flinkControllerForTest.CompareAndUpdateJobStatus(context.Background(), &flinkApp, "hash")