Skip to content

Commit cffd894

Browse files
author
Meeth Gala
committed
address PR comments and add advance and clean up dag tasks
1 parent 0026724 commit cffd894

22 files changed

+649
-264
lines changed

gobblin-service/src/main/java/org/apache/gobblin/service/modules/orchestration/AdvanceDagProc.java

Lines changed: 188 additions & 53 deletions
Large diffs are not rendered by default.
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.gobblin.service.modules.orchestration;
19+
20+
import java.io.IOException;
21+
22+
import org.apache.gobblin.annotation.Alpha;
23+
24+
25+
/**
26+
* An implementation of {@link DagTask} that is responsible for advancing the dag to the next node based
27+
* on its current flow and job status. It is added to the {@link DagTaskStream} by the
28+
* {@link org.apache.gobblin.service.monitoring.KafkaJobStatusMonitor} after it consumes the appropriate
29+
* {@link org.apache.gobblin.metrics.GobblinTrackingEvent} for the {@link org.apache.gobblin.service.modules.flowgraph.Dag}
30+
*/
31+
32+
@Alpha
33+
public class AdvanceDagTask extends DagTask {
34+
35+
protected DagManager.DagId advanceDagId;
36+
37+
@Override
38+
void initialize(Object state, long triggerTimeStamp) {
39+
40+
}
41+
42+
@Override
43+
AdvanceDagProc host(DagTaskVisitor visitor) throws IOException, InstantiationException, IllegalAccessException {
44+
return (AdvanceDagProc) visitor.meet(this);
45+
}
46+
}
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.gobblin.service.modules.orchestration;
19+
20+
import java.io.IOException;
21+
import java.util.LinkedList;
22+
import java.util.concurrent.ExecutionException;
23+
import java.util.concurrent.TimeUnit;
24+
25+
import com.google.common.base.Optional;
26+
27+
import lombok.extern.slf4j.Slf4j;
28+
29+
import org.apache.gobblin.annotation.Alpha;
30+
import org.apache.gobblin.metrics.MetricContext;
31+
import org.apache.gobblin.metrics.event.EventSubmitter;
32+
import org.apache.gobblin.metrics.event.TimingEvent;
33+
import org.apache.gobblin.runtime.api.MultiActiveLeaseArbiter;
34+
import org.apache.gobblin.service.FlowId;
35+
import org.apache.gobblin.service.modules.flowgraph.Dag;
36+
import org.apache.gobblin.service.modules.orchestration.exception.MaybeRetryableException;
37+
import org.apache.gobblin.service.modules.spec.JobExecutionPlan;
38+
import org.apache.gobblin.service.monitoring.FlowStatusGenerator;
39+
import org.apache.gobblin.service.monitoring.JobStatus;
40+
41+
42+
/**
43+
* An implementation of {@link DagProc} that is responsible for clean up {@link Dag} that has been completed
44+
* or has reached an end state likewise: FAILED, COMPLETE or CANCELED
45+
*
46+
*/
47+
@Slf4j
48+
@Alpha
49+
public class CleanUpDagProc extends DagProc {
50+
51+
private MultiActiveLeaseArbiter multiActiveLeaseArbiter;
52+
53+
private DagManagementStateStore dagManagementStateStore;
54+
55+
private MetricContext metricContext;
56+
57+
private Optional<EventSubmitter> eventSubmitter;
58+
59+
private DagManagerMetrics dagManagerMetrics;
60+
61+
private DagStateStore dagStateStore;
62+
63+
private DagStateStore failedDagStateStore;
64+
65+
private DagTaskStream dagTaskStream;
66+
67+
private static final long DAG_FLOW_STATUS_TOLERANCE_TIME_MILLIS = TimeUnit.MINUTES.toMillis(5);
68+
69+
//TODO: instantiate an object of this class
70+
71+
@Override
72+
protected Object initialize() throws MaybeRetryableException, IOException {
73+
String dagIdToClean = ""; //TODO: implement this dagID
74+
if(!this.dagManagementStateStore.hasRunningJobs(dagIdToClean)) {
75+
Dag<JobExecutionPlan> dag = this.dagManagementStateStore.getDagIdToDags().get(dagIdToClean);
76+
return dag;
77+
}
78+
return null;
79+
}
80+
81+
@Override
82+
protected Object act(Object state) throws ExecutionException, InterruptedException, IOException {
83+
Dag<JobExecutionPlan> dag = (Dag<JobExecutionPlan>) state;
84+
DagManager.DagId dagId = DagManagerUtils.generateDagId(dag);
85+
LinkedList<Dag.DagNode<JobExecutionPlan>> dagNodeList = this.dagManagementStateStore.getDagToJobs().get(dagId);
86+
while (!dagNodeList.isEmpty()) {
87+
Dag.DagNode<JobExecutionPlan> dagNode = dagNodeList.poll();
88+
this.dagManagementStateStore.deleteJobState(dagId.toString(), dagNode);
89+
}
90+
if (dag.getFlowEvent() == null) {
91+
// If the dag flow event is not set, then it is successful
92+
dag.setFlowEvent(TimingEvent.FlowTimings.FLOW_SUCCEEDED);
93+
} else {
94+
addFailedDag(dagId.toString(), dag);
95+
}
96+
JobStatus flowStatus = dagTaskStream.pollFlowStatus(dag);
97+
if (flowStatus != null && FlowStatusGenerator.FINISHED_STATUSES.contains(flowStatus.getEventName())) {
98+
FlowId flowId = DagManagerUtils.getFlowId(dag);
99+
100+
switch (dag.getFlowEvent()) {
101+
case TimingEvent.FlowTimings.FLOW_SUCCEEDED:
102+
this.dagManagerMetrics.emitFlowSuccessMetrics(flowId);
103+
this.dagManagerMetrics.conditionallyMarkFlowAsState(flowId, DagManager.FlowState.SUCCESSFUL);
104+
break;
105+
case TimingEvent.FlowTimings.FLOW_FAILED:
106+
this.dagManagerMetrics.emitFlowFailedMetrics(flowId);
107+
this.dagManagerMetrics.conditionallyMarkFlowAsState(flowId, DagManager.FlowState.FAILED);
108+
break;
109+
case TimingEvent.FlowTimings.FLOW_CANCELLED:
110+
this.dagManagerMetrics.emitFlowSlaExceededMetrics(flowId);
111+
this.dagManagerMetrics.conditionallyMarkFlowAsState(flowId, DagManager.FlowState.FAILED);
112+
break;
113+
default:
114+
log.warn("Unexpected flow event {} for dag {}", dag.getFlowEvent(), dagId);
115+
}
116+
log.info("Dag {} has finished with status {}; Cleaning up dag from the state store.", dagId, dag.getFlowEvent());
117+
cleanUpDag(dagId.toString());
118+
}
119+
return null;
120+
}
121+
122+
@Override
123+
protected void sendNotification(Object result) throws MaybeRetryableException {
124+
long cleanUpProcessingTime = System.currentTimeMillis();
125+
Dag<JobExecutionPlan> dag = (Dag<JobExecutionPlan>) result;
126+
String dagId = DagManagerUtils.generateDagId(dag).toString();
127+
DagManagerUtils.emitFlowEvent(this.eventSubmitter, this.dagManagementStateStore.getDagIdToDags().get(dagId), dag.getFlowEvent());
128+
dag.setEventEmittedTimeMillis(cleanUpProcessingTime);
129+
}
130+
131+
/**
132+
* Add a dag to failed dag state store
133+
*/
134+
private synchronized void addFailedDag(String dagId, Dag<JobExecutionPlan> dag) {
135+
try {
136+
log.info("Adding dag " + dagId + " to failed dag state store");
137+
this.failedDagStateStore.writeCheckpoint(this.dagManagementStateStore.getDagIdToDags().get(dagId));
138+
} catch (IOException e) {
139+
log.error("Failed to add dag " + dagId + " to failed dag state store", e);
140+
}
141+
this.dagManagementStateStore.getFailedDagIds().add(dagId);
142+
}
143+
144+
/**
145+
* Note that removal of a {@link Dag} entry in {@link #dags} needs to be happen after {@link #cleanUp()}
146+
* since the real {@link Dag} object is required for {@link #cleanUp()},
147+
* and cleaning of all relevant states need to be atomic
148+
* @param dagId
149+
*/
150+
private synchronized void cleanUpDag(String dagId) {
151+
log.info("Cleaning up dagId {}", dagId);
152+
// clears flow event after cancelled job to allow resume event status to be set
153+
this.dagManagementStateStore.getDagIdToDags().get(dagId).setFlowEvent(null);
154+
try {
155+
this.dagStateStore.cleanUp(this.dagManagementStateStore.getDagIdToDags().get(dagId));
156+
} catch (IOException ioe) {
157+
log.error(String.format("Failed to clean %s from backStore due to:", dagId), ioe);
158+
}
159+
this.dagManagementStateStore.getDagIdToDags().remove(dagId);
160+
this.dagManagementStateStore.getDagToJobs().remove(dagId);
161+
}
162+
}

gobblin-service/src/main/java/org/apache/gobblin/service/modules/orchestration/WorkInProgress.java renamed to gobblin-service/src/main/java/org/apache/gobblin/service/modules/orchestration/CleanUpDagTask.java

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,32 @@
1414
* See the License for the specific language governing permissions and
1515
* limitations under the License.
1616
*/
17+
1718
package org.apache.gobblin.service.modules.orchestration;
1819

19-
import java.lang.annotation.ElementType;
20-
import java.lang.annotation.Retention;
21-
import java.lang.annotation.RetentionPolicy;
22-
import java.lang.annotation.Target;
20+
import org.apache.gobblin.annotation.Alpha;
21+
import org.apache.gobblin.service.modules.flowgraph.Dag;
2322

2423

2524
/**
26-
* Custom Annotation for classes that are under development.
27-
* It will make the classes available only during compilation phase, and not in the build.
25+
* An implementation of {@link DagTask} that is responsible for clean up {@link Dag} that has been completed
26+
* or has reached an end state likewise: FAILED, COMPLETE or CANCELED. It is added to the {@link DagTaskStream}
27+
* by the {@link org.apache.gobblin.service.monitoring.KafkaJobStatusMonitor} after it consumes the appropriate
28+
* {@link org.apache.gobblin.metrics.GobblinTrackingEvent}.
29+
*
2830
*/
29-
@Retention(RetentionPolicy.SOURCE)
30-
@Target(ElementType.TYPE)
31-
public @interface WorkInProgress {
32-
String value() default "This class/interface is a work in progress.";
33-
}
31+
@Alpha
32+
public class CleanUpDagTask extends DagTask {
33+
34+
protected DagManager.DagId cleanUpDagId;
3435

36+
@Override
37+
void initialize(Object state, long triggerTimeStamp) {
38+
39+
}
40+
41+
@Override
42+
CleanUpDagProc host(DagTaskVisitor visitor) {
43+
return (CleanUpDagProc) visitor.meet(this);
44+
}
45+
}

gobblin-service/src/main/java/org/apache/gobblin/service/modules/orchestration/DagManagement.java

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import java.io.IOException;
2121
import java.util.concurrent.ExecutionException;
2222

23+
import org.apache.gobblin.annotation.Alpha;
2324
import org.apache.gobblin.service.modules.flowgraph.Dag;
2425
import org.apache.gobblin.service.modules.spec.JobExecutionPlan;
2526
import org.apache.gobblin.service.monitoring.JobStatus;
@@ -30,23 +31,37 @@
3031
* and flow completion deadlines
3132
*
3233
*/
33-
@WorkInProgress
34+
@Alpha
3435
public interface DagManagement {
36+
3537
/**
3638
* Currently, it is handling just the launch of a {@link Dag} request via REST client for adhoc flows
37-
* @param launchDagTask
39+
* @param flowGroup
40+
* @param flowName
41+
* @param triggerTimeStamp
3842
*/
39-
void launchFlow(LaunchDagTask launchDagTask);
43+
void launchFlow(String flowGroup, String flowName, long triggerTimeStamp);
44+
4045
/**
4146
* Currently, it is handling just the resume of a {@link Dag} request via REST client for adhoc flows
42-
* @param resumeDagTask
47+
* @param flowGroup
48+
* @param flowName
49+
* @param flowExecutionId
50+
* @param triggerTimeStamp
51+
* @throws IOException
4352
*/
44-
void resumeFlow(ResumeDagTask resumeDagTask) throws IOException;
53+
void resumeFlow(String flowGroup, String flowName, String flowExecutionId, long triggerTimeStamp)
54+
throws IOException, InterruptedException;
55+
4556
/**
4657
* Currently, it is handling just the kill/cancel of a {@link Dag} request via REST client for adhoc flows
47-
* @param killDagTask
58+
* @param flowGroup
59+
* @param flowName
60+
* @param flowExecutionId
61+
* @param triggerTimeStamp
4862
*/
49-
void killFlow(KillDagTask killDagTask);
63+
void killFlow(String flowGroup, String flowName, String flowExecutionId, long triggerTimeStamp)
64+
throws InterruptedException;
5065

5166
boolean enforceFlowCompletionDeadline(Dag.DagNode<JobExecutionPlan> node) throws ExecutionException, InterruptedException;
5267

gobblin-service/src/main/java/org/apache/gobblin/service/modules/orchestration/DagManagementStateStore.java

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,15 @@
2222
import java.util.LinkedList;
2323
import java.util.List;
2424
import java.util.Map;
25+
import java.util.Optional;
2526
import java.util.Set;
2627

27-
import com.google.common.base.Optional;
2828
import com.google.common.collect.Lists;
2929

3030
import lombok.Getter;
31+
import lombok.Synchronized;
3132

33+
import org.apache.gobblin.annotation.Alpha;
3234
import org.apache.gobblin.runtime.api.DagActionStore;
3335
import org.apache.gobblin.service.modules.flowgraph.Dag;
3436
import org.apache.gobblin.service.modules.spec.JobExecutionPlan;
@@ -42,27 +44,27 @@
4244
* Going forward, each of these in-memory references will be read/write from MySQL store.
4345
* Thus, the {@link DagManager} would then be stateless and operate independently.
4446
*/
45-
@Getter
46-
@WorkInProgress
47+
@Getter(onMethod_={@Synchronized})
48+
@Alpha
4749
public class DagManagementStateStore {
4850
private final Map<Dag.DagNode<JobExecutionPlan>, Dag<JobExecutionPlan>> jobToDag = new HashMap<>();
49-
private final Map<String, Dag<JobExecutionPlan>> dags = new HashMap<>();
51+
private final Map<String, Dag<JobExecutionPlan>> dagIdToDags = new HashMap<>();
5052
private final Set<String> failedDagIds = new HashSet<>();
51-
private final Map<String, Dag<JobExecutionPlan>> resumingDags = new HashMap<>();
53+
private final Map<String, Dag<JobExecutionPlan>> dagIdToResumingDags = new HashMap<>();
5254
// dagToJobs holds a map of dagId to running jobs of that dag
5355
final Map<String, LinkedList<Dag.DagNode<JobExecutionPlan>>> dagToJobs = new HashMap<>();
5456
final Map<String, Long> dagToSLA = new HashMap<>();
5557
private final Set<String> dagIdstoClean = new HashSet<>();
5658
private Optional<DagActionStore> dagActionStore;
5759

58-
protected void deleteJobState(String dagId, Dag.DagNode<JobExecutionPlan> dagNode) {
60+
protected synchronized void deleteJobState(String dagId, Dag.DagNode<JobExecutionPlan> dagNode) {
5961
this.jobToDag.remove(dagNode);
6062
this.dagToJobs.get(dagId).remove(dagNode);
6163
this.dagToSLA.remove(dagId);
6264
}
6365

64-
protected void addJobState(String dagId, Dag.DagNode<JobExecutionPlan> dagNode) {
65-
Dag<JobExecutionPlan> dag = this.dags.get(dagId);
66+
protected synchronized void addJobState(String dagId, Dag.DagNode<JobExecutionPlan> dagNode) {
67+
Dag<JobExecutionPlan> dag = this.dagIdToDags.get(dagId);
6668
this.jobToDag.put(dagNode, dag);
6769
if (this.dagToJobs.containsKey(dagId)) {
6870
this.dagToJobs.get(dagId).add(dagNode);
@@ -73,11 +75,11 @@ protected void addJobState(String dagId, Dag.DagNode<JobExecutionPlan> dagNode)
7375
}
7476
}
7577

76-
protected boolean hasRunningJobs(String dagId) {
78+
protected synchronized boolean hasRunningJobs(String dagId) {
7779
List<Dag.DagNode<JobExecutionPlan>> dagNodes = this.dagToJobs.get(dagId);
7880
return dagNodes != null && !dagNodes.isEmpty();
7981
}
80-
protected void removeDagActionFromStore(DagManager.DagId dagId, DagActionStore.FlowActionType flowActionType) throws IOException {
82+
protected synchronized void removeDagActionFromStore(DagManager.DagId dagId, DagActionStore.FlowActionType flowActionType) throws IOException {
8183
if (this.dagActionStore.isPresent()) {
8284
this.dagActionStore.get().deleteDagAction(
8385
new DagActionStore.DagAction(dagId.flowGroup, dagId.flowName, dagId.flowExecutionId, flowActionType));

0 commit comments

Comments
 (0)