Skip to content

Commit a2c6314

Browse files
committed
DRILL-4706: Fragment planning causes Drillbits to read remote chunks when local copies are available.
New fragment placement algorithm based on locality of data.
1 parent 4b1902c commit a2c6314

File tree

14 files changed

+965
-8
lines changed

14 files changed

+965
-8
lines changed

contrib/storage-kudu/src/main/java/org/apache/drill/exec/store/kudu/KuduGroupScan.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,13 @@ public EndpointByteMap getByteMap() {
145145
public int compareTo(CompleteWork o) {
146146
return 0;
147147
}
148+
149+
@Override
150+
public DrillbitEndpoint getPreferredEndpoint() {
151+
// TODO: we should populate this if we want to do locality based
152+
// scan (localAffinity) for kudu.
153+
return null;
154+
}
148155
}
149156

150157
/**

exec/java-exec/src/main/java/org/apache/drill/exec/ExecConstants.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,8 @@ public interface ExecConstants {
169169
String PARQUET_PAGEREADER_USE_FADVISE = "store.parquet.reader.pagereader.usefadvise";
170170
OptionValidator PARQUET_PAGEREADER_USE_FADVISE_VALIDATOR = new BooleanValidator(PARQUET_PAGEREADER_USE_FADVISE, false);
171171

172+
String PARQUET_LOCAL_AFFINITY = "store.parquet.use_local_affinity";
173+
OptionValidator PARQUET_LOCAL_AFFINITY_IMPLEMENTATION_VALIDATOR = new BooleanValidator(PARQUET_LOCAL_AFFINITY, false);
172174
OptionValidator COMPILE_SCALAR_REPLACEMENT = new BooleanValidator("exec.compile.scalar_replacement", false);
173175

174176
String JSON_ALL_TEXT_MODE = "store.json.all_text_mode";

exec/java-exec/src/main/java/org/apache/drill/exec/physical/EndpointAffinity.java

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ public class EndpointAffinity {
3131
private final DrillbitEndpoint endpoint;
3232
private double affinity = 0.0d;
3333

34+
// work assignments for this endpoint
35+
private int numLocalWorkUnits;
36+
3437
// Requires including this endpoint at least once? Default is not required.
3538
private boolean mandatory;
3639

@@ -68,13 +71,31 @@ public EndpointAffinity(DrillbitEndpoint endpoint, double affinity) {
6871
* @param mandatory Is this endpoint requires at least one mandatory assignment?
6972
* @param maxWidth Maximum allowed assignments for this endpoint.
7073
*/
74+
public EndpointAffinity(final DrillbitEndpoint endpoint, final double affinity, final boolean mandatory, final int maxWidth) {
75+
Preconditions.checkArgument(maxWidth >= 1, "MaxWidth for given endpoint should be at least one.");
76+
this.endpoint = endpoint;
77+
this.affinity = affinity;
78+
this.mandatory = mandatory;
79+
this.maxWidth = maxWidth;
80+
}
81+
82+
/**
83+
* Creates EndpointAffinity instance for given DrillbitEndpoint, affinity,
84+
* mandatory assignment requirement flag and numLocalWorkUnits.
85+
* @param endpoint Drillbit endpoint
86+
* @param affinity Initial affinity value
87+
* @param mandatory Is this endpoint requires at least one mandatory assignment?
88+
* @param maxWidth Maximum allowed assignments for this endpoint.
89+
* @param numLocalWorkUnits Number of local work units.
90+
*/
7191
public EndpointAffinity(final DrillbitEndpoint endpoint, final double affinity, final boolean mandatory,
72-
final int maxWidth) {
92+
final int maxWidth, int numLocalWorkUnits) {
7393
Preconditions.checkArgument(maxWidth >= 1, "MaxWidth for given endpoint should be at least one.");
7494
this.endpoint = endpoint;
7595
this.affinity = affinity;
7696
this.mandatory = mandatory;
7797
this.maxWidth = maxWidth;
98+
this.numLocalWorkUnits = numLocalWorkUnits;
7899
}
79100

80101
/**
@@ -142,6 +163,14 @@ public void setMaxWidth(final int maxWidth) {
142163
this.maxWidth = Math.min(this.maxWidth, maxWidth);
143164
}
144165

166+
public void setNumLocalWorkUnits(final int localWorkUnits) {
167+
numLocalWorkUnits = localWorkUnits;
168+
}
169+
170+
public int getNumLocalWorkUnits() {
171+
return numLocalWorkUnits;
172+
}
173+
145174
@Override
146175
public int hashCode() {
147176
final int prime = 31;

exec/java-exec/src/main/java/org/apache/drill/exec/planner/fragment/DistributionAffinity.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,12 @@ public enum DistributionAffinity {
2727
*/
2828
NONE(SoftAffinityFragmentParallelizer.INSTANCE),
2929

30+
/**
31+
* Local distribution affinity to one or more endpoints.
32+
* Fragments will be scheduled on nodes based on locality of data.
33+
*/
34+
LOCAL(LocalAffinityFragmentParallelizer.INSTANCE),
35+
3036
/**
3137
* Operator has soft distribution affinity to one or more endpoints. Operator performs better when fragments are
3238
* assigned to the endpoints with affinity, but not a mandatory requirement.
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
* <p/>
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
* <p/>
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.drill.exec.planner.fragment;
20+
21+
import com.google.common.collect.Lists;
22+
import com.google.common.collect.Ordering;
23+
import org.apache.drill.exec.physical.EndpointAffinity;
24+
import org.apache.drill.exec.physical.PhysicalOperatorSetupException;
25+
import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
26+
27+
import java.util.Map;
28+
import java.util.List;
29+
import java.util.Collection;
30+
import java.util.HashMap;
31+
import java.util.Collections;
32+
import org.slf4j.Logger;
33+
34+
/**
35+
* Implementation of {@link FragmentParallelizer} where fragment has zero or more endpoints.
36+
* Fragment placement is done preferring data locality.
37+
*/
38+
public class LocalAffinityFragmentParallelizer implements FragmentParallelizer {
39+
private static final Logger logger = org.slf4j.LoggerFactory.getLogger(LocalAffinityFragmentParallelizer.class);
40+
public static final LocalAffinityFragmentParallelizer INSTANCE = new LocalAffinityFragmentParallelizer();
41+
private static String EOL = System.getProperty("line.separator");
42+
43+
// Sort a list of map entries in decreasing order by values.
44+
Ordering<Map.Entry<DrillbitEndpoint, Integer>> sortByValues = new Ordering<Map.Entry<DrillbitEndpoint, Integer>>() {
45+
@Override
46+
public int compare(Map.Entry<DrillbitEndpoint, Integer> left, Map.Entry<DrillbitEndpoint, Integer> right) {
47+
return right.getValue().compareTo(left.getValue());
48+
}
49+
};
50+
51+
@Override
52+
public void parallelizeFragment(final Wrapper fragmentWrapper, final ParallelizationParameters parameters,
53+
final Collection<DrillbitEndpoint> activeEndpoints) throws PhysicalOperatorSetupException {
54+
final Stats stats = fragmentWrapper.getStats();
55+
final ParallelizationInfo parallelizationInfo = stats.getParallelizationInfo();
56+
logger.trace("LocalAffinity Fragment Parallelizer: " + "MaxCost: {}, " + "SliceTarget: {}, " +
57+
"Parallelization MaxWidth: {}," + EOL + "Parallelization MinWidth: {}," + "MaxGlobalWidth: {}," +
58+
"MaxWidthPerNode {}," + EOL + "ActiveEndPoints: {}",
59+
stats.getMaxCost(), parameters.getSliceTarget(), parallelizationInfo.getMaxWidth(),
60+
parameters.getMaxGlobalWidth(), parameters.getMaxWidthPerNode(), activeEndpoints);
61+
62+
final Map<DrillbitEndpoint, EndpointAffinity> endpointAffinityMap =
63+
fragmentWrapper.getStats().getParallelizationInfo().getEndpointAffinityMap();
64+
int totalLocalWorkUnits = 0;
65+
Map<DrillbitEndpoint, Integer> localEndpointPool = new HashMap<>(); // Nodes with data locality.
66+
67+
// Get the total number of work units and list of endPoints with data locality to schedule fragments on
68+
for (Map.Entry<DrillbitEndpoint, EndpointAffinity> epAff : endpointAffinityMap.entrySet()) {
69+
if (epAff.getValue().getNumLocalWorkUnits() > 0) {
70+
totalLocalWorkUnits += epAff.getValue().getNumLocalWorkUnits();
71+
localEndpointPool.put(epAff.getKey(), epAff.getValue().getNumLocalWorkUnits());
72+
}
73+
}
74+
75+
// Find the parallelization width of fragment
76+
// 1. Find the parallelization based on cost. Use max cost of all operators in this fragment;
77+
int width = (int) Math.ceil(stats.getMaxCost() / parameters.getSliceTarget());
78+
79+
// 2. Cap the parallelization width by fragment level width limit and system level per query width limit
80+
width = Math.min(width, Math.min(parallelizationInfo.getMaxWidth(), parameters.getMaxGlobalWidth()));
81+
82+
// 3. Cap the parallelization width by system level per node width limit
83+
width = Math.min(width, parameters.getMaxWidthPerNode() * activeEndpoints.size());
84+
85+
// 4. Make sure width is at least the min width enforced by operators
86+
width = Math.max(parallelizationInfo.getMinWidth(), width);
87+
88+
// 5. Make sure width is at most the max width enforced by operators
89+
width = Math.min(parallelizationInfo.getMaxWidth(), width);
90+
91+
// 6: Finally make sure the width is at least one
92+
width = Math.max(1, width);
93+
94+
List<DrillbitEndpoint> assignedEndPoints = Lists.newArrayList();
95+
int totalAssigned = 0;
96+
97+
// Sort the endpointPool based on numLocalWorkUnits. This sorting is done because we are doing
98+
// round robin allocation and we stop when we reach the width. We want to allocate
99+
// on endpoints which have higher numLocalWorkUnits first.
100+
List<Map.Entry<DrillbitEndpoint, Integer>> sortedEndpointPool = Lists.newArrayList(localEndpointPool.entrySet());
101+
Collections.sort(sortedEndpointPool, sortByValues);
102+
103+
// Keep track of number of fragments allocated to each endpoint.
104+
Map<DrillbitEndpoint, Integer> endpointAssignments = new HashMap<>();
105+
106+
// Keep track of how many more to assign to each endpoint.
107+
Map<DrillbitEndpoint, Integer> remainingEndpointAssignments = new HashMap<>();
108+
109+
// localWidth is the width that we can allocate up to if we allocate only on nodes with locality.
110+
int localWidth = Math.min(width, parameters.getMaxWidthPerNode() * localEndpointPool.size());
111+
112+
logger.trace("LocalAffinity Fragment Parallelizer: " + "width: {}, " + "totalLocalworkUnits: {}, " +
113+
"localWidth: {}," + EOL + "localEndpointPool: {}",
114+
width, totalLocalWorkUnits, localWidth, localEndpointPool);
115+
116+
// Calculate the target allocation for each endPoint with data locality based on work it has to do
117+
// Assign one fragment (minimum) to these endPoints.
118+
for (DrillbitEndpoint ep : localEndpointPool.keySet()) {
119+
final int numWorkUnits = endpointAffinityMap.get(ep).getNumLocalWorkUnits();
120+
final int targetAllocation = Math.min(numWorkUnits,
121+
(int) Math.ceil(localWidth * ((double)numWorkUnits/totalLocalWorkUnits)));
122+
assignedEndPoints.add(ep);
123+
totalAssigned++;
124+
endpointAssignments.put(ep, 1);
125+
remainingEndpointAssignments.put(ep, targetAllocation - 1);
126+
if (totalAssigned == localWidth) { // do not allocate more than local width
127+
break;
128+
}
129+
}
130+
131+
// Keep allocating from endpoints in a round robin fashion up to min(targetAllocation, maxwidthPerNode)
132+
// for each endpoint with data locality and upto localWidth all together.
133+
while(totalAssigned < localWidth) {
134+
int assignedThisRound = 0;
135+
for (Map.Entry<DrillbitEndpoint, Integer> epEntry : sortedEndpointPool) {
136+
DrillbitEndpoint ep = epEntry.getKey();
137+
final int remainingAssignments = remainingEndpointAssignments.get(ep);
138+
final int currentAssignments = endpointAssignments.get(ep);
139+
if (remainingAssignments > 0 && currentAssignments < parameters.getMaxWidthPerNode()) {
140+
assignedEndPoints.add(ep);
141+
remainingEndpointAssignments.put(ep, remainingAssignments - 1);
142+
totalAssigned++;
143+
assignedThisRound++;
144+
endpointAssignments.put(ep, currentAssignments + 1);
145+
}
146+
if (totalAssigned == localWidth) {
147+
break;
148+
}
149+
}
150+
if (assignedThisRound == 0) {
151+
break;
152+
}
153+
}
154+
// At this point, we have taken care of allocating fragments for totalLocalWorkUnits, i.e. workUnits which
155+
// have data locality information.
156+
// For the workUnits which do not have data locality information (For the case where drillbits are not running
157+
// on endPoints which have data and local filesystem), allocate them from the active endpoint pool.
158+
// If we have already scheduled parallelizationInfo.getMaxWidth() fragments, do not schedule any more.
159+
// Else, figure out fragments to schedule (how many and where ?) on active end points for unAssignedWorkUnits.
160+
// We can assign max of upto width.
161+
int unAssigned = parallelizationInfo.getMaxWidth() > totalLocalWorkUnits ?
162+
(parallelizationInfo.getMaxWidth() - totalLocalWorkUnits) : 0;
163+
while (totalAssigned < width && unAssigned > 0) {
164+
for (DrillbitEndpoint ep : activeEndpoints) {
165+
if (endpointAssignments.containsKey(ep) &&
166+
endpointAssignments.get(ep) >= parameters.getMaxWidthPerNode()) {
167+
continue;
168+
}
169+
assignedEndPoints.add(ep);
170+
totalAssigned++;
171+
unAssigned--;
172+
if (endpointAssignments.containsKey(ep)) {
173+
endpointAssignments.put(ep, endpointAssignments.get(ep) + 1);
174+
} else {
175+
endpointAssignments.put(ep, 1);
176+
}
177+
if (unAssigned == 0 || totalAssigned == width) {
178+
break;
179+
}
180+
}
181+
}
182+
183+
logger.trace("LocalAffinity Fragment Parallelizer: " + "Total Assigned: {}" + EOL +
184+
"Endpoint Assignments: {}", totalAssigned, endpointAssignments);
185+
186+
fragmentWrapper.setWidth(assignedEndPoints.size());
187+
fragmentWrapper.assignEndpoints(assignedEndPoints);
188+
}
189+
}

exec/java-exec/src/main/java/org/apache/drill/exec/planner/fragment/ParallelizationInfo.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ public static ParallelizationInfo create(int minWidth, int maxWidth) {
5454
public static ParallelizationInfo create(int minWidth, int maxWidth, List<EndpointAffinity> endpointAffinities) {
5555
Map<DrillbitEndpoint, EndpointAffinity> affinityMap = Maps.newHashMap();
5656

57-
for(EndpointAffinity epAffinity : endpointAffinities) {
57+
for (EndpointAffinity epAffinity : endpointAffinities) {
5858
affinityMap.put(epAffinity.getEndpoint(), epAffinity);
5959
}
6060

exec/java-exec/src/main/java/org/apache/drill/exec/server/options/SystemOptionManager.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ public class SystemOptionManager extends BaseOptionManager implements AutoClosea
108108
ExecConstants.PARQUET_PAGEREADER_BUFFER_SIZE_VALIDATOR,
109109
ExecConstants.PARQUET_PAGEREADER_USE_FADVISE_VALIDATOR,
110110
ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP_VALIDATOR,
111+
ExecConstants.PARQUET_LOCAL_AFFINITY_IMPLEMENTATION_VALIDATOR,
111112
ExecConstants.JSON_READER_ALL_TEXT_MODE_VALIDATOR,
112113
ExecConstants.ENABLE_UNION_TYPE,
113114
ExecConstants.TEXT_ESTIMATED_ROW_SIZE,

0 commit comments

Comments
 (0)