Skip to content

Commit 79fd9fe

Browse files
authored
Speed up advancing within a block. (#13692)
Advancing within a block consists of finding the first index within an array of 128 values whose value is greater than or equal a target. Given the small size, it's not obvious whether it's better to perform a linear search, a binary search or something else... It is surprisingly hard to beat the linear search that we are using today. Experiments suggested that the following approach works in practice: - First check if the next item in the array is greater than or equal to the target. - Then find the first 4-values interval that contains our target. - Then perform a branchless binary search within this interval of 4 values. This approach still biases heavily towards the case when the target is very close to the current index, only a bit less than a linear search.
1 parent d55b92b commit 79fd9fe

File tree

3 files changed

+486
-33
lines changed

3 files changed

+486
-33
lines changed
Lines changed: 376 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,376 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.benchmark.jmh;
18+
19+
import java.util.Arrays;
20+
import java.util.Random;
21+
import java.util.concurrent.TimeUnit;
22+
import org.apache.lucene.search.DocIdSetIterator;
23+
import org.openjdk.jmh.annotations.Benchmark;
24+
import org.openjdk.jmh.annotations.BenchmarkMode;
25+
import org.openjdk.jmh.annotations.CompilerControl;
26+
import org.openjdk.jmh.annotations.Fork;
27+
import org.openjdk.jmh.annotations.Level;
28+
import org.openjdk.jmh.annotations.Measurement;
29+
import org.openjdk.jmh.annotations.Mode;
30+
import org.openjdk.jmh.annotations.OutputTimeUnit;
31+
import org.openjdk.jmh.annotations.Scope;
32+
import org.openjdk.jmh.annotations.Setup;
33+
import org.openjdk.jmh.annotations.State;
34+
import org.openjdk.jmh.annotations.Warmup;
35+
36+
@BenchmarkMode(Mode.Throughput)
37+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
38+
@State(Scope.Benchmark)
39+
@Warmup(iterations = 5, time = 1)
40+
@Measurement(iterations = 5, time = 1)
41+
@Fork(
42+
value = 1,
43+
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
44+
public class AdvanceBenchmark {
45+
46+
private final long[] values = new long[129];
47+
private final int[] startIndexes = new int[1_000];
48+
private final long[] targets = new long[startIndexes.length];
49+
50+
@Setup(Level.Trial)
51+
public void setup() throws Exception {
52+
for (int i = 0; i < 128; ++i) {
53+
values[i] = i;
54+
}
55+
values[128] = DocIdSetIterator.NO_MORE_DOCS;
56+
Random r = new Random(0);
57+
for (int i = 0; i < startIndexes.length; ++i) {
58+
startIndexes[i] = r.nextInt(64);
59+
targets[i] = startIndexes[i] + 1 + r.nextInt(1 << r.nextInt(7));
60+
}
61+
}
62+
63+
@Benchmark
64+
public void binarySearch() {
65+
for (int i = 0; i < startIndexes.length; ++i) {
66+
binarySearch(values, targets[i], startIndexes[i]);
67+
}
68+
}
69+
70+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
71+
private static int binarySearch(long[] values, long target, int startIndex) {
72+
// Standard binary search
73+
int i = Arrays.binarySearch(values, startIndex, values.length, target);
74+
if (i < 0) {
75+
i = -1 - i;
76+
}
77+
return i;
78+
}
79+
80+
@Benchmark
81+
public void binarySearch2() {
82+
for (int i = 0; i < startIndexes.length; ++i) {
83+
binarySearch2(values, targets[i], startIndexes[i]);
84+
}
85+
}
86+
87+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
88+
private static int binarySearch2(long[] values, long target, int startIndex) {
89+
// Try to help the compiler by providing predictable start/end offsets.
90+
int i = Arrays.binarySearch(values, 0, 128, target);
91+
if (i < 0) {
92+
i = -1 - i;
93+
}
94+
return i;
95+
}
96+
97+
@Benchmark
98+
public void binarySearch3() {
99+
for (int i = 0; i < startIndexes.length; ++i) {
100+
binarySearch3(values, targets[i], startIndexes[i]);
101+
}
102+
}
103+
104+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
105+
private static int binarySearch3(long[] values, long target, int startIndex) {
106+
// Organize code the same way as suggested in https://quickwit.io/blog/search-a-sorted-block,
107+
// which proved to help with LLVM.
108+
int start = 0;
109+
int length = 128;
110+
111+
while (length > 1) {
112+
length /= 2;
113+
if (values[start + length - 1] < target) {
114+
start += length;
115+
}
116+
}
117+
return start;
118+
}
119+
120+
@Benchmark
121+
public void binarySearch4() {
122+
for (int i = 0; i < startIndexes.length; ++i) {
123+
binarySearch4(values, targets[i], startIndexes[i]);
124+
}
125+
}
126+
127+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
128+
private static int binarySearch4(long[] values, long target, int startIndex) {
129+
// Explicitly inline the binary-search logic to see if it helps the compiler.
130+
int start = 0;
131+
132+
if (values[63] < target) {
133+
start += 64;
134+
}
135+
if (values[start + 31] < target) {
136+
start += 32;
137+
}
138+
if (values[start + 15] < target) {
139+
start += 16;
140+
}
141+
if (values[start + 7] < target) {
142+
start += 8;
143+
}
144+
if (values[start + 3] < target) {
145+
start += 4;
146+
}
147+
if (values[start + 1] < target) {
148+
start += 2;
149+
}
150+
if (values[start] < target) {
151+
start += 1;
152+
}
153+
154+
return start;
155+
}
156+
157+
@Benchmark
158+
public void binarySearch5() {
159+
for (int i = 0; i < startIndexes.length; ++i) {
160+
binarySearch5(values, targets[i], startIndexes[i]);
161+
}
162+
}
163+
164+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
165+
private static int binarySearch5(long[] values, long target, int startIndex) {
166+
// Other way to write a binary search
167+
int start = 0;
168+
169+
for (int shift = 6; shift >= 0; --shift) {
170+
int halfRange = 1 << shift;
171+
if (values[start + halfRange - 1] < target) {
172+
start += halfRange;
173+
}
174+
}
175+
176+
return start;
177+
}
178+
179+
@Benchmark
180+
public void binarySearch6() {
181+
for (int i = 0; i < startIndexes.length; ++i) {
182+
binarySearch6(values, targets[i], startIndexes[i]);
183+
}
184+
}
185+
186+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
187+
private static int binarySearch6(long[] values, long target, int startIndex) {
188+
// Other way to write a binary search
189+
int start = 0;
190+
191+
for (int halfRange = 64; halfRange > 0; halfRange >>= 1) {
192+
if (values[start + halfRange - 1] < target) {
193+
start += halfRange;
194+
}
195+
}
196+
197+
return start;
198+
}
199+
200+
@Benchmark
201+
public void linearSearch() {
202+
for (int i = 0; i < startIndexes.length; ++i) {
203+
linearSearch(values, targets[i], startIndexes[i]);
204+
}
205+
}
206+
207+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
208+
private static int linearSearch(long[] values, long target, int startIndex) {
209+
// Naive linear search.
210+
for (int i = startIndex; i < values.length; ++i) {
211+
if (values[i] >= target) {
212+
return i;
213+
}
214+
}
215+
return values.length;
216+
}
217+
218+
@Benchmark
219+
public void bruteForceSearch() {
220+
for (int i = 0; i < startIndexes.length; ++i) {
221+
bruteForceSearch(values, targets[i], startIndexes[i]);
222+
}
223+
}
224+
225+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
226+
private static int bruteForceSearch(long[] values, long target, int startIndex) {
227+
// Linear search with predictable start/end offsets to see if it helps the compiler.
228+
for (int i = 0; i < 128; ++i) {
229+
if (values[i] >= target) {
230+
return i;
231+
}
232+
}
233+
return values.length;
234+
}
235+
236+
@Benchmark
237+
public void linearSearch2() {
238+
for (int i = 0; i < startIndexes.length; ++i) {
239+
linearSearch2(values, targets[i], startIndexes[i]);
240+
}
241+
}
242+
243+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
244+
private static int linearSearch2(long[] values, long target, int startIndex) {
245+
// Two-level linear search, first checking every 8-th value, then values within an 8-value range
246+
int rangeStart = values.length - 8;
247+
248+
for (int i = startIndex; i + 8 <= values.length; i += 8) {
249+
if (values[i + 7] >= target) {
250+
rangeStart = i;
251+
break;
252+
}
253+
}
254+
255+
for (int i = 0; i < 8; ++i) {
256+
if (values[rangeStart + i] >= target) {
257+
return rangeStart + i;
258+
}
259+
}
260+
261+
return values.length;
262+
}
263+
264+
@Benchmark
265+
public void linearSearch3() {
266+
for (int i = 0; i < startIndexes.length; ++i) {
267+
linearSearch3(values, targets[i], startIndexes[i]);
268+
}
269+
}
270+
271+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
272+
private static int linearSearch3(long[] values, long target, int startIndex) {
273+
// Iteration over linearSearch that tries to reduce branches
274+
while (startIndex + 4 <= values.length) {
275+
int count = values[startIndex] < target ? 1 : 0;
276+
if (values[startIndex + 1] < target) {
277+
count++;
278+
}
279+
if (values[startIndex + 2] < target) {
280+
count++;
281+
}
282+
if (values[startIndex + 3] < target) {
283+
count++;
284+
}
285+
if (count != 4) {
286+
return startIndex + count;
287+
}
288+
startIndex += 4;
289+
}
290+
291+
for (int i = startIndex; i < values.length; ++i) {
292+
if (values[i] >= target) {
293+
return i;
294+
}
295+
}
296+
297+
return values.length;
298+
}
299+
300+
@Benchmark
301+
public void hybridSearch() {
302+
for (int i = 0; i < startIndexes.length; ++i) {
303+
hybridSearch(values, targets[i], startIndexes[i]);
304+
}
305+
}
306+
307+
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
308+
private static int hybridSearch(long[] values, long target, int startIndex) {
309+
// Two-level linear search, first checking every 8-th value, then values within an 8-value range
310+
int rangeStart = values.length - 8;
311+
312+
for (int i = startIndex; i + 8 <= values.length; i += 8) {
313+
if (values[i + 7] >= target) {
314+
rangeStart = i;
315+
break;
316+
}
317+
}
318+
319+
return binarySearchHelper8(values, target, rangeStart);
320+
}
321+
322+
// branchless binary search over 8 values
323+
private static int binarySearchHelper8(long[] values, long target, int start) {
324+
if (values[start + 3] < target) {
325+
start += 4;
326+
}
327+
if (values[start + 1] < target) {
328+
start += 2;
329+
}
330+
if (values[start] < target) {
331+
start += 1;
332+
}
333+
return start;
334+
}
335+
336+
private static void assertEquals(int expected, int actual) {
337+
if (expected != actual) {
338+
throw new AssertionError("Expected: " + expected + ", got " + actual);
339+
}
340+
}
341+
342+
public static void main(String[] args) {
343+
// For testing purposes
344+
long[] values = new long[129];
345+
for (int i = 0; i < 128; ++i) {
346+
values[i] = i;
347+
}
348+
values[128] = DocIdSetIterator.NO_MORE_DOCS;
349+
for (int start = 0; start < 128; ++start) {
350+
for (int targetIndex = start; targetIndex < 128; ++targetIndex) {
351+
int actualIndex = binarySearch(values, values[targetIndex], start);
352+
assertEquals(targetIndex, actualIndex);
353+
actualIndex = binarySearch2(values, values[targetIndex], start);
354+
assertEquals(targetIndex, actualIndex);
355+
actualIndex = binarySearch3(values, values[targetIndex], start);
356+
assertEquals(targetIndex, actualIndex);
357+
actualIndex = binarySearch4(values, values[targetIndex], start);
358+
assertEquals(targetIndex, actualIndex);
359+
actualIndex = binarySearch5(values, values[targetIndex], start);
360+
assertEquals(targetIndex, actualIndex);
361+
actualIndex = binarySearch6(values, values[targetIndex], start);
362+
assertEquals(targetIndex, actualIndex);
363+
actualIndex = bruteForceSearch(values, values[targetIndex], start);
364+
assertEquals(targetIndex, actualIndex);
365+
actualIndex = hybridSearch(values, values[targetIndex], start);
366+
assertEquals(targetIndex, actualIndex);
367+
actualIndex = linearSearch(values, values[targetIndex], start);
368+
assertEquals(targetIndex, actualIndex);
369+
actualIndex = linearSearch2(values, values[targetIndex], start);
370+
assertEquals(targetIndex, actualIndex);
371+
actualIndex = linearSearch3(values, values[targetIndex], start);
372+
assertEquals(targetIndex, actualIndex);
373+
}
374+
}
375+
}
376+
}

0 commit comments

Comments
 (0)