Skip to content

Commit b4eaeb2

Browse files
committed
Use bit arrays for predicate matching in search.
1 parent efc6215 commit b4eaeb2

File tree

5 files changed

+58
-69
lines changed

5 files changed

+58
-69
lines changed

app/bin/tools/search_benchmark.dart

+3
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ Future<void> main(List<String> args) async {
2525

2626
// NOTE: please add more queries to this list, especially if there is a performance bottleneck.
2727
final queries = [
28+
'sdk:dart',
29+
'sdk:flutter platform:android',
30+
'is:flutter-favorite',
2831
'chart',
2932
'json',
3033
'camera',

app/lib/search/mem_index.dart

+45-50
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import 'dart:math' as math;
66

77
import 'package:_pub_shared/search/search_form.dart';
8+
import 'package:bit_array/bit_array.dart';
89
import 'package:clock/clock.dart';
910
import 'package:collection/collection.dart';
1011
import 'package:logging/logging.dart';
@@ -29,10 +30,9 @@ class InMemoryPackageIndex {
2930
late final TokenIndex<IndexedApiDocPage> _apiSymbolIndex;
3031
late final _scorePool = ScorePool(_packageNameIndex._packageNames);
3132

32-
/// Maps the tag strings to a list of document index values
33-
/// (`PackageDocument doc.tags -> List<_documents.indexOf(doc)>`).
34-
final _tagDocumentIndices = <String, List<int>>{};
35-
final _documentTagIds = <List<int>>[];
33+
/// Maps the tag strings to a list of document index values using bit arrays.
34+
/// - (`PackageDocument doc.tags -> BitArray(List<_documents.indexOf(doc)>)`).
35+
final _tagBitArrays = <String, BitArray>{};
3636

3737
/// Adjusted score takes the overall score and transforms
3838
/// it linearly into the [0.4-1.0] range.
@@ -63,12 +63,11 @@ class InMemoryPackageIndex {
6363
_documentsByName[doc.package] = doc;
6464

6565
// transform tags into numberical IDs
66-
final tagIds = <int>[];
6766
for (final tag in doc.tags) {
68-
_tagDocumentIndices.putIfAbsent(tag, () => []).add(i);
67+
_tagBitArrays
68+
.putIfAbsent(tag, () => BitArray(_documents.length))
69+
.setBit(i);
6970
}
70-
tagIds.sort();
71-
_documentTagIds.add(tagIds);
7271

7372
final apiDocPages = doc.apiDocPages;
7473
if (apiDocPages != null) {
@@ -137,62 +136,54 @@ class InMemoryPackageIndex {
137136
return PackageSearchResult.empty();
138137
}
139138
return _scorePool.withScore(
140-
value: 1.0,
139+
value: 0.0,
141140
fn: (score) {
142141
return _search(query, score);
143142
},
144143
);
145144
}
146145

147146
PackageSearchResult _search(
148-
ServiceSearchQuery query, IndexedScore<String> packageScores) {
149-
// filter on package prefix
150-
if (query.parsedQuery.packagePrefix != null) {
151-
final String prefix = query.parsedQuery.packagePrefix!.toLowerCase();
152-
packageScores.retainWhere(
153-
(i, _) => _documents[i].packageNameLowerCased.startsWith(prefix),
154-
);
155-
}
147+
ServiceSearchQuery query,
148+
IndexedScore<String> packageScores,
149+
) {
150+
// TODO: implement pooling of this object similarly to [ScorePool].
151+
final packages = BitArray(_documents.length)
152+
..setRange(0, _documents.length);
156153

157154
// filter on tags
158155
final combinedTagsPredicate =
159156
query.tagsPredicate.appendPredicate(query.parsedQuery.tagsPredicate);
160157
if (combinedTagsPredicate.isNotEmpty) {
161158
for (final entry in combinedTagsPredicate.entries) {
162-
final docIndexes = _tagDocumentIndices[entry.key];
163-
159+
final tagBits = _tagBitArrays[entry.key];
164160
if (entry.value) {
165-
// predicate is required, zeroing the gaps between index values
166-
if (docIndexes == null) {
167-
// the predicate is required, no document will match it
161+
if (tagBits == null) {
162+
// the predicate is not matched by any document
168163
return PackageSearchResult.empty();
169164
}
170-
171-
for (var i = 0; i < docIndexes.length; i++) {
172-
if (i == 0) {
173-
packageScores.fillRange(0, docIndexes[i], 0.0);
174-
continue;
175-
}
176-
packageScores.fillRange(docIndexes[i - 1] + 1, docIndexes[i], 0.0);
177-
}
178-
packageScores.fillRange(docIndexes.last + 1, _documents.length, 0.0);
165+
packages.and(tagBits);
179166
} else {
180-
// predicate is prohibited, zeroing the values
181-
182-
if (docIndexes == null) {
183-
// the predicate is prohibited, no document has it, always a match
167+
if (tagBits == null) {
168+
// negative predicate without index means all document is matched
184169
continue;
185170
}
186-
for (final i in docIndexes) {
187-
packageScores.setValue(i, 0.0);
188-
}
171+
packages.andNot(tagBits);
189172
}
190173
}
191174
}
192175

176+
// filter on package prefix
177+
if (query.parsedQuery.packagePrefix != null) {
178+
final prefix = query.parsedQuery.packagePrefix!.toLowerCase();
179+
packages.clearWhere(
180+
(i) => !_documents[i].packageNameLowerCased.startsWith(prefix),
181+
);
182+
}
183+
193184
// filter on dependency
194185
if (query.parsedQuery.hasAnyDependency) {
195-
packageScores.removeWhere((i, _) {
186+
packages.clearWhere((i) {
196187
final doc = _documents[i];
197188
if (doc.dependencies.isEmpty) return true;
198189
for (final dependency in query.parsedQuery.allDependencies) {
@@ -208,22 +199,29 @@ class InMemoryPackageIndex {
208199

209200
// filter on points
210201
if (query.minPoints != null && query.minPoints! > 0) {
211-
packageScores.removeWhere(
212-
(i, _) => _documents[i].grantedPoints < query.minPoints!);
202+
packages
203+
.clearWhere((i) => _documents[i].grantedPoints < query.minPoints!);
213204
}
214205

215206
// filter on updatedDuration
216207
final updatedDuration = query.parsedQuery.updatedDuration;
217208
if (updatedDuration != null && updatedDuration > Duration.zero) {
218209
final now = clock.now();
219-
packageScores.removeWhere(
220-
(i, _) => now.difference(_documents[i].updated) > updatedDuration);
210+
packages.clearWhere(
211+
(i) => now.difference(_documents[i].updated) > updatedDuration);
212+
}
213+
214+
// TODO: find a better way to handle predicate-only filtering and scoring
215+
for (final index in packages.asIntIterable()) {
216+
if (index >= _documents.length) break;
217+
packageScores.setValue(index, 1.0);
221218
}
222219

223220
// do text matching
224221
final parsedQueryText = query.parsedQuery.text;
225222
final textResults = _searchText(
226223
packageScores,
224+
packages,
227225
parsedQueryText,
228226
includeNameMatches: (query.offset ?? 0) == 0,
229227
textMatchExtent: query.textMatchExtent ?? TextMatchExtent.api,
@@ -334,6 +332,7 @@ class InMemoryPackageIndex {
334332

335333
_TextResults? _searchText(
336334
IndexedScore<String> packageScores,
335+
BitArray packages,
337336
String? text, {
338337
required bool includeNameMatches,
339338
required TextMatchExtent textMatchExtent,
@@ -345,12 +344,14 @@ class InMemoryPackageIndex {
345344
final sw = Stopwatch()..start();
346345
final words = splitForQuery(text);
347346
if (words.isEmpty) {
347+
// packages.clearAll();
348348
packageScores.fillRange(0, packageScores.length, 0);
349349
return _TextResults.empty();
350350
}
351351

352352
final matchName = textMatchExtent.shouldMatchName();
353353
if (!matchName) {
354+
// packages.clearAll();
354355
packageScores.fillRange(0, packageScores.length, 0);
355356
return _TextResults.empty(
356357
errorMessage:
@@ -373,12 +374,6 @@ class InMemoryPackageIndex {
373374
nameMatches.add(text);
374375
}
375376

376-
// Multiple words are scored separately, and then the individual scores
377-
// are multiplied. We can use a package filter that is applied after each
378-
// word to reduce the scope of the later words based on the previous results.
379-
/// However, API docs search should be filtered on the original list.
380-
final indexedPositiveList = packageScores.toIndexedPositiveList();
381-
382377
final matchDescription = textMatchExtent.shouldMatchDescription();
383378
final matchReadme = textMatchExtent.shouldMatchReadme();
384379
final matchApi = textMatchExtent.shouldMatchApi();
@@ -419,7 +414,7 @@ class InMemoryPackageIndex {
419414
if (value < 0.01) continue;
420415

421416
final doc = symbolPages.keys[i];
422-
if (!indexedPositiveList[doc.index]) continue;
417+
if (!packages[doc.index]) continue;
423418

424419
// skip if the previously found pages are better than the current one
425420
final pages =

app/lib/search/token_index.dart

-18
Original file line numberDiff line numberDiff line change
@@ -227,24 +227,6 @@ class IndexedScore<K> {
227227
_values.fillRange(start, end, fillValue);
228228
}
229229

230-
void removeWhere(bool Function(int index, K key) fn) {
231-
for (var i = 0; i < length; i++) {
232-
if (isNotPositive(i)) continue;
233-
if (fn(i, _keys[i])) {
234-
_values[i] = 0.0;
235-
}
236-
}
237-
}
238-
239-
void retainWhere(bool Function(int index, K key) fn) {
240-
for (var i = 0; i < length; i++) {
241-
if (isNotPositive(i)) continue;
242-
if (!fn(i, _keys[i])) {
243-
_values[i] = 0.0;
244-
}
245-
}
246-
}
247-
248230
void multiplyAllFrom(IndexedScore other) {
249231
multiplyAllFromValues(other._values);
250232
}

app/pubspec.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ dependencies:
4848
# pana version to be pinned
4949
pana: '0.22.20'
5050
# 3rd-party packages with pinned versions
51+
bit_array: 2.3.0
5152
mailer: '6.3.0'
5253
ulid: '2.0.1'
5354
tar: '2.0.0'

pubspec.lock

+9-1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,14 @@ packages:
7070
url: "https://pub.dev"
7171
source: hosted
7272
version: "0.10.0"
73+
bit_array:
74+
dependency: transitive
75+
description:
76+
name: bit_array
77+
sha256: "1d7a488b29446431a586681c157db901434b5de7dbbe14db271b91ea3eabfbac"
78+
url: "https://pub.dev"
79+
source: hosted
80+
version: "2.3.0"
7381
boolean_selector:
7482
dependency: transitive
7583
description:
@@ -1015,4 +1023,4 @@ packages:
10151023
source: hosted
10161024
version: "2.2.2"
10171025
sdks:
1018-
dart: ">=3.6.0 <4.0.0"
1026+
dart: ">=3.7.0 <4.0.0"

0 commit comments

Comments
 (0)