Skip to content

Commit cee60a9

Browse files
Richard Franciscore-system-dev
authored andcommitted
Implement edge ngram tokenizer for Bloodhound (#93)
Add tests for ngram tokenizer Add tokenizer examples to Bloodhound readme Build dist NPM & Bower version patch to 0.11.2
1 parent 7f38a23 commit cee60a9

File tree

11 files changed

+137
-25
lines changed

11 files changed

+137
-25
lines changed

bower.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "corejs-typeahead",
3-
"version": "0.11.1",
3+
"version": "0.11.2",
44
"main": "dist/typeahead.bundle.js",
55
"dependencies": {
66
"jquery": ">=1.7"

dist/bloodhound.js

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*!
22
* typeahead.js 0.11.1
33
* https://github.com/twitter/typeahead.js
4-
* Copyright 2013-2015 Twitter, Inc. and other contributors; Licensed MIT
4+
* Copyright 2013-2016 Twitter, Inc. and other contributors; Licensed MIT
55
*/
66

77
(function(root, factory) {
@@ -12,7 +12,7 @@
1212
} else if (typeof exports === "object") {
1313
module.exports = factory(require("jquery"));
1414
} else {
15-
root["Bloodhound"] = factory(jQuery);
15+
root["Bloodhound"] = factory(root["jQuery"]);
1616
}
1717
})(this, function($) {
1818
var _ = function() {
@@ -157,9 +157,11 @@
157157
return {
158158
nonword: nonword,
159159
whitespace: whitespace,
160+
ngram: ngram,
160161
obj: {
161162
nonword: getObjTokenizer(nonword),
162-
whitespace: getObjTokenizer(whitespace)
163+
whitespace: getObjTokenizer(whitespace),
164+
ngram: getObjTokenizer(ngram)
163165
}
164166
};
165167
function whitespace(str) {
@@ -170,6 +172,19 @@
170172
str = _.toStr(str);
171173
return str ? str.split(/\W+/) : [];
172174
}
175+
function ngram(str) {
176+
str = _.toStr(str);
177+
var tokens = [], word = "";
178+
_.each(str.split(""), function(char) {
179+
if (char.match(/\s+/)) {
180+
word = "";
181+
} else {
182+
tokens.push(word + char);
183+
word += char;
184+
}
185+
});
186+
return tokens;
187+
}
173188
function getObjTokenizer(tokenizer) {
174189
return function setKey(keys) {
175190
keys = _.isArray(keys) ? keys : [].slice.call(arguments, 0);

dist/bloodhound.min.js

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dist/typeahead.bundle.js

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*!
22
* typeahead.js 0.11.1
33
* https://github.com/twitter/typeahead.js
4-
* Copyright 2013-2015 Twitter, Inc. and other contributors; Licensed MIT
4+
* Copyright 2013-2016 Twitter, Inc. and other contributors; Licensed MIT
55
*/
66

77
(function(root, factory) {
@@ -12,7 +12,7 @@
1212
} else if (typeof exports === "object") {
1313
module.exports = factory(require("jquery"));
1414
} else {
15-
root["Bloodhound"] = factory(jQuery);
15+
root["Bloodhound"] = factory(root["jQuery"]);
1616
}
1717
})(this, function($) {
1818
var _ = function() {
@@ -157,9 +157,11 @@
157157
return {
158158
nonword: nonword,
159159
whitespace: whitespace,
160+
ngram: ngram,
160161
obj: {
161162
nonword: getObjTokenizer(nonword),
162-
whitespace: getObjTokenizer(whitespace)
163+
whitespace: getObjTokenizer(whitespace),
164+
ngram: getObjTokenizer(ngram)
163165
}
164166
};
165167
function whitespace(str) {
@@ -170,6 +172,19 @@
170172
str = _.toStr(str);
171173
return str ? str.split(/\W+/) : [];
172174
}
175+
function ngram(str) {
176+
str = _.toStr(str);
177+
var tokens = [], word = "";
178+
_.each(str.split(""), function(char) {
179+
if (char.match(/\s+/)) {
180+
word = "";
181+
} else {
182+
tokens.push(word + char);
183+
word += char;
184+
}
185+
});
186+
return tokens;
187+
}
173188
function getObjTokenizer(tokenizer) {
174189
return function setKey(keys) {
175190
keys = _.isArray(keys) ? keys : [].slice.call(arguments, 0);
@@ -935,7 +950,7 @@
935950
} else if (typeof exports === "object") {
936951
module.exports = factory(require("jquery"));
937952
} else {
938-
factory(jQuery);
953+
factory(root["jQuery"]);
939954
}
940955
})(this, function($) {
941956
var _ = function() {
@@ -1730,8 +1745,9 @@
17301745
suggestions = suggestions || [];
17311746
if (!canceled && rendered < that.limit) {
17321747
that.cancel = $.noop;
1733-
that._append(query, suggestions.slice(0, that.limit - rendered));
1734-
rendered += suggestions.length;
1748+
var idx = Math.abs(rendered - that.limit);
1749+
rendered += idx;
1750+
that._append(query, suggestions.slice(0, idx));
17351751
that.async && that.trigger("asyncReceived", query);
17361752
}
17371753
}
@@ -1838,6 +1854,9 @@
18381854
this.$node.on("mouseover", this.selectors.selectable, function() {
18391855
that.setCursor($(this));
18401856
});
1857+
this.$node.on("mouseleave", function() {
1858+
that._removeCursor();
1859+
});
18411860
_.each(this.datasets, function(dataset) {
18421861
dataset.onSync("asyncRequested", that._propagate, that).onSync("asyncCanceled", that._propagate, that).onSync("asyncReceived", that._propagate, that).onSync("rendered", that._onRendered, that).onSync("cleared", that._onCleared, that);
18431862
});

dist/typeahead.bundle.min.js

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dist/typeahead.jquery.js

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*!
22
* typeahead.js 0.11.1
33
* https://github.com/twitter/typeahead.js
4-
* Copyright 2013-2015 Twitter, Inc. and other contributors; Licensed MIT
4+
* Copyright 2013-2016 Twitter, Inc. and other contributors; Licensed MIT
55
*/
66

77
(function(root, factory) {
@@ -12,7 +12,7 @@
1212
} else if (typeof exports === "object") {
1313
module.exports = factory(require("jquery"));
1414
} else {
15-
factory(jQuery);
15+
factory(root["jQuery"]);
1616
}
1717
})(this, function($) {
1818
var _ = function() {
@@ -807,8 +807,9 @@
807807
suggestions = suggestions || [];
808808
if (!canceled && rendered < that.limit) {
809809
that.cancel = $.noop;
810-
that._append(query, suggestions.slice(0, that.limit - rendered));
811-
rendered += suggestions.length;
810+
var idx = Math.abs(rendered - that.limit);
811+
rendered += idx;
812+
that._append(query, suggestions.slice(0, idx));
812813
that.async && that.trigger("asyncReceived", query);
813814
}
814815
}
@@ -915,6 +916,9 @@
915916
this.$node.on("mouseover", this.selectors.selectable, function() {
916917
that.setCursor($(this));
917918
});
919+
this.$node.on("mouseleave", function() {
920+
that._removeCursor();
921+
});
918922
_.each(this.datasets, function(dataset) {
919923
dataset.onSync("asyncRequested", that._propagate, that).onSync("asyncCanceled", that._propagate, that).onSync("asyncReceived", that._propagate, that).onSync("rendered", that._onRendered, that).onSync("cleared", that._onCleared, that);
920924
});

dist/typeahead.jquery.min.js

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

doc/bloodhound.md

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Table of Contents
1212
* [Usage](#usage)
1313
* [API](#api)
1414
* [Options](#options)
15+
* [Tokenizers] (#tokenizers)
1516
* [Prefetch](#prefetch)
1617
* [Remote](#remote)
1718

@@ -164,10 +165,10 @@ When instantiating a Bloodhound suggestion engine, there are a number of
164165
options you can configure.
165166

166167
* `datumTokenizer` – A function with the signature `(datum)` that transforms a
167-
datum into an array of string tokens. **Required**.
168+
datum into an array of string tokens. See [Tokenizers](#tokenizers). **Required**.
168169

169170
* `queryTokenizer` – A function with the signature `(query)` that transforms a
170-
query into an array of string tokens. **Required**.
171+
query into an array of string tokens. See [Tokenizers](#tokenizers). **Required**.
171172

172173
* `matchAnyQueryToken` - By default a search result must match ALL query-tokens.
173174
Instead, this option returns results that match ANY query-tokens. Defaults to
@@ -204,6 +205,23 @@ options you can configure.
204205

205206
[compare function]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/sort
206207

208+
### Tokenizers
209+
210+
The Bloodhound suggestion engine is token-based, so how datums and queries are tokenized plays a vital role in the quality of search results.
211+
212+
A tokenizer is a function with the signature `(string)` that transforms a query into an array of string tokens. When instantiating a Bloodhound suggestion engine, you can use your own tokenizer or one of the following included implementations:
213+
214+
```javascript
215+
// returns ['foo', 'bar', 'foo-bar']
216+
Bloodhound.tokenizers.whitespace('foo bar foo-bar');
217+
218+
// returns ['foo', 'bar', 'foo', 'bar']
219+
Bloodhound.tokenizers.nonword('foo bar foo-bar');
220+
221+
// returns ['f', 'fo', 'foo', 'b', 'ba', 'bar']
222+
Bloodhound.tokenizers.ngram('foo bar');
223+
```
224+
207225
### Prefetch
208226

209227
Prefetched data is fetched and processed on initialization. If the browser

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,6 @@
6666
"scripts": {
6767
"test": "bower install && ./node_modules/karma/bin/karma start --single-run --browsers PhantomJS"
6868
},
69-
"version": "0.11.1",
69+
"version": "0.11.2",
7070
"main": "dist/typeahead.bundle.js"
7171
}

src/bloodhound/tokenizers.js

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@ var tokenizers = (function() {
1010
return {
1111
nonword: nonword,
1212
whitespace: whitespace,
13+
ngram: ngram,
1314
obj: {
1415
nonword: getObjTokenizer(nonword),
15-
whitespace: getObjTokenizer(whitespace)
16+
whitespace: getObjTokenizer(whitespace),
17+
ngram: getObjTokenizer(ngram)
1618
}
1719
};
1820

@@ -26,6 +28,24 @@ var tokenizers = (function() {
2628
return str ? str.split(/\W+/) : [];
2729
}
2830

31+
function ngram(str) {
32+
str = _.toStr(str);
33+
34+
var tokens = [],
35+
word = '';
36+
37+
_.each(str.split(''), function(char) {
38+
if (char.match(/\s+/)) {
39+
word = '';
40+
} else {
41+
tokens.push(word+char);
42+
word += char;
43+
}
44+
});
45+
46+
return tokens;
47+
}
48+
2949
function getObjTokenizer(tokenizer) {
3050
return function setKey(keys) {
3151
keys = _.isArray(keys) ? keys : [].slice.call(arguments, 0);

0 commit comments

Comments
 (0)