2
2
from __future__ import absolute_import
3
3
import unittest
4
4
from webstruct import GateLoader , HtmlTokenizer , HtmlFeatureExtractor
5
- from webstruct .features import token_lower , token_identity , Pattern
5
+ from webstruct .features import token_lower , token_identity , looks_like_year , Pattern
6
6
7
7
8
8
class PatternTest (unittest .TestCase ):
@@ -17,19 +17,20 @@ def _load_document(self):
17
17
return html_tokens
18
18
19
19
def test_pattern (self ):
20
+ #, (0, 'looks_like_year')
20
21
featextractor = HtmlFeatureExtractor (
21
- token_features = [token_lower , token_identity ],
22
+ token_features = [token_lower , token_identity , looks_like_year ],
22
23
global_features = [
23
- Pattern ((- 2 , 'lower' ), (- 1 , 'lower' ))
24
+ Pattern ((- 2 , 'lower' ), (- 1 , 'lower' ), ( - 1 , 'looks_like_year' ) )
24
25
]
25
26
)
26
27
X = featextractor .transform_single (self .html_tokens )
27
-
28
- key = 'lower[-2]/lower[-1]'
28
+ key = 'lower[-2]/lower[-1]/looks_like_year[-1]'
29
29
self .assertNotIn (key , X [0 ])
30
30
self .assertListEqual (
31
31
[feat [key ] for feat in X [1 :]],
32
- ['?/hello' , 'hello/john' , 'john/doe' , 'doe/mary' ],
32
+ ['?/hello/False' , 'hello/john/False' , 'john/doe/False' ,
33
+ 'doe/mary/False' ],
33
34
)
34
35
35
36
def test_pattern_lookups (self ):
0 commit comments