Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add :matches selector #210

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,16 @@ $ cat robots.html | pup ':contains("History")'
</span>
```

```bash
$ cat robots.html | pup ':matches("Histor*")'
<span class="toctext">
History
</span>
<span class="mw-headline" id="History">
History
</span>
```

```bash
$ cat robots.html | pup ':parent-of([action="edit"])'
<span class="wb-langlinks-edit wb-langlinks-link">
Expand Down Expand Up @@ -211,6 +221,7 @@ pup ':last-of-type'
pup ':only-child'
pup ':only-of-type'
pup ':contains("text")'
pup ':matches("pattern")'
pup ':nth-child(n)'
pup ':nth-of-type(n)'
pup ':nth-last-child(n)'
Expand Down
54 changes: 54 additions & 0 deletions selector.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,11 @@ func ParsePseudo(selector *CSSSelector, s scanner.Scanner) error {
if err != nil {
return err
}
case strings.HasPrefix(cmd, "matches("):
selector.Pseudo, err = parseMatchesPseudo(cmd[len("matches("):])
if err != nil {
return err
}
case strings.HasPrefix(cmd, "nth-child("),
strings.HasPrefix(cmd, "nth-last-child("),
strings.HasPrefix(cmd, "nth-last-of-type("),
Expand Down Expand Up @@ -592,6 +597,55 @@ func parseContainsPseudo(cmd string) (PseudoClass, error) {
}
}

// Parse a :matches("") selector
// expects the input to be a valid regexp that matches text
func parseMatchesPseudo(cmd string) (PseudoClass, error) {
var s scanner.Scanner
s.Init(strings.NewReader(cmd))
switch s.Next() {
case '"':
default:
return nil, fmt.Errorf("Malformed 'matches(\"\")' selector")
}
pattern := bytes.NewBuffer([]byte{})
for {
r := s.Next()
switch r {
case '"':
// ')' then EOF must follow '"'
if s.Next() != ')' {
return nil, fmt.Errorf("Malformed 'matches(\"\")' selector")
}
if s.Next() != scanner.EOF {
return nil, fmt.Errorf("'matches(\"\")' must end selector")
}
p, err := regexp.Compile(pattern.String())
if err != nil {
return nil, err
}
contains := func(node *html.Node) bool {
for c := node.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.TextNode {
if p.MatchString(c.Data) {
return true
}
}
}
return false
}
return contains, nil
case '\\':
s.Next()
case scanner.EOF:
return nil, fmt.Errorf("Malformed 'contains(\"\")' selector")
default:
if _, err := pattern.WriteRune(r); err != nil {
return nil, err
}
}
}
}

// Parse a :not(selector) selector
// expects the input to be everything after the open parenthesis
// e.g. for `not(div#id)` the argument would be `div#id)`
Expand Down
2 changes: 2 additions & 0 deletions tests/cmds.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,5 @@ link , a:parent-of(sup)
link , a:parent-of(sup) sup
li --number
li -n
p:contains("Rob")
p:matches("Ro*")
6 changes: 4 additions & 2 deletions tests/expected_output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ a92e50c09cd56970625ac3b74efbddb83b2731bb table li
66950e746590d7f4e9cfe3d1adef42cd0addcf1d table li:last-of-type
0a37d612cd4c67a42bd147b1edc5a1128456b017 table a[title="The Practice of Programming"]
0d3918d54f868f13110262ffbb88cbb0b083057d table a[title="The Practice of Programming"] text{}
ecb542a30fc75c71a0c6380692cbbc4266ccbce4 json{}
199188dc8f1522426a628e41d96264bffb8beb0f json{}
95ef88ded9dab22ee3206cca47b9c3a376274bda text{}
e4f7358fbb7bb1748a296fa2a7e815fa7de0a08b .after-portlet
da39a3ee5e6b4b0d3255bfef95601890afd80709 .after
Expand All @@ -34,7 +34,7 @@ d314e83b059bb876b0e5ee76aa92d54987961f9a .navbox-list li:nth-last-child(1)
613bf65ac4042b6ee0a7a47f08732fdbe1b5b06b #toc
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a text{}
97d170e1550eee4afc0af065b78cda302a97674c #toc li + a json{}
cd0d4cc32346750408f7d4f5e78ec9a6e5b79a0d #toc li + a json{}
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a + span
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li > li
Expand All @@ -47,3 +47,5 @@ b6a3d6cccd305fcc3e8bf2743c443743bdaaa02b link , a:parent-of(sup)
0d1f66765d1632c70f8608947890524e78459362 link , a:parent-of(sup) sup
da39a3ee5e6b4b0d3255bfef95601890afd80709 li --number
da39a3ee5e6b4b0d3255bfef95601890afd80709 li -n
4c15ca8f190a4412469e487fab6f7ad2479f922f p:contains("Rob")
da39a3ee5e6b4b0d3255bfef95601890afd80709 p:matches("Ro*")
6 changes: 3 additions & 3 deletions tests/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

from __future__ import print_function
from hashlib import sha1
from subprocess import Popen, PIPE, STDOUT
from subprocess import Popen, PIPE

data = open("index.html", "r").read()
data = open("index.html", "rb").read()

for line in open("cmds.txt", "r"):
line = line.strip()
p = Popen(['pup', line], stdout=PIPE, stdin=PIPE, stderr=PIPE)
p = Popen(["pup", line], stdout=PIPE, stdin=PIPE, stderr=PIPE)
h = sha1()
h.update(p.communicate(input=data)[0])
print("%s %s" % (h.hexdigest(), line))