Skip to content

Commit 571adeb

Browse files
committed
Merge pull request #23 from EricChiang/json
json{} displayer added
2 parents dd9e318 + dfe4a38 commit 571adeb

File tree

4 files changed

+223
-79
lines changed

4 files changed

+223
-79
lines changed

README.md

+90-5
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,22 @@ Ew, HTML. Let's run that through some pup selectors:
3434
$ curl -s https://news.ycombinator.com/ | pup 'td.title a[href^=http] attr{href}'
3535
```
3636

37+
Even better, let's grab the titles too:
38+
39+
```bash
40+
$ curl -s https://news.ycombinator.com/ | pup 'td.title a[href^=http] json{}'
41+
```
42+
3743
## Basic Usage
3844

3945
```bash
40-
$ cat index.html | pup [selectors and flags]
46+
$ cat index.html | pup [flags] [selectors] [optional display function]
4147
```
4248

4349
or
4450

4551
```bash
46-
$ pup < index.html [selectors and flags]
52+
$ pup < index.html [flags] [selectors] [optional display function]
4753
```
4854

4955
## Examples
@@ -185,7 +191,7 @@ You can mix and match selectors as you wish.
185191
cat index.html | pup element#id[attribute=value]
186192
```
187193

188-
## Functions
194+
## Display Functions
189195

190196
Non-HTML selectors which effect the output type are implemented as functions
191197
which can be provided as a final argument.
@@ -231,6 +237,85 @@ $ pup < robots.html a attr{href} | head
231237
//en.wikivoyage.org/wiki/
232238
```
233239

240+
#### `json{}`
241+
242+
Print HTML as JSON.
243+
244+
```bash
245+
$ cat robots.html | pup div#p-namespaces a
246+
<a href="/wiki/Robots_exclusion_standard" title="View the content page [c]" accesskey="c">
247+
Article
248+
</a>
249+
<a href="/wiki/Talk:Robots_exclusion_standard" title="Discussion about the content page [t]" accesskey="t">
250+
Talk
251+
</a>
252+
```
253+
254+
```bash
255+
$ cat robots.html | pup div#p-namespaces a json{}
256+
[
257+
{
258+
"attrs": {
259+
"accesskey": "c",
260+
"href": "/wiki/Robots_exclusion_standard",
261+
"title": "View the content page [c]"
262+
},
263+
"tag": "a",
264+
"text": "Article"
265+
},
266+
{
267+
"attrs": {
268+
"accesskey": "t",
269+
"href": "/wiki/Talk:Robots_exclusion_standard",
270+
"title": "Discussion about the content page [t]"
271+
},
272+
"tag": "a",
273+
"text": "Talk"
274+
}
275+
]
276+
```
277+
278+
Use the `-i` / `--indent` flag to control the intent level.
279+
280+
```bash
281+
$ cat robots.html | pup --indent 4 div#p-namespaces a json{}
282+
[
283+
{
284+
"attrs": {
285+
"accesskey": "c",
286+
"href": "/wiki/Robots_exclusion_standard",
287+
"title": "View the content page [c]"
288+
},
289+
"tag": "a",
290+
"text": "Article"
291+
},
292+
{
293+
"attrs": {
294+
"accesskey": "t",
295+
"href": "/wiki/Talk:Robots_exclusion_standard",
296+
"title": "Discussion about the content page [t]"
297+
},
298+
"tag": "a",
299+
"text": "Talk"
300+
}
301+
]
302+
```
303+
304+
If the selectors only return one element the results will be printed as a JSON
305+
object, not a list.
306+
307+
```bash
308+
$ cat robots.html | pup --indent 4 title json{}
309+
{
310+
"tag": "title",
311+
"text": "Robots exclusion standard - Wikipedia, the free encyclopedia"
312+
}
313+
```
314+
315+
Because there is no universal standard for converting HTML/XML to JSON, a
316+
method has been chosen which hopefully fits. The goal is simply to get the
317+
output of pup into a more consumable format.
318+
234319
## Flags
235320

236321
```bash
@@ -243,6 +328,6 @@ $ pup < robots.html a attr{href} | head
243328
--version display version
244329
```
245330

246-
## TODO:
331+
## TODO
247332

248-
* Print as json function `json{}`
333+
Add more tests!

display.go

+130
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
package main
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
"regexp"
7+
"strings"
8+
9+
"code.google.com/p/go.net/html"
10+
)
11+
12+
type Displayer interface {
13+
Display(nodes []*html.Node)
14+
}
15+
16+
type TextDisplayer struct {
17+
}
18+
19+
func (t TextDisplayer) Display(nodes []*html.Node) {
20+
for _, node := range nodes {
21+
if node.Type == html.TextNode {
22+
fmt.Println(node.Data)
23+
}
24+
children := []*html.Node{}
25+
child := node.FirstChild
26+
for child != nil {
27+
children = append(children, child)
28+
child = child.NextSibling
29+
}
30+
t.Display(children)
31+
}
32+
}
33+
34+
type AttrDisplayer struct {
35+
Attr string
36+
}
37+
38+
func (a AttrDisplayer) Display(nodes []*html.Node) {
39+
for _, node := range nodes {
40+
attributes := node.Attr
41+
for _, attr := range attributes {
42+
if attr.Key == a.Attr {
43+
val := html.EscapeString(attr.Val)
44+
fmt.Printf("%s\n", val)
45+
}
46+
}
47+
}
48+
}
49+
50+
type JSONDisplayer struct {
51+
}
52+
53+
// returns a jsonifiable struct
54+
func jsonify(node *html.Node) map[string]interface{} {
55+
vals := map[string]interface{}{}
56+
if len(node.Attr) > 0 {
57+
attrs := map[string]string{}
58+
for _, attr := range node.Attr {
59+
attrs[attr.Key] = html.EscapeString(attr.Val)
60+
}
61+
vals["attrs"] = attrs
62+
}
63+
vals["tag"] = node.DataAtom.String()
64+
children := []interface{}{}
65+
for child := node.FirstChild; child != nil; child = child.NextSibling {
66+
switch child.Type {
67+
case html.ElementNode:
68+
children = append(children, jsonify(child))
69+
case html.TextNode:
70+
text := strings.TrimSpace(child.Data)
71+
if text != "" {
72+
// if there is already text we'll append it
73+
currText, ok := vals["text"]
74+
if ok {
75+
text = fmt.Sprintf("%s %s", currText, text)
76+
}
77+
vals["text"] = text
78+
}
79+
}
80+
}
81+
return vals
82+
}
83+
84+
func (j JSONDisplayer) Display(nodes []*html.Node) {
85+
var data []byte
86+
var err error
87+
switch len(nodes) {
88+
case 1:
89+
jsonNode := jsonify(nodes[0])
90+
data, err = json.MarshalIndent(&jsonNode, "", indentString)
91+
default:
92+
jsonNodes := []map[string]interface{}{}
93+
for _, node := range nodes {
94+
jsonNodes = append(jsonNodes, jsonify(node))
95+
}
96+
data, err = json.MarshalIndent(&jsonNodes, "", indentString)
97+
}
98+
if err != nil {
99+
panic("Could not jsonify nodes")
100+
}
101+
fmt.Printf("%s\n", data)
102+
}
103+
104+
var (
105+
// Display function helpers
106+
displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`)
107+
textFuncMatcher = regexp.MustCompile(`^text\{\}$`)
108+
attrFuncMatcher = regexp.MustCompile(`^attr\{([^\}]*)\}$`)
109+
jsonFuncMatcher = regexp.MustCompile(`^json\{([^\}]*)\}$`)
110+
)
111+
112+
func NewDisplayFunc(text string) (Displayer, error) {
113+
if !displayMatcher.MatchString(text) {
114+
return nil, fmt.Errorf("Not a display function")
115+
}
116+
switch {
117+
case textFuncMatcher.MatchString(text):
118+
return TextDisplayer{}, nil
119+
case attrFuncMatcher.MatchString(text):
120+
matches := attrFuncMatcher.FindStringSubmatch(text)
121+
if len(matches) != 2 {
122+
return nil, fmt.Errorf("")
123+
} else {
124+
return AttrDisplayer{matches[1]}, nil
125+
}
126+
case jsonFuncMatcher.MatchString(text):
127+
return JSONDisplayer{}, nil
128+
}
129+
return nil, fmt.Errorf("Not a display function")
130+
}

funcs/display.go

-70
This file was deleted.

main.go

+3-4
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,14 @@ import (
44
"code.google.com/p/go.net/html"
55
"code.google.com/p/go.net/html/charset"
66
"fmt"
7-
"github.com/ericchiang/pup/funcs"
87
"github.com/ericchiang/pup/selector"
98
"io"
109
"os"
1110
"strconv"
1211
"strings"
1312
)
1413

15-
const VERSION string = "0.3.0"
14+
const VERSION string = "0.3.1"
1615

1716
var (
1817
// Flags
@@ -22,7 +21,7 @@ var (
2221
maxPrintLevel int = -1
2322
printNumber bool = false
2423
printColor bool = false
25-
displayer funcs.Displayer = nil
24+
displayer Displayer = nil
2625
)
2726

2827
// Print to stderr and exit
@@ -177,7 +176,7 @@ func main() {
177176
// if this is the last element, check for a function like
178177
// text{} or attr{}
179178
if i+1 == len(cmds) {
180-
d, err := funcs.NewDisplayFunc(cmd)
179+
d, err := NewDisplayFunc(cmd)
181180
if err == nil {
182181
displayer = d
183182
selectors = selectors[0 : len(cmds)-1]

0 commit comments

Comments
 (0)