Skip to content

Commit 9e98bc2

Browse files
committed
Allow use to specify charset
1 parent d00d654 commit 9e98bc2

File tree

3 files changed

+34
-21
lines changed

3 files changed

+34
-21
lines changed

README.md

+1-10
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,6 @@ If you're on OS X, use [Brew](http://brew.sh/) to install (no Go required).
1919

2020
brew install https://raw.githubusercontent.com/EricChiang/pup/master/pup.rb
2121

22-
For linux distrubtions use the following commands to install under your `PATH`
23-
environment variable. You can set `ARCH` to `linux_386` for 32-bit infrastructures.
24-
25-
ARCH=linux_amd64
26-
cd /tmp
27-
wget https://github.com/EricChiang/pup/releases/download/v0.3.5/pup_${ARCH}.zip
28-
unzip pup_${ARCH}.zip && rm pup_${ARCH}.zip
29-
sudo mv pup /usr/local/bin
30-
pup --version
31-
3222
## Quick start
3323

3424
```bash
@@ -353,5 +343,6 @@ output of pup into a more consumable format.
353343
-i --indent number of spaces to use for indent or character
354344
-n --number print number of elements selected
355345
-l --limit restrict number of levels printed
346+
--charset specify the charset for pup to use
356347
--version display version
357348
```

parse.go

+29
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,41 @@ import (
66
"os"
77
"strconv"
88
"strings"
9+
10+
"golang.org/x/net/html"
11+
"golang.org/x/net/html/charset"
12+
"golang.org/x/text/transform"
913
)
1014

1115
var (
1216
pupIn io.ReadCloser = os.Stdin
17+
pupCharset string = ""
1318
pupMaxPrintLevel int = -1
1419
pupPrintColor bool = false
1520
pupIndentString string = " "
1621
pupDisplayer Displayer = TreeDisplayer{}
1722
)
1823

24+
// Parse the html while handling the charset
25+
func ParseHTML(r io.Reader, cs string) (*html.Node, error) {
26+
var err error
27+
if cs == "" {
28+
// attempt to guess the charset of the HTML document
29+
r, err = charset.NewReader(r, "")
30+
if err != nil {
31+
return nil, err
32+
}
33+
} else {
34+
// let the user specify the charset
35+
e, name := charset.Lookup(cs)
36+
if name == "" {
37+
return nil, fmt.Errorf("'%s' is not a valid charset", cs)
38+
}
39+
r = transform.NewReader(r, e.NewDecoder())
40+
}
41+
return html.Parse(r)
42+
}
43+
1944
func PrintHelp(w io.Writer, exitCode int) {
2045
helpString := `Usage
2146
pup [flags] [selectors] [optional display function]
@@ -28,6 +53,7 @@ Flags
2853
-i --indent number of spaces to use for indent or character
2954
-n --number print number of elements selected
3055
-l --limit restrict number of levels printed
56+
--charset specify the charset for pup to use
3157
--version display version
3258
`
3359
fmt.Fprintf(w, helpString, VERSION)
@@ -81,6 +107,9 @@ func ProcessFlags(cmds []string) (nonFlagCmds []string, err error) {
81107
return []string{}, fmt.Errorf("Argument for '%s' must be numeric", cmd)
82108
}
83109
i++
110+
case "--charset":
111+
pupCharset = cmds[i+1]
112+
i++
84113
case "--version":
85114
fmt.Println(VERSION)
86115
os.Exit(0)

pup.go

+4-11
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import (
55
"os"
66

77
"golang.org/x/net/html"
8-
"golang.org/x/net/html/charset"
98
)
109

1110
// _=,_
@@ -17,7 +16,7 @@ import (
1716
// |/ \_( # |"
1817
// C/ ,--___/
1918

20-
var VERSION string = "0.3.6"
19+
var VERSION string = "0.3.7"
2120

2221
func main() {
2322
// process flags and arguments
@@ -27,19 +26,13 @@ func main() {
2726
os.Exit(2)
2827
}
2928

30-
// Determine the charset of the input
31-
cr, err := charset.NewReader(pupIn, "")
32-
if err != nil {
33-
fmt.Fprintf(os.Stderr, err.Error())
34-
os.Exit(2)
35-
}
36-
3729
// Parse the input and get the root node
38-
root, err := html.Parse(cr)
30+
root, err := ParseHTML(pupIn, pupCharset)
3931
if err != nil {
40-
fmt.Fprintf(os.Stderr, err.Error())
32+
fmt.Fprintf(os.Stderr, "%s\n", err.Error())
4133
os.Exit(2)
4234
}
35+
pupIn.Close()
4336

4437
// Parse the selectors
4538
selectorFuncs := []SelectorFunc{}

0 commit comments

Comments
 (0)