From 14e452d64192b61239b493be9a902d0726e6aa68 Mon Sep 17 00:00:00 2001 From: Andre Marianiello Date: Sat, 23 Feb 2019 15:36:49 -0500 Subject: [PATCH 1/3] Remove outdated section of README --- README.md | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/README.md b/README.md index a24ac88..12da41c 100644 --- a/README.md +++ b/README.md @@ -319,17 +319,6 @@ $ cat robots.html | pup -i 4 'div#p-namespaces a json{}' ] ``` -If the selectors only return one element the results will be printed as a JSON -object, not a list. - -```bash -$ cat robots.html | pup --indent 4 'title json{}' -{ - "tag": "title", - "text": "Robots exclusion standard - Wikipedia, the free encyclopedia" -} -``` - Because there is no universal standard for converting HTML/XML to JSON, a method has been chosen which hopefully fits. The goal is simply to get the output of pup into a more consumable format. From 2bb485903cf9a1eac418935a60336e7e7852ca13 Mon Sep 17 00:00:00 2001 From: Andre Marianiello Date: Sat, 23 Feb 2019 15:08:25 -0500 Subject: [PATCH 2/3] Preserving sibling relationship of all node types --- README.md | 28 ++++++++++++++++---- display.go | 54 +++++++++++++++++---------------------- tests/expected_output.txt | 4 +-- 3 files changed, 49 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 12da41c..512ba56 100644 --- a/README.md +++ b/README.md @@ -272,7 +272,7 @@ $ cat robots.html | pup 'div#p-namespaces a' Article - + Talk ``` @@ -282,16 +282,25 @@ $ cat robots.html | pup 'div#p-namespaces a json{}' [ { "accesskey": "c", + "children": [ + { + "text": "Article" + } + ], "href": "/wiki/Robots_exclusion_standard", "tag": "a", - "text": "Article", "title": "View the content page [c]" }, { "accesskey": "t", + "children": [ + { + "text": "Talk" + } + ], "href": "/wiki/Talk:Robots_exclusion_standard", + "rel": "discussion", "tag": "a", - "text": "Talk", "title": "Discussion about the content page [t]" } ] @@ -304,16 +313,25 @@ $ cat robots.html | pup -i 4 'div#p-namespaces a json{}' [ { "accesskey": "c", + "children": [ + { + "text": "Article" + } + ], "href": "/wiki/Robots_exclusion_standard", "tag": "a", - "text": "Article", "title": "View the content page [c]" }, { "accesskey": "t", + "children": [ + { + "text": "Talk" + } + ], "href": "/wiki/Talk:Robots_exclusion_standard", + "rel": "discussion", "tag": "a", - "text": "Talk", "title": "Discussion about the content page [t]" } ] diff --git a/display.go b/display.go index 0be946c..3b16f01 100644 --- a/display.go +++ b/display.go @@ -272,38 +272,32 @@ func jsonify(node *html.Node) map[string]interface{} { } } } - vals["tag"] = node.DataAtom.String() - children := []interface{}{} - for child := node.FirstChild; child != nil; child = child.NextSibling { - switch child.Type { - case html.ElementNode: - children = append(children, jsonify(child)) - case html.TextNode: - text := strings.TrimSpace(child.Data) - if text != "" { - if pupEscapeHTML { - // don't escape javascript - if node.DataAtom != atom.Script { - text = html.EscapeString(text) - } - } - // if there is already text we'll append it - currText, ok := vals["text"] - if ok { - text = fmt.Sprintf("%s %s", currText, text) - } - vals["text"] = text - } - case html.CommentNode: - comment := strings.TrimSpace(child.Data) + switch node.Type { + case html.ElementNode: + vals["tag"] = node.Data + case html.TextNode: + text := strings.TrimSpace(node.Data) + if text != "" { if pupEscapeHTML { - comment = html.EscapeString(comment) - } - currComment, ok := vals["comment"] - if ok { - comment = fmt.Sprintf("%s %s", currComment, comment) + // don't escape javascript + if node.DataAtom != atom.Script { + text = html.EscapeString(text) + } } - vals["comment"] = comment + vals["text"] = text + } + case html.CommentNode: + comment := strings.TrimSpace(node.Data) + if pupEscapeHTML { + comment = html.EscapeString(comment) + } + vals["comment"] = comment + } + children := []interface{}{} + for child := node.FirstChild; child != nil; child = child.NextSibling { + jChild := jsonify(child) + if len(jChild) > 0 { + children = append(children, jChild) } } if len(children) > 0 { diff --git a/tests/expected_output.txt b/tests/expected_output.txt index 7f06b47..3b08cee 100644 --- a/tests/expected_output.txt +++ b/tests/expected_output.txt @@ -10,7 +10,7 @@ a92e50c09cd56970625ac3b74efbddb83b2731bb table li 66950e746590d7f4e9cfe3d1adef42cd0addcf1d table li:last-of-type 0a37d612cd4c67a42bd147b1edc5a1128456b017 table a[title="The Practice of Programming"] 0d3918d54f868f13110262ffbb88cbb0b083057d table a[title="The Practice of Programming"] text{} -ecb542a30fc75c71a0c6380692cbbc4266ccbce4 json{} +199188dc8f1522426a628e41d96264bffb8beb0f json{} 95ef88ded9dab22ee3206cca47b9c3a376274bda text{} e4f7358fbb7bb1748a296fa2a7e815fa7de0a08b .after-portlet da39a3ee5e6b4b0d3255bfef95601890afd80709 .after @@ -34,7 +34,7 @@ d314e83b059bb876b0e5ee76aa92d54987961f9a .navbox-list li:nth-last-child(1) 613bf65ac4042b6ee0a7a47f08732fdbe1b5b06b #toc da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a text{} -97d170e1550eee4afc0af065b78cda302a97674c #toc li + a json{} +cd0d4cc32346750408f7d4f5e78ec9a6e5b79a0d #toc li + a json{} da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a + span da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li > li From 5e4c14aae925876e4ee2a45670a86bd82eca7bba Mon Sep 17 00:00:00 2001 From: Andre Marianiello Date: Sun, 17 Mar 2019 18:26:57 -0400 Subject: [PATCH 3/3] Don't trim whitespace --- display.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/display.go b/display.go index 3b16f01..79e0e0e 100644 --- a/display.go +++ b/display.go @@ -276,7 +276,7 @@ func jsonify(node *html.Node) map[string]interface{} { case html.ElementNode: vals["tag"] = node.Data case html.TextNode: - text := strings.TrimSpace(node.Data) + text := node.Data if text != "" { if pupEscapeHTML { // don't escape javascript