Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fully implement AbstractTrees #99

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,8 @@ span
julia>
```

You can also use additional functions provided by the `AbstractTrees` interface, eg. `nextsibling`.

## TODOS

- support CDATA
Expand Down
14 changes: 8 additions & 6 deletions src/conversion.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,29 +48,31 @@ function elem_tag(ge::CGumbo.Element)
tag
end

function gumbo_to_jl(parent::HTMLNode, ge::CGumbo.Element, preserve_whitespace, preserve_template)
function gumbo_to_jl(parent::HTMLNode, ge::CGumbo.Element, index::Integer, preserve_whitespace, preserve_template)
tag = elem_tag(ge)
attrs = attributes(gvector_to_jl(CGumbo.Attribute,ge.attributes))
children = HTMLNode[]
res = HTMLElement{tag}(children, parent, attrs)
res = HTMLElement{tag}(children, parent, index, attrs)
preserve_whitespace = tag in RELEVANT_WHITESPACE || preserve_whitespace
child_index = 1
for childptr in gvector_to_jl(CGumbo.Node{Int},ge.children)
node = load_node(childptr, preserve_whitespace, preserve_template)
if in(typeof(node).parameters[1], [CGumbo.Element, CGumbo.Text])
push!(children, gumbo_to_jl(res, node.v, preserve_whitespace, preserve_template))
push!(children, gumbo_to_jl(res, node.v, child_index, preserve_whitespace, preserve_template))
child_index += 1
end
end
res
end


function gumbo_to_jl(parent::HTMLNode, gt::CGumbo.Text, preserve_whitespace, preserve_template)
HTMLText(parent, unsafe_string(gt.text))
function gumbo_to_jl(parent::HTMLNode, gt::CGumbo.Text, index::Integer, preserve_whitespace, preserve_template)
HTMLText(parent, index, unsafe_string(gt.text))
end

# this is a fallback method that should only be called to construct
# the root of a tree
gumbo_to_jl(ge::CGumbo.Element, preserve_whitespace, preserve_template) = gumbo_to_jl(NullNode(), ge, preserve_whitespace, preserve_template)
gumbo_to_jl(ge::CGumbo.Element, preserve_whitespace, preserve_template) = gumbo_to_jl(NullNode(), ge, 1, preserve_whitespace, preserve_template)

# load a GumboNode struct into memory as the appropriate Julia type
# this involves loading it once as a CGumbo.Node{Int} in order to
Expand Down
49 changes: 47 additions & 2 deletions src/htmltypes.jl
Original file line number Diff line number Diff line change
@@ -1,23 +1,27 @@
import AbstractTrees

abstract type HTMLNode end

mutable struct HTMLText <: HTMLNode
parent::HTMLNode
index_within_parent::Integer
text::AbstractString
end

# convenience method for defining without parent
HTMLText(text::AbstractString) = HTMLText(NullNode(), text)
HTMLText(text::AbstractString) = HTMLText(NullNode(), 1, text)

struct NullNode <: HTMLNode end

mutable struct HTMLElement{T} <: HTMLNode
children::Vector{HTMLNode}
parent::HTMLNode
index_within_parent::Integer
attributes::Dict{AbstractString,AbstractString}
end

# convenience method for defining an empty element
HTMLElement(T::Symbol) = HTMLElement{T}(HTMLNode[],NullNode(),Dict{AbstractString,AbstractString}())
HTMLElement(T::Symbol) = HTMLElement{T}(HTMLNode[],NullNode(), 1, Dict{AbstractString,AbstractString}())

mutable struct HTMLDocument
doctype::AbstractString
Expand All @@ -27,3 +31,44 @@ end
struct InvalidHTMLException <: Exception
msg::AbstractString
end

# AbstractTrees interface declarations

AbstractTrees.ParentLinks(::Type{<:HTMLNode}) = AbstractTrees.StoredParents()
function AbstractTrees.parent(node::Union{HTMLElement,HTMLText})
if node.parent isa NullNode
return nothing
else
return node.parent
end
end
AbstractTrees.parent(node::NullNode) = nothing

AbstractTrees.SiblingLinks(::Type{<:HTMLNode}) = AbstractTrees.StoredSiblings()

function AbstractTrees.nextsibling(node::Union{HTMLElement,HTMLText})
if node.parent isa NullNode
return nothing
end
num_siblings = length(node.parent.children)
if node.index_within_parent < num_siblings
return node.parent.children[node.index_within_parent + 1]
else
return nothing
end
end

AbstractTrees.nextsibling(node::NullNode) = nothing

function AbstractTrees.prevsibling(node::Union{HTMLElement,HTMLText})
if node.parent isa NullNode
return nothing
end
if node.index_within_parent > 1
return node.parent.children[node.index_within_parent - 1]
else
return nothing
end
end

AbstractTrees.prevsibling(node::NullNode) = nothing
3 changes: 2 additions & 1 deletion test/basics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@ import Gumbo: HTMLNode, NullNode
# convenience constructor works
@test HTMLElement(:body) == HTMLElement{:body}(HTMLNode[],
NullNode(),
1,
Dict{AbstractString,AbstractString}())

# accessing tags works
@test HTMLElement(:body) |> tag == :body

let
elem = HTMLElement{:body}(HTMLNode[], NullNode(), Dict("foo" => "bar"))
elem = HTMLElement{:body}(HTMLNode[], NullNode(), 1, Dict("foo" => "bar"))
@test getattr(elem, "foo") == "bar"
@test getattr(elem, "foo", "baz") == "bar"
@test getattr(elem, "bar", "qux") == "qux"
Expand Down
8 changes: 8 additions & 0 deletions test/traversal.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ let res = Any[]
@assert tag(res[3]) == :body
@assert tag(res[4]) == :p
@assert text(last(res)) == "c"
@assert last(res).index_within_parent == 3
@assert AbstractTrees.parent(res[4]) === res[3]
@assert isnothing(AbstractTrees.parent(res[1]))
@assert tag(prevsibling(last(res))) == :strong
@assert nextsibling(prevsibling(last(res))) === last(res)
@assert isnothing(nextsibling(last(res)))
@assert isnothing(prevsibling(res[4]))
@assert AbstractTrees.isroot(res[1])
end

let res = Any[]
Expand Down
Loading