From 8c4828059783aa6c4db2458c5a8af6d62ed0b59a Mon Sep 17 00:00:00 2001 From: balping Date: Thu, 18 Jan 2024 01:25:01 +0100 Subject: [PATCH] Fully implement AbstractTrees --- README.md | 2 ++ src/conversion.jl | 14 ++++++++------ src/htmltypes.jl | 49 +++++++++++++++++++++++++++++++++++++++++++++-- test/basics.jl | 3 ++- test/traversal.jl | 8 ++++++++ 5 files changed, 67 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a95d432..cdecbaf 100644 --- a/README.md +++ b/README.md @@ -246,6 +246,8 @@ span julia> ``` +You can also use additional functions provided by the `AbstractTrees` interface, eg. `nextsibling`. + ## TODOS - support CDATA diff --git a/src/conversion.jl b/src/conversion.jl index 21b4ba5..cae6cb3 100644 --- a/src/conversion.jl +++ b/src/conversion.jl @@ -48,29 +48,31 @@ function elem_tag(ge::CGumbo.Element) tag end -function gumbo_to_jl(parent::HTMLNode, ge::CGumbo.Element, preserve_whitespace, preserve_template) +function gumbo_to_jl(parent::HTMLNode, ge::CGumbo.Element, index::Integer, preserve_whitespace, preserve_template) tag = elem_tag(ge) attrs = attributes(gvector_to_jl(CGumbo.Attribute,ge.attributes)) children = HTMLNode[] - res = HTMLElement{tag}(children, parent, attrs) + res = HTMLElement{tag}(children, parent, index, attrs) preserve_whitespace = tag in RELEVANT_WHITESPACE || preserve_whitespace + child_index = 1 for childptr in gvector_to_jl(CGumbo.Node{Int},ge.children) node = load_node(childptr, preserve_whitespace, preserve_template) if in(typeof(node).parameters[1], [CGumbo.Element, CGumbo.Text]) - push!(children, gumbo_to_jl(res, node.v, preserve_whitespace, preserve_template)) + push!(children, gumbo_to_jl(res, node.v, child_index, preserve_whitespace, preserve_template)) + child_index += 1 end end res end -function gumbo_to_jl(parent::HTMLNode, gt::CGumbo.Text, preserve_whitespace, preserve_template) - HTMLText(parent, unsafe_string(gt.text)) +function gumbo_to_jl(parent::HTMLNode, gt::CGumbo.Text, index::Integer, preserve_whitespace, preserve_template) + HTMLText(parent, index, unsafe_string(gt.text)) end # this is a fallback method that should only be called to construct # the root of a tree -gumbo_to_jl(ge::CGumbo.Element, preserve_whitespace, preserve_template) = gumbo_to_jl(NullNode(), ge, preserve_whitespace, preserve_template) +gumbo_to_jl(ge::CGumbo.Element, preserve_whitespace, preserve_template) = gumbo_to_jl(NullNode(), ge, 1, preserve_whitespace, preserve_template) # load a GumboNode struct into memory as the appropriate Julia type # this involves loading it once as a CGumbo.Node{Int} in order to diff --git a/src/htmltypes.jl b/src/htmltypes.jl index 1dd9fc5..8ef1379 100644 --- a/src/htmltypes.jl +++ b/src/htmltypes.jl @@ -1,23 +1,27 @@ +import AbstractTrees + abstract type HTMLNode end mutable struct HTMLText <: HTMLNode parent::HTMLNode + index_within_parent::Integer text::AbstractString end # convenience method for defining without parent -HTMLText(text::AbstractString) = HTMLText(NullNode(), text) +HTMLText(text::AbstractString) = HTMLText(NullNode(), 1, text) struct NullNode <: HTMLNode end mutable struct HTMLElement{T} <: HTMLNode children::Vector{HTMLNode} parent::HTMLNode + index_within_parent::Integer attributes::Dict{AbstractString,AbstractString} end # convenience method for defining an empty element -HTMLElement(T::Symbol) = HTMLElement{T}(HTMLNode[],NullNode(),Dict{AbstractString,AbstractString}()) +HTMLElement(T::Symbol) = HTMLElement{T}(HTMLNode[],NullNode(), 1, Dict{AbstractString,AbstractString}()) mutable struct HTMLDocument doctype::AbstractString @@ -27,3 +31,44 @@ end struct InvalidHTMLException <: Exception msg::AbstractString end + +# AbstractTrees interface declarations + +AbstractTrees.ParentLinks(::Type{<:HTMLNode}) = AbstractTrees.StoredParents() +function AbstractTrees.parent(node::Union{HTMLElement,HTMLText}) + if node.parent isa NullNode + return nothing + else + return node.parent + end +end +AbstractTrees.parent(node::NullNode) = nothing + +AbstractTrees.SiblingLinks(::Type{<:HTMLNode}) = AbstractTrees.StoredSiblings() + +function AbstractTrees.nextsibling(node::Union{HTMLElement,HTMLText}) + if node.parent isa NullNode + return nothing + end + num_siblings = length(node.parent.children) + if node.index_within_parent < num_siblings + return node.parent.children[node.index_within_parent + 1] + else + return nothing + end +end + +AbstractTrees.nextsibling(node::NullNode) = nothing + +function AbstractTrees.prevsibling(node::Union{HTMLElement,HTMLText}) + if node.parent isa NullNode + return nothing + end + if node.index_within_parent > 1 + return node.parent.children[node.index_within_parent - 1] + else + return nothing + end +end + +AbstractTrees.prevsibling(node::NullNode) = nothing diff --git a/test/basics.jl b/test/basics.jl index 9e708cf..bdcaf61 100644 --- a/test/basics.jl +++ b/test/basics.jl @@ -5,13 +5,14 @@ import Gumbo: HTMLNode, NullNode # convenience constructor works @test HTMLElement(:body) == HTMLElement{:body}(HTMLNode[], NullNode(), + 1, Dict{AbstractString,AbstractString}()) # accessing tags works @test HTMLElement(:body) |> tag == :body let - elem = HTMLElement{:body}(HTMLNode[], NullNode(), Dict("foo" => "bar")) + elem = HTMLElement{:body}(HTMLNode[], NullNode(), 1, Dict("foo" => "bar")) @test getattr(elem, "foo") == "bar" @test getattr(elem, "foo", "baz") == "bar" @test getattr(elem, "bar", "qux") == "qux" diff --git a/test/traversal.jl b/test/traversal.jl index 065e1e5..8cc037a 100644 --- a/test/traversal.jl +++ b/test/traversal.jl @@ -31,6 +31,14 @@ let res = Any[] @assert tag(res[3]) == :body @assert tag(res[4]) == :p @assert text(last(res)) == "c" + @assert last(res).index_within_parent == 3 + @assert AbstractTrees.parent(res[4]) === res[3] + @assert isnothing(AbstractTrees.parent(res[1])) + @assert tag(prevsibling(last(res))) == :strong + @assert nextsibling(prevsibling(last(res))) === last(res) + @assert isnothing(nextsibling(last(res))) + @assert isnothing(prevsibling(res[4])) + @assert AbstractTrees.isroot(res[1]) end let res = Any[]