Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add html5gum as alternative link extractor #480

Merged
merged 25 commits into from
Feb 7, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 7 additions & 10 deletions examples/collect_links/collect_links.rs
Original file line number Diff line number Diff line change
@@ -20,16 +20,13 @@ async fn main() -> Result<()> {
},
];

let links = Collector::new(
None, // base
false, // don't skip missing inputs
)
.collect_links(
inputs, // base url or directory
)
.await
.collect::<Result<Vec<_>>>()
.await?;
let links = Collector::new(None) // base
.skip_missing_inputs(false) // don't skip missing inputs? (default=false)
.use_html5ever(false) // use html5ever for parsing? (default=false)
.collect_links(inputs) // base url or directory
.await
.collect::<Result<Vec<_>>>()
.await?;

dbg!(links);

5 changes: 4 additions & 1 deletion lychee-bin/src/main.rs
Original file line number Diff line number Diff line change
@@ -200,7 +200,10 @@ fn run_main() -> Result<i32> {
/// Run lychee on the given inputs
async fn run(opts: &LycheeOptions) -> Result<i32> {
let inputs = opts.inputs();
let requests = Collector::new(opts.config.base.clone(), opts.config.skip_missing)
let requests = Collector::new(opts.config.base.clone())
.skip_missing_inputs(opts.config.skip_missing)
// File a bug if you rely on this envvar! It's going to go away eventually.
.use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1"))
.collect_links(inputs)
.await;

3 changes: 2 additions & 1 deletion lychee-lib/Cargo.toml
Original file line number Diff line number Diff line change
@@ -20,7 +20,6 @@ version = "0.8.2"
check-if-email-exists = "0.8.26"
fast_chemail = "0.9.6"
glob = "0.3.0"
html5ever = "0.25.1"
http = "0.2.6"
hubcaps = "0.6.2"
linkify = "0.8.0"
@@ -50,6 +49,8 @@ once_cell = "1.9.0"
thiserror = "1.0.30"
futures = "0.3.19"
lazy_static = "1.4.0"
html5ever = "0.25.1"
html5gum = "0.4.0"

[dependencies.par-stream]
version = "0.10.0"
28 changes: 24 additions & 4 deletions lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
@@ -13,18 +13,34 @@ use par_stream::ParStreamExt;
pub struct Collector {
base: Option<Base>,
skip_missing_inputs: bool,
use_html5ever: bool,
}

impl Collector {
/// Create a new collector with an empty cache
#[must_use]
pub const fn new(base: Option<Base>, skip_missing_inputs: bool) -> Self {
pub const fn new(base: Option<Base>) -> Self {
Collector {
base,
skip_missing_inputs,
skip_missing_inputs: false,
use_html5ever: false,
}
}

/// Skip missing input files (default is to error if they don't exist)
#[must_use]
pub const fn skip_missing_inputs(mut self, yes: bool) -> Self {
self.skip_missing_inputs = yes;
self
}

/// Use `html5ever` to parse HTML instead of `html5gum`.
#[must_use]
pub const fn use_html5ever(mut self, yes: bool) -> Self {
self.use_html5ever = yes;
self
}

/// Fetch all unique links from inputs
/// All relative URLs get prefixed with `base` (if given).
/// (This can be a directory or a base URL)
@@ -47,7 +63,11 @@ impl Collector {
let base = base.clone();
async move {
let content = content?;
let uris: Vec<RawUri> = Extractor::extract(&content);
let uris: Vec<RawUri> = if self.use_html5ever {
Extractor::extract_html5ever(&content)
} else {
Extractor::extract(&content)
};
let requests = request::create(uris, &content, &base)?;
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
}
@@ -74,7 +94,7 @@ mod test {

// Helper function to run the collector on the given inputs
async fn collect(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
let responses = Collector::new(base, false).collect_links(inputs).await;
let responses = Collector::new(base).collect_links(inputs).await;
responses.map(|r| r.unwrap().uri).collect().await
}

23 changes: 3 additions & 20 deletions lychee-lib/src/extract/html.rs
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@ use html5ever::{
use super::plaintext::extract_plaintext;
use crate::types::raw_uri::RawUri;

#[derive(Clone)]
#[derive(Clone, Default)]
struct LinkExtractor {
links: Vec<RawUri>,
}
@@ -61,8 +61,8 @@ impl TokenSink for LinkExtractor {
}

impl LinkExtractor {
pub(crate) const fn new() -> Self {
Self { links: Vec::new() }
pub(crate) fn new() -> Self {
LinkExtractor::default()
}

/// Extract all semantically known links from a given html attribute.
@@ -125,20 +125,3 @@ pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {

tokenizer.sink.links
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_extract_link_at_end_of_line() {
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
let link = input.trim_end();

let uris: Vec<String> = extract_html(input)
.into_iter()
.map(|raw_uri| raw_uri.text)
.collect();
assert_eq!(vec![link.to_string()], uris);
}
}
207 changes: 207 additions & 0 deletions lychee-lib/src/extract/html5gum.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
use html5gum::{Emitter, Error, Tokenizer};

use super::plaintext::extract_plaintext;
use crate::types::raw_uri::RawUri;

#[derive(Clone)]
struct LinkExtractor {
// note: what html5gum calls a tag, lychee calls an element
links: Vec<RawUri>,
current_string: Vec<u8>,
current_element_name: Vec<u8>,
current_element_is_closing: bool,
current_attribute_name: Vec<u8>,
current_attribute_value: Vec<u8>,
last_start_element: Vec<u8>,
}

/// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease
/// of debugging
unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
debug_assert!(std::str::from_utf8(s).is_ok());
std::str::from_utf8_unchecked(s)
}

impl LinkExtractor {
pub(crate) const fn new() -> Self {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could #[derive(Default)] for LinkExtractor and this code goes away?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did that now, and it makes new non-const as Default is not const

LinkExtractor {
links: Vec::new(),
current_string: Vec::new(),
current_element_name: Vec::new(),
current_element_is_closing: false,
current_attribute_name: Vec::new(),
current_attribute_value: Vec::new(),
last_start_element: Vec::new(),
}
}

/// Extract all semantically known links from a given html attribute.
#[allow(clippy::unnested_or_patterns)]
pub(crate) fn extract_urls_from_elem_attr<'a>(
attr_name: &str,
elem_name: &str,
attr_value: &'a str,
) -> Option<impl Iterator<Item = &'a str>> {
// For a comprehensive list of elements that might contain URLs/URIs
// see https://www.w3.org/TR/REC-html40/index/attributes.html
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
match (elem_name, attr_name) {
// Common element/attribute combinations for links
(_, "href" | "src" | "cite" | "usemap")
// Less common (but still valid!) combinations
| ("applet", "codebase")
| ("body", "background")
| ("button", "formaction")
| ("command", "icon")
| ("form", "action")
| ("frame", "longdesc")
| ("head", "profile")
| ("html", "manifest")
| ("iframe", "longdesc")
| ("img", "longdesc")
| ("input", "formaction")
| ("object", "classid")
| ("object", "codebase")
| ("object", "data")
| ("video", "poster") => {
Some(vec![attr_value].into_iter())
}
(_, "srcset") => {
let mut urls = Vec::new();
for image_candidate_string in attr_value.trim().split(',') {
for part in image_candidate_string.split_ascii_whitespace() {
if part.is_empty() {
continue;
}
urls.push(part);
break;
}
}
Some(urls.into_iter())
}
_ => None,
}
}

fn flush_current_characters(&mut self) {
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
let raw = unsafe { from_utf8_unchecked(&self.current_string) };
self.links.extend(extract_plaintext(raw));
self.current_string.clear();
}

fn flush_old_attribute(&mut self) {
{
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };

let urls = LinkExtractor::extract_urls_from_elem_attr(attr, name, value);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

element is also called tag or name in this function. Maybe we should stick to one word?

let elem = unsafe { from_utf8_unchecked(&self.current_element_name) };

Copy link
Collaborator Author

@untitaker untitaker Feb 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's html5gum (html spec) naming clashing with lychee naming

other candidates for rename are last_start_tag but they are not used by lychee

it's unclear where to draw the line to me because you clearly can't pick different method names

i chose to do just the rename you proposed (as that's the attribute we use for link extraction) but now the discrepancy is visible in the struct definition

best i can do is do a mass rename, accept that in things like set_last_start_tag the variable names are inconsistent and document that elem and tag are the same thing


let new_urls = match urls {
None => extract_plaintext(value),
Some(urls) => urls
.into_iter()
.map(|url| RawUri {
text: url.to_string(),
element: Some(name.to_string()),
attribute: Some(attr.to_string()),
})
.collect::<Vec<_>>(),
};

self.links.extend(new_urls);
}

self.current_attribute_name.clear();
self.current_attribute_value.clear();
}
}

impl Emitter for &mut LinkExtractor {
type Token = ();

fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) {
self.last_start_element.clear();
self.last_start_element
.extend(last_start_tag.unwrap_or_default());
}

fn emit_eof(&mut self) {
self.flush_current_characters();
}
fn emit_error(&mut self, _: Error) {}
fn pop_token(&mut self) -> Option<()> {
None
}

fn emit_string(&mut self, c: &[u8]) {
self.current_string.extend(c);
}

fn init_start_tag(&mut self) {
self.flush_current_characters();
self.current_element_name.clear();
self.current_element_is_closing = false;
}

fn init_end_tag(&mut self) {
self.flush_current_characters();
self.current_element_name.clear();
self.current_element_is_closing = true;
}

fn init_comment(&mut self) {
self.flush_current_characters();
}

fn emit_current_tag(&mut self) {
self.flush_old_attribute();
}

fn emit_current_doctype(&mut self) {}
fn set_self_closing(&mut self) {
self.current_element_is_closing = true;
}
fn set_force_quirks(&mut self) {}

fn push_tag_name(&mut self, s: &[u8]) {
self.current_element_name.extend(s);
}

fn push_comment(&mut self, _: &[u8]) {}
fn push_doctype_name(&mut self, _: &[u8]) {}
fn init_doctype(&mut self) {
self.flush_current_characters();
}
fn init_attribute(&mut self) {
self.flush_old_attribute();
}
fn push_attribute_name(&mut self, s: &[u8]) {
self.current_attribute_name.extend(s);
}
fn push_attribute_value(&mut self, s: &[u8]) {
self.current_attribute_value.extend(s);
}

fn set_doctype_public_identifier(&mut self, _: &[u8]) {}
fn set_doctype_system_identifier(&mut self, _: &[u8]) {}
fn push_doctype_public_identifier(&mut self, _: &[u8]) {}
fn push_doctype_system_identifier(&mut self, _: &[u8]) {}
fn current_is_appropriate_end_tag_token(&mut self) -> bool {
self.current_element_is_closing
&& !self.current_element_name.is_empty()
&& self.current_element_name == self.last_start_element
}

fn emit_current_comment(&mut self) {}
}

/// Extract unparsed URL strings from an HTML string.
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
let mut extractor = LinkExtractor::new();
let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
assert!(tokenizer.next().is_none());
extractor.links
}
Loading