diff --git a/NEWS.md b/NEWS.md index 351f7a6..9570cc2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,12 +13,10 @@ * `ragnar_read()` and `read_as_markdown()` now accept paths that begin with `~` (@topepo, #46, #48). -* Changes to `read_as_markdown()` HTML conversion (#40): +* Changes to `read_as_markdown()` HTML conversion (#40, #51): - * If a 'main' tag is present, content outside the 'main' tag is now excluded - by default. To restore the previous behavior and include the sidebar, header, - footer, and other navigational elements in the converted markdown, use - `read_as_markdown(x, main_only=FALSE)`. + * New arguments `html_extract_selectors` and `html_zap_selectors` provide a flexible way to + exclude some html page elements from being included in the converted markdown. * Fixed handling of nested code fences in markdown output. diff --git a/R/read-markdown.R b/R/read-markdown.R index 7df8447..c5e1c6e 100644 --- a/R/read-markdown.R +++ b/R/read-markdown.R @@ -1,37 +1,100 @@ #' Convert files to markdown #' -#' @param x A filepath or url. Accepts a wide variety of file types, including -#' PDF, PowerPoint, Word, Excel, Images (EXIF metadata and OCR), Audio (EXIF -#' metadata and speech transcription), HTML, Text-based formats (CSV, JSON, -#' XML), ZIP files (iterates over contents), Youtube URLs, and EPubs. -#' @param ... Passed on to `MarkItDown.convert()` -#' @param canonical logical, whether to postprocess the output from MarkItDown +#' @param x A filepath or URL. Accepts a wide variety of file types, including +#' PDF, PowerPoint, Word, Excel, images (EXIF metadata and OCR), audio (EXIF +#' metadata and speech transcription), HTML, text-based formats (CSV, JSON, +#' XML), ZIP files (iterates over contents), YouTube URLs, and EPUBs. +#' @param ... Passed on to `MarkItDown.convert()`. +#' @param canonical Logical. Whether to postprocess the output from MarkItDown #' with `commonmark::markdown_commonmark()`. -#' @param main_only logical. Applies only to HTML documents. If `TRUE` and a -#' `main` tag is present in the document, only the contents of the `main` tag -#' are returned. This is a convenient way to exclude navigational elements -#' typically found in sidebars, page headers, and footers. -#' -#' @returns A single string of markdown +#' @param html_extract_selectors Character vector of CSS selectors. If a match +#' for a selector is found in the document, only the matched node's contents +#' are converted. Unmatched extract selectors have no effect. +#' @param html_zap_selectors Character vector of CSS selectors. Elements +#' matching these selectors will be excluded ("zapped") from the HTML document +#' before conversion to markdown. This is useful for removing navigation bars, +#' sidebars, headers, footers, or other unwanted elements. By default, +#' navigation elements (`nav`) are excluded. +#' +#' @returns A single string of markdown. #' @export #' #' @examplesIf reticulate::py_available() -#' # convert html +#' # Convert HTML #' read_as_markdown("https://r4ds.hadley.nz/base-R.html") |> -#' substr(1, 1000) |> +#' substr(1, 500) |> #' cat() #' #' read_as_markdown("https://r4ds.hadley.nz/base-R.html", canonical = TRUE) |> -#' substr(1, 1000) |> +#' substr(1, 500) |> #' cat() #' -#' # convert pdf +#' # When converting HTML, you might want to omit certain elements, like +#' # sidebars, headers, footers, etc. You can pass CSS selector strings +#' # to either extract nodes or exclude nodes during conversion. +#' # +#' # The easiest way to make selectors is to use SelectorGadget: +#' # https://rvest.tidyverse.orgarticles/selectorgadget.html +#' # +#' # You can also right-click on a page and select "Inspect Element" in a +#' # browser to better understand an HTML page's structure. +#' # +#' # For comprehensive or advanced usage of CSS selectors, consult: +#' # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors-through-the-css-property +#' # https://facelessuser.github.io/soupsieve/selectors/ +#' +#' url <- "https://duckdb.org/code_of_conduct" +#' # Includes the sidebar and other navigational elements +#' read_as_markdown(url) |> substr(1, 500) |> writeLines() +#' +#' # Extract the main content +#' read_as_markdown(url, html_extract_selectors = "#main_content_wrap") +#' +#' # Alternative approach: exclude nodes +#' read_as_markdown( +#' url, +#' html_zap_selectors = c( +#' "header", # node name +#' ".sidenavigation", # node class +#' ".searchoverlay", # node class +#' "#sidebar" # node ID +#' ) +#' ) |> substr(1, 500) |> writeLines() +#' +#' # Quarto example +#' url <- "https://quarto.org/docs/computations/python.html" +#' +#' # Include sidebar, footer, etc. +#' read_as_markdown( +#' url, +#' html_extract_selectors = NULL, +#' html_zap_selectors = NULL +#' ) |> substr(1, 500) |> writeLines() +#' +#' # Exclude content outside
+#' read_as_markdown(url, html_extract_selectors = "main") |> +#' substr(1, 500) |> writeLines() +#' +#' # Exclude specific matching nodes +#' read_as_markdown( +#' url, +#' html_extract_selectors = NULL, +#' html_zap_selectors = c( +#' "#quarto-sidebar", +#' "#quarto-margin-sidebar", +#' "header", +#' "footer", +#' "nav" +#' ) +#' ) |> substr(1, 500) |> writeLines() +#' +#' # Convert PDF #' pdf <- file.path(R.home("doc"), "NEWS.pdf") #' read_as_markdown(pdf) |> substr(1, 1000) |> cat() -#' ## alternative: +#' ## Alternative: #' # pdftools::pdf_text(pdf) |> substr(1, 2000) |> cat() #' -#' # convert images to markdown descriptions using OpenAI +#' # Convert images to markdown descriptions using OpenAI #' jpg <- file.path(R.home("doc"), "html", "logo.jpg") #' if (Sys.getenv("OPENAI_API_KEY") != "") { #' # if (xfun::is_macos()) system("brew install ffmpeg") @@ -56,7 +119,13 @@ #' chat <- ellmer::chat_openai(echo = TRUE) #' chat$chat("Describe this image", ellmer::content_image_file(jpg)) #' } -read_as_markdown <- function(x, ..., canonical = FALSE, main_only = TRUE) { +read_as_markdown <- function( + x, + ..., + canonical = FALSE, + html_extract_selectors = c("main"), + html_zap_selectors = c("nav") +) { check_string(x) if (startsWith(x, "~")) { x <- path.expand(x) @@ -68,40 +137,18 @@ read_as_markdown <- function(x, ..., canonical = FALSE, main_only = TRUE) { # dependencies that conflict md <- ragnartools.markitdown$convert_to_markdown( x, + html_extract_selectors = html_extract_selectors, + html_zap_selectors = html_zap_selectors, ..., - main_only = main_only ) } else { - # use the markitdown cli API, (much) slower, but can be isolated from - # reticulated python. - # TODO: apply markitdown monkeypatches in cli interface too - - check_dots_empty() - outfile <- withr::local_tempfile(fileext = ".md") - exit_code <- cli_markitdown(c(shQuote(x), "-o", shQuote(outfile))) - if ( - !identical(exit_code, 0L) || - (no_outfile_produced <- !file.exists(outfile)) - ) { - # more useful output to stderr() should have been printed - # already by cli_markitdown() if we are here. - errmsg <- stri_flatten( - c( - paste("markitdown exit code: ", exit_code), - if (no_outfile_produced) "No output file produced." - ), - collapse = "\n" - ) - stop(errmsg) - } - - md <- stri_read_lines(outfile) + md <- read_as_markdown_cli(x, ...) } md <- stri_replace_all_fixed(md, "\f", "\n\n---\n\n") md <- unlist(stri_split_lines(md)) # normalize newlines md <- stri_trim_right(md) - if (canonical) + if (canonical) { md <- commonmark::markdown_commonmark( md, normalize = TRUE, @@ -109,6 +156,7 @@ read_as_markdown <- function(x, ..., canonical = FALSE, main_only = TRUE) { width = 72L, extensions = TRUE ) + } md <- stri_flatten(md, "\n") glue::as_glue(md) } @@ -118,8 +166,9 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) { lines <- text |> stri_split_lines() |> unlist() text <- lines |> stri_flatten("\n") - if (text == "") + if (text == "") { return(data_frame(tag = character(), start = integer(), end = integer())) + } doc <- text |> commonmark::markdown_html( @@ -256,7 +305,9 @@ markdown_segment <- function( sizes <- drop_first(boundaries) - drop_last(boundaries) splits <- vec_chop(bytes, sizes = sizes) |> vapply(rawToChar, "") - if (trim) splits <- stri_trim_both(splits) # drops names + if (trim) { + splits <- stri_trim_both(splits) + } # drops names # make names split_tags <- c("", sourcepos$tag[match(tag_boundaries, sourcepos$start)]) @@ -294,8 +345,9 @@ markdown_frame <- function( names = "tag", leaves = "text" ) - if (!length(segment_by) || base::setequal(segment_by, frame_by)) + if (!length(segment_by) || base::setequal(segment_by, frame_by)) { frame[["tag"]] <- NULL + } as_tibble(frame) } @@ -454,9 +506,39 @@ ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) { # ------ utils +read_as_markdown_cli <- function(x, ...) { + # use the markitdown cli API, (much) slower, but can be isolated from + # reticulated python. + # TODO: apply markitdown monkeypatches in cli interface too + + check_dots_empty() + outfile <- withr::local_tempfile(fileext = ".md") + exit_code <- cli_markitdown(c(shQuote(x), "-o", shQuote(outfile))) + if ( + !identical(exit_code, 0L) || + (no_outfile_produced <- !file.exists(outfile)) + ) { + # more useful output to stderr() should have been printed + # already by cli_markitdown() if we are here. + errmsg <- stri_flatten( + c( + paste("markitdown exit code: ", exit_code), + if (no_outfile_produced) "No output file produced." + ), + collapse = "\n" + ) + stop(errmsg) + } + + md <- stri_read_lines(outfile) + md +} + + cli_markitdown <- function(args, ...) { - if (is.na(Sys.getenv("PYTHONIOENCODING", NA))) - withr::local_envvar("PYTHONIOENCODING" = "utf-8") # needed on windows + if (is.na(Sys.getenv("PYTHONIOENCODING", NA))) { + withr::local_envvar("PYTHONIOENCODING" = "utf-8") + } # needed on windows reticulate::uv_run_tool( "markitdown", diff --git a/inst/python/_ragnartools/markitdown.py b/inst/python/_ragnartools/markitdown.py index 979026a..bf3cbab 100644 --- a/inst/python/_ragnartools/markitdown.py +++ b/inst/python/_ragnartools/markitdown.py @@ -1,11 +1,8 @@ -import functools import markitdown from markitdown.converters._markdownify import _CustomMarkdownify md = markitdown.MarkItDown() -MISSING = object() - def maybe_expand_outer_code_fence(text): # take a 'pre' string like this: @@ -45,63 +42,85 @@ def maybe_expand_outer_code_fence(text): return text -def fence_main(text): - return f"____RAGNAR_MAIN_START____{text}____RAGNAR_MAIN_END____" - - class patched_markitdown: - def __init__(self, patch_pre=True, patch_main=True): - self.patch_main = patch_main - self.patch_pre = patch_pre + def __init__( + self, + html_extract_selectors=None, + html_zap_selectors=None, + ): + self.html_extract_selectors = html_extract_selectors or [] + self.html_zap_selectors = html_zap_selectors or [] def __enter__(self): - if self.patch_pre: - self.og_convert_pre = og_convert_pre = _CustomMarkdownify.convert_pre + self.og_convert_soup = og_convert_soup = _CustomMarkdownify.convert_soup - def convert_pre(self, el, text, parent_tags): - text = og_convert_pre(self, el, text, parent_tags) - return maybe_expand_outer_code_fence(text) + def convert_soup(self_, soup): - _CustomMarkdownify.convert_pre = convert_pre + for selector in self.html_extract_selectors: + if (tag := soup.select_one(selector)) is not None: - if self.patch_main: - self.og_convert_main = og_convert_main = getattr( - _CustomMarkdownify, "convert_main", MISSING - ) - if og_convert_main is MISSING or None: + soup = tag.extract() - def convert_main(self, el, text, parent_tags): - return fence_main(text) + for selector in self.html_zap_selectors: + while (tag := soup.select_one(selector)) is not None: + tag.decompose() - else: + return og_convert_soup(self_, soup) - def convert_main(self, el, text, parent_tags): - text = og_convert_main(self, el, text, parent_tags) - return fence_main(text) + _CustomMarkdownify.convert_soup = convert_soup - _CustomMarkdownify.convert_main = convert_main + self.og_convert_pre = og_convert_pre = _CustomMarkdownify.convert_pre - def __exit__(self, exc_type, exc_val, exc_tb): - _CustomMarkdownify.convert_pre = self.og_convert_pre - if self.patch_main: - if self.og_convert_main is MISSING: - delattr(_CustomMarkdownify, "convert_main") - else: - _CustomMarkdownify.convert_main = self.og_convert_main + def convert_pre(self, el, text, parent_tags): + text = og_convert_pre(self, el, text, parent_tags) + return maybe_expand_outer_code_fence(text) + _CustomMarkdownify.convert_pre = convert_pre -def convert_to_markdown(x, *args, main_only=True, **kwargs): - with patched_markitdown(patch_main=main_only): - result = md.convert(x, *args, **kwargs) - text = result.markdown - + def __exit__(self, exc_type, exc_val, exc_tb): + _CustomMarkdownify.convert_pre = self.og_convert_pre + _CustomMarkdownify.convert_soup = self.og_convert_soup + + +def as_str_list(x): + if x is None: + return [] + if isinstance(x, str): + return [x] + return list(x) + + +def convert_to_markdown( + x, + *args, + html_extract_selectors=None, + html_zap_selectors=None, + **kwargs, +): + html_extract_selectors = as_str_list(html_extract_selectors) + html_zap_selectors = as_str_list(html_zap_selectors) + + # backcompat support for previous 'main_only' arg + main_only = kwargs.pop("main_only", None) + if main_only is not None: if main_only: - start = text.find("____RAGNAR_MAIN_START____") - end = text.rfind("____RAGNAR_MAIN_END____") - if start != -1 and end != -1: - text = text[start + len("____RAGNAR_MAIN_START____") : end] + if "main" not in html_extract_selectors: + html_extract_selectors.insert(0, "main") + else: + html_extract_selectors = [s for s in html_extract_selectors if s != "main"] + + with patched_markitdown( + html_extract_selectors=html_extract_selectors, + html_zap_selectors=html_zap_selectors, + ): + result = md.convert(x, *args, **kwargs) + text = result.markdown.strip() if result.title is not None: - text = f"# {result.title}\n\n{text}" + title = f"# {result.title}" + if not text.startswith(title): + text = f"{title}\n\n{text}" + + text = text.replace("\f", "\n\n---\n\n") return text diff --git a/man/read_as_markdown.Rd b/man/read_as_markdown.Rd index a996193..70141c8 100644 --- a/man/read_as_markdown.Rd +++ b/man/read_as_markdown.Rd @@ -4,48 +4,118 @@ \alias{read_as_markdown} \title{Convert files to markdown} \usage{ -read_as_markdown(x, ..., canonical = FALSE, main_only = TRUE) +read_as_markdown( + x, + ..., + canonical = FALSE, + html_extract_selectors = c("main"), + html_zap_selectors = c("nav") +) } \arguments{ -\item{x}{A filepath or url. Accepts a wide variety of file types, including -PDF, PowerPoint, Word, Excel, Images (EXIF metadata and OCR), Audio (EXIF -metadata and speech transcription), HTML, Text-based formats (CSV, JSON, -XML), ZIP files (iterates over contents), Youtube URLs, and EPubs.} +\item{x}{A filepath or URL. Accepts a wide variety of file types, including +PDF, PowerPoint, Word, Excel, images (EXIF metadata and OCR), audio (EXIF +metadata and speech transcription), HTML, text-based formats (CSV, JSON, +XML), ZIP files (iterates over contents), YouTube URLs, and EPUBs.} -\item{...}{Passed on to \code{MarkItDown.convert()}} +\item{...}{Passed on to \code{MarkItDown.convert()}.} -\item{canonical}{logical, whether to postprocess the output from MarkItDown +\item{canonical}{Logical. Whether to postprocess the output from MarkItDown with \code{commonmark::markdown_commonmark()}.} -\item{main_only}{logical. Applies only to HTML documents. If \code{TRUE} and a -\code{main} tag is present in the document, only the contents of the \code{main} tag -are returned. This is a convenient way to exclude navigational elements -typically found in sidebars, page headers, and footers.} +\item{html_extract_selectors}{Character vector of CSS selectors. If a match +for a selector is found in the document, only the matched node's contents +are converted. Unmatched extract selectors have no effect.} + +\item{html_zap_selectors}{Character vector of CSS selectors. Elements +matching these selectors will be excluded ("zapped") from the HTML document +before conversion to markdown. This is useful for removing navigation bars, +sidebars, headers, footers, or other unwanted elements. By default, +navigation elements (\code{nav}) are excluded.} } \value{ -A single string of markdown +A single string of markdown. } \description{ Convert files to markdown } \examples{ \dontshow{if (reticulate::py_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} -# convert html +# Convert HTML read_as_markdown("https://r4ds.hadley.nz/base-R.html") |> - substr(1, 1000) |> + substr(1, 500) |> cat() read_as_markdown("https://r4ds.hadley.nz/base-R.html", canonical = TRUE) |> - substr(1, 1000) |> + substr(1, 500) |> cat() -# convert pdf +# When converting HTML, you might want to omit certain elements, like +# sidebars, headers, footers, etc. You can pass CSS selector strings +# to either extract nodes or exclude nodes during conversion. +# +# The easiest way to make selectors is to use SelectorGadget: +# https://rvest.tidyverse.orgarticles/selectorgadget.html +# +# You can also right-click on a page and select "Inspect Element" in a +# browser to better understand an HTML page's structure. +# +# For comprehensive or advanced usage of CSS selectors, consult: +# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors-through-the-cssproperty +# https://facelessuser.github.io/soupsieveselectors/ + +url <- "https://duckdb.org/code_of_conduct" +# Includes the sidebar and other navigational elements +read_as_markdown(url) |> substr(1, 500) |> writeLines() + +# Extract the main content +read_as_markdown(url, html_extract_selectors = "#main_content_wrap") + +# Alternative approach: exclude nodes +read_as_markdown( + url, + html_zap_selectors = c( + "header", # node name + ".sidenavigation", # node class + ".searchoverlay", # node class + "#sidebar" # node ID + ) +) |> substr(1, 500) |> writeLines() + +# Quarto example +url <- "https://quarto.org/docs/computations/python.html" + +# Include sidebar, footer, etc. +read_as_markdown( + url, + html_extract_selectors = NULL, + html_zap_selectors = NULL +) |> substr(1, 500) |> writeLines() + +# Exclude content outside
+read_as_markdown(url, html_extract_selectors = "main") |> + substr(1, 500) |> writeLines() + +# Exclude specific matching nodes +read_as_markdown( + url, + html_extract_selectors = NULL, + html_zap_selectors = c( + "#quarto-sidebar", + "#quarto-margin-sidebar", + "header", + "footer", + "nav" + ) +) |> substr(1, 500) |> writeLines() + +# Convert PDF pdf <- file.path(R.home("doc"), "NEWS.pdf") read_as_markdown(pdf) |> substr(1, 1000) |> cat() -## alternative: +## Alternative: # pdftools::pdf_text(pdf) |> substr(1, 2000) |> cat() -# convert images to markdown descriptions using OpenAI +# Convert images to markdown descriptions using OpenAI jpg <- file.path(R.home("doc"), "html", "logo.jpg") if (Sys.getenv("OPENAI_API_KEY") != "") { # if (xfun::is_macos()) system("brew install ffmpeg")