tidyverse · t-kalinowski · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025
diff --git a/NEWS.md b/NEWS.md
@@ -13,12 +13,10 @@
 * `ragnar_read()` and `read_as_markdown()` now accept paths
   that begin with `~` (@topepo, #46, #48).
 
-* Changes to `read_as_markdown()` HTML conversion (#40):
+* Changes to `read_as_markdown()` HTML conversion (#40, #51):
 
-  * If a 'main' tag is present, content outside the 'main' tag is now excluded
-    by default. To restore the previous behavior and include the sidebar, header,
-    footer, and other navigational elements in the converted markdown, use
-    `read_as_markdown(x, main_only=FALSE)`.
+  * New arguments `html_extract_selectors` and `html_zap_selectors` provide a flexible way to
+    exclude some html page elements from being included in the converted markdown.
 
   * Fixed handling of nested code fences in markdown output.
 

diff --git a/R/read-markdown.R b/R/read-markdown.R
@@ -1,37 +1,100 @@
 #' Convert files to markdown
 #'
-#' @param x A filepath or url. Accepts a wide variety of file types, including
-#'   PDF, PowerPoint, Word, Excel, Images (EXIF metadata and OCR), Audio (EXIF
-#'   metadata and speech transcription), HTML, Text-based formats (CSV, JSON,
-#'   XML), ZIP files (iterates over contents), Youtube URLs, and EPubs.
-#' @param ... Passed on to `MarkItDown.convert()`
-#' @param canonical logical, whether to postprocess the output from MarkItDown
+#' @param x A filepath or URL. Accepts a wide variety of file types, including
+#'   PDF, PowerPoint, Word, Excel, images (EXIF metadata and OCR), audio (EXIF
+#'   metadata and speech transcription), HTML, text-based formats (CSV, JSON,
+#'   XML), ZIP files (iterates over contents), YouTube URLs, and EPUBs.
+#' @param ... Passed on to `MarkItDown.convert()`.
+#' @param canonical Logical. Whether to postprocess the output from MarkItDown
 #'   with `commonmark::markdown_commonmark()`.
-#' @param main_only logical. Applies only to HTML documents. If `TRUE` and a
-#'   `main` tag is present in the document, only the contents of the `main` tag
-#'   are returned. This is a convenient way to exclude navigational elements
-#'   typically found in sidebars, page headers, and footers.
-#'
-#' @returns A single string of markdown
+#' @param html_extract_selectors Character vector of CSS selectors. If a match
+#'   for a selector is found in the document, only the matched node's contents
+#'   are converted. Unmatched extract selectors have no effect.
+#' @param html_zap_selectors Character vector of CSS selectors. Elements
+#'   matching these selectors will be excluded ("zapped") from the HTML document
+#'   before conversion to markdown. This is useful for removing navigation bars,
+#'   sidebars, headers, footers, or other unwanted elements. By default,
+#'   navigation elements (`nav`) are excluded.
+#'
+#' @returns A single string of markdown.
 #' @export
 #'
 #' @examplesIf reticulate::py_available()
-#' # convert html
+#' # Convert HTML
 #' read_as_markdown("https://r4ds.hadley.nz/base-R.html") |>
-#'   substr(1, 1000) |>
+#'   substr(1, 500) |>
 #'   cat()
 #'
 #' read_as_markdown("https://r4ds.hadley.nz/base-R.html", canonical = TRUE) |>
-#'   substr(1, 1000) |>
+#'   substr(1, 500) |>
 #'   cat()
 #'
-#' # convert pdf
+#' # When converting HTML, you might want to omit certain elements, like
+#' # sidebars, headers, footers, etc. You can pass CSS selector strings
+#' # to either extract nodes or exclude nodes during conversion.
+#' #
+#' # The easiest way to make selectors is to use SelectorGadget:
+#' # https://rvest.tidyverse.orgarticles/selectorgadget.html
+#' #
+#' # You can also right-click on a page and select "Inspect Element" in a
+#' # browser to better understand an HTML page's structure.
+#' #
+#' # For comprehensive or advanced usage of CSS selectors, consult:
+#' # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors-through-the-css-property
+#' # https://facelessuser.github.io/soupsieve/selectors/
+#'
+#' url <- "https://duckdb.org/code_of_conduct"
+#' # Includes the sidebar and other navigational elements
+#' read_as_markdown(url) |> substr(1, 500) |> writeLines()
+#'
+#' # Extract the main content
+#' read_as_markdown(url, html_extract_selectors = "#main_content_wrap")
+#'
+#' # Alternative approach: exclude nodes
+#' read_as_markdown(
+#'   url,
+#'   html_zap_selectors = c(
+#'     "header",          # node name
+#'     ".sidenavigation", # node class
+#'     ".searchoverlay",  # node class
+#'     "#sidebar"         # node ID
+#'   )
+#' ) |> substr(1, 500) |> writeLines()
+#'
+#' # Quarto example
+#' url <- "https://quarto.org/docs/computations/python.html"
+#'
+#' # Include sidebar, footer, etc.
+#' read_as_markdown(
+#'   url,
+#'   html_extract_selectors = NULL,
+#'   html_zap_selectors = NULL
+#' ) |> substr(1, 500) |> writeLines()
+#'
+#' # Exclude content outside <main>
+#' read_as_markdown(url, html_extract_selectors = "main") |>
+#'   substr(1, 500) |> writeLines()
+#'
+#' # Exclude specific matching nodes
+#' read_as_markdown(
+#'   url,
+#'   html_extract_selectors = NULL,
+#'   html_zap_selectors = c(
+#'     "#quarto-sidebar",
+#'     "#quarto-margin-sidebar",
+#'     "header",
+#'     "footer",
+#'     "nav"
+#'   )
+#' ) |> substr(1, 500) |> writeLines()
+#'
+#' # Convert PDF
 #' pdf <- file.path(R.home("doc"), "NEWS.pdf")
 #' read_as_markdown(pdf) |> substr(1, 1000) |> cat()
-#' ## alternative:
+#' ## Alternative:
 #' # pdftools::pdf_text(pdf) |> substr(1, 2000) |> cat()
 #'
-#' # convert images to markdown descriptions using OpenAI
+#' # Convert images to markdown descriptions using OpenAI
 #' jpg <- file.path(R.home("doc"), "html", "logo.jpg")
 #' if (Sys.getenv("OPENAI_API_KEY") != "") {
 #'   # if (xfun::is_macos()) system("brew install ffmpeg")
@@ -56,7 +119,13 @@
 #'   chat <- ellmer::chat_openai(echo = TRUE)
 #'   chat$chat("Describe this image", ellmer::content_image_file(jpg))
 #' }
-read_as_markdown <- function(x, ..., canonical = FALSE, main_only = TRUE) {
+read_as_markdown <- function(
+  x,
+  ...,
+  canonical = FALSE,
+  html_extract_selectors = c("main"),
+  html_zap_selectors = c("nav")
+) {
   check_string(x)
   if (startsWith(x, "~")) {
     x <- path.expand(x)
@@ -68,47 +137,26 @@ read_as_markdown <- function(x, ..., canonical = FALSE, main_only = TRUE) {
     # dependencies that conflict
     md <- ragnartools.markitdown$convert_to_markdown(
       x,
+      html_extract_selectors = html_extract_selectors,
+      html_zap_selectors = html_zap_selectors,
       ...,
-      main_only = main_only
     )
   } else {
-    # use the markitdown cli API, (much) slower, but can be isolated from
-    # reticulated python.
-    # TODO: apply markitdown monkeypatches in cli interface too
-
-    check_dots_empty()
-    outfile <- withr::local_tempfile(fileext = ".md")
-    exit_code <- cli_markitdown(c(shQuote(x), "-o", shQuote(outfile)))
-    if (
-      !identical(exit_code, 0L) ||
-        (no_outfile_produced <- !file.exists(outfile))
-    ) {
-      # more useful output to stderr() should have been printed
-      # already by cli_markitdown() if we are here.
-      errmsg <- stri_flatten(
-        c(
-          paste("markitdown exit code: ", exit_code),
-          if (no_outfile_produced) "No output file produced."
-        ),
-        collapse = "\n"
-      )
-      stop(errmsg)
-    }
-
-    md <- stri_read_lines(outfile)
+    md <- read_as_markdown_cli(x, ...)
   }
 
   md <- stri_replace_all_fixed(md, "\f", "\n\n---\n\n")
   md <- unlist(stri_split_lines(md)) # normalize newlines
   md <- stri_trim_right(md)
-  if (canonical)
+  if (canonical) {
     md <- commonmark::markdown_commonmark(
       md,
       normalize = TRUE,
       footnotes = TRUE,
       width = 72L,
       extensions = TRUE
     )
+  }
   md <- stri_flatten(md, "\n")
   glue::as_glue(md)
 }
@@ -118,8 +166,9 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
   lines <- text |> stri_split_lines() |> unlist()
   text <- lines |> stri_flatten("\n")
 
-  if (text == "")
+  if (text == "") {
     return(data_frame(tag = character(), start = integer(), end = integer()))
+  }
 
   doc <- text |>
     commonmark::markdown_html(
@@ -256,7 +305,9 @@ markdown_segment <- function(
   sizes <- drop_first(boundaries) - drop_last(boundaries)
   splits <- vec_chop(bytes, sizes = sizes) |> vapply(rawToChar, "")
 
-  if (trim) splits <- stri_trim_both(splits) # drops names
+  if (trim) {
+    splits <- stri_trim_both(splits)
+  } # drops names
 
   # make names
   split_tags <- c("", sourcepos$tag[match(tag_boundaries, sourcepos$start)])
@@ -294,8 +345,9 @@ markdown_frame <- function(
     names = "tag",
     leaves = "text"
   )
-  if (!length(segment_by) || base::setequal(segment_by, frame_by))
+  if (!length(segment_by) || base::setequal(segment_by, frame_by)) {
     frame[["tag"]] <- NULL
+  }
   as_tibble(frame)
 }
 
@@ -454,9 +506,39 @@ ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) {
 
 # ------ utils
 
+read_as_markdown_cli <- function(x, ...) {
+  # use the markitdown cli API, (much) slower, but can be isolated from
+  # reticulated python.
+  # TODO: apply markitdown monkeypatches in cli interface too
+
+  check_dots_empty()
+  outfile <- withr::local_tempfile(fileext = ".md")
+  exit_code <- cli_markitdown(c(shQuote(x), "-o", shQuote(outfile)))
+  if (
+    !identical(exit_code, 0L) ||
+      (no_outfile_produced <- !file.exists(outfile))
+  ) {
+    # more useful output to stderr() should have been printed
+    # already by cli_markitdown() if we are here.
+    errmsg <- stri_flatten(
+      c(
+        paste("markitdown exit code: ", exit_code),
+        if (no_outfile_produced) "No output file produced."
+      ),
+      collapse = "\n"
+    )
+    stop(errmsg)
+  }
+
+  md <- stri_read_lines(outfile)
+  md
+}
+
+
 cli_markitdown <- function(args, ...) {
-  if (is.na(Sys.getenv("PYTHONIOENCODING", NA)))
-    withr::local_envvar("PYTHONIOENCODING" = "utf-8") # needed on windows
+  if (is.na(Sys.getenv("PYTHONIOENCODING", NA))) {
+    withr::local_envvar("PYTHONIOENCODING" = "utf-8")
+  } # needed on windows
 
   reticulate::uv_run_tool(
     "markitdown",