Skip to content

Commit ca693e2

Browse files
Accept CSS selectors in read_as_markdown() (#51)
* accept css selectors in `read_as_markdown()` * redocument * add NEWS * tidy and copyedit * Update NEWS.md Co-authored-by: Daniel Falbel <[email protected]> --------- Co-authored-by: Daniel Falbel <[email protected]>
1 parent 1316ddc commit ca693e2

File tree

4 files changed

+287
-118
lines changed

4 files changed

+287
-118
lines changed

NEWS.md

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,10 @@
1313
* `ragnar_read()` and `read_as_markdown()` now accept paths
1414
that begin with `~` (@topepo, #46, #48).
1515

16-
* Changes to `read_as_markdown()` HTML conversion (#40):
16+
* Changes to `read_as_markdown()` HTML conversion (#40, #51):
1717

18-
* If a 'main' tag is present, content outside the 'main' tag is now excluded
19-
by default. To restore the previous behavior and include the sidebar, header,
20-
footer, and other navigational elements in the converted markdown, use
21-
`read_as_markdown(x, main_only=FALSE)`.
18+
* New arguments `html_extract_selectors` and `html_zap_selectors` provide a flexible way to
19+
exclude some html page elements from being included in the converted markdown.
2220

2321
* Fixed handling of nested code fences in markdown output.
2422

R/read-markdown.R

Lines changed: 132 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,100 @@
11
#' Convert files to markdown
22
#'
3-
#' @param x A filepath or url. Accepts a wide variety of file types, including
4-
#' PDF, PowerPoint, Word, Excel, Images (EXIF metadata and OCR), Audio (EXIF
5-
#' metadata and speech transcription), HTML, Text-based formats (CSV, JSON,
6-
#' XML), ZIP files (iterates over contents), Youtube URLs, and EPubs.
7-
#' @param ... Passed on to `MarkItDown.convert()`
8-
#' @param canonical logical, whether to postprocess the output from MarkItDown
3+
#' @param x A filepath or URL. Accepts a wide variety of file types, including
4+
#' PDF, PowerPoint, Word, Excel, images (EXIF metadata and OCR), audio (EXIF
5+
#' metadata and speech transcription), HTML, text-based formats (CSV, JSON,
6+
#' XML), ZIP files (iterates over contents), YouTube URLs, and EPUBs.
7+
#' @param ... Passed on to `MarkItDown.convert()`.
8+
#' @param canonical Logical. Whether to postprocess the output from MarkItDown
99
#' with `commonmark::markdown_commonmark()`.
10-
#' @param main_only logical. Applies only to HTML documents. If `TRUE` and a
11-
#' `main` tag is present in the document, only the contents of the `main` tag
12-
#' are returned. This is a convenient way to exclude navigational elements
13-
#' typically found in sidebars, page headers, and footers.
14-
#'
15-
#' @returns A single string of markdown
10+
#' @param html_extract_selectors Character vector of CSS selectors. If a match
11+
#' for a selector is found in the document, only the matched node's contents
12+
#' are converted. Unmatched extract selectors have no effect.
13+
#' @param html_zap_selectors Character vector of CSS selectors. Elements
14+
#' matching these selectors will be excluded ("zapped") from the HTML document
15+
#' before conversion to markdown. This is useful for removing navigation bars,
16+
#' sidebars, headers, footers, or other unwanted elements. By default,
17+
#' navigation elements (`nav`) are excluded.
18+
#'
19+
#' @returns A single string of markdown.
1620
#' @export
1721
#'
1822
#' @examplesIf reticulate::py_available()
19-
#' # convert html
23+
#' # Convert HTML
2024
#' read_as_markdown("https://r4ds.hadley.nz/base-R.html") |>
21-
#' substr(1, 1000) |>
25+
#' substr(1, 500) |>
2226
#' cat()
2327
#'
2428
#' read_as_markdown("https://r4ds.hadley.nz/base-R.html", canonical = TRUE) |>
25-
#' substr(1, 1000) |>
29+
#' substr(1, 500) |>
2630
#' cat()
2731
#'
28-
#' # convert pdf
32+
#' # When converting HTML, you might want to omit certain elements, like
33+
#' # sidebars, headers, footers, etc. You can pass CSS selector strings
34+
#' # to either extract nodes or exclude nodes during conversion.
35+
#' #
36+
#' # The easiest way to make selectors is to use SelectorGadget:
37+
#' # https://rvest.tidyverse.orgarticles/selectorgadget.html
38+
#' #
39+
#' # You can also right-click on a page and select "Inspect Element" in a
40+
#' # browser to better understand an HTML page's structure.
41+
#' #
42+
#' # For comprehensive or advanced usage of CSS selectors, consult:
43+
#' # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors-through-the-css-property
44+
#' # https://facelessuser.github.io/soupsieve/selectors/
45+
#'
46+
#' url <- "https://duckdb.org/code_of_conduct"
47+
#' # Includes the sidebar and other navigational elements
48+
#' read_as_markdown(url) |> substr(1, 500) |> writeLines()
49+
#'
50+
#' # Extract the main content
51+
#' read_as_markdown(url, html_extract_selectors = "#main_content_wrap")
52+
#'
53+
#' # Alternative approach: exclude nodes
54+
#' read_as_markdown(
55+
#' url,
56+
#' html_zap_selectors = c(
57+
#' "header", # node name
58+
#' ".sidenavigation", # node class
59+
#' ".searchoverlay", # node class
60+
#' "#sidebar" # node ID
61+
#' )
62+
#' ) |> substr(1, 500) |> writeLines()
63+
#'
64+
#' # Quarto example
65+
#' url <- "https://quarto.org/docs/computations/python.html"
66+
#'
67+
#' # Include sidebar, footer, etc.
68+
#' read_as_markdown(
69+
#' url,
70+
#' html_extract_selectors = NULL,
71+
#' html_zap_selectors = NULL
72+
#' ) |> substr(1, 500) |> writeLines()
73+
#'
74+
#' # Exclude content outside <main>
75+
#' read_as_markdown(url, html_extract_selectors = "main") |>
76+
#' substr(1, 500) |> writeLines()
77+
#'
78+
#' # Exclude specific matching nodes
79+
#' read_as_markdown(
80+
#' url,
81+
#' html_extract_selectors = NULL,
82+
#' html_zap_selectors = c(
83+
#' "#quarto-sidebar",
84+
#' "#quarto-margin-sidebar",
85+
#' "header",
86+
#' "footer",
87+
#' "nav"
88+
#' )
89+
#' ) |> substr(1, 500) |> writeLines()
90+
#'
91+
#' # Convert PDF
2992
#' pdf <- file.path(R.home("doc"), "NEWS.pdf")
3093
#' read_as_markdown(pdf) |> substr(1, 1000) |> cat()
31-
#' ## alternative:
94+
#' ## Alternative:
3295
#' # pdftools::pdf_text(pdf) |> substr(1, 2000) |> cat()
3396
#'
34-
#' # convert images to markdown descriptions using OpenAI
97+
#' # Convert images to markdown descriptions using OpenAI
3598
#' jpg <- file.path(R.home("doc"), "html", "logo.jpg")
3699
#' if (Sys.getenv("OPENAI_API_KEY") != "") {
37100
#' # if (xfun::is_macos()) system("brew install ffmpeg")
@@ -56,7 +119,13 @@
56119
#' chat <- ellmer::chat_openai(echo = TRUE)
57120
#' chat$chat("Describe this image", ellmer::content_image_file(jpg))
58121
#' }
59-
read_as_markdown <- function(x, ..., canonical = FALSE, main_only = TRUE) {
122+
read_as_markdown <- function(
123+
x,
124+
...,
125+
canonical = FALSE,
126+
html_extract_selectors = c("main"),
127+
html_zap_selectors = c("nav")
128+
) {
60129
check_string(x)
61130
if (startsWith(x, "~")) {
62131
x <- path.expand(x)
@@ -68,47 +137,26 @@ read_as_markdown <- function(x, ..., canonical = FALSE, main_only = TRUE) {
68137
# dependencies that conflict
69138
md <- ragnartools.markitdown$convert_to_markdown(
70139
x,
140+
html_extract_selectors = html_extract_selectors,
141+
html_zap_selectors = html_zap_selectors,
71142
...,
72-
main_only = main_only
73143
)
74144
} else {
75-
# use the markitdown cli API, (much) slower, but can be isolated from
76-
# reticulated python.
77-
# TODO: apply markitdown monkeypatches in cli interface too
78-
79-
check_dots_empty()
80-
outfile <- withr::local_tempfile(fileext = ".md")
81-
exit_code <- cli_markitdown(c(shQuote(x), "-o", shQuote(outfile)))
82-
if (
83-
!identical(exit_code, 0L) ||
84-
(no_outfile_produced <- !file.exists(outfile))
85-
) {
86-
# more useful output to stderr() should have been printed
87-
# already by cli_markitdown() if we are here.
88-
errmsg <- stri_flatten(
89-
c(
90-
paste("markitdown exit code: ", exit_code),
91-
if (no_outfile_produced) "No output file produced."
92-
),
93-
collapse = "\n"
94-
)
95-
stop(errmsg)
96-
}
97-
98-
md <- stri_read_lines(outfile)
145+
md <- read_as_markdown_cli(x, ...)
99146
}
100147

101148
md <- stri_replace_all_fixed(md, "\f", "\n\n---\n\n")
102149
md <- unlist(stri_split_lines(md)) # normalize newlines
103150
md <- stri_trim_right(md)
104-
if (canonical)
151+
if (canonical) {
105152
md <- commonmark::markdown_commonmark(
106153
md,
107154
normalize = TRUE,
108155
footnotes = TRUE,
109156
width = 72L,
110157
extensions = TRUE
111158
)
159+
}
112160
md <- stri_flatten(md, "\n")
113161
glue::as_glue(md)
114162
}
@@ -118,8 +166,9 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
118166
lines <- text |> stri_split_lines() |> unlist()
119167
text <- lines |> stri_flatten("\n")
120168

121-
if (text == "")
169+
if (text == "") {
122170
return(data_frame(tag = character(), start = integer(), end = integer()))
171+
}
123172

124173
doc <- text |>
125174
commonmark::markdown_html(
@@ -256,7 +305,9 @@ markdown_segment <- function(
256305
sizes <- drop_first(boundaries) - drop_last(boundaries)
257306
splits <- vec_chop(bytes, sizes = sizes) |> vapply(rawToChar, "")
258307

259-
if (trim) splits <- stri_trim_both(splits) # drops names
308+
if (trim) {
309+
splits <- stri_trim_both(splits)
310+
} # drops names
260311

261312
# make names
262313
split_tags <- c("", sourcepos$tag[match(tag_boundaries, sourcepos$start)])
@@ -294,8 +345,9 @@ markdown_frame <- function(
294345
names = "tag",
295346
leaves = "text"
296347
)
297-
if (!length(segment_by) || base::setequal(segment_by, frame_by))
348+
if (!length(segment_by) || base::setequal(segment_by, frame_by)) {
298349
frame[["tag"]] <- NULL
350+
}
299351
as_tibble(frame)
300352
}
301353

@@ -454,9 +506,39 @@ ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) {
454506

455507
# ------ utils
456508

509+
read_as_markdown_cli <- function(x, ...) {
510+
# use the markitdown cli API, (much) slower, but can be isolated from
511+
# reticulated python.
512+
# TODO: apply markitdown monkeypatches in cli interface too
513+
514+
check_dots_empty()
515+
outfile <- withr::local_tempfile(fileext = ".md")
516+
exit_code <- cli_markitdown(c(shQuote(x), "-o", shQuote(outfile)))
517+
if (
518+
!identical(exit_code, 0L) ||
519+
(no_outfile_produced <- !file.exists(outfile))
520+
) {
521+
# more useful output to stderr() should have been printed
522+
# already by cli_markitdown() if we are here.
523+
errmsg <- stri_flatten(
524+
c(
525+
paste("markitdown exit code: ", exit_code),
526+
if (no_outfile_produced) "No output file produced."
527+
),
528+
collapse = "\n"
529+
)
530+
stop(errmsg)
531+
}
532+
533+
md <- stri_read_lines(outfile)
534+
md
535+
}
536+
537+
457538
cli_markitdown <- function(args, ...) {
458-
if (is.na(Sys.getenv("PYTHONIOENCODING", NA)))
459-
withr::local_envvar("PYTHONIOENCODING" = "utf-8") # needed on windows
539+
if (is.na(Sys.getenv("PYTHONIOENCODING", NA))) {
540+
withr::local_envvar("PYTHONIOENCODING" = "utf-8")
541+
} # needed on windows
460542

461543
reticulate::uv_run_tool(
462544
"markitdown",

0 commit comments

Comments
 (0)