1
1
# ' Convert files to markdown
2
2
# '
3
- # ' @param x A filepath or url . Accepts a wide variety of file types, including
4
- # ' PDF, PowerPoint, Word, Excel, Images (EXIF metadata and OCR), Audio (EXIF
5
- # ' metadata and speech transcription), HTML, Text -based formats (CSV, JSON,
6
- # ' XML), ZIP files (iterates over contents), Youtube URLs, and EPubs .
7
- # ' @param ... Passed on to `MarkItDown.convert()`
8
- # ' @param canonical logical, whether to postprocess the output from MarkItDown
3
+ # ' @param x A filepath or URL . Accepts a wide variety of file types, including
4
+ # ' PDF, PowerPoint, Word, Excel, images (EXIF metadata and OCR), audio (EXIF
5
+ # ' metadata and speech transcription), HTML, text -based formats (CSV, JSON,
6
+ # ' XML), ZIP files (iterates over contents), YouTube URLs, and EPUBs .
7
+ # ' @param ... Passed on to `MarkItDown.convert()`.
8
+ # ' @param canonical Logical. Whether to postprocess the output from MarkItDown
9
9
# ' with `commonmark::markdown_commonmark()`.
10
- # ' @param main_only logical. Applies only to HTML documents. If `TRUE` and a
11
- # ' `main` tag is present in the document, only the contents of the `main` tag
12
- # ' are returned. This is a convenient way to exclude navigational elements
13
- # ' typically found in sidebars, page headers, and footers.
14
- # '
15
- # ' @returns A single string of markdown
10
+ # ' @param html_extract_selectors Character vector of CSS selectors. If a match
11
+ # ' for a selector is found in the document, only the matched node's contents
12
+ # ' are converted. Unmatched extract selectors have no effect.
13
+ # ' @param html_zap_selectors Character vector of CSS selectors. Elements
14
+ # ' matching these selectors will be excluded ("zapped") from the HTML document
15
+ # ' before conversion to markdown. This is useful for removing navigation bars,
16
+ # ' sidebars, headers, footers, or other unwanted elements. By default,
17
+ # ' navigation elements (`nav`) are excluded.
18
+ # '
19
+ # ' @returns A single string of markdown.
16
20
# ' @export
17
21
# '
18
22
# ' @examplesIf reticulate::py_available()
19
- # ' # convert html
23
+ # ' # Convert HTML
20
24
# ' read_as_markdown("https://r4ds.hadley.nz/base-R.html") |>
21
- # ' substr(1, 1000 ) |>
25
+ # ' substr(1, 500 ) |>
22
26
# ' cat()
23
27
# '
24
28
# ' read_as_markdown("https://r4ds.hadley.nz/base-R.html", canonical = TRUE) |>
25
- # ' substr(1, 1000 ) |>
29
+ # ' substr(1, 500 ) |>
26
30
# ' cat()
27
31
# '
28
- # ' # convert pdf
32
+ # ' # When converting HTML, you might want to omit certain elements, like
33
+ # ' # sidebars, headers, footers, etc. You can pass CSS selector strings
34
+ # ' # to either extract nodes or exclude nodes during conversion.
35
+ # ' #
36
+ # ' # The easiest way to make selectors is to use SelectorGadget:
37
+ # ' # https://rvest.tidyverse.orgarticles/selectorgadget.html
38
+ # ' #
39
+ # ' # You can also right-click on a page and select "Inspect Element" in a
40
+ # ' # browser to better understand an HTML page's structure.
41
+ # ' #
42
+ # ' # For comprehensive or advanced usage of CSS selectors, consult:
43
+ # ' # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors-through-the-css-property
44
+ # ' # https://facelessuser.github.io/soupsieve/selectors/
45
+ # '
46
+ # ' url <- "https://duckdb.org/code_of_conduct"
47
+ # ' # Includes the sidebar and other navigational elements
48
+ # ' read_as_markdown(url) |> substr(1, 500) |> writeLines()
49
+ # '
50
+ # ' # Extract the main content
51
+ # ' read_as_markdown(url, html_extract_selectors = "#main_content_wrap")
52
+ # '
53
+ # ' # Alternative approach: exclude nodes
54
+ # ' read_as_markdown(
55
+ # ' url,
56
+ # ' html_zap_selectors = c(
57
+ # ' "header", # node name
58
+ # ' ".sidenavigation", # node class
59
+ # ' ".searchoverlay", # node class
60
+ # ' "#sidebar" # node ID
61
+ # ' )
62
+ # ' ) |> substr(1, 500) |> writeLines()
63
+ # '
64
+ # ' # Quarto example
65
+ # ' url <- "https://quarto.org/docs/computations/python.html"
66
+ # '
67
+ # ' # Include sidebar, footer, etc.
68
+ # ' read_as_markdown(
69
+ # ' url,
70
+ # ' html_extract_selectors = NULL,
71
+ # ' html_zap_selectors = NULL
72
+ # ' ) |> substr(1, 500) |> writeLines()
73
+ # '
74
+ # ' # Exclude content outside <main>
75
+ # ' read_as_markdown(url, html_extract_selectors = "main") |>
76
+ # ' substr(1, 500) |> writeLines()
77
+ # '
78
+ # ' # Exclude specific matching nodes
79
+ # ' read_as_markdown(
80
+ # ' url,
81
+ # ' html_extract_selectors = NULL,
82
+ # ' html_zap_selectors = c(
83
+ # ' "#quarto-sidebar",
84
+ # ' "#quarto-margin-sidebar",
85
+ # ' "header",
86
+ # ' "footer",
87
+ # ' "nav"
88
+ # ' )
89
+ # ' ) |> substr(1, 500) |> writeLines()
90
+ # '
91
+ # ' # Convert PDF
29
92
# ' pdf <- file.path(R.home("doc"), "NEWS.pdf")
30
93
# ' read_as_markdown(pdf) |> substr(1, 1000) |> cat()
31
- # ' ## alternative :
94
+ # ' ## Alternative :
32
95
# ' # pdftools::pdf_text(pdf) |> substr(1, 2000) |> cat()
33
96
# '
34
- # ' # convert images to markdown descriptions using OpenAI
97
+ # ' # Convert images to markdown descriptions using OpenAI
35
98
# ' jpg <- file.path(R.home("doc"), "html", "logo.jpg")
36
99
# ' if (Sys.getenv("OPENAI_API_KEY") != "") {
37
100
# ' # if (xfun::is_macos()) system("brew install ffmpeg")
56
119
# ' chat <- ellmer::chat_openai(echo = TRUE)
57
120
# ' chat$chat("Describe this image", ellmer::content_image_file(jpg))
58
121
# ' }
59
- read_as_markdown <- function (x , ... , canonical = FALSE , main_only = TRUE ) {
122
+ read_as_markdown <- function (
123
+ x ,
124
+ ... ,
125
+ canonical = FALSE ,
126
+ html_extract_selectors = c(" main" ),
127
+ html_zap_selectors = c(" nav" )
128
+ ) {
60
129
check_string(x )
61
130
if (startsWith(x , " ~" )) {
62
131
x <- path.expand(x )
@@ -68,47 +137,26 @@ read_as_markdown <- function(x, ..., canonical = FALSE, main_only = TRUE) {
68
137
# dependencies that conflict
69
138
md <- ragnartools.markitdown $ convert_to_markdown(
70
139
x ,
140
+ html_extract_selectors = html_extract_selectors ,
141
+ html_zap_selectors = html_zap_selectors ,
71
142
... ,
72
- main_only = main_only
73
143
)
74
144
} else {
75
- # use the markitdown cli API, (much) slower, but can be isolated from
76
- # reticulated python.
77
- # TODO: apply markitdown monkeypatches in cli interface too
78
-
79
- check_dots_empty()
80
- outfile <- withr :: local_tempfile(fileext = " .md" )
81
- exit_code <- cli_markitdown(c(shQuote(x ), " -o" , shQuote(outfile )))
82
- if (
83
- ! identical(exit_code , 0L ) ||
84
- (no_outfile_produced <- ! file.exists(outfile ))
85
- ) {
86
- # more useful output to stderr() should have been printed
87
- # already by cli_markitdown() if we are here.
88
- errmsg <- stri_flatten(
89
- c(
90
- paste(" markitdown exit code: " , exit_code ),
91
- if (no_outfile_produced ) " No output file produced."
92
- ),
93
- collapse = " \n "
94
- )
95
- stop(errmsg )
96
- }
97
-
98
- md <- stri_read_lines(outfile )
145
+ md <- read_as_markdown_cli(x , ... )
99
146
}
100
147
101
148
md <- stri_replace_all_fixed(md , " \f " , " \n\n ---\n\n " )
102
149
md <- unlist(stri_split_lines(md )) # normalize newlines
103
150
md <- stri_trim_right(md )
104
- if (canonical )
151
+ if (canonical ) {
105
152
md <- commonmark :: markdown_commonmark(
106
153
md ,
107
154
normalize = TRUE ,
108
155
footnotes = TRUE ,
109
156
width = 72L ,
110
157
extensions = TRUE
111
158
)
159
+ }
112
160
md <- stri_flatten(md , " \n " )
113
161
glue :: as_glue(md )
114
162
}
@@ -118,8 +166,9 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
118
166
lines <- text | > stri_split_lines() | > unlist()
119
167
text <- lines | > stri_flatten(" \n " )
120
168
121
- if (text == " " )
169
+ if (text == " " ) {
122
170
return (data_frame(tag = character (), start = integer(), end = integer()))
171
+ }
123
172
124
173
doc <- text | >
125
174
commonmark :: markdown_html(
@@ -256,7 +305,9 @@ markdown_segment <- function(
256
305
sizes <- drop_first(boundaries ) - drop_last(boundaries )
257
306
splits <- vec_chop(bytes , sizes = sizes ) | > vapply(rawToChar , " " )
258
307
259
- if (trim ) splits <- stri_trim_both(splits ) # drops names
308
+ if (trim ) {
309
+ splits <- stri_trim_both(splits )
310
+ } # drops names
260
311
261
312
# make names
262
313
split_tags <- c(" " , sourcepos $ tag [match(tag_boundaries , sourcepos $ start )])
@@ -294,8 +345,9 @@ markdown_frame <- function(
294
345
names = " tag" ,
295
346
leaves = " text"
296
347
)
297
- if (! length(segment_by ) || base :: setequal(segment_by , frame_by ))
348
+ if (! length(segment_by ) || base :: setequal(segment_by , frame_by )) {
298
349
frame [[" tag" ]] <- NULL
350
+ }
299
351
as_tibble(frame )
300
352
}
301
353
@@ -454,9 +506,39 @@ ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) {
454
506
455
507
# ------ utils
456
508
509
+ read_as_markdown_cli <- function (x , ... ) {
510
+ # use the markitdown cli API, (much) slower, but can be isolated from
511
+ # reticulated python.
512
+ # TODO: apply markitdown monkeypatches in cli interface too
513
+
514
+ check_dots_empty()
515
+ outfile <- withr :: local_tempfile(fileext = " .md" )
516
+ exit_code <- cli_markitdown(c(shQuote(x ), " -o" , shQuote(outfile )))
517
+ if (
518
+ ! identical(exit_code , 0L ) ||
519
+ (no_outfile_produced <- ! file.exists(outfile ))
520
+ ) {
521
+ # more useful output to stderr() should have been printed
522
+ # already by cli_markitdown() if we are here.
523
+ errmsg <- stri_flatten(
524
+ c(
525
+ paste(" markitdown exit code: " , exit_code ),
526
+ if (no_outfile_produced ) " No output file produced."
527
+ ),
528
+ collapse = " \n "
529
+ )
530
+ stop(errmsg )
531
+ }
532
+
533
+ md <- stri_read_lines(outfile )
534
+ md
535
+ }
536
+
537
+
457
538
cli_markitdown <- function (args , ... ) {
458
- if (is.na(Sys.getenv(" PYTHONIOENCODING" , NA )))
459
- withr :: local_envvar(" PYTHONIOENCODING" = " utf-8" ) # needed on windows
539
+ if (is.na(Sys.getenv(" PYTHONIOENCODING" , NA ))) {
540
+ withr :: local_envvar(" PYTHONIOENCODING" = " utf-8" )
541
+ } # needed on windows
460
542
461
543
reticulate :: uv_run_tool(
462
544
" markitdown" ,
0 commit comments