diff --git a/.Rbuildignore b/.Rbuildignore index faefafc5a..6182dba99 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -14,6 +14,7 @@ ^_pkgdown\.yml$ ^docs$ ^logs$ +^dev$ ^man-roxygen$ ^staged_dependencies\.yaml$ ^temp$ diff --git a/NEWS.md b/NEWS.md index 0d51606ce..fe1202278 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,6 @@ ## formatters 0.5.11.9000 +* Optimized pagination sub-routines to avoid `matrix_form()` calls when not needed. +* Optimized pagination sub-routine `.compress_mat()` to reduce computing time for long listings. ## formatters 0.5.11 * Fixed a bug in `mform_handle_newlines` that caused string matrix column names to be removed. This prevented paginated listing key column info from being repeated when vertically spanning multiple pages. diff --git a/R/matrix_form.R b/R/matrix_form.R index 651ea95ec..3310c4a52 100644 --- a/R/matrix_form.R +++ b/R/matrix_form.R @@ -44,11 +44,13 @@ mform_handle_newlines <- function(matform) { # pre-proc in case of wrapping and \n line_grouping <- mf_lgrouping(matform) - strmat <- .compress_mat(strmat, line_grouping, "nl") - frmmat <- .compress_mat(frmmat, line_grouping, "unique") # never not unique - spamat <- .compress_mat(spamat, line_grouping, "unique") - alimat <- .compress_mat(alimat, line_grouping, "unique") - line_grouping <- unique(line_grouping) + if (any(duplicated(line_grouping))) { + strmat <- .compress_mat(strmat, line_grouping, "nl") + frmmat <- .compress_mat(frmmat, line_grouping, "unique") # never not unique + spamat <- .compress_mat(spamat, line_grouping, "unique") + alimat <- .compress_mat(alimat, line_grouping, "unique") + line_grouping <- unique(line_grouping) + } # nlines detects if there is a newline character # colwidths = NULL, max_width = NULL, fontspec = NULL @@ -143,32 +145,42 @@ mform_handle_newlines <- function(matform) { .quick_handle_nl <- function(str_v) { if (any(grepl("\n", str_v))) { - return(unlist(strsplit(str_v, "\n", fixed = TRUE))) + unlist(strsplit(str_v, "\n", fixed = TRUE)) } else { - return(str_v) + str_v } } -# Helper function to recompact the lines following line groupings to then have them expanded again +#### Helper function to recompact the lines following line groupings to then have them expanded again +# This version now ensures its output has the exact same row order as the original function. +# -> see file in `dev/benchmark_compress_mat.R` for a benchmark of the two versions(+). .compress_mat <- function(mat, line_grouping, collapse_method = c("nl", "unique")) { - list_compacted_mat <- lapply(unique(line_grouping), function(lg) { - apply(mat, 2, function(mat_cols) { - col_vec <- mat_cols[which(line_grouping == lg)] - if (collapse_method[1] == "nl") { - paste0(col_vec, collapse = "\n") - } else { - val <- unique(col_vec) - val <- val[nzchar(val)] + df <- as.data.frame(mat, stringsAsFactors = FALSE) + + # The original function processes groups in the order they appear in `unique(line_grouping)`. + # We create a factor with levels set to that specific order to force `split` to maintain it. + factor_grouping <- factor(line_grouping, levels = unique(line_grouping)) + list_of_dfs <- split(df, factor_grouping) + + if (collapse_method[1] == "nl") { + result_list <- lapply(list_of_dfs, function(sub_df) { + sapply(sub_df, paste, collapse = "\n") + }) + } else { # "unique" method + result_list <- lapply(list_of_dfs, function(sub_df) { + sapply(sub_df, function(col) { + val <- unique(col[nzchar(col)]) if (length(val) > 1) { - stop("Problem in linegroupings! Some do not have the same values.") # nocov - } else if (length(val) < 1) { - val <- "" # Case in which it is only "" + stop("Problem in linegroupings! Some do not have the same values.") + } else if (length(val) == 0) { + "" + } else { + val } - val[[1]] - } + }) }) - }) - do.call("rbind", list_compacted_mat) + } + do.call("rbind", result_list) } disp_from_spans <- function(spans) { diff --git a/R/pagination.R b/R/pagination.R index de48a3652..8cd04cae3 100644 --- a/R/pagination.R +++ b/R/pagination.R @@ -569,7 +569,7 @@ pag_indices_inner <- function(pagdf, #' @inheritParams pag_indices_inner #' @inheritParams open_font_dev #' @inheritParams format_value -#' @param obj (`ANY`)\cr object to be paginated. Must have a [matrix_form()] method. +#' @param mf (`MatrixPrintForm`)\cr object to be paginated. #' @param cpp (`numeric(1)`)\cr number of characters per page (width). #' @param colwidths (`numeric`)\cr vector of column widths (in characters) for use in vertical pagination. #' @param rep_cols (`numeric(1)`)\cr number of *columns* (not including row labels) to be repeated on every page. @@ -583,7 +583,7 @@ pag_indices_inner <- function(pagdf, #' lapply(colpaginds, function(j) mtcars[, j, drop = FALSE]) #' #' @export -vert_pag_indices <- function(obj, +vert_pag_indices <- function(mf, cpp = 40, colwidths = NULL, verbose = FALSE, @@ -594,7 +594,7 @@ vert_pag_indices <- function(obj, if (is.list(nosplitin)) { nosplitin <- nosplitin[["cols"]] } - mf <- matrix_form(obj, indent_rownames = TRUE, fontspec = fontspec, round_type = round_type) + # mf <- matrix_form(obj, indent_rownames = TRUE, fontspec = fontspec, round_type = round_type) clwds <- colwidths %||% propose_column_widths(mf, fontspec = fontspec) if (is.null(mf_cinfo(mf))) { ## like always, ugh. mf <- mpf_infer_cinfo(mf, colwidths = clwds, rep_cols = rep_cols, fontspec = fontspec) @@ -951,6 +951,7 @@ splice_idx_lists <- function(lsts) { #' @inheritParams page_lcpp #' @inheritParams toString #' @inheritParams propose_column_widths +#' @param obj (`ANY`)\cr object to be paginated. Must have a [matrix_form()] method. #' @param lpp (`numeric(1)` or `NULL`)\cr lines per page. If `NA` (the default), this is calculated automatically #' based on the specified page size). `NULL` indicates no vertical pagination should occur. #' @param cpp (`numeric(1)` or `NULL`)\cr width (in characters) per page. If `NA` (the default), this is calculated @@ -1014,6 +1015,8 @@ paginate_indices <- function(obj, if (newdev) { on.exit(close_font_dev()) } + + ## this MUST alsways return a list, inluding list(obj) when ## no forced pagination is needed! otherwise stuff breaks for things ## based on s3 classes that are lists underneath!!! @@ -1034,7 +1037,9 @@ paginate_indices <- function(obj, ## order is annoying here, since we won't actually need the mpf if ## we run into forced pagination, but life is short and this should work fine. + # this step can be very slow. No need to do it internally mpf <- matrix_form(obj, TRUE, TRUE, indent_size = indent_size, fontspec = fontspec, round_type = round_type) + if (is.null(colwidths)) { colwidths <- mf_col_widths(mpf) %||% propose_column_widths(mpf, fontspec = fontspec, round_type = round_type) } else { @@ -1042,7 +1047,7 @@ paginate_indices <- function(obj, } mf_colgap(mpf) <- col_gap - if (!is.null(rep_cols) && rep_cols != num_rep_cols(obj)) { + if (!is.null(rep_cols) && rep_cols != num_rep_cols(mpf)) { num_rep_cols(mpf) <- rep_cols } if (NROW(mf_cinfo(mpf)) == 0) { @@ -1078,27 +1083,29 @@ paginate_indices <- function(obj, ## this wraps the cell contents AND shoves referential footnote ## info into mf_rinfo(mpf) - mpf <- do_cell_fnotes_wrap(mpf, colwidths, max_width, tf_wrap = tf_wrap, fontspec = fontspec) + ori_mpf <- mpf # used for pulling right split labels + mpf <- do_cell_fnotes_wrap(ori_mpf, colwidths, max_width, tf_wrap = tf_wrap, fontspec = fontspec) # rlistings note: if there is a wrapping in a keycol, it is not calculated correctly # in the above call, so we need to keep this information in mf_rinfo # and use it here. mfri <- mf_rinfo(mpf) - keycols <- .get_keycols_from_listing(obj) + keycols <- .get_keycols_from_listing(mpf) + seq_nrh <- seq_len(mf_nrheader(mpf)) if (NROW(mfri) > 1 && .is_listing_mf(mpf) && length(keycols) > 0) { # Lets determine the groupings created by keycols keycols_grouping_df <- NULL for (i in seq_along(keycols)) { kcol <- keycols[i] - if (is(obj, "MatrixPrintForm")) { - # This makes the function work also in the case we have only matrix form (mainly for testing purposes) - kcolvec <- mf_strings(obj)[, mf_strings(obj)[1, , drop = TRUE] == kcol][-1] - while (any(kcolvec == "")) { - kcolvec[which(kcolvec == "")] <- kcolvec[which(kcolvec == "") - 1] + # This makes the function work also in the case we have only matrix form (mainly for testing purposes) + ori_mpf_colnames <- names(mf_strings(ori_mpf)[1, , drop = TRUE]) + kcolvec <- mf_strings(ori_mpf)[, ori_mpf_colnames == kcol][-seq_nrh] + + # if the keycol is not present, we just use the previous one + for (j in seq_along(kcolvec)) { + if (j > 1 && kcolvec[j] == "") { + kcolvec[j] <- kcolvec[j - 1] } - } else { - kcolvec <- obj[[kcol]] - kcolvec <- vapply(kcolvec, format_value, "", format = obj_format(kcolvec), na_str = obj_na_str(kcolvec)) } groupings <- as.numeric(factor(kcolvec, levels = unique(kcolvec))) where_they_start <- which(c(1, diff(groupings)) > 0) @@ -1310,19 +1317,18 @@ paginate_to_mpfs <- function(obj, return(deep_pag) } else if (has_page_title(fpags[[1]])) { obj <- fpags[[1]] + ## we run into forced pagination, but life is short and this should work fine. + mpf <- matrix_form(obj, TRUE, TRUE, indent_size = indent_size, fontspec = fontspec, round_type = round_type) + num_rep_cols(mpf) <- rep_cols + if (is.null(colwidths)) { + colwidths <- mf_col_widths(mpf) %||% propose_column_widths(mpf, fontspec = fontspec, round_type = round_type) + } + mf_col_widths(mpf) <- colwidths + mf_colgap(mpf) <- col_gap } - ## we run into forced pagination, but life is short and this should work fine. - mpf <- matrix_form(obj, TRUE, TRUE, indent_size = indent_size, fontspec = fontspec, round_type = round_type) - num_rep_cols(mpf) <- rep_cols - if (is.null(colwidths)) { - colwidths <- mf_col_widths(mpf) %||% propose_column_widths(mpf, fontspec = fontspec, round_type = round_type) - } - mf_col_widths(mpf) <- colwidths - mf_colgap(mpf) <- col_gap - page_indices <- paginate_indices( - obj = obj, + obj = mpf, ## page_type = page_type, ## font_family = font_family, ## font_size = font_size, diff --git a/dev/benchmark_compress_mat.R b/dev/benchmark_compress_mat.R new file mode 100644 index 000000000..f09811115 --- /dev/null +++ b/dev/benchmark_compress_mat.R @@ -0,0 +1,176 @@ +# Davide Garolini 12.06.2025 +# Dev script to benchmark the performance of different implementations of the +# .compress_mat function. + +# Ensure required packages are installed and loaded +if (!require("microbenchmark")) install.packages("microbenchmark") +if (!require("data.table")) install.packages("data.table") +if (!require("testthat")) install.packages("testthat") +library(microbenchmark) +library(data.table) +library(testthat) + +# Original function, restored to its initial, untouched version. +# NOTE: This version is not only slow but can be fragile with very large +# inputs due to its use of nested `apply`. It is kept for comparison. +.compress_mat_original <- function(mat, line_grouping, collapse_method = c("nl", "unique")) { + list_compacted_mat <- lapply(unique(line_grouping), function(lg) { + apply(mat, 2, function(mat_cols) { + col_vec <- mat_cols[which(line_grouping == lg)] + if (collapse_method[1] == "nl") { + paste0(col_vec, collapse = "\n") + } else { + val <- unique(col_vec) + val <- val[nzchar(val)] + if (length(val) > 1) { + stop("Problem in linegroupings! Some do not have the same values.") # nocov + } else if (length(val) < 1) { + val <- "" # Case in which it is only "" + } + val[[1]] + } + }) + }) + do.call("rbind", list_compacted_mat) +} + +# --- Optimized Versions --- + +# 1. Optimized Base R version +# This version now ensures its output has the exact same row order as the original function. +.compress_mat_base_R <- function(mat, line_grouping, collapse_method = c("nl", "unique")) { + df <- as.data.frame(mat, stringsAsFactors = FALSE) + + # The original function processes groups in the order they appear in `unique(line_grouping)`. + # We create a factor with levels set to that specific order to force `split` to maintain it. + factor_grouping <- factor(line_grouping, levels = unique(line_grouping)) + list_of_dfs <- split(df, factor_grouping) + + if (collapse_method[1] == "nl") { + result_list <- lapply(list_of_dfs, function(sub_df) { + sapply(sub_df, paste, collapse = "\n") + }) + } else { # "unique" method + result_list <- lapply(list_of_dfs, function(sub_df) { + sapply(sub_df, function(col) { + val <- unique(col[nzchar(col)]) + if (length(val) > 1) { + stop("Problem in linegroupings! Some do not have the same values.") + } else if (length(val) == 0) { + "" + } else { + val + } + }) + }) + } + do.call("rbind", result_list) +} + + +# 2. Optimized data.table version +# This version now ensures its output has the exact same row order as the original function. +.compress_mat_datatable <- function(mat, line_grouping, collapse_method = c("nl", "unique")) { + dt <- as.data.table(mat) + dt[, `:=`(group_col = line_grouping)] + + # The `by` operation sorts the groups by default. + # We will reorder the result later to match the original function's output order. + if (collapse_method[1] == "nl") { + result_dt <- dt[, lapply(.SD, paste, collapse = "\n"), by = group_col] + } else { # "unique" method + unique_func <- function(col) { + val <- unique(col[nzchar(col)]) + if (length(val) > 1) stop("Problem in linegroupings! Some do not have the same values.") + if (length(val) == 0) "" else val + } + result_dt <- dt[, lapply(.SD, unique_func), by = group_col] + } + + # Reorder the results to match the original function's group order. + original_order <- unique(line_grouping) + setkeyv(result_dt, "group_col") # Set a key for fast matching + final_dt <- result_dt[J(original_order)] # Reorder using a join-like operation + + final_dt[, group_col := NULL] + as.matrix(final_dt) +} + + +# --- Data Generation --- + +# Using a slightly smaller dataset to prevent the fragile original function from erroring. +set.seed(42) # for reproducibility +n_rows <- 1000 +n_cols <- 10 +n_groups <- 100 + +line_grouping <- sample(1:n_groups, n_rows, replace = TRUE) + +strmat <- matrix( + paste0("s_", sample(1:(n_rows / 5), n_rows * n_cols, replace = TRUE)), + nrow = n_rows +) + +group_vals <- matrix( + paste0("f_", 1:(n_groups * n_cols)), + nrow = n_groups, + ncol = n_cols +) +frmat <- group_vals[line_grouping, ] +frmat[sample(length(frmat), size = n_rows * 0.1)] <- "" + + +# --- Verification --- +# We verify that the outputs of the optimized functions are identical to the original. +# We set the attributes to NULL to ensure we are only comparing the matrix data values. +cat("--- Verifying 'nl' method results are identical ---\n") +res_orig_nl <- .compress_mat_original(strmat, line_grouping, "nl") +res_base_nl <- .compress_mat_base_R(strmat, line_grouping, "nl") +res_dt_nl <- .compress_mat_datatable(strmat, line_grouping, "nl") + +test_that("Optimized 'nl' methods match original", { + # Set attributes to NULL as they are not needed and can differ. + attributes(res_orig_nl) <- NULL + attributes(res_base_nl) <- NULL + attributes(res_dt_nl) <- NULL + expect_equal(res_base_nl, res_orig_nl) + expect_equal(res_dt_nl, res_orig_nl) +}) +cat("Verification successful.\n\n") + + +cat("--- Verifying 'unique' method results are identical ---\n") +res_orig_unique <- .compress_mat_original(frmat, line_grouping, "unique") +res_base_unique <- .compress_mat_base_R(frmat, line_grouping, "unique") +res_dt_unique <- .compress_mat_datatable(frmat, line_grouping, "unique") + +test_that("Optimized 'unique' methods match original", { + attributes(res_orig_unique) <- NULL + attributes(res_base_unique) <- NULL + attributes(res_dt_unique) <- NULL + expect_equal(res_base_unique, res_orig_unique) + expect_equal(res_dt_unique, res_orig_unique) +}) +cat("Verification successful.\n\n") + + +# --- Benchmark --- + +cat("--- Benchmarking 'nl' (newline) method ---\n") +bm_nl <- microbenchmark( + original = .compress_mat_original(strmat, line_grouping, "nl"), + base_R = .compress_mat_base_R(strmat, line_grouping, "nl"), + datatable = .compress_mat_datatable(strmat, line_grouping, "nl"), + times = 10L +) +print(bm_nl) + +cat("\n\n--- Benchmarking 'unique' method ---\n") +bm_unique <- microbenchmark( + original = .compress_mat_original(frmat, line_grouping, "unique"), + base_R = .compress_mat_base_R(frmat, line_grouping, "unique"), + datatable = .compress_mat_datatable(frmat, line_grouping, "unique"), + times = 10L +) +print(bm_unique) diff --git a/man/vert_pag_indices.Rd b/man/vert_pag_indices.Rd index 4f5ad60f4..276bec98d 100644 --- a/man/vert_pag_indices.Rd +++ b/man/vert_pag_indices.Rd @@ -5,7 +5,7 @@ \title{Find column indices for vertical pagination} \usage{ vert_pag_indices( - obj, + mf, cpp = 40, colwidths = NULL, verbose = FALSE, @@ -16,7 +16,7 @@ vert_pag_indices( ) } \arguments{ -\item{obj}{(\code{ANY})\cr object to be paginated. Must have a \code{\link[=matrix_form]{matrix_form()}} method.} +\item{mf}{(\code{MatrixPrintForm})\cr object to be paginated.} \item{cpp}{(\code{numeric(1)})\cr number of characters per page (width).}