From da0b844c22a11f9bd99eb41b3be55c5346919456 Mon Sep 17 00:00:00 2001 From: Bernhard Meindl Date: Thu, 22 Jan 2026 10:26:03 +0100 Subject: [PATCH] Setup Pkgdown; Standardize exported function names --- .Rbuildignore | 3 ++ .github/workflows/check.yaml | 2 + .github/workflows/pkgdown.yaml | 35 ++++++++++++++ .gitignore | 1 + DESCRIPTION | 2 + NAMESPACE | 8 ++-- NEWS.md | 4 +- R/UrlScraper.R | 10 ++-- R/UrlScraper_utils_html.R | 72 ++++++++++++++-------------- R/UrlScraper_utils_scrape.R | 12 ++--- R/parameter_manager.R | 28 +++++------ R/search_url.R | 24 +++++----- README.md | 8 ++-- _pkgdown.yml | 47 ++++++++++++++++++ tests/testthat/test-scraper-config.R | 12 ++--- vignettes/Config.Rmd | 12 ++--- vignettes/Intro.Rmd | 14 +++--- 17 files changed, 192 insertions(+), 102 deletions(-) create mode 100644 .github/workflows/pkgdown.yaml create mode 100644 _pkgdown.yml diff --git a/.Rbuildignore b/.Rbuildignore index a5a2e58..2b79998 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,3 +4,6 @@ ^doc$ ^Meta$ ^\.github$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index e5e346f..9aa1e1e 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -40,3 +40,5 @@ jobs: with: error-on: '"error"' check-dir: '"check"' + env: + _R_CHECK_VIGNETTES_CHECK_TITLE_: FALSE diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 0000000..3d63e9e --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,35 @@ +name: pkgdown + +on: + workflow_run: + workflows: ["R-CMD-check"] + types: + - completed + workflow_dispatch: + +jobs: + pkgdown: + runs-on: ubuntu-latest + if: ${{ github.event.workflow_run.conclusion == 'success' }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + + - name: Build and Deploy Site + run: | + git config --local user.name "$GITHUB_ACTOR" + git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" + Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' diff --git a/.gitignore b/.gitignore index 8bf50dd..fc742c2 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ man debug /doc/ /Meta/ +docs diff --git a/DESCRIPTION b/DESCRIPTION index 87b32e7..e54d5ed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,5 +45,7 @@ Suggests: yaml, devtools Roxygen: list(markdown = TRUE) +URL: https://statistikat.github.io/taRantula, https://github.com/statistikat/taRantula +BugReports: https://github.com/statistikat/taRantula/issues VignetteBuilder: knitr Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 65d0db3..de42aa9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,10 +5,10 @@ export(buildQuery) export(cfg_googlesearch) export(cfg_scraper) export(check_links) -export(extract_links) -export(get_google_creds) -export(params_googlesearch) -export(params_scraper) +export(extractLinks) +export(getGoogleCreds) +export(paramsGoogleSearch) +export(paramsScraper) export(runGoogleSearch) export(searchURL) import(data.table) diff --git a/NEWS.md b/NEWS.md index 01806dd..6f79d47 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,8 +9,8 @@ ### Configuration (`params_manager`) * **R6-based Config System**: Introduced a robust, hierarchical configuration system with strict validation logic. - * `params_scraper()`: Dedicated configuration for generic web crawling and JS rendering. - * `params_googlesearch()`: Tailored configuration for Google Search API interactions including rate-limit management. + * `paramsScraper()`: Dedicated configuration for generic web crawling and JS rendering. + * `paramsGoogleSearch()`: Tailored configuration for Google Search API interactions including rate-limit management. * **Deep Merging**: Configuration methods now support nested path updates (e.g., `cfg$set("selenium$host", ...)`). * **Validation**: Built-in defensive programming with type-checking for integers, booleans, character vectors, and directory paths. * **Export/Import functionality**: Added `$export()` and `$write_defaults()` methods to support YAML-based configuration round-trips. diff --git a/R/UrlScraper.R b/R/UrlScraper.R index a92e2ef..d4731e4 100644 --- a/R/UrlScraper.R +++ b/R/UrlScraper.R @@ -20,7 +20,7 @@ #' * Regex‑based extraction of text from previously scraped HTML #' #' @section Configuration: -#' A configuration object (typically created via `params_scraper()`) is +#' A configuration object (typically created via [paramsScraper]) is #' expected to contain at least the following entries: #' #' * `db_file` – path to the DuckDB database file @@ -41,7 +41,7 @@ #' - `robots_user_agent` – user agent string used for robots queries #' * `exclude_social_links` – logical, whether to exclude social media links #' -#' The exact structure depends on `params_scraper()` and related helpers. +#' The exact structure depends on [paramsScraper] and related helpers. #' #' @section Methods: #' * `initialize(config)` – create a new `UrlScraper` instance @@ -64,7 +64,7 @@ #' @examples #' \dontrun{ #' # Create a default configuration object -#' cfg <- params_scraper() +#' cfg <- paramsScraper() #' #' # Example Selenium settings #' cfg$set("selenium$host", "localhost") @@ -115,7 +115,7 @@ UrlScraper <- R6::R6Class( #' if present, and configures progress handlers. #' #' @param config A list (or configuration object) of settings, typically - #' created by `params_scraper()`. It should include: + #' created by [paramsScraper()]. It should include: #' * `db_file` – path to the DuckDB database file. #' * `snapshot_dir` – directory for snapshot files. #' * `progress_dir` – directory for progress/log files. @@ -320,7 +320,7 @@ UrlScraper <- R6::R6Class( ".write_snapshot", ".worker_scrape", ".scrape_single_url", - "extract_links", + "extractLinks", "check_links", "check_robotsdata", "query_robotsdata", diff --git a/R/UrlScraper_utils_html.R b/R/UrlScraper_utils_html.R index 6e4835a..0007223 100644 --- a/R/UrlScraper_utils_html.R +++ b/R/UrlScraper_utils_html.R @@ -2,22 +2,22 @@ #' #' @description #' Extracts all valid hyperlinks from an HTML document and returns them as a -#' cleaned and normalized `data.table`. +#' cleaned and normalized `data.table`. #' The function parses ``, ``, ``, and `` elements, #' resolves relative URLs, removes invalid or unwanted links, and enriches the #' output with metadata such as the source URL, extraction level, and timestamp. #' #' @details #' This extractor is designed for web‑scraping pipelines where only meaningful, -#' navigable hyperlinks are desired. +#' navigable hyperlinks are desired. #' The function: #' -#' * Converts inputs to an XML document when necessary -#' * Extracts link text and normalizes whitespace -#' * Resolves relative URLs against the provided `baseurl` -#' * Forces all URLs to use `https://` -#' * Removes invalid links using [`check_links()`] -#' * Ensures uniqueness of extracted links +#' * Converts inputs to an XML document when necessary +#' * Extracts link text and normalizes whitespace +#' * Resolves relative URLs against the provided `baseurl` +#' * Forces all URLs to use `https://` +#' * Removes invalid links using [`check_links()`] +#' * Ensures uniqueness of extracted links #' #' @param doc A character string containing HTML or an `xml_document` object. #' @param baseurl Character string representing the URL from which the document @@ -25,11 +25,11 @@ #' #' @return #' A `data.table` containing the following columns: -#' * `href` – Cleaned and validated absolute URLs -#' * `label` – Link text extracted from the anchor element -#' * `source_url` – The originating page from which links were extracted -#' * `level` – Extraction depth (always 0 for this function) -#' * `scraped_at` – Timestamp of extraction +#' * `href` – Cleaned and validated absolute URLs +#' * `label` – Link text extracted from the anchor element +#' * `source_url` – The originating page from which links were extracted +#' * `level` – Extraction depth (always 0 for this function) +#' * `scraped_at` – Timestamp of extraction #' #' Duplicate URLs are automatically removed. #' @@ -37,8 +37,8 @@ #' #' @examples #' html <- "About" -#' extract_links(html, baseurl = "https://example.com") -extract_links <- function(doc, baseurl) { +#' extractLinks(html, baseurl = "https://example.com") +extractLinks <- function(doc, baseurl) { href <- NULL if (!inherits(doc, "xml_document")) { doc <- rvest::read_html(doc) @@ -91,13 +91,13 @@ extract_links <- function(doc, baseurl) { #' #' @description #' Evaluates extracted URLs and determines which of them should be retained -#' for further processing. +#' for further processing. #' The function filters out links that: #' -#' * Do not belong to the same domain as `baseurl` -#' * Point to files such as images, audio, video, archives, executables, etc. -#' * Refer to fragments or anchor points -#' * Refer back to the same path as the main page +#' * Do not belong to the same domain as `baseurl` +#' * Point to files such as images, audio, video, archives, executables, etc. +#' * Refer to fragments or anchor points +#' * Refer back to the same path as the main page #' #' @param hrefs Character vector of URLs to check. #' @param baseurl Character string giving the original page URL for domain and @@ -190,7 +190,7 @@ check_links <- function(hrefs, baseurl) { #' #' @description #' Extracts the domain portion of URLs and optionally includes the scheme -#' (`http://` or `https://`). +#' (`http://` or `https://`). #' The function removes common subdomains such as `www.` for consistency. #' #' @param x Character vector of URLs. @@ -226,16 +226,16 @@ get_domain <- function(x, include_scheme = FALSE) { #' #' @description #' Converts an HTML document into a cleaned representation where scripts, -#' styles, and similar elements are removed. +#' styles, and similar elements are removed. #' If `keep_only_text = TRUE`, the function returns only the visible text of #' the page. #' #' @details #' This helper is used to prepare HTML content for downstream text extraction. #' It: -#' * Removes `