statistikat · bernhard-da · Jan 22, 2026 · Jan 22, 2026
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -4,3 +4,6 @@
 ^doc$
 ^Meta$
 ^\.github$
+^_pkgdown\.yml$
+^docs$
+^pkgdown$
diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml
@@ -40,3 +40,5 @@ jobs:
         with:
           error-on: '"error"'
           check-dir: '"check"'
+        env:
+          _R_CHECK_VIGNETTES_CHECK_TITLE_: FALSE
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
@@ -0,0 +1,35 @@
+name: pkgdown
+
+on:
+  workflow_run:
+    workflows: ["R-CMD-check"]
+    types:
+      - completed
+  workflow_dispatch:
+
+jobs:
+  pkgdown:
+    runs-on: ubuntu-latest
+    if: ${{ github.event.workflow_run.conclusion == 'success' }}
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::pkgdown, local::.
+
+      - name: Build and Deploy Site
+        run: |
+          git config --local user.name "$GITHUB_ACTOR"
+          git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
+          Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)'
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ man
 debug
 /doc/
 /Meta/
+docs
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -45,5 +45,7 @@ Suggests:
     yaml,
     devtools
 Roxygen: list(markdown = TRUE)
+URL: https://statistikat.github.io/taRantula, https://github.com/statistikat/taRantula
+BugReports: https://github.com/statistikat/taRantula/issues
 VignetteBuilder: knitr
 Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -5,10 +5,10 @@ export(buildQuery)
 export(cfg_googlesearch)
 export(cfg_scraper)
 export(check_links)
-export(extract_links)
-export(get_google_creds)
-export(params_googlesearch)
-export(params_scraper)
+export(extractLinks)
+export(getGoogleCreds)
+export(paramsGoogleSearch)
+export(paramsScraper)
 export(runGoogleSearch)
 export(searchURL)
 import(data.table)

diff --git a/NEWS.md b/NEWS.md
@@ -9,8 +9,8 @@
 
 ### Configuration (`params_manager`)
 * **R6-based Config System**: Introduced a robust, hierarchical configuration system with strict validation logic.
-    * `params_scraper()`: Dedicated configuration for generic web crawling and JS rendering.
-    * `params_googlesearch()`: Tailored configuration for Google Search API interactions including rate-limit management.
+    * `paramsScraper()`: Dedicated configuration for generic web crawling and JS rendering.
+    * `paramsGoogleSearch()`: Tailored configuration for Google Search API interactions including rate-limit management.
 * **Deep Merging**: Configuration methods now support nested path updates (e.g., `cfg$set("selenium$host", ...)`). 
 * **Validation**: Built-in defensive programming with type-checking for integers, booleans, character vectors, and directory paths.
 * **Export/Import functionality**: Added `$export()` and `$write_defaults()` methods to support YAML-based configuration round-trips.

diff --git a/R/UrlScraper.R b/R/UrlScraper.R
@@ -20,7 +20,7 @@
 #' * Regex‑based extraction of text from previously scraped HTML
 #'
 #' @section Configuration:
-#' A configuration object (typically created via `params_scraper()`) is
+#' A configuration object (typically created via [paramsScraper]) is
 #' expected to contain at least the following entries:
 #'
 #' * `db_file` – path to the DuckDB database file
@@ -41,7 +41,7 @@
 #'   - `robots_user_agent` – user agent string used for robots queries
 #' * `exclude_social_links` – logical, whether to exclude social media links
 #'
-#' The exact structure depends on `params_scraper()` and related helpers.
+#' The exact structure depends on [paramsScraper] and related helpers.
 #'
 #' @section Methods:
 #' * `initialize(config)` – create a new `UrlScraper` instance
@@ -64,7 +64,7 @@
 #' @examples
 #' \dontrun{
 #' # Create a default configuration object
-#' cfg <- params_scraper()
+#' cfg <- paramsScraper()
 #'
 #' # Example Selenium settings
 #' cfg$set("selenium$host", "localhost")
@@ -115,7 +115,7 @@ UrlScraper <- R6::R6Class(
     #' if present, and configures progress handlers.
     #'
     #' @param config A list (or configuration object) of settings, typically
-    #'   created by `params_scraper()`. It should include:
+    #'   created by [paramsScraper()]. It should include:
     #'   * `db_file` – path to the DuckDB database file.
     #'   * `snapshot_dir` – directory for snapshot files.
     #'   * `progress_dir` – directory for progress/log files.
@@ -320,7 +320,7 @@ UrlScraper <- R6::R6Class(
             ".write_snapshot",
             ".worker_scrape",
             ".scrape_single_url",
-            "extract_links",
+            "extractLinks",
             "check_links",
             "check_robotsdata",
             "query_robotsdata",

diff --git a/R/UrlScraper_utils_html.R b/R/UrlScraper_utils_html.R
@@ -2,43 +2,43 @@
 #'
 #' @description
 #' Extracts all valid hyperlinks from an HTML document and returns them as a
-#' cleaned and normalized `data.table`.  
+#' cleaned and normalized `data.table`.
 #' The function parses `<a>`, `<area>`, `<base>`, and `<link>` elements,
 #' resolves relative URLs, removes invalid or unwanted links, and enriches the
 #' output with metadata such as the source URL, extraction level, and timestamp.
 #'
 #' @details
 #' This extractor is designed for web‑scraping pipelines where only meaningful,
-#' navigable hyperlinks are desired.  
+#' navigable hyperlinks are desired.
 #' The function:
 #'
-#' * Converts inputs to an XML document when necessary  
-#' * Extracts link text and normalizes whitespace  
-#' * Resolves relative URLs against the provided `baseurl`  
-#' * Forces all URLs to use `https://`  
-#' * Removes invalid links using [`check_links()`]  
-#' * Ensures uniqueness of extracted links  
+#' * Converts inputs to an XML document when necessary
+#' * Extracts link text and normalizes whitespace
+#' * Resolves relative URLs against the provided `baseurl`
+#' * Forces all URLs to use `https://`
+#' * Removes invalid links using [`check_links()`]
+#' * Ensures uniqueness of extracted links
 #'
 #' @param doc A character string containing HTML or an `xml_document` object.
 #' @param baseurl Character string representing the URL from which the document
 #'   originated. Used to resolve relative links and filter domains.
 #'
 #' @return
 #' A `data.table` containing the following columns:
-#' * `href` – Cleaned and validated absolute URLs  
-#' * `label` – Link text extracted from the anchor element  
-#' * `source_url` – The originating page from which links were extracted  
-#' * `level` – Extraction depth (always 0 for this function)  
-#' * `scraped_at` – Timestamp of extraction  
+#' * `href` – Cleaned and validated absolute URLs
+#' * `label` – Link text extracted from the anchor element
+#' * `source_url` – The originating page from which links were extracted
+#' * `level` – Extraction depth (always 0 for this function)
+#' * `scraped_at` – Timestamp of extraction
 #'
 #' Duplicate URLs are automatically removed.
 #'
 #' @export
 #'
 #' @examples
 #' html <- "<html><body><a href='/about'>About</a></body></html>"
-#' extract_links(html, baseurl = "https://example.com")
-extract_links <- function(doc, baseurl) {
+#' extractLinks(html, baseurl = "https://example.com")
+extractLinks <- function(doc, baseurl) {
   href <- NULL
   if (!inherits(doc, "xml_document")) {
     doc <- rvest::read_html(doc)
@@ -91,13 +91,13 @@ extract_links <- function(doc, baseurl) {
 #'
 #' @description
 #' Evaluates extracted URLs and determines which of them should be retained
-#' for further processing.  
+#' for further processing.
 #' The function filters out links that:
 #'
-#' * Do not belong to the same domain as `baseurl`  
-#' * Point to files such as images, audio, video, archives, executables, etc.  
-#' * Refer to fragments or anchor points  
-#' * Refer back to the same path as the main page  
+#' * Do not belong to the same domain as `baseurl`
+#' * Point to files such as images, audio, video, archives, executables, etc.
+#' * Refer to fragments or anchor points
+#' * Refer back to the same path as the main page
 #'
 #' @param hrefs Character vector of URLs to check.
 #' @param baseurl Character string giving the original page URL for domain and
@@ -190,7 +190,7 @@ check_links <- function(hrefs, baseurl) {
 #'
 #' @description
 #' Extracts the domain portion of URLs and optionally includes the scheme
-#' (`http://` or `https://`).  
+#' (`http://` or `https://`).
 #' The function removes common subdomains such as `www.` for consistency.
 #'
 #' @param x Character vector of URLs.
@@ -226,16 +226,16 @@ get_domain <- function(x, include_scheme = FALSE) {
 #'
 #' @description
 #' Converts an HTML document into a cleaned representation where scripts,
-#' styles, and similar elements are removed.  
+#' styles, and similar elements are removed.
 #' If `keep_only_text = TRUE`, the function returns only the visible text of
 #' the page.
 #'
 #' @details
 #' This helper is used to prepare HTML content for downstream text extraction.
 #' It:
-#' * Removes `<script>`, `<style>`, and `<noscript>` nodes  
-#' * Optionally extracts only visible text  
-#' * Supports both raw HTML input and already parsed XML documents  
+#' * Removes `<script>`, `<style>`, and `<noscript>` nodes
+#' * Optionally extracts only visible text
+#' * Supports both raw HTML input and already parsed XML documents
 #'
 #' @param doc Either HTML content as a character string or an
 #'   `xml_document`. `NA` inputs are returned unchanged.
@@ -291,37 +291,37 @@ parse_HTML <- function(doc, keep_only_text = FALSE) {
 #'
 #' @description
 #' Applies a regular expression to previously scraped HTML documents, optionally
-#' restricted to a specific capture group.  
+#' restricted to a specific capture group.
 #' Each document is first cleaned using [`parse_HTML()`] to remove non‑text
 #' content, ensuring reliable pattern extraction.
 #'
 #' @details
 #' The function:
 #'
-#' * Cleans and normalizes each HTML document  
-#' * Converts text to lowercase when `ignore_cases = TRUE`  
-#' * Extracts all regex matches using `stringr::str_match_all()`  
-#' * Supports named or numbered capture groups  
-#' * Returns a unified `data.table` indexed by URL  
+#' * Cleans and normalizes each HTML document
+#' * Converts text to lowercase when `ignore_cases = TRUE`
+#' * Extracts all regex matches using `stringr::str_match_all()`
+#' * Supports named or numbered capture groups
+#' * Returns a unified `data.table` indexed by URL
 #'
 #' Named groups allow meaningful column labeling in the result.
 #'
 #' @param docs Character vector or list of HTML source documents.
-#' @param urls Character vector of URLs corresponding to `docs`.  
+#' @param urls Character vector of URLs corresponding to `docs`.
 #' @param pattern A regular expression to search for.
-#' @param group Optional capture group name or index to extract.  
+#' @param group Optional capture group name or index to extract.
 #'   If `NULL`, the full match is returned.
 #' @param ignore_cases Logical; if `TRUE`, performs case‑insensitive matching.
 #'
 #' @return
 #' A `data.table` where each row corresponds to a match and includes:
-#' * `url` – The originating document URL  
-#' * `pattern` (or the given group name) – Extracted values  
+#' * `url` – The originating document URL
+#' * `pattern` (or the given group name) – Extracted values
 #'
 #' Missing matches are returned as `NA_character_`.
 #'
 #' @keywords internal
-#' 
+#'
 #' @examples
 #' \dontrun{
 #' ## Extract email-like patterns:

diff --git a/R/UrlScraper_utils_scrape.R b/R/UrlScraper_utils_scrape.R
@@ -24,11 +24,11 @@
 #'   }
 #'
 #' @details
-#' The function first checks robots.txt rules using `check_robotsdata()`.  
-#' If scraping is disallowed, a standardized record is returned.  
+#' The function first checks robots.txt rules using `check_robotsdata()`.
+#' If scraping is disallowed, a standardized record is returned.
 #' When using Selenium, the browser is navigated to the URL and the potentially
 #' redirected final URL is captured. For non-Selenium inputs, an HTTP GET request
-#' is performed.  
+#' is performed.
 #' Errors during scraping are caught and converted into structured output.
 #'
 #'
@@ -84,7 +84,7 @@
         url_redirect <- NA_character_
       }
 
-      dt_links <- extract_links(
+      dt_links <- extractLinks(
         doc = html_source,
         baseurl = url
       )
@@ -142,8 +142,8 @@
 #' @details
 #' The function iterates over provided URLs, invoking `.scrape_single_url()` for each.
 #' Progress is logged to file, and optional snapshot files store intermediate results to
-#' safeguard against worker interruptions.  
-#' When the stop file is detected, the worker terminates early.  
+#' safeguard against worker interruptions.
+#' When the stop file is detected, the worker terminates early.
 #' Any remaining un-snapshotted results are written at the end of execution.
 #'
 #' @examples
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,4 @@ man @@
     debug
     /doc/
     /Meta/
+    docs