From 46c4317d48239bce2ba503636925e87fabd16637 Mon Sep 17 00:00:00 2001 From: Jack Penzer Date: Mon, 9 Dec 2024 11:00:02 +0000 Subject: [PATCH] Update the R-CMD-check.yaml to run on releases also - also need to check that this actually triggers in the CI/CD. Add the .Rds to the repo too --- .github/workflows/R-CMD-check.yaml | 2 ++ R/limpiar_pp.R | 17 ------------ _pkgdown.yml | 1 + man/data_param.Rd | 12 +++++++++ man/limpiar_alphanumeric.Rd | 33 +++++++++++++++++++++++ man/limpiar_emojis.Rd | 27 ------------------- man/limpiar_inspect.Rd | 19 +++++++++----- man/limpiar_link_click.Rd | 11 ++++---- man/limpiar_link_click_reverse.Rd | 1 - man/limpiar_non_ascii.Rd | 33 +++++++++++++++++++++++ man/limpiar_pp_companies.Rd | 8 ------ man/limpiar_pp_products.Rd | 9 ------- man/limpiar_recode_emojis.Rd | 42 ++++++++++++++++++++++++++++++ man/limpiar_remove_emojis.Rd | 34 ++++++++++++++++++++++++ man/limpiar_spam_grams.Rd | 4 +-- man/limpiar_stopwords.Rd | 6 ++--- man/limpiar_wrap.Rd | 32 +++++++++++++++++++++++ man/text_var.Rd | 12 +++++++++ 18 files changed, 224 insertions(+), 79 deletions(-) create mode 100644 man/data_param.Rd create mode 100644 man/limpiar_alphanumeric.Rd delete mode 100644 man/limpiar_emojis.Rd create mode 100644 man/limpiar_non_ascii.Rd create mode 100644 man/limpiar_recode_emojis.Rd create mode 100644 man/limpiar_remove_emojis.Rd create mode 100644 man/limpiar_wrap.Rd create mode 100644 man/text_var.Rd diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index d2edacb..b14cee8 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -5,6 +5,8 @@ on: branches: [main, master] pull_request: branches: [main, master] + release: + types: [published] schedule: # Runs at 15:20 PM UTC every Thursday - cron: '00 09 * * 0' diff --git a/R/limpiar_pp.R b/R/limpiar_pp.R index 83bde6d..7e89a1f 100644 --- a/R/limpiar_pp.R +++ b/R/limpiar_pp.R @@ -5,15 +5,6 @@ #' #' @return Data Frame or Tibble object with text variable edited inline #' @export -#' -#' @examples -#' \dontrun{ -#' Example 1 -#'df %>% limpiar_pp_products(message) -#' Example 2 -#'limpiar_pp_products(df, message) -#' } -#' limpiar_pp_products <- function(df, text_var){ entities <- LimpiaR::entities @@ -40,14 +31,6 @@ limpiar_pp_products <- function(df, text_var){ #' #' @return Data Frame or Tibble object with text variable edited inline #' @export -#' -#' @examples -#' \dontrun{ -#' Example 1 -#'df %>% limpiar_pp_companies(message) -#' Example 2 -#'limpiar_pp_companies(df, message) -#' } limpiar_pp_companies <- function(df, text_var){ companies <- c("\\bapple\\b", "\\bmicrosoft\\b", "\\bmsft\\b", "\\bnvidia\\b", "\\bsony\\b", "\\binstagram\\b", "\\bwhatsapp\\b", "duckduckgo", "\\bddg\\b", "\\bduck duck go\\b", "\\bsamsung\\b", "\\blenovo\\b", diff --git a/_pkgdown.yml b/_pkgdown.yml index 96c986e..c5f55ea 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -58,6 +58,7 @@ reference: - limpiar_na_cols - contains("limpiar_link_click") - limpiar_ex_subreddits + - limpiar_wrap - title: Processing Parts of Speech desc: > diff --git a/man/data_param.Rd b/man/data_param.Rd new file mode 100644 index 0000000..ff7377a --- /dev/null +++ b/man/data_param.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/params.R +\name{data_param} +\alias{data_param} +\title{Helper for consistent documentation of \code{data}} +\arguments{ +\item{data}{Name of your Data Frame or Tibble object} +} +\description{ +Use \verb{@inheritParams data_param} to consistently document \code{data}. +} +\keyword{internal} diff --git a/man/limpiar_alphanumeric.Rd b/man/limpiar_alphanumeric.Rd new file mode 100644 index 0000000..1cbaf33 --- /dev/null +++ b/man/limpiar_alphanumeric.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/limpiar_alphanumeric.R +\name{limpiar_alphanumeric} +\alias{limpiar_alphanumeric} +\title{Remove everything except letters, numbers, and spaces} +\usage{ +limpiar_alphanumeric(data, text_var = mention_content) +} +\arguments{ +\item{data}{Name of your Data Frame or Tibble object} + +\item{text_var}{Name of your text variable. Can be given as a 'string' or a symbol - should refer to a column inside \code{data}} +} +\value{ +Data frame with the text variable changed in place +} +\description{ +A simple regex for retaining only a-z, A-Z and 0-9 as well as white space characters, including new lines. This function \emph{will} remove accented characters, and any non-English characters, punctuation, etc. so it is a heavy-duty approach to cleaning and should be used prudently. If you know that you need to keep accents, try \code{limpiar_non_ascii} first, before avoiding these functions altogether. +} +\examples{ +test_df <- data.frame( +text = c( + "Simple text 123", # Basic ASCII only + "Hello! How are you? 😊 🌟", # ASCII + punctuation + emojis + "cafΓ© MΓΌnchen niΓ±o", # Latin-1 accented characters + "#special@chars&(~)|[$]", # Special characters and symbols + "ζ··εˆζ±‰ε­—γ¨ζ—₯本θͺž β†’ ⌘ £€Β₯" # CJK characters + symbols + arrows +) +) + +limpiar_alphanumeric(test_df, text) + +} diff --git a/man/limpiar_emojis.Rd b/man/limpiar_emojis.Rd deleted file mode 100644 index 87d016f..0000000 --- a/man/limpiar_emojis.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/limpiar_emojis.R -\name{limpiar_emojis} -\alias{limpiar_emojis} -\title{Replace emojis with a textual description} -\usage{ -limpiar_emojis(df, text_var = mention_content, with_emoji_tag = FALSE) -} -\arguments{ -\item{df}{Name of Data Frame or Tibble Object} - -\item{text_var}{Name of text variable} - -\item{with_emoji_tag}{Whether to replace with snakecase linked words or not} -} -\value{ -The Data Frame or Tibble object with most emojis cleaned from the text variable -} -\description{ -Main usage is for pre-processing the text variable as part of Deep Learning pipeline. -The most important argument is whether or not to add the emoji tag, which will also print in snake case. -} -\examples{ -limpiar_examples \%>\% dplyr::select(mention_content) - -limpiar_examples \%>\% limpiar_emojis() \%>\% dplyr::select(mention_content) -} diff --git a/man/limpiar_inspect.Rd b/man/limpiar_inspect.Rd index c561d0b..b6bfad0 100644 --- a/man/limpiar_inspect.Rd +++ b/man/limpiar_inspect.Rd @@ -9,24 +9,31 @@ limpiar_inspect( pattern, text_var = mention_content, url_var = mention_url, - title = "inspect" + title = "inspect", + open_view = TRUE, + ignore_case = TRUE ) } \arguments{ -\item{data}{Data frame or tibble object} +\item{data}{Name of your Data Frame or Tibble object} \item{pattern}{Pattern you wish to inspect e.g. "link bio"} -\item{text_var}{Name of the text variable/character vector} +\item{text_var}{Name of your text variable. Can be given as a 'string' or a symbol - should refer to a column inside \code{data}} \item{url_var}{Name of the data frame's URL-column} \item{title}{Name of the viewable pane} + +\item{open_view}{For testing purposes, default is set to TRUE} + +\item{ignore_case}{Whether the pattern should ignore the upper case/lower case distinction} } \description{ -Produces a viewable data frame with posts matching a regular expression and their url. -Useful for investigating suspected spam posts, or other patterns of interest. -Set the name of the title to avoid new frames overwriting old ones. +Produces a viewable data frame with posts matching a regular expression and Useful for investigating suspected spam posts, or other patterns of interest. Set the name of the title to avoid new frames overwriting old ones. +} +\details{ +add boundary tags e.g. \verb{\\\\b} to either side of your pattern if you wish to only match words rather than parts of words. For example, \code{pattern="cats"} will match '#cats', but also 'catch up'. If we add a word boundary: \verb{pattern = \\\\bcats\\\\b} we won't match either '#cats' or 'catch up'. } \examples{ df <- data.frame( diff --git a/man/limpiar_link_click.Rd b/man/limpiar_link_click.Rd index 2953372..ae791a2 100644 --- a/man/limpiar_link_click.Rd +++ b/man/limpiar_link_click.Rd @@ -23,11 +23,10 @@ Make sure that DataTable is rendered with the argument 'escape = FALSE' or colum The function now checks if your url_var was a clickable link, and if it is then it won't add any new formatting. } \examples{ -\dontrun{ -Example 1: -df \%>\% limpiar_link_click(permalink) +df <- LimpiaR::limpiar_examples[1, ] +df["mention_url"] + +df <- df \%>\% limpiar_link_click(mention_url) +df["mention_url"] -Example 2: -limpiar_link_click(data, mention_url) -} } diff --git a/man/limpiar_link_click_reverse.Rd b/man/limpiar_link_click_reverse.Rd index f2e7712..5277c00 100644 --- a/man/limpiar_link_click_reverse.Rd +++ b/man/limpiar_link_click_reverse.Rd @@ -21,7 +21,6 @@ Undoes the effects of the limpiar_link_click function, giving you the original u df <- LimpiaR::limpiar_examples[1, ] df <- df \%>\% limpiar_link_click(mention_url) -df$mention_url df \%>\% limpiar_link_click_reverse(mention_url) } diff --git a/man/limpiar_non_ascii.Rd b/man/limpiar_non_ascii.Rd new file mode 100644 index 0000000..644f368 --- /dev/null +++ b/man/limpiar_non_ascii.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/limpiar_non_ascii.R +\name{limpiar_non_ascii} +\alias{limpiar_non_ascii} +\title{Remove non-ASCII characters except those with latin accents} +\usage{ +limpiar_non_ascii(data, text_var = mention_content) +} +\arguments{ +\item{data}{Name of your Data Frame or Tibble object} + +\item{text_var}{Name of your text variable. Can be given as a 'string' or a symbol - should refer to a column inside \code{data}} +} +\value{ +Data frame with the text variable changed in place +} +\description{ +Function uses a simple RegEx to retain only basic ASCII characters plus attempts to retain characters with latin accents. If you know that you want to remove everything including accented characters then you should use \code{limpiar_alphanumeric}. +} +\examples{ +test_df <- data.frame( +text = c( + "Simple text 123", # Basic ASCII only + "Hello! How are you? 😊 🌟", # ASCII + punctuation + emojis + "cafΓ© MΓΌnchen niΓ±o", # Latin-1 accented characters + "#special@chars&(~)|[$]", # Special characters and symbols + "ζ··εˆζ±‰ε­—γ¨ζ—₯本θͺž β†’ ⌘ £€Β₯" # CJK characters + symbols + arrows +) +) + +limpiar_non_ascii(test_df, text) + +} diff --git a/man/limpiar_pp_companies.Rd b/man/limpiar_pp_companies.Rd index 2114607..ef0feb3 100644 --- a/man/limpiar_pp_companies.Rd +++ b/man/limpiar_pp_companies.Rd @@ -17,11 +17,3 @@ Data Frame or Tibble object with text variable edited inline \description{ Remove known companies for pits & peaks } -\examples{ -\dontrun{ -Example 1 -df \%>\% limpiar_pp_companies(message) -Example 2 -limpiar_pp_companies(df, message) -} -} diff --git a/man/limpiar_pp_products.Rd b/man/limpiar_pp_products.Rd index 6d10ee5..c1546f2 100644 --- a/man/limpiar_pp_products.Rd +++ b/man/limpiar_pp_products.Rd @@ -17,12 +17,3 @@ Data Frame or Tibble object with text variable edited inline \description{ Replace entities for the Peaks&Pit classifier } -\examples{ -\dontrun{ -Example 1 -df \%>\% limpiar_pp_products(message) -Example 2 -limpiar_pp_products(df, message) -} - -} diff --git a/man/limpiar_recode_emojis.Rd b/man/limpiar_recode_emojis.Rd new file mode 100644 index 0000000..3629dce --- /dev/null +++ b/man/limpiar_recode_emojis.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/limpiar_emojis.R +\name{limpiar_recode_emojis} +\alias{limpiar_recode_emojis} +\title{Recode emojis with a textual description} +\usage{ +limpiar_recode_emojis(data, text_var = mention_content, with_emoji_tag = FALSE) +} +\arguments{ +\item{data}{Name of your Data Frame or Tibble object} + +\item{text_var}{Name of your text variable. Can be given as a 'string' or a symbol - should refer to a column inside \code{data}} + +\item{with_emoji_tag}{Whether to replace with snakecase linked words or not} +} +\value{ +The Data Frame or Tibble object with most emojis cleaned from the text variable +} +\description{ +Main usage is for pre-processing the text variable as part of Deep Learning pipeline. The most important argument is whether or not to add the emoji tag, which will also print in snake case. +} +\examples{ + emojis <- data.frame( + text = c("Hello πŸ‘‹ World", + "Family: πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦", + "Coding πŸ‘¨πŸ½β€πŸ’»", + "Flags πŸ³οΈβ€πŸŒˆ πŸ‡ΊπŸ‡Έ", + "Weather β˜€οΈ β›ˆοΈ ❄️") +) + +emojis + +# Without tagging and combining: +limpiar_recode_emojis(emojis, text) + +# With tagging and combining: +limpiar_recode_emojis(emojis, text, TRUE) + +# using limpiar_remove_emojis() to remove them entirely: +limpiar_remove_emojis(emojis, text) + +} diff --git a/man/limpiar_remove_emojis.Rd b/man/limpiar_remove_emojis.Rd new file mode 100644 index 0000000..79d5a19 --- /dev/null +++ b/man/limpiar_remove_emojis.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/limpiar_emojis.R +\name{limpiar_remove_emojis} +\alias{limpiar_remove_emojis} +\title{Completely Remove \emph{Most} Emojis from Text} +\usage{ +limpiar_remove_emojis(data, text_var = mention_content) +} +\arguments{ +\item{data}{Name of your Data Frame or Tibble object} + +\item{text_var}{Name of your text variable. Can be given as a 'string' or a symbol - should refer to a column inside \code{data}} +} +\value{ +Data Frame with the text variable cleaned in place +} +\description{ +uses a simple Regular Expression (RegEx) to clear most emojis from the text variable. Attempts to handle emojis which are joined together - like family emojis, and 'edited emojis' like those with skin tones etc. set +} +\examples{ + + emojis <- data.frame( + text = c("Hello πŸ‘‹ World", + "Family: πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦", + "Coding πŸ‘¨πŸ½β€πŸ’»", + "Flags πŸ³οΈβ€πŸŒˆ πŸ‡ΊπŸ‡Έ", + "Weather β˜€οΈ β›ˆοΈ ❄️") +) + +emojis + +# using limpiar_remove_emojis() to remove them entirely: +limpiar_remove_emojis(emojis, text) +} diff --git a/man/limpiar_spam_grams.Rd b/man/limpiar_spam_grams.Rd index 90c672a..aaa3ef4 100644 --- a/man/limpiar_spam_grams.Rd +++ b/man/limpiar_spam_grams.Rd @@ -7,9 +7,9 @@ limpiar_spam_grams(data, text_var, n_gram, min_freq) } \arguments{ -\item{data}{Data frame or tibble object} +\item{data}{Name of your Data Frame or Tibble object} -\item{text_var}{Name of the text variable} +\item{text_var}{Name of your text variable. Can be given as a 'string' or a symbol - should refer to a column inside \code{data}} \item{n_gram}{Number of words in the n-gram i.e. n = 2 = bigram} diff --git a/man/limpiar_stopwords.Rd b/man/limpiar_stopwords.Rd index 3c5f2ad..4ed9177 100644 --- a/man/limpiar_stopwords.Rd +++ b/man/limpiar_stopwords.Rd @@ -4,12 +4,12 @@ \alias{limpiar_stopwords} \title{Clean stop words for visualisations} \usage{ -limpiar_stopwords(df, text_var = mention_content, stop_words) +limpiar_stopwords(data, text_var = mention_content, stop_words) } \arguments{ -\item{df}{Name of Data Frame or Tibble object} +\item{data}{Name of your Data Frame or Tibble object} -\item{text_var}{name of the text variable} +\item{text_var}{Name of your text variable. Can be given as a 'string' or a symbol - should refer to a column inside \code{data}} \item{stop_words}{"sentiment" or "topics" - sentiment retains negation cues} } diff --git a/man/limpiar_wrap.Rd b/man/limpiar_wrap.Rd new file mode 100644 index 0000000..40b051e --- /dev/null +++ b/man/limpiar_wrap.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/limpiar_wrap_strings.R +\name{limpiar_wrap} +\alias{limpiar_wrap} +\title{Wrap strings for visual ease} +\usage{ +limpiar_wrap( + data, + text_var = mention_content, + n = 15, + newline_char = "

" +) +} +\arguments{ +\item{data}{Name of your Data Frame or Tibble object} + +\item{text_var}{Name of your text variable. Can be given as a 'string' or a symbol - should refer to a column inside \code{data}} + +\item{n}{number of words} + +\item{newline_char}{the specific delimiter to wrap the texts with} +} +\value{ +Data Frame with text variable edited in place +} +\description{ +Useful for pre-processing a dataset in which you need to read many documents, or scan over a lot of documents, e.g. when rendering an interactive scatter plot and using plotly's hover, or when using \code{DT::datatable(escape = FALSE)}. +} +\examples{ + +limpiar_examples \%>\% limpiar_wrap(mention_content, n = 5, newline_char = "
") +} diff --git a/man/text_var.Rd b/man/text_var.Rd new file mode 100644 index 0000000..a3a78da --- /dev/null +++ b/man/text_var.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/params.R +\name{text_var} +\alias{text_var} +\title{Helper for consistent documentation of \code{text_var}} +\arguments{ +\item{text_var}{Name of your text variable. Can be given as a 'string' or a symbol - should refer to a column inside \code{data}} +} +\description{ +Use \verb{@inheritParams text_var} to consistently document \code{text_var}. +} +\keyword{internal}