diff --git a/NAMESPACE b/NAMESPACE index 181d46c..f11815a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,13 @@ export(pm_city_detect) export(pm_city_none) export(pm_city_parse) export(pm_city_std) +export(pm_country_all) +export(pm_country_any) +export(pm_country_detect) +export(pm_country_none) +export(pm_country_parse) +export(pm_country_std) +export(pm_country_trim) export(pm_dictionary) export(pm_has_address) export(pm_has_uid) diff --git a/R/build.R b/R/build.R index a7c4457..36afccd 100644 --- a/R/build.R +++ b/R/build.R @@ -162,7 +162,9 @@ pm_rebuild <- function(.data, start, end, new_address, include_commas = FALSE, endQN <- rlang::quo_name(rlang::enquo(end)) if (endQN == "end"){ - if ("pm.zip4" %in% names(.data) == TRUE){ + if ("pm.country" %in% names(.data) == TRUE) { + endQ <- rlang::quo(!! rlang::sym("pm.country")) + } else if ("pm.zip4" %in% names(.data) == TRUE){ endQ <- rlang::quo(!! rlang::sym("pm.zip4")) } else if ("pm.zip4" %in% names(.data) == FALSE){ endQ <- rlang::quo(!! rlang::sym("pm.zip")) @@ -239,7 +241,7 @@ pm_rebuild <- function(.data, start, end, new_address, include_commas = FALSE, "pm.houseFrac", "pm.houseSuf", "pm.preDir", "pm.street", "pm.streetSuf", "pm.sufDir", "pm.unitType", "pm.unitNum", "pm.city", - "pm.state", "pm.zip", "pm.zip4"), + "pm.state", "pm.zip", "pm.zip4", "pm.country"), stringsAsFactors = FALSE ) diff --git a/R/country.R b/R/country.R new file mode 100644 index 0000000..8298931 --- /dev/null +++ b/R/country.R @@ -0,0 +1,381 @@ +#' Does Country Dictionary Return Any Matches +#' +#' @description Determine whether the country dictionary returns any matches. +#' +#' @usage pm_country_any(.data, dictionary) +#' +#' @param .data A postmastr object created with \link{pm_prep} +#' @param dictionary Optional; a tbl created with \code{pm_dictionary} to be used +#' as a master list for countries. If none is specified, the full default +#' country dictionary will be used. +#' +#' @return A logical scalar is returned that is \code{TRUE} if the data contains at +#' least one country name or abbrevation in the given dictionary and \code{FALSE} +#' if they do not. +#' +#' @export +pm_country_any <- function(.data, dictionary){ + + # check for object and key variables + if (pm_has_uid(.data) == FALSE){ + stop("The variable 'pm.uid' is missing from the given object. Create a postmastr object with pm_identify and pm_prep before proceeding.") + } + + if (pm_has_address(.data) == FALSE){ + stop("The variable 'pm.address' is missing from the given object. Create a postmastr object with pm_prep before proceeding.") + } + + # test dictionary + if (missing(dictionary) == TRUE){ + .data <- pm_country_detect(.data) + } else if (missing(dictionary) == FALSE){ + .data <- pm_country_any(.data, dictionary = dictionary) + } + + # create output + out <- any(.data$pm.hasCountry) + + # return output + return(out) + +} + +#' Does Country Dictionary Return a Match for All Observations +#' +#' @description Determine whether the country dictionary returns matches for all observations. +#' +#' @usage pm_country_all(.data, dictionary) +#' +#' @param .data A postmastr object created with \link{pm_prep} +#' @param dictionary Optional; a tbl created with \code{pm_dictionary} to be used +#' as a master list for countries. If none is specified, the full default +#' country dictionary will be used. +#' +#' @return A logical scalar is returned that is \code{TRUE} if the data contains a country +#' name or abbreviation for every observation in the data set and \code{FALSE} otherwise. +#' +#' @export +pm_country_all <- function(.data, dictionary){ + + # check for object and key variables + if (pm_has_uid(.data) == FALSE){ + stop("The variable 'pm.uid' is missing from the given object. Create a postmastr object with pm_identify and pm_prep before proceeding.") + } + + if (pm_has_address(.data) == FALSE){ + stop("The variable 'pm.address' is missing from the given object. Create a postmastr object with pm_prep before proceeding.") + } + + # test dictionary + if (missing(dictionary) == TRUE){ + .data <- pm_country_detect(.data) + } else if (missing(dictionary) == FALSE){ + .data <- pm_country_any(.data, dictionary = dictionary) + } + + # create output + out <- all(.data$pm.hasCountry) + + # return output + return(out) + +} + +#' Detect Presence of Country +#' +#' @description Determine the presence of country names or abbreviations +#' at the end of a string. +#' +#' @usage pm_country_detect(.data, dictionary) +#' +#' @param .data A postmastr object created with \link{pm_prep} +#' @param dictionary Optional; a tbl created with \code{pm_dictionary} to be used +#' as a master list for countries. If none is specified, the full default +#' country dictionary will be used. +#' +#' @return A tibble with a new logical variable \code{pm.hasCountry} that is +#' \code{TRUE} if a country name or abbreviation from the given dictionary is +#' found at the end of the address and \code{FALSE} otherwise. +#' +#' @importFrom dplyr %>% +#' @importFrom dplyr mutate +#' @importFrom stringr str_c +#' @importFrom stringr str_detect +#' +#' @export +pm_country_detect <- function(.data, dictionary){ + + # create bindings for global variables + pm.address = NULL + + # check for object and key variables + if (pm_has_uid(.data) == FALSE){ + stop("The variable 'pm.uid' is missing from the given object. Create a postmastr object with pm_identify and pm_prep before proceeding.") + } + + if (pm_has_address(.data) == FALSE){ + stop("The variable 'pm.address' is missing from the given object. Create a postmastr object with pm_prep before proceeding.") + } + + # load dictionary if not specified + if (missing(dictionary) == TRUE){ + dictionary <- pm_dictionary(type = "country") + } + + # minimize dictionary + dict <- paste(dictionary$con.input, collapse = "|") + + # check observations + .data <- dplyr::mutate(.data, pm.hasCountry = stringr::str_detect(pm.address, + pattern = stringr::str_c("\\b(", dict, ")\\b$"))) + + # return output + return(.data) + +} + + +#' Return Only Unmatched Observations From pm_country_detect +#' +#' @description Automatically subset the results of \link{pm_country_detect} to +#' return only observations that were not found in the dictionary. +#' +#' @usage pm_country_none(.data, dictionary) +#' +#' @param .data A postmastr object created with \link{pm_prep} +#' @param dictionary Optional; a tbl created with \code{pm_dictionary} to be used +#' as a master list for countries. If none is specified, the full default +#' country dictionary will be used. +#' +#' @importFrom dplyr %>% +#' @importFrom dplyr filter +#' @importFrom dplyr select +#' +#' @export +pm_country_none <- function(.data, dictionary){ + + # global bindings + pm.hasCountry = NULL + + # check for object and key variables + if (pm_has_uid(.data) == FALSE){ + stop("The variable 'pm.uid' is missing from the given object. Create a postmastr object with pm_identify and pm_prep before proceeding.") + } + + if (pm_has_address(.data) == FALSE){ + stop("The variable 'pm.address' is missing from the given object. Create a postmastr object with pm_prep before proceeding.") + } + + # load dictionary if not specified + if (missing(dictionary) == TRUE){ + dictionary <- pm_dictionary(type = "country") + } + + # create output + .data %>% + pm_country_detect(dictionary = dictionary) %>% + dplyr::filter(pm.hasCountry == FALSE) %>% + dplyr::select(-pm.hasCountry) -> .data + + # return output + return(.data) + +} + +#' Parse Country +#' +#' @description Parse a country from a string. These data +#' should be at the end of the string (i.e. the last word or words). +#' +#' @usage pm_country_parse(.data, dictionary) +#' +#' @param .data A postmastr object created with \link{pm_prep} +#' @param dictionary Optional; a tbl created with \code{pm_dictionary} to be used +#' as a master list for countries. If none is specified, the full default +#' country dictionary will be used. +#' +#' @importFrom dplyr %>% +#' @importFrom dplyr mutate +#' @importFrom dplyr select +#' @importFrom stringr str_c +#' @importFrom stringr str_count +#' @importFrom stringr str_replace +#' @importFrom stringr word +#' +#' @export +pm_country_parse <- function(.data, dictionary){ + + # create bindings for global variables + pm.address = pm.country = NULL + + # check for object and key variables + if (pm_has_uid(.data) == FALSE){ + stop("The variable 'pm.uid' is missing from the given object. Create a postmastr object with pm_identify and pm_prep before proceeding.") + } + + if (pm_has_address(.data) == FALSE){ + stop("The variable 'pm.address' is missing from the given object. Create a postmastr object with pm_prep before proceeding.") + } + + # load dictionary if not specified + if (missing(dictionary) == TRUE){ + dictionary <- pm_dictionary(type = "country") + } + + # load dictionary if NULL + if (is.null(dictionary) == TRUE){ + dictionary <- pm_dictionary(type = "country") + } + + # minimize dictionary + dict <- paste(dictionary$con.input, collapse = "|") + + # parse countries + ## parse + .data <- dplyr::mutate(.data, pm.country = + stringr::str_extract(pm.address, + pattern = stringr::str_c("\\b(", dict, ")\\b$"))) + + ## clean address data + .data %>% + dplyr::mutate(pm.address = ifelse(is.na(pm.country) == FALSE, + stringr::word(pm.address, start = 1, end = -1-stringr::str_count(pm.country, pattern = "\\w+")), + pm.address)) %>% + pm_country_std(var = pm.country, dictionary = dictionary) -> .data + + # re-order data + vars <- pm_reorder(.data) + .data <- dplyr::select(.data, vars) + + # return output + return(.data) + +} + +#' Standardize Parsed Countries +#' +#' @description Convert countries to USPS preferred two-letter abbreviation. +#' +#' @usage pm_country_std(.data, var, dictionary) +#' +#' @param .data A postmastr object created with \link{pm_prep} +#' @param var A character variable that may contain countries +#' @param dictionary Optional; a tbl created with \code{pm_dictionary} to be used +#' as a master list for countries. If none is specified, the full default +#' country dictionary will be used. +#' +#' @importFrom dplyr %>% +#' @importFrom dplyr left_join +#' @importFrom dplyr mutate +#' @importFrom dplyr select +#' @importFrom dplyr rename +#' @importFrom rlang := +#' @importFrom rlang enquo +#' @importFrom rlang quo +#' @importFrom rlang sym +#' +#' @export +pm_country_std <- function(.data, var, dictionary){ + + # global variables + . = con.input = con.output = NULL + + # save parameters to list + paramList <- as.list(match.call()) + + # unquote + if (!is.character(paramList$var)) { + varQ <- rlang::enquo(var) + } else if (is.character(paramList$var)) { + varQ <- rlang::quo(!! rlang::sym(var)) + } + + varQN <- rlang::quo_name(rlang::enquo(var)) + + # load dictionary if not specified + if (missing(dictionary) == TRUE){ + dictionary <- pm_dictionary(type = "country") + } + + # modify dictionary + dictionary %>% + dplyr::rename(!!varQ := con.input) -> dictionary + + # standardize country names + .data %>% + dplyr::left_join(., dictionary, by = varQN) %>% + dplyr::mutate(!!varQ := ifelse(is.na(con.output) == FALSE, con.output, !!varQ)) %>% + dplyr::select(-con.output) -> out + + # return output + return(out) + +} + +#' Trim Country +#' +#' @description Remove a country from an address without parsing. These data +#' should be at the end of the string (i.e. the last word or words). +#' +#' @usage pm_country_trim(.data, dictionary) +#' +#' @param .data A postmastr object created with \link{pm_prep} +#' @param dictionary Optional; a tbl created with \code{pm_dictionary} to be used +#' as a master list for countries. If none is specified, the full default +#' country dictionary will be used. +#' +#' @importFrom dplyr %>% +#' @importFrom dplyr mutate +#' @importFrom dplyr select +#' @importFrom stringr str_c +#' @importFrom stringr str_count +#' @importFrom stringr str_replace +#' @importFrom stringr word +#' +#' @export +pm_country_trim <- function(.data, dictionary){ + + # create bindings for global variables + pm.address = pm.country = NULL + + # check for object and key variables + if (pm_has_uid(.data) == FALSE){ + stop("The variable 'pm.uid' is missing from the given object. Create a postmastr object with pm_identify and pm_prep before proceeding.") + } + + if (pm_has_address(.data) == FALSE){ + stop("The variable 'pm.address' is missing from the given object. Create a postmastr object with pm_prep before proceeding.") + } + + # load dictionary if not specified + if (missing(dictionary) == TRUE){ + dictionary <- pm_dictionary(type = "country") + } + + # load dictionary if NULL + if (is.null(dictionary) == TRUE){ + dictionary <- pm_dictionary(type = "country") + } + + # minimize dictionary + dict <- paste(dictionary$con.input, collapse = "|") + + # parse countries + ## parse + .data <- dplyr::mutate(.data, pm.country = + stringr::str_extract(pm.address, + pattern = stringr::str_c("\\b(", dict, ")\\b$"))) + + ## clean address data + .data %>% + dplyr::mutate(pm.address = ifelse(is.na(pm.country) == FALSE, + stringr::word(pm.address, start = 1, end = -1-stringr::str_count(pm.country, pattern = "\\w+")), + pm.address)) -> .data + + # re-order data + .data <- dplyr::select(.data, -pm.country) + + # return output + return(.data) + +} diff --git a/R/dictionary.R b/R/dictionary.R index 8199089..472b55d 100644 --- a/R/dictionary.R +++ b/R/dictionary.R @@ -21,7 +21,7 @@ #' @usage pm_dictionary(type, append, filter, case = c("title", "lower", "upper"), locale = "us") #' #' @param type A string indicating the grammatical address element the dictionary -#' should represent. Current options are \code{"state"}, \code{"city"}, +#' should represent. Current options are \code{"country"}, \code{"state"}, \code{"city"}, #' \code{"directional"}, and \code{"suffix"}. #' @param append An optional dictionary appendix object created with \code{\link{pm_append}} #' @param filter An optional character scalar or vector with output elements that should @@ -139,6 +139,20 @@ pm_dictionary <- function(type, append, filter, case = c("title", "lower", "uppe out <- pm_case(working, locale = locale, type = type, case = case) + } else if (type == "country"){ + + if (missing(append) == FALSE & missing(filter) == FALSE){ + working <- pm_dictionary_country(append = append, filter = filter) + } else if (missing(append) == FALSE & missing(filter) == TRUE){ + working <- pm_dictionary_country(append = append) + } else if (missing(append) == TRUE & missing(filter) == FALSE){ + working <- pm_dictionary_country(filter = filter) + } else if (missing(append) == TRUE & missing(filter) == TRUE){ + working <- pm_dictionary_country() + } + + out <- pm_case(working, locale = locale, type = type, case = case) + } } @@ -333,6 +347,36 @@ pm_dictionary_us_suffix <- function(append, filter){ } +# country names +pm_dictionary_country <- function(append, filter){ + + # global bindings + con.output = NULL + + # load data + out <- postmastr::dic_country + + # optionally append + if (missing(append) == FALSE){ + + # bind rows + out <- dplyr::bind_rows(out, append) + + # re-order observations + out <- out[order(out$con.output),] + + } + + # optionally filter + if (missing(filter) == FALSE){ + out <- dplyr::filter(out, con.output %in% filter) + } + + # return output + return(out) + +} + # Dictionary Case pm_case <- function(.data, locale, type, case){ @@ -344,6 +388,8 @@ pm_case <- function(.data, locale, type, case){ out <- pm_convert_case(.data, var = "dir.input", orderVar = "dir.output", case = case) } else if (type == "suffix"){ out <- pm_convert_case(.data, var = "suf.input", orderVar = "suf.output", case = case) + } else if (type == "country"){ + out <- pm_convert_case(.data, var = "con.input", orderVar = "con.output", case = case) } } @@ -405,7 +451,7 @@ pm_convert_case <- function(.data, var, orderVar, case){ #' @usage pm_append(type, input, output, locale = "us") #' #' @param type A string indicating the grammatical address element the dictionary -#' should represent. Current options are \code{"state"}, \code{"city"}, +#' should represent. Current options are \code{"country"}, \code{"state"}, \code{"city"}, #' \code{"street"}, \code{"house suffix"}, \code{"directional"}, and \code{"suffix"}. #' @param input A character scalar or vector containing possible terms existing in #' the data. This should be the same length as \code{output}. @@ -533,6 +579,15 @@ pm_append <- function(type, input, output, locale = "us"){ # re-order observations out <- out[order(out$houseSuf.input),] + } else if (type == "country"){ + + out <- dplyr::tibble( + con.output = c(output), + con.input = c(input)) + + # re-order observations + out <- out[order(out$houseSuf.input),] + } } @@ -610,3 +665,22 @@ pm_append <- function(type, input, output, locale = "us"){ #' head(dic_us_suffix) #' "dic_us_suffix" + +#' Country Dictionary +#' +#' @description A list of abbreviations for the United States. +#' +#' @docType data +#' +#' @usage data(dic_country) +#' +#' @format A tibble with 502 rows and 3 variables: +#' \describe{ +#' \item{con.output}{standard output} +#' \item{con.input}{full names abbreviations} +#' } +#' +#' @examples +#' head(dic_country) +#' +"dic_country" diff --git a/R/parse.R b/R/parse.R index 3262f5e..ed3ab70 100644 --- a/R/parse.R +++ b/R/parse.R @@ -2,7 +2,8 @@ #' #' @description A wrapper around the parse functions that can be used to shorten all #' of \code{postmastr}'s core code down to a single function call once dictionaries -#' have been created and tested against the data. +#' have been created and tested against the data. By default, any country indicators +#' like "US" or "USA" will be removed from output addresses. #' #' @usage pm_parse(.data, input, address, output, new_address, ordinal = TRUE, #' unnest = FALSE, include_commas = FALSE, include_unit = TRUE, @@ -205,6 +206,7 @@ pm_parse <- function(.data, input, address, output, new_address, ordinal = TRUE, source %>% pm_prep(var = "address") %>% + pm_country_trim() %>% pm_postal_parse(locale = locale) %>% pm_state_parse(dictionary = state_dict, locale = locale) %>% pm_city_parse(dictionary = city_dict, locale = locale) %>% @@ -224,6 +226,7 @@ pm_parse <- function(.data, input, address, output, new_address, ordinal = TRUE, source %>% pm_prep(var = "address") %>% + pm_country_trim() %>% pm_postal_parse(locale = locale) %>% pm_state_parse(dictionary = state_dict, locale = locale) %>% pm_city_parse(dictionary = city_dict, locale = locale) %>% diff --git a/R/utils.R b/R/utils.R index baff4ed..bb96dcf 100644 --- a/R/utils.R +++ b/R/utils.R @@ -12,7 +12,7 @@ pm_reorder <- function(.data, locale = "us"){ "pm.hasHouseFrac", "pm.houseFrac", "pm.hasAlpha", "pm.hasHouseSuf", "pm.houseSuf", "pm.hasDir", "pm.preDir", "pm.street", "pm.hasStreetSuf", "pm.streetSuf", "pm.sufDir", "pm.hasUnit", "pm.unitType", "pm.unitNum", "pm.hasCity", "pm.city", - "pm.hasState", "pm.state", "pm.hasZip", "pm.zip", "pm.zip4"), + "pm.hasState", "pm.state", "pm.hasZip", "pm.zip", "pm.zip4", "pm.hasCountry", "pm.country"), stringsAsFactors = FALSE ) diff --git a/_pkgdown.yml b/_pkgdown.yml index 55296e3..096635e 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -27,6 +27,17 @@ reference: - pm_has_address - pm_has_uid + - title: "Countries" + desc: "Work with Country Names and Abbreviations" + contents: + - pm_country_any + - pm_country_all + - pm_country_detect + - pm_country_none + - pm_country_parse + - pm_country_trim + - pm_country_std + - title: "Postal Codes" desc: "Work with Postal (Zip) Codes" contents: diff --git a/data/dic_country.rda b/data/dic_country.rda new file mode 100644 index 0000000..6d80a5b Binary files /dev/null and b/data/dic_country.rda differ diff --git a/docs/articles/postmastr.html b/docs/articles/postmastr.html index d1c1f3e..97b0dd8 100644 --- a/docs/articles/postmastr.html +++ b/docs/articles/postmastr.html @@ -75,7 +75,7 @@
vignettes/postmastr.Rmd
postmastr.RmdA list of abbreviations for the United States.
+ +data(dic_country)+ +
A tibble with 502 rows and 3 variables:
standard output
full names abbreviations
+head(dic_country)#> # A tibble: 6 x 2 +#> con.output con.input +#> <chr> <chr> +#> 1 US US +#> 2 US USA +#> 3 US United States +#> 4 US United States of America +#> 5 US The United States of America +#> 6 US America+
Validate postmastr pm.uid Variable
Work with Country Names and Abbreviations
+Does Country Dictionary Return Any Matches
Does Country Dictionary Return a Match for All Observations
Detect Presence of Country
Return Only Unmatched Observations From pm_country_detect
Parse Country
Trim Country
Standardize Parsed Countries
A string indicating the grammatical address element the dictionary
-should represent. Current options are "state", "city",
+should represent. Current options are "country", "state", "city",
"street", "house suffix", "directional", and "suffix".
R/country.R
+ pm_country_all.RdDetermine whether the country dictionary returns matches for all observations.
+ +pm_country_all(.data, dictionary)+ +
| .data | +A postmastr object created with pm_prep |
+
|---|---|
| dictionary | +Optional; a tbl created with |
+
A logical scalar is returned that is TRUE if the data contains a country
+ name or abbreviation for every observation in the data set and FALSE otherwise.
Determine whether the country dictionary returns any matches.
+ +pm_country_any(.data, dictionary)+ +
| .data | +A postmastr object created with pm_prep |
+
|---|---|
| dictionary | +Optional; a tbl created with |
+
A logical scalar is returned that is TRUE if the data contains at
+ least one country name or abbrevation in the given dictionary and FALSE
+ if they do not.
Determine the presence of country names or abbreviations + at the end of a string.
+ +pm_country_detect(.data, dictionary)+ +
| .data | +A postmastr object created with pm_prep |
+
|---|---|
| dictionary | +Optional; a tbl created with |
+
A tibble with a new logical variable pm.hasCountry that is
+ TRUE if a country name or abbreviation from the given dictionary is
+ found at the end of the address and FALSE otherwise.
R/country.R
+ pm_country_none.RdAutomatically subset the results of pm_country_detect to + return only observations that were not found in the dictionary.
+ +pm_country_none(.data, dictionary)+ +
| .data | +A postmastr object created with pm_prep |
+
|---|---|
| dictionary | +Optional; a tbl created with |
+
Parse a country from a string. These data + should be at the end of the string (i.e. the last word or words).
+ +pm_country_parse(.data, dictionary)+ +
| .data | +A postmastr object created with pm_prep |
+
|---|---|
| dictionary | +Optional; a tbl created with |
+
Convert countries to USPS preferred two-letter abbreviation.
+ +pm_country_std(.data, var, dictionary)+ +
| .data | +A postmastr object created with pm_prep |
+
|---|---|
| var | +A character variable that may contain countries |
+
| dictionary | +Optional; a tbl created with |
+
Remove a country from an address without parsing. These data + should be at the end of the string (i.e. the last word or words).
+ +pm_country_trim(.data, dictionary)+ +
| .data | +A postmastr object created with pm_prep |
+
|---|---|
| dictionary | +Optional; a tbl created with |
+
A string indicating the grammatical address element the dictionary
-should represent. Current options are "state", "city",
+should represent. Current options are "country", "state", "city",
"directional", and "suffix".
A wrapper around the parse functions that can be used to shorten all
of postmastr's core code down to a single function call once dictionaries
- have been created and tested against the data.