From 9a668ace622a08237886897933a9d3cf4ee75946 Mon Sep 17 00:00:00 2001 From: Stefan Vriend Date: Fri, 25 Oct 2024 14:55:09 +0200 Subject: [PATCH 01/10] Write functions two get English common name from Wikidata --- R/get-common-name.R | 65 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 R/get-common-name.R diff --git a/R/get-common-name.R b/R/get-common-name.R new file mode 100644 index 0000000..f08c2b0 --- /dev/null +++ b/R/get-common-name.R @@ -0,0 +1,65 @@ +# Function to retrieve a species' English common name from Wikidata + +# Author: Stefan Vriend +# Created: 2024-10-24 +# Last updated: 2024-10-25 + + +# Load packages ----------------------------------------------------------- + +library(WikidataQueryServiceR) +library(WikidataR) + + +# Function ---------------------------------------------------------------- + +# Arguments +# sci_name: Character specifying one or more scientific names + +get_common_name <- function(sci_name) { + + # Query common name from Wikidata + query <- paste0(' + SELECT + ?item ?common_name + WHERE { + ?item wdt:P225 ?scientific_name; + wdt:P1843 ?common_name. + + FILTER(LANGMATCHES(LANG(?common_name), "en")) + + FILTER(lcase(str(?scientific_name)) IN (', + '"', tolower(sci_name), '"', + ')) + + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} + ') + + common_name <- WikidataQueryServiceR::query_wikidata(sparql_query = query, + format = "smart") %>% + dplyr::pull("common_name") |> + stringr::str_to_sentence() |> + unique() + + return(common_name) + +} + +# Alternative ------------------------------------------------------------- + +get_common_name2 <- function(sci_name) { + + # Query common name from Wikidata + item <- WikidataR::find_item(sci_name) |> + purrr::flatten() + + wiki <- WikidataR::get_item(item$id) + + wiki[[1]]$claims$P1843 |> + purrr::pluck("mainsnak", "datavalue", "value") |> + dplyr::filter(language == "en") |> + dplyr::pull(text) |> + stringr::str_to_sentence() |> + unique() + +} From 923742354d9058a64f2d8102fce4ff27413103cc Mon Sep 17 00:00:00 2001 From: Stefan Vriend Date: Fri, 25 Oct 2024 15:31:24 +0200 Subject: [PATCH 02/10] Update functions to work on vectors of scientific names and update speed of SPARQL query --- R/get-common-name.R | 56 ++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/R/get-common-name.R b/R/get-common-name.R index f08c2b0..5d888df 100644 --- a/R/get-common-name.R +++ b/R/get-common-name.R @@ -10,34 +10,33 @@ library(WikidataQueryServiceR) library(WikidataR) +# Get English common name through {WikidataQueryServiceR} ----------------- -# Function ---------------------------------------------------------------- +# Note: this code uses SPARQL (an RDF-query language) # Arguments # sci_name: Character specifying one or more scientific names get_common_name <- function(sci_name) { - # Query common name from Wikidata + # SPARQL query to select common name from Wikidata query <- paste0(' SELECT ?item ?common_name WHERE { - ?item wdt:P225 ?scientific_name; + ?item wdt:P225', '"', sci_name, '"', '; wdt:P1843 ?common_name. FILTER(LANGMATCHES(LANG(?common_name), "en")) - FILTER(lcase(str(?scientific_name)) IN (', - '"', tolower(sci_name), '"', - ')) - SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} ') + # Send query to Wikidata query service and extract English common name common_name <- WikidataQueryServiceR::query_wikidata(sparql_query = query, - format = "smart") %>% - dplyr::pull("common_name") |> + format = "smart") |> + purrr::map("common_name") |> + purrr::as_vector() |> stringr::str_to_sentence() |> unique() @@ -45,21 +44,32 @@ get_common_name <- function(sci_name) { } -# Alternative ------------------------------------------------------------- +# Get English common name through {WikidataR} ----------------------------- + +# Arguments +# sci_name: Character specifying one or more scientific names get_common_name2 <- function(sci_name) { - # Query common name from Wikidata - item <- WikidataR::find_item(sci_name) |> - purrr::flatten() - - wiki <- WikidataR::get_item(item$id) + purrr::map_chr(.x = sci_name, + .f = ~{ + + # Search Wikidata item corresponding to scientific name + item <- WikidataR::find_item(.x) |> + purrr::flatten() + + # Retrieve properties from Wikidata item + wiki <- WikidataR::get_item(item$id) + + # Extract English common name + wiki[[1]]$claims$P1843 |> + purrr::pluck("mainsnak", "datavalue", "value") |> + dplyr::filter(language == "en") |> + dplyr::pull("text") |> + purrr::as_vector() |> + stringr::str_to_sentence() |> + unique() + + }) - wiki[[1]]$claims$P1843 |> - purrr::pluck("mainsnak", "datavalue", "value") |> - dplyr::filter(language == "en") |> - dplyr::pull(text) |> - stringr::str_to_sentence() |> - unique() - -} +} \ No newline at end of file From 5d8e415a8372be94c0e8ab1961bf8dd6b672da5f Mon Sep 17 00:00:00 2001 From: Stefan Vriend Date: Fri, 25 Oct 2024 15:44:48 +0200 Subject: [PATCH 03/10] Comment-out alt version of function This version of the function was created because the SPARQL-version was slow, but now that its performance is increased, the alt version becomes obsolete --- R/get-common-name.R | 53 +++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/R/get-common-name.R b/R/get-common-name.R index 5d888df..aa57747 100644 --- a/R/get-common-name.R +++ b/R/get-common-name.R @@ -8,7 +8,7 @@ # Load packages ----------------------------------------------------------- library(WikidataQueryServiceR) -library(WikidataR) +#library(WikidataR) # Get English common name through {WikidataQueryServiceR} ----------------- @@ -44,32 +44,33 @@ get_common_name <- function(sci_name) { } -# Get English common name through {WikidataR} ----------------------------- + +# (alternative) Get English common name through {WikidataR} --------------- # Arguments # sci_name: Character specifying one or more scientific names -get_common_name2 <- function(sci_name) { - - purrr::map_chr(.x = sci_name, - .f = ~{ - - # Search Wikidata item corresponding to scientific name - item <- WikidataR::find_item(.x) |> - purrr::flatten() - - # Retrieve properties from Wikidata item - wiki <- WikidataR::get_item(item$id) - - # Extract English common name - wiki[[1]]$claims$P1843 |> - purrr::pluck("mainsnak", "datavalue", "value") |> - dplyr::filter(language == "en") |> - dplyr::pull("text") |> - purrr::as_vector() |> - stringr::str_to_sentence() |> - unique() - - }) - -} \ No newline at end of file +# get_common_name2 <- function(sci_name) { +# +# purrr::map_chr(.x = sci_name, +# .f = ~{ +# +# # Search Wikidata item corresponding to scientific name +# item <- WikidataR::find_item(.x) |> +# purrr::flatten() +# +# # Retrieve properties from Wikidata item +# wiki <- WikidataR::get_item(item$id) +# +# # Extract English common name +# wiki[[1]]$claims$P1843 |> +# purrr::pluck("mainsnak", "datavalue", "value") |> +# dplyr::filter(language == "en") |> +# dplyr::pull("text") |> +# purrr::as_vector() |> +# stringr::str_to_sentence() |> +# unique() +# +# }) +# +# } \ No newline at end of file From b58c23f90b3bf87b1695c5a60101f87e8c62d50c Mon Sep 17 00:00:00 2001 From: Stefan Vriend Date: Fri, 25 Oct 2024 15:50:55 +0200 Subject: [PATCH 04/10] Add missing packages --- R/get-common-name.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/get-common-name.R b/R/get-common-name.R index aa57747..0f674b5 100644 --- a/R/get-common-name.R +++ b/R/get-common-name.R @@ -8,6 +8,8 @@ # Load packages ----------------------------------------------------------- library(WikidataQueryServiceR) +library(purrr) +library(stringr) #library(WikidataR) # Get English common name through {WikidataQueryServiceR} ----------------- From 01850bc96b0d27122ec5820f3c35741d129f443d Mon Sep 17 00:00:00 2001 From: Stefan Vriend Date: Fri, 25 Oct 2024 17:49:34 +0200 Subject: [PATCH 05/10] Update get-common-name to run both on a single value and a vector of values --- R/get-common-name.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/get-common-name.R b/R/get-common-name.R index 0f674b5..45594d1 100644 --- a/R/get-common-name.R +++ b/R/get-common-name.R @@ -8,7 +8,7 @@ # Load packages ----------------------------------------------------------- library(WikidataQueryServiceR) -library(purrr) +library(dplyr) library(stringr) #library(WikidataR) @@ -37,8 +37,8 @@ get_common_name <- function(sci_name) { # Send query to Wikidata query service and extract English common name common_name <- WikidataQueryServiceR::query_wikidata(sparql_query = query, format = "smart") |> - purrr::map("common_name") |> - purrr::as_vector() |> + dplyr::bind_rows() |> + dplyr::pull("common_name") |> stringr::str_to_sentence() |> unique() From b6d3037f243e79c40862afb0776e39dd2e03ad69 Mon Sep 17 00:00:00 2001 From: Stefan Vriend Date: Mon, 28 Oct 2024 17:42:39 +0100 Subject: [PATCH 06/10] Add option select common name in different languages --- R/get-common-name.R | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/R/get-common-name.R b/R/get-common-name.R index 45594d1..9fefa25 100644 --- a/R/get-common-name.R +++ b/R/get-common-name.R @@ -2,7 +2,7 @@ # Author: Stefan Vriend # Created: 2024-10-24 -# Last updated: 2024-10-25 +# Last updated: 2024-10-28 # Load packages ----------------------------------------------------------- @@ -12,35 +12,36 @@ library(dplyr) library(stringr) #library(WikidataR) -# Get English common name through {WikidataQueryServiceR} ----------------- +# Get common name through {WikidataQueryServiceR} ----------------- # Note: this code uses SPARQL (an RDF-query language) # Arguments # sci_name: Character specifying one or more scientific names +# lang: Character specifying one or more languages the common name -get_common_name <- function(sci_name) { +get_common_name <- function(sci_name, + lang = "en") { # SPARQL query to select common name from Wikidata query <- paste0(' - SELECT - ?item ?common_name + SELECT DISTINCT + ?item ?common_name (LANG(?common_name) AS ?lang) WHERE { ?item wdt:P225', '"', sci_name, '"', '; wdt:P1843 ?common_name. - FILTER(LANGMATCHES(LANG(?common_name), "en")) + FILTER(LANGMATCHES(LANG(?common_name), ', '"', lang, '")) SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} ') - # Send query to Wikidata query service and extract English common name + # Send query to Wikidata query service and extract common name common_name <- WikidataQueryServiceR::query_wikidata(sparql_query = query, format = "smart") |> dplyr::bind_rows() |> - dplyr::pull("common_name") |> - stringr::str_to_sentence() |> - unique() + dplyr::mutate(common_name = stringr::str_to_title(common_name)) |> + dplyr::distinct() return(common_name) From 489fe793105548c950943e3eb2fb2aa14b8d6782 Mon Sep 17 00:00:00 2001 From: Cherine Jantzen Date: Mon, 11 Nov 2024 14:31:48 +0100 Subject: [PATCH 07/10] Fix typo --- R/get-common-name.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get-common-name.R b/R/get-common-name.R index 9fefa25..e360733 100644 --- a/R/get-common-name.R +++ b/R/get-common-name.R @@ -18,7 +18,7 @@ library(stringr) # Arguments # sci_name: Character specifying one or more scientific names -# lang: Character specifying one or more languages the common name +# lang: Character specifying one or more languages of the common name get_common_name <- function(sci_name, lang = "en") { From 8a97a12e86521eedf4a0a9ce9a7fda233452139a Mon Sep 17 00:00:00 2001 From: Stefan Vriend Date: Mon, 11 Nov 2024 17:44:27 +0100 Subject: [PATCH 08/10] Fix get_common_name() for those species where the common names are stored under label instead of taxon common name keep multiple records if multiple common names are used in a language --- R/get-common-name.R | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/R/get-common-name.R b/R/get-common-name.R index e360733..22dea74 100644 --- a/R/get-common-name.R +++ b/R/get-common-name.R @@ -41,7 +41,36 @@ get_common_name <- function(sci_name, format = "smart") |> dplyr::bind_rows() |> dplyr::mutate(common_name = stringr::str_to_title(common_name)) |> - dplyr::distinct() + dplyr::distinct() |> + dplyr::select(-"item") + + if(nrow(common_name) != length(lang)) { + + query <- paste0(' + SELECT DISTINCT + ?item ?label ?common_name (LANG(?label) AS ?lang) (LANG(?common_name) AS ?lang_cn) + WHERE { + ?item wdt:P225', '"', sci_name, '"', '; + wdt:P1843 ?common_name; + rdfs:label ?label. + + FILTER(LANG(?label) = ', '"', lang, '") + + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} + ') + + common_name <- WikidataQueryServiceR::query_wikidata(sparql_query = query, + format = "smart") |> + dplyr::bind_rows() |> + dplyr::mutate(common_name = dplyr::case_when(lang == "en" & lang_cn == "en" ~ stringr::str_to_title(common_name), + lang == "en" & lang_cn != "en" ~ NA_character_, + lang != "en" ~ stringr::str_to_title(label))) |> + dplyr::select(-"lang_cn") |> + dplyr::distinct() |> + dplyr::select(-"item", -"label") |> + dplyr::filter(if_all(common_name, ~!is.na(.)), .by = "lang") + + } return(common_name) From ebc9155e9a15b46a594c5e8fd00bbe4bbabd22db Mon Sep 17 00:00:00 2001 From: Stefan Vriend Date: Mon, 11 Nov 2024 20:54:42 +0100 Subject: [PATCH 09/10] Fix bug in get_common_names(), NA for common names Keep NA for a language that does not have a common name for the desired taxon --- R/get-common-name.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/R/get-common-name.R b/R/get-common-name.R index 22dea74..7a30bfe 100644 --- a/R/get-common-name.R +++ b/R/get-common-name.R @@ -2,7 +2,7 @@ # Author: Stefan Vriend # Created: 2024-10-24 -# Last updated: 2024-10-28 +# Last updated: 2024-11-11 # Load packages ----------------------------------------------------------- @@ -44,6 +44,8 @@ get_common_name <- function(sci_name, dplyr::distinct() |> dplyr::select(-"item") + # For some taxa (e.g., plants and fungi), non-English common names are stored as + # rdfs:label properties rather than wdt:P1843 properties if(nrow(common_name) != length(lang)) { query <- paste0(' @@ -68,7 +70,10 @@ get_common_name <- function(sci_name, dplyr::select(-"lang_cn") |> dplyr::distinct() |> dplyr::select(-"item", -"label") |> - dplyr::filter(if_all(common_name, ~!is.na(.)), .by = "lang") + # For chosen languages that do not have a common name for the taxon, + # (e.g., Wikidata does not provide an English common name for the fungus Sparassis crispa) + # return NA. + dplyr::filter(!(is.na(common_name) & dplyr::n() > 1), .by = "lang") } From a3f8ebeb0ddf94b4b567634107360ce7f8c8d965 Mon Sep 17 00:00:00 2001 From: Stefan Vriend Date: Mon, 11 Nov 2024 20:58:34 +0100 Subject: [PATCH 10/10] Update script title --- R/get-common-name.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get-common-name.R b/R/get-common-name.R index 7a30bfe..826f925 100644 --- a/R/get-common-name.R +++ b/R/get-common-name.R @@ -1,4 +1,4 @@ -# Function to retrieve a species' English common name from Wikidata +# Function to retrieve a taxon's common names from Wikidata # Author: Stefan Vriend # Created: 2024-10-24