-
Notifications
You must be signed in to change notification settings - Fork 2
Open
Description
This is an example I developed for building a CWB-indexed MARPOR corpus. Maybe incorporate it into vignette, or include it as a R Markdown template?
library(cwbtools)
library(polmineR)
library(RcppCWB)
library(manifestoR)
library(dplyr)
library(purrr)
library(tidytext)
library(data.table)
party_names <- c(
"41112" = "GRUENE",
"41113" = "GRUENE",
"41221" = "PDS",
"41222" = "LINKE",
"41223" = "LINKE",
"41320" = "SPD",
"41420" = "FDP",
"41521" = "CDU/CSU",
"41952" = "Piraten",
"41953" = "AfD",
"41912" = "SSW"
)
mp_setapikey("~/.credentials/manifesto_apikey.txt")
tokstream <- mp_corpus(countryname == "Germany" & edate > as.Date("2017-10-01")) %>%
lapply(as.data.frame) %>%
lapply(as_tibble) %>%
imap(function(tbl, id) mutate(tbl, id = id)) %>%
bind_rows() %>%
mutate(yearmon = gsub("^.*_(\\d{4})(\\d{2})$", "\\1-\\2", id)) %>%
mutate(party = party_names[gsub("^(.*)_.*$", "\\1", id)]) %>%
unnest_tokens(word, text, to_lower = FALSE, strip_punct = FALSE) %>%
mutate(cpos = 0L:(nrow(.) - 1L))
mdata <- tokstream %>%
group_by(pos, yearmon, party) %>%
summarise(cpos_left = min(cpos), cpos_right = max(cpos), .groups = "rowwise") %>%
mutate(pos = as.character(pos)) %>%
relocate(cpos_left, cpos_right) %>%
arrange(cpos_left)
MARPOR <- CorpusData$new()
MARPOR$tokenstream <- tokstream %>% select(cpos, word) %>% as.data.table()
MARPOR$metadata <- mdata
data_dir_tmp <- fs::path(tempdir(), "data_dir")
registry_tmp <- fs::path(tempdir(), "registry")
dir.create(data_dir_tmp)
dir.create(registry_tmp)
MARPOR$encode(
corpus = "MARPOR",
encoding = "utf8",
p_attributes = "word",
s_attributes = c("pos", "yearmon", "party"),
registry_dir = registry_tmp,
data_dir = data_dir_tmp,
method = "R",
compress = FALSE
)
RcppCWB::cl_delete_corpus("MARPOR", registry = registry_tmp)
RcppCWB::cqp_reset_registry(registry_tmp)
polmineR::corpus("MARPOR") %>%
kwic(query = "Zuwanderung", s_attributes = "party")Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels