Skip to content

Fix for germaparl_encode_lda_topics() #8

@Studentenfutter

Description

@Studentenfutter

The germaparl_encode_lda_topics() returns an error because the decode() function of polmineR is deprecated. I found a solution by using the s_attribute_decode() function from RcppCWB:

germaparl_encode_lda_topics() <- 
function (k = 450, n = 5) {
germaparl_data_dir <- registry_file_parse(corpus = "GERMAPARL", 
        registry_dir = germaparl_regdir())[["home"]]
    corpus_charset <- registry_file_parse(corpus = "GERMAPARL")[["properties"]][["charset"]]
    model <- germaparl_load_topicmodel(k = 250)
    message("... getting topic matrix")
    topic_matrix <- topicmodels::topics(model, k = 5)
    topic_dt <- data.table(speech = colnames(topic_matrix), topics = apply(topic_matrix, 
        2, function(x) sprintf("|%s|", paste(x, collapse = "|"))), 
        key = "speech")
    message("... decoding s-attribute speech")
    if (!"speech" %in% s_attributes("GERMAPARL")) {
        stop("The s-attributes 'speech' is not yet present.", 
            "Use the function germaparl_add_s_attribute_speech to generate it.")
    }
    cpos_dt <- as.data.table(RcppCWB::s_attribute_decode("GERMAPARL", data_dir = germaparl_data_dir, s_attribute = "speech")) # Returns a data frame but setkeyv requires a data.table - converting
    names(cpos_dt)[names(cpos_dt) == "value"] <- "speech" # With the new function, speech gets renamed to "value" - changing it back to speech
    setkeyv(cpos_dt, "speech")
    cpos_dt2 <- topic_dt[cpos_dt]
    setorderv(cpos_dt2, cols = "cpos_left", order = 1L)
    cpos_dt2[["speech"]] <- NULL
    cpos_dt2[["id"]] <- NULL
    cpos_dt2[, `:=`(topics, ifelse(is.na(topics), "||", topics))]
    setcolorder(cpos_dt2, c("cpos_left", "cpos_right", "topics"))
    message("... running some sanity checks")
    coverage <- sum(cpos_dt2[["cpos_right"]] - cpos_dt2[["cpos_left"]]) + 
        nrow(cpos_dt2)
    if (coverage != size("GERMAPARL")) 
        stop()
    P <- partition("GERMAPARL", speech = ".*", regex = TRUE)
    if (sum(cpos_dt2[["cpos_left"]] - P@cpos[, 1]) != 0) 
        stop()
    if (sum(cpos_dt2[["cpos_right"]] - P@cpos[, 2]) != 0) 
        stop()
    if (length(sAttributes("GERMAPARL", "speech", unique = FALSE)) != 
        nrow(cpos_dt2)) 
        stop()
    message("... encoding s-attribute 'topics'")
    s_attribute_encode(values = cpos_dt2[["topics"]], data_dir = germaparl_data_dir, 
        s_attribute = "topics", corpus = "GERMAPARL", region_matrix = as.matrix(cpos_dt2[, 
            c("cpos_left", "cpos_right")]), registry_dir = germaparl_regdir(), 
        encoding = corpus_charset, method = "R", verbose = TRUE)
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions