diff --git a/NAMESPACE b/NAMESPACE index 077d91fe..83f1c915 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,8 +23,10 @@ export(extract_pairwise_interactions) export(extract_pairwise_thresholds) export(extract_posterior_inclusion_probabilities) export(mrfSampler) +export(summary_SBM) importFrom(Rcpp,evalCpp) importFrom(Rdpack,reprompt) importFrom(methods,hasArg) +importFrom(stats,dpois) importFrom(utils,packageVersion) useDynLib(bgms, .registration=TRUE) diff --git a/R/RcppExports.R b/R/RcppExports.R index e1dcd9d6..ff281bfa 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -9,11 +9,15 @@ sample_bcomrf_gibbs <- function(no_states, no_variables, no_categories, interact .Call(`_bgms_sample_bcomrf_gibbs`, no_states, no_variables, no_categories, interactions, thresholds, variable_type, reference_category, iter) } -gibbs_sampler <- function(observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save = FALSE, display_progress = FALSE, edge_selection = TRUE) { - .Call(`_bgms_gibbs_sampler`, observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save, display_progress, edge_selection) +gibbs_sampler <- function(observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, lambda, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save = FALSE, display_progress = FALSE, edge_selection = TRUE) { + .Call(`_bgms_gibbs_sampler`, observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, lambda, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save, display_progress, edge_selection) } compare_gibbs_sampler <- function(observations_gr1, observations_gr2, no_categories_gr1, no_categories_gr2, interaction_scale, pairwise_difference_scale, main_difference_scale, pairwise_difference_prior, main_difference_prior, inclusion_probability_difference, pairwise_beta_bernoulli_alpha, pairwise_beta_bernoulli_beta, main_beta_bernoulli_alpha, main_beta_bernoulli_beta, Index, iter, burnin, n_cat_obs_gr1, n_cat_obs_gr2, sufficient_blume_capel_gr1, sufficient_blume_capel_gr2, threshold_alpha, threshold_beta, na_impute, missing_index_gr1, missing_index_gr2, ordinal_variable, reference_category, independent_thresholds, save = FALSE, display_progress = FALSE, difference_selection = TRUE) { .Call(`_bgms_compare_gibbs_sampler`, observations_gr1, observations_gr2, no_categories_gr1, no_categories_gr2, interaction_scale, pairwise_difference_scale, main_difference_scale, pairwise_difference_prior, main_difference_prior, inclusion_probability_difference, pairwise_beta_bernoulli_alpha, pairwise_beta_bernoulli_beta, main_beta_bernoulli_alpha, main_beta_bernoulli_beta, Index, iter, burnin, n_cat_obs_gr1, n_cat_obs_gr2, sufficient_blume_capel_gr1, sufficient_blume_capel_gr2, threshold_alpha, threshold_beta, na_impute, missing_index_gr1, missing_index_gr2, ordinal_variable, reference_category, independent_thresholds, save, display_progress, difference_selection) } +compute_Vn_mfm_sbm <- function(no_variables, dirichlet_alpha, t_max, lambda) { + .Call(`_bgms_compute_Vn_mfm_sbm`, no_variables, dirichlet_alpha, t_max, lambda) +} + diff --git a/R/bgm.R b/R/bgm.R index 799eb625..4af346a9 100644 --- a/R/bgm.R +++ b/R/bgm.R @@ -116,9 +116,12 @@ #' when nodes are expected to fall into distinct clusters. The inclusion #' probabilities for the edges are defined at the level of the clusters, with a #' beta prior for the unknown inclusion probability with shape parameters -#' \code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}, and a Dirichlet +#' \code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}, a Dirichlet #' prior on the cluster assignment probabilities with a common concentration -#' parameter \code{dirichlet_alpha}. The default is +#' parameter \code{dirichlet_alpha} and a zero-truncated Poisson prior on the +#' number of clusters with a rate parameter \code{lambda}, indicating the +#' expected number of clusters. +#' The default is #' \code{edge_prior = "Bernoulli"}. #' @param inclusion_probability The prior edge inclusion probability for the #' Bernoulli model. Can be a single probability, or a matrix of \code{p} rows @@ -130,6 +133,8 @@ #' \code{beta_bernoulli_beta = 1}. #' @param dirichlet_alpha The shape of the Dirichlet prior on the node-to-block #' allocation probabilities for the Stochastic Block model. +#' @param lambda The rate parameter of the zero-truncated Poisson prior on the +#' number of cluster for the Stochastic Block model. #' @param na.action How do you want the function to handle missing data? If #' \code{na.action = "listwise"}, listwise deletion is used. If #' \code{na.action = "impute"}, missing data are imputed iteratively during the @@ -147,7 +152,7 @@ #' The default is \code{TRUE}. #' #' @return If \code{save = FALSE} (the default), the result is a list of class -#' ``bgms'' containing the following matrices with model-averaged quantities: +#' \code{"bgms"} containing the following matrices with model-averaged quantities: #' \itemize{ #' \item \code{indicator}: A matrix with \code{p} rows and \code{p} columns, #' containing the posterior inclusion probabilities of individual edges. @@ -155,16 +160,23 @@ #' containing model-averaged posterior means of the pairwise associations. #' \item \code{thresholds}: A matrix with \code{p} rows and \code{max(m)} #' columns, containing model-averaged category thresholds. In the case of -#' ``blume-capel'' variables, the first entry is the parameter for the linear +#' \code{"blume-capel"} variables, the first entry is the parameter for the linear #' effect and the second entry is the parameter for the quadratic effect, which #' models the offset to the reference category. -#' \item In the case of -#' \code{edge_prior = "Stochastic-Block"} two additional elements are returned: -#' a vector \code{allocations} with the estimated cluster assignments of the nodes and an -#' table \code{clusters} with the estimated posterior probability of the number -#' of clusters in the network. The vector of node allocations is calculated using -#' a method proposed by \insertCite{Dahl2009}{bgms} and also used by -#' \insertCite{GengEtAl_2019}{bgms}. +#' \item In the case of \code{edge_prior = "Stochastic-Block"}, two additional +#' elements are returned: +#' \itemize{ +#' \item A vector \code{allocations} with the estimated cluster assignments +#' of the nodes, calculated using a method proposed by \code{Dahl (2009)} and +#' also used by \code{Geng et al. (2019)}. +#' \item A matrix \code{components} with the estimated posterior probability +#' of the number of components (clusters) in the network. These probabilities +#' are calculated based on Equation 3.7 in \code{Miller (2018)}, which computes +#' the conditional probability of the number of components given the +#' number of clusters. The number of clusters is derived from the +#' cardinality of the sampled \code{allocations} vector for each iteration of +#' the MCMC sampler (see \code{save = TRUE}). +#' } #' } #' If \code{save = TRUE}, the result is a list of class ``bgms'' containing: #' \itemize{ @@ -180,8 +192,9 @@ #' \item In the case of #' \code{edge_prior = "Stochastic-Block"} a matrix \code{allocations} with the #' cluster assignments of the nodes from each iteration is returned. This matrix -#' can be used to calculate the posterior probability of the number of clusters -#' by utilizing the \code{summary_SBM(bgm_output[["allocations"]])} function. +#' can be used to calculate the posterior probability of the number of components +#' and estimated cluster assignments of the nodes, by and utilizing the +#' \code{summary_SBM()} function. #' } #' Column averages of these matrices provide the model-averaged posterior means. #' Except for the \code{allocations} matrix, for which the \code{summary_SBM} @@ -292,6 +305,7 @@ bgm = function(x, beta_bernoulli_alpha = 1, beta_bernoulli_beta = 1, dirichlet_alpha = 1, + lambda = 1, na.action = c("listwise", "impute"), save = FALSE, display_progress = TRUE) { @@ -318,7 +332,8 @@ bgm = function(x, inclusion_probability = inclusion_probability, beta_bernoulli_alpha = beta_bernoulli_alpha, beta_bernoulli_beta = beta_bernoulli_beta, - dirichlet_alpha = dirichlet_alpha) + dirichlet_alpha = dirichlet_alpha, + lambda = lambda) # ---------------------------------------------------------------------------- # The vector variable_type is now coded as boolean. @@ -444,6 +459,7 @@ bgm = function(x, beta_bernoulli_alpha = beta_bernoulli_alpha, beta_bernoulli_beta = beta_bernoulli_beta, dirichlet_alpha = dirichlet_alpha, + lambda = lambda, Index = Index, iter = iter, burnin = burnin, @@ -477,6 +493,7 @@ bgm = function(x, beta_bernoulli_alpha = beta_bernoulli_alpha , beta_bernoulli_beta = beta_bernoulli_beta, dirichlet_alpha = dirichlet_alpha, + lambda = lambda, na.action = na.action, save = save, version = packageVersion("bgms") @@ -521,13 +538,15 @@ bgm = function(x, if(edge_selection == TRUE) { if(edge_prior == "Stochastic-Block"){ - summarySbm <- summary_SBM(cluster_allocations = out$allocations) + summarySbm <- summary_SBM(cluster_allocations = out$allocations, + dirichlet_alpha = dirichlet_alpha, + lambda = lambda) output = list(indicator = indicator, interactions = interactions, thresholds = thresholds, + components = summarySbm$components, allocations = summarySbm$allocations, - clusters = summarySbm$no_clusters, arguments = arguments) } else { diff --git a/R/utility_functions.R b/R/utility_functions.R index 4c12ba61..11047669 100644 --- a/R/utility_functions.R +++ b/R/utility_functions.R @@ -1,6 +1,7 @@ #' @importFrom Rcpp evalCpp #' @importFrom Rdpack reprompt #' @importFrom methods hasArg +#' @importFrom stats dpois check_model = function(x, variable_type, @@ -13,7 +14,8 @@ check_model = function(x, inclusion_probability = 0.5, beta_bernoulli_alpha = 1, beta_bernoulli_beta = 1, - dirichlet_alpha = dirichlet_alpha) { + dirichlet_alpha = dirichlet_alpha, + lambda = lambda) { #Check variable type input --------------------------------------------------- if(length(variable_type) == 1) { @@ -187,14 +189,14 @@ check_model = function(x, } if(edge_prior == "Stochastic-Block") { theta = matrix(0.5, nrow = ncol(x), ncol = ncol(x)) - if(beta_bernoulli_alpha <= 0 || beta_bernoulli_beta <= 0 || dirichlet_alpha <= 0) - stop("The scale parameters of the beta and Dirichlet distribution need to be positive.") - if(!is.finite(beta_bernoulli_alpha) || !is.finite(beta_bernoulli_beta) || !is.finite(dirichlet_alpha)) - stop("The scale parameters of the beta and Dirichlet distribution need to be finite.") + if(beta_bernoulli_alpha <= 0 || beta_bernoulli_beta <= 0 || dirichlet_alpha <= 0 || lambda <= 0) + stop("The scale parameters the beta distrubution, the concentration parameter of the Dirichlet distribution and the rate parameter of the Poisson distribution need to be positive.") + if(!is.finite(beta_bernoulli_alpha) || !is.finite(beta_bernoulli_beta) || !is.finite(dirichlet_alpha) || !is.finite(lambda)) + stop("The scale parameters of the beta distrubution, the concentration parameter of the Dirichlet distribution and the rate parameter of the Poisson distribution need to be finite.") if(is.na(beta_bernoulli_alpha) || is.na(beta_bernoulli_beta) || is.null(beta_bernoulli_alpha) || is.null(beta_bernoulli_beta) || - is.null(dirichlet_alpha) || is.null(dirichlet_alpha)) - stop("Values for both scale parameters of the beta and Dirichlet distribution need to be specified.") + is.null(dirichlet_alpha) || is.null(dirichlet_alpha) || is.null(lambda) || is.null(lambda)) + stop("Values for both scale parameters of the betathe concentration parameter of the Dirichlet distribution and the rate parameter of the Poisson distribution need to be specified.") } } else { theta = matrix(0.5, nrow = 1, ncol = 1) @@ -1013,7 +1015,6 @@ compare_reformat_data = function(x, # Geng, J., Bhattacharya, A., & Pati, D. (2019). Probabilistic Community # Detection With Unknown Number of Communities, Journal of the American # Statistical Association, 114:526, 893-905, DOI:10.1080/01621459.2018.1458618 - getDahl <- function(cluster_allocations) { # Dimensions of the input matrix niters <- nrow(cluster_allocations) # Number of iterations @@ -1045,21 +1046,114 @@ getDahl <- function(cluster_allocations) { return(DahlAns) } +# Calculate the conditional probability of the number of +# components given the cardinality of a sampled +# allocation vector based on Equation (3.7) from +# Miller & Harrison (2018). Mixture Models With a Prior on the Number of +# Components, Journal of the American Statistical Association, 113:521, 340-356, +# DOI:10.1080/01621459.2016.1255636 + +compute_p_k_given_t <- function(t, + log_Vn, + dirichlet_alpha, + no_variables, + lambda) { + # Define the K_values + K_values <- as.numeric(1:no_variables) + + # Initialize vector for probabilities + p_k_given_t <- numeric(length(K_values)) + + # Normalization constant for t + log_vn_t <- log_Vn[t] + + # Normalizing factor for the truncated Poisson distribution + norm_factor <- 1 - dpois(0, lambda) + truncated_poisson_pmf <- dpois(K_values, lambda) / norm_factor + + # Loop through each value of K + for (i in seq_along(K_values)) { + K <- K_values[i] + if (K >= t) { + # Falling factorial + falling_factorial <- prod(K:(K - t + 1)) + # Rising factorial + rising_factorial <- prod((dirichlet_alpha * K) + 0:(no_variables - 1)) + # Compute log probability + log_p_k <- log(falling_factorial) - log(rising_factorial) + log(truncated_poisson_pmf[i]) - log_vn_t + # Convert log probability to probability + p_k_given_t[i] <- exp(log_p_k) + } else { + p_k_given_t[i] <- 0 + } + } + # Normalize probabilities + p_k_given_t <- p_k_given_t / sum(p_k_given_t) -# A function that computes the cluster posterior probabilities and allocations + return(p_k_given_t) +} -summary_SBM <- function(cluster_allocations) { - # Compute the number of unique clusters for each iteration +#' Function for summarizing the block allocation vectors +#' +#' This function summarizes the sampled allocation vectors from each iteration +#' of the Gibbs sampler from the output of \code{bgm} +#' when \code{edge_prior = "Stochastic-Block"} and \code{save = TRUE}. +#' +#' @param cluster_allocations a matrix of \code{iter} rows and \code{no_variables} +#' columns containing the sampled cluster allocation vectors from each iteration. +#' @param dirichlet_alpha the concentration parameter of the Dirichlet prior +#' @param lambda the Poisson rate parameter +#' @return Returns a list of two elements: \code{components} and \code{allocations}, +#' containing the posterior probabilties for the number of components (clusters) +#' and the estimated cluster allocation of the nodes using Dahl's method. +#' @examples +#' +#' # fit a model with the SBM prior +#' bgm_object <- bgm(Wenchuan[, c(1:5)], +#' edge_prior = "Stochastic-Block", +#' save = TRUE, +#' iter = 1e3) +#' +#' s <- summary_SBM(bgm_object$allocations, +#' bgm_object$arguments$dirichlet_alpha, +#' bgm_object$arguments$lambda) +#' +#' @export +summary_SBM <- function(cluster_allocations, + dirichlet_alpha, + lambda) { + + # precompute things + no_variables <- ncol(cluster_allocations) + log_Vn <- compute_Vn_mfm_sbm(no_variables, + dirichlet_alpha, + no_variables, + lambda) + # Compute the number of unique clusters (t) for each iteration + # i.e., the cardinality of the partition z clusters <- apply(cluster_allocations, 1, function(row) length(unique(row))) + # Compute the conditional probabilities of the number of clusters + # for each row in clusters + p_k_given_t <- matrix(NA, nrow = length(clusters), ncol = no_variables) + + for (i in 1:length(clusters)) { + p_k_given_t[i, ] <- compute_p_k_given_t(clusters[i], + log_Vn, + dirichlet_alpha, + no_variables, + lambda) + } + # average across all iterations + p_k_given_t <- colMeans(p_k_given_t) - # Compute the posterior probabilities of the actual unique clusters - no_clusters <- table(clusters) / length(clusters) + # make it as a matrix + no_components <- 1:no_variables + components <- cbind(no_components, p_k_given_t) + colnames(components) <- c("no_components", "probability") # Compute the allocations of the nodes based on Dahl's method allocations <- getDahl(cluster_allocations) - # Return the results - return(list(no_clusters = no_clusters, + return(list(components = components, allocations = allocations)) } - diff --git a/bgms.Rproj b/bgms.Rproj index 6291372e..1c78b1f3 100644 --- a/bgms.Rproj +++ b/bgms.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: 2fb4a59f-6132-41a6-9659-f1d3297dc9d0 RestoreWorkspace: Default SaveWorkspace: Default diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index 18afffc3..ee85e281 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -1,3 +1,15 @@ +@ARTICLE{miller2018mixture, + title={Mixture models with a prior on the number of components}, + author={Miller, Jeffrey W and Harrison, Matthew T}, + journal={Journal of the American Statistical Association}, + volume={113}, + number={521}, + pages={340--356}, + year={2018}, + doi = {10.1080/01621459.2016.1255636}, + publisher={Taylor \& Francis} +} + @article{Dahl2009, author = {David B. Dahl}, title = {Modal clustering in a class of product partition models}, diff --git a/man/bgm.Rd b/man/bgm.Rd index d091164b..9f306879 100644 --- a/man/bgm.Rd +++ b/man/bgm.Rd @@ -20,6 +20,7 @@ bgm( beta_bernoulli_alpha = 1, beta_bernoulli_beta = 1, dirichlet_alpha = 1, + lambda = 1, na.action = c("listwise", "impute"), save = FALSE, display_progress = TRUE @@ -106,9 +107,12 @@ of allocations as described by @GengEtAl_2019}{bgms}. This model is advantageous when nodes are expected to fall into distinct clusters. The inclusion probabilities for the edges are defined at the level of the clusters, with a beta prior for the unknown inclusion probability with shape parameters -\code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}, and a Dirichlet +\code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}, a Dirichlet prior on the cluster assignment probabilities with a common concentration -parameter \code{dirichlet_alpha}. The default is +parameter \code{dirichlet_alpha} and a zero-truncated Poisson prior on the +number of clusters with a rate parameter \code{lambda}, indicating the +expected number of clusters. +The default is \code{edge_prior = "Bernoulli"}.} \item{inclusion_probability}{The prior edge inclusion probability for the @@ -124,6 +128,9 @@ positive numbers. Defaults to \code{beta_bernoulli_alpha = 1} and \item{dirichlet_alpha}{The shape of the Dirichlet prior on the node-to-block allocation probabilities for the Stochastic Block model.} +\item{lambda}{The rate parameter of the zero-truncated Poisson prior on the +number of cluster for the Stochastic Block model.} + \item{na.action}{How do you want the function to handle missing data? If \code{na.action = "listwise"}, listwise deletion is used. If \code{na.action = "impute"}, missing data are imputed iteratively during the @@ -144,7 +151,7 @@ The default is \code{TRUE}.} } \value{ If \code{save = FALSE} (the default), the result is a list of class -``bgms'' containing the following matrices with model-averaged quantities: +\code{"bgms"} containing the following matrices with model-averaged quantities: \itemize{ \item \code{indicator}: A matrix with \code{p} rows and \code{p} columns, containing the posterior inclusion probabilities of individual edges. @@ -152,16 +159,23 @@ containing the posterior inclusion probabilities of individual edges. containing model-averaged posterior means of the pairwise associations. \item \code{thresholds}: A matrix with \code{p} rows and \code{max(m)} columns, containing model-averaged category thresholds. In the case of -``blume-capel'' variables, the first entry is the parameter for the linear +\code{"blume-capel"} variables, the first entry is the parameter for the linear effect and the second entry is the parameter for the quadratic effect, which models the offset to the reference category. -\item In the case of -\code{edge_prior = "Stochastic-Block"} two additional elements are returned: -a vector \code{allocations} with the estimated cluster assignments of the nodes and an -table \code{clusters} with the estimated posterior probability of the number -of clusters in the network. The vector of node allocations is calculated using -a method proposed by \insertCite{Dahl2009}{bgms} and also used by -\insertCite{GengEtAl_2019}{bgms}. +\item In the case of \code{edge_prior = "Stochastic-Block"}, two additional +elements are returned: + \itemize{ + \item A vector \code{allocations} with the estimated cluster assignments + of the nodes, calculated using a method proposed by \code{Dahl (2009)} and + also used by \code{Geng et al. (2019)}. + \item A matrix \code{components} with the estimated posterior probability + of the number of components (clusters) in the network. These probabilities + are calculated based on Equation 3.7 in \code{Miller (2018)}, which computes + the conditional probability of the number of components given the + number of clusters. The number of clusters is derived from the + cardinality of the sampled \code{allocations} vector for each iteration of + the MCMC sampler (see \code{save = TRUE}). + } } If \code{save = TRUE}, the result is a list of class ``bgms'' containing: \itemize{ @@ -177,8 +191,9 @@ the Gibbs sampler for the category thresholds. \item In the case of \code{edge_prior = "Stochastic-Block"} a matrix \code{allocations} with the cluster assignments of the nodes from each iteration is returned. This matrix -can be used to calculate the posterior probability of the number of clusters -by utilizing the \code{summary_SBM(bgm_output[["allocations"]])} function. +can be used to calculate the posterior probability of the number of components +and estimated cluster assignments of the nodes, by and utilizing the +\code{summary_SBM()} function. } Column averages of these matrices provide the model-averaged posterior means. Except for the \code{allocations} matrix, for which the \code{summary_SBM} diff --git a/man/summary_SBM.Rd b/man/summary_SBM.Rd new file mode 100644 index 00000000..fde02886 --- /dev/null +++ b/man/summary_SBM.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility_functions.R +\name{summary_SBM} +\alias{summary_SBM} +\title{Function for summarizing the block allocation vectors} +\usage{ +summary_SBM(cluster_allocations, dirichlet_alpha, lambda) +} +\arguments{ +\item{cluster_allocations}{a matrix of \code{iter} rows and \code{no_variables} +columns containing the sampled cluster allocation vectors from each iteration.} + +\item{dirichlet_alpha}{the concentration parameter of the Dirichlet prior} + +\item{lambda}{the Poisson rate parameter} +} +\value{ +Returns a list of two elements: \code{components} and \code{allocations}, +containing the posterior probabilties for the number of components (clusters) +and the estimated cluster allocation of the nodes using Dahl's method. +} +\description{ +This function summarizes the sampled allocation vectors from each iteration +of the Gibbs sampler from the output of \code{bgm} +when \code{edge_prior = "Stochastic-Block"} and \code{save = TRUE}. +} +\examples{ + +# fit a model with the SBM prior +bgm_object <- bgm(Wenchuan[, c(1:5)], + edge_prior = "Stochastic-Block", + save = TRUE, + iter = 1e3) + +s <- summary_SBM(bgm_object$allocations, + bgm_object$arguments$dirichlet_alpha, + bgm_object$arguments$lambda) + +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index d1858a8f..e777dff1 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -45,8 +45,8 @@ BEGIN_RCPP END_RCPP } // gibbs_sampler -List gibbs_sampler(IntegerMatrix observations, IntegerMatrix indicator, NumericMatrix interactions, NumericMatrix thresholds, IntegerVector no_categories, double interaction_scale, NumericMatrix proposal_sd, NumericMatrix proposal_sd_blumecapel, String edge_prior, NumericMatrix theta, double beta_bernoulli_alpha, double beta_bernoulli_beta, double dirichlet_alpha, IntegerMatrix Index, int iter, int burnin, IntegerMatrix n_cat_obs, IntegerMatrix sufficient_blume_capel, double threshold_alpha, double threshold_beta, bool na_impute, IntegerMatrix missing_index, LogicalVector variable_bool, IntegerVector reference_category, bool save, bool display_progress, bool edge_selection); -RcppExport SEXP _bgms_gibbs_sampler(SEXP observationsSEXP, SEXP indicatorSEXP, SEXP interactionsSEXP, SEXP thresholdsSEXP, SEXP no_categoriesSEXP, SEXP interaction_scaleSEXP, SEXP proposal_sdSEXP, SEXP proposal_sd_blumecapelSEXP, SEXP edge_priorSEXP, SEXP thetaSEXP, SEXP beta_bernoulli_alphaSEXP, SEXP beta_bernoulli_betaSEXP, SEXP dirichlet_alphaSEXP, SEXP IndexSEXP, SEXP iterSEXP, SEXP burninSEXP, SEXP n_cat_obsSEXP, SEXP sufficient_blume_capelSEXP, SEXP threshold_alphaSEXP, SEXP threshold_betaSEXP, SEXP na_imputeSEXP, SEXP missing_indexSEXP, SEXP variable_boolSEXP, SEXP reference_categorySEXP, SEXP saveSEXP, SEXP display_progressSEXP, SEXP edge_selectionSEXP) { +List gibbs_sampler(IntegerMatrix observations, IntegerMatrix indicator, NumericMatrix interactions, NumericMatrix thresholds, IntegerVector no_categories, double interaction_scale, NumericMatrix proposal_sd, NumericMatrix proposal_sd_blumecapel, String edge_prior, NumericMatrix theta, double beta_bernoulli_alpha, double beta_bernoulli_beta, double dirichlet_alpha, double lambda, IntegerMatrix Index, int iter, int burnin, IntegerMatrix n_cat_obs, IntegerMatrix sufficient_blume_capel, double threshold_alpha, double threshold_beta, bool na_impute, IntegerMatrix missing_index, LogicalVector variable_bool, IntegerVector reference_category, bool save, bool display_progress, bool edge_selection); +RcppExport SEXP _bgms_gibbs_sampler(SEXP observationsSEXP, SEXP indicatorSEXP, SEXP interactionsSEXP, SEXP thresholdsSEXP, SEXP no_categoriesSEXP, SEXP interaction_scaleSEXP, SEXP proposal_sdSEXP, SEXP proposal_sd_blumecapelSEXP, SEXP edge_priorSEXP, SEXP thetaSEXP, SEXP beta_bernoulli_alphaSEXP, SEXP beta_bernoulli_betaSEXP, SEXP dirichlet_alphaSEXP, SEXP lambdaSEXP, SEXP IndexSEXP, SEXP iterSEXP, SEXP burninSEXP, SEXP n_cat_obsSEXP, SEXP sufficient_blume_capelSEXP, SEXP threshold_alphaSEXP, SEXP threshold_betaSEXP, SEXP na_imputeSEXP, SEXP missing_indexSEXP, SEXP variable_boolSEXP, SEXP reference_categorySEXP, SEXP saveSEXP, SEXP display_progressSEXP, SEXP edge_selectionSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -63,6 +63,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< double >::type beta_bernoulli_alpha(beta_bernoulli_alphaSEXP); Rcpp::traits::input_parameter< double >::type beta_bernoulli_beta(beta_bernoulli_betaSEXP); Rcpp::traits::input_parameter< double >::type dirichlet_alpha(dirichlet_alphaSEXP); + Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP); Rcpp::traits::input_parameter< IntegerMatrix >::type Index(IndexSEXP); Rcpp::traits::input_parameter< int >::type iter(iterSEXP); Rcpp::traits::input_parameter< int >::type burnin(burninSEXP); @@ -77,7 +78,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< bool >::type save(saveSEXP); Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); Rcpp::traits::input_parameter< bool >::type edge_selection(edge_selectionSEXP); - rcpp_result_gen = Rcpp::wrap(gibbs_sampler(observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save, display_progress, edge_selection)); + rcpp_result_gen = Rcpp::wrap(gibbs_sampler(observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, lambda, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save, display_progress, edge_selection)); return rcpp_result_gen; END_RCPP } @@ -123,12 +124,27 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// compute_Vn_mfm_sbm +NumericVector compute_Vn_mfm_sbm(int no_variables, double dirichlet_alpha, int t_max, double lambda); +RcppExport SEXP _bgms_compute_Vn_mfm_sbm(SEXP no_variablesSEXP, SEXP dirichlet_alphaSEXP, SEXP t_maxSEXP, SEXP lambdaSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< int >::type no_variables(no_variablesSEXP); + Rcpp::traits::input_parameter< double >::type dirichlet_alpha(dirichlet_alphaSEXP); + Rcpp::traits::input_parameter< int >::type t_max(t_maxSEXP); + Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP); + rcpp_result_gen = Rcpp::wrap(compute_Vn_mfm_sbm(no_variables, dirichlet_alpha, t_max, lambda)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_bgms_sample_omrf_gibbs", (DL_FUNC) &_bgms_sample_omrf_gibbs, 6}, {"_bgms_sample_bcomrf_gibbs", (DL_FUNC) &_bgms_sample_bcomrf_gibbs, 8}, - {"_bgms_gibbs_sampler", (DL_FUNC) &_bgms_gibbs_sampler, 27}, + {"_bgms_gibbs_sampler", (DL_FUNC) &_bgms_gibbs_sampler, 28}, {"_bgms_compare_gibbs_sampler", (DL_FUNC) &_bgms_compare_gibbs_sampler, 32}, + {"_bgms_compute_Vn_mfm_sbm", (DL_FUNC) &_bgms_compute_Vn_mfm_sbm, 4}, {NULL, NULL, 0} }; diff --git a/src/gibbs_functions.cpp b/src/gibbs_functions.cpp index 56e06236..6a8512e6 100644 --- a/src/gibbs_functions.cpp +++ b/src/gibbs_functions.cpp @@ -792,6 +792,7 @@ List gibbs_sampler(IntegerMatrix observations, double beta_bernoulli_alpha, double beta_bernoulli_beta, double dirichlet_alpha, + double lambda, IntegerMatrix Index, int iter, int burnin, @@ -812,7 +813,6 @@ List gibbs_sampler(IntegerMatrix observations, int no_interactions = Index.nrow(); int no_thresholds = sum(no_categories); int max_no_categories = max(no_categories); - IntegerVector K_values; // To store sampled K values IntegerVector v = seq(0, no_interactions - 1); @@ -870,15 +870,14 @@ List gibbs_sampler(IntegerMatrix observations, NumericMatrix out_allocations(iter, no_variables); if(edge_prior == "Stochastic-Block") { // Initial Configuration of the cluster allocations - cluster_allocations[0] = 0; - cluster_allocations[1] = 1; - for(int i = 2; i < no_variables; i++) { + // Randomly choose the number of clusters (k) between 1 and no_variables + int k = static_cast(R::unif_rand() * no_variables) + 1; + + // Assign each variable to a random cluster + for (int i = 0; i < no_variables; i++) { double U = R::unif_rand(); - if(U > 0.5){ - cluster_allocations[i] = 1; - } else { - cluster_allocations[i] = 0; - } + int cluster = static_cast(U * k); // Randomly select a cluster (0 to k-1) + cluster_allocations[i] = cluster; } cluster_prob = block_probs_mfm_sbm(cluster_allocations, @@ -896,7 +895,8 @@ List gibbs_sampler(IntegerMatrix observations, log_Vn = compute_Vn_mfm_sbm(no_variables, dirichlet_alpha, - no_variables + 10); + no_variables, + lambda); } //The Gibbs sampler ---------------------------------------------------------- diff --git a/src/gibbs_functions_edge_prior.cpp b/src/gibbs_functions_edge_prior.cpp index 0cdbc535..fef05628 100644 --- a/src/gibbs_functions_edge_prior.cpp +++ b/src/gibbs_functions_edge_prior.cpp @@ -82,9 +82,11 @@ NumericMatrix add_row_col_block_prob_matrix(NumericMatrix X, // ----------------------------------------------------------------------------| // Compute partition coefficient for the MFM - SBM // ----------------------------------------------------------------------------| +// [[Rcpp::export]] NumericVector compute_Vn_mfm_sbm(int no_variables, double dirichlet_alpha, - int t_max) { + int t_max, + double lambda) { NumericVector log_Vn(t_max); double r; double tmp; @@ -101,6 +103,10 @@ NumericVector compute_Vn_mfm_sbm(int no_variables, } tmp -= std::lgamma(k + 1); + // Add the poisson term + double log_norm_factor = log(1.0 - exp(R::dpois(0, lambda, true))); + tmp += R::dpois(k-1, lambda, true) - log_norm_factor; + // Compute the maximum between r and tmp if (tmp > r) { r = std::log(std::exp(r - tmp) + 1) + tmp; @@ -227,7 +233,11 @@ IntegerVector block_allocations_mfm_sbm(IntegerVector cluster_assign, double loglike; double logmarg; - for(int node = 0; node < no_variables; node++) { + // Generate a randomized order using Rcpp's sample function + IntegerVector indices = sample(no_variables, no_variables); + + for (int idx = 0; idx < no_variables; ++idx) { + int node = indices[idx] - 1; // Convert to zero-based index old = cluster_assign[node]; IntegerVector cluster_size = table_cpp(cluster_assign); diff --git a/src/gibbs_functions_edge_prior.h b/src/gibbs_functions_edge_prior.h index 827c4ea9..6818dbb8 100644 --- a/src/gibbs_functions_edge_prior.h +++ b/src/gibbs_functions_edge_prior.h @@ -6,7 +6,8 @@ using namespace Rcpp; // ----------------------------------------------------------------------------| Rcpp::NumericVector compute_Vn_mfm_sbm(int no_variables, double dirichlet_alpha, - int t_max); + int t_max, + double lambda); // ----------------------------------------------------------------------------| // Sample the block allocations for the MFM - SBM