Bayesian-Graphical-Modelling-Lab · sekulovskin · Dec 5, 2024 · Dec 5, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -23,8 +23,10 @@ export(extract_pairwise_interactions)
 export(extract_pairwise_thresholds)
 export(extract_posterior_inclusion_probabilities)
 export(mrfSampler)
+export(summary_SBM)
 importFrom(Rcpp,evalCpp)
 importFrom(Rdpack,reprompt)
 importFrom(methods,hasArg)
+importFrom(stats,dpois)
 importFrom(utils,packageVersion)
 useDynLib(bgms, .registration=TRUE)
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -9,11 +9,15 @@ sample_bcomrf_gibbs <- function(no_states, no_variables, no_categories, interact
     .Call(`_bgms_sample_bcomrf_gibbs`, no_states, no_variables, no_categories, interactions, thresholds, variable_type, reference_category, iter)
 }
 
-gibbs_sampler <- function(observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save = FALSE, display_progress = FALSE, edge_selection = TRUE) {
-    .Call(`_bgms_gibbs_sampler`, observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save, display_progress, edge_selection)
+gibbs_sampler <- function(observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, lambda, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save = FALSE, display_progress = FALSE, edge_selection = TRUE) {
+    .Call(`_bgms_gibbs_sampler`, observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, lambda, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save, display_progress, edge_selection)
 }
 
 compare_gibbs_sampler <- function(observations_gr1, observations_gr2, no_categories_gr1, no_categories_gr2, interaction_scale, pairwise_difference_scale, main_difference_scale, pairwise_difference_prior, main_difference_prior, inclusion_probability_difference, pairwise_beta_bernoulli_alpha, pairwise_beta_bernoulli_beta, main_beta_bernoulli_alpha, main_beta_bernoulli_beta, Index, iter, burnin, n_cat_obs_gr1, n_cat_obs_gr2, sufficient_blume_capel_gr1, sufficient_blume_capel_gr2, threshold_alpha, threshold_beta, na_impute, missing_index_gr1, missing_index_gr2, ordinal_variable, reference_category, independent_thresholds, save = FALSE, display_progress = FALSE, difference_selection = TRUE) {
     .Call(`_bgms_compare_gibbs_sampler`, observations_gr1, observations_gr2, no_categories_gr1, no_categories_gr2, interaction_scale, pairwise_difference_scale, main_difference_scale, pairwise_difference_prior, main_difference_prior, inclusion_probability_difference, pairwise_beta_bernoulli_alpha, pairwise_beta_bernoulli_beta, main_beta_bernoulli_alpha, main_beta_bernoulli_beta, Index, iter, burnin, n_cat_obs_gr1, n_cat_obs_gr2, sufficient_blume_capel_gr1, sufficient_blume_capel_gr2, threshold_alpha, threshold_beta, na_impute, missing_index_gr1, missing_index_gr2, ordinal_variable, reference_category, independent_thresholds, save, display_progress, difference_selection)
 }
 
+compute_Vn_mfm_sbm <- function(no_variables, dirichlet_alpha, t_max, lambda) {
+    .Call(`_bgms_compute_Vn_mfm_sbm`, no_variables, dirichlet_alpha, t_max, lambda)
+}
+
diff --git a/R/bgm.R b/R/bgm.R
@@ -116,9 +116,12 @@
 #' when nodes are expected to fall into distinct clusters. The inclusion
 #' probabilities for the edges are defined at the level of the clusters, with a
 #' beta prior for the unknown inclusion probability with shape parameters
-#' \code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}, and a Dirichlet
+#' \code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}, a Dirichlet
 #' prior on the cluster assignment probabilities with a common concentration
-#' parameter \code{dirichlet_alpha}. The default is
+#' parameter \code{dirichlet_alpha} and a zero-truncated Poisson prior on the
+#' number of clusters with a rate parameter \code{lambda}, indicating the
+#' expected number of clusters.
+#' The default is
 #' \code{edge_prior = "Bernoulli"}.
 #' @param inclusion_probability The prior edge inclusion probability for the
 #' Bernoulli model. Can be a single probability, or a matrix of \code{p} rows
@@ -130,6 +133,8 @@
 #' \code{beta_bernoulli_beta = 1}.
 #' @param dirichlet_alpha The shape of the Dirichlet prior on the node-to-block
 #' allocation probabilities for the Stochastic Block model.
+#' @param lambda The rate parameter of the zero-truncated Poisson prior on the
+#' number of cluster for the Stochastic Block model.
 #' @param na.action How do you want the function to handle missing data? If
 #' \code{na.action = "listwise"}, listwise deletion is used. If
 #' \code{na.action = "impute"}, missing data are imputed iteratively during the
@@ -147,24 +152,31 @@
 #' The default is \code{TRUE}.
 #'
 #' @return If \code{save = FALSE} (the default), the result is a list of class
-#' ``bgms'' containing the following matrices with model-averaged quantities:
+#' \code{"bgms"} containing the following matrices with model-averaged quantities:
 #' \itemize{
 #' \item \code{indicator}: A matrix with \code{p} rows and \code{p} columns,
 #' containing the posterior inclusion probabilities of individual edges.
 #' \item \code{interactions}: A matrix with \code{p} rows and \code{p} columns,
 #' containing model-averaged posterior means of the pairwise associations.
 #' \item \code{thresholds}: A matrix with \code{p} rows and \code{max(m)}
 #' columns, containing model-averaged category thresholds. In the case of
-#' ``blume-capel'' variables, the first entry is the parameter for the linear
+#' \code{"blume-capel"} variables, the first entry is the parameter for the linear
 #' effect and the second entry is the parameter for the quadratic effect, which
 #' models the offset to the reference category.
-#' \item In the case of
-#' \code{edge_prior = "Stochastic-Block"} two additional elements are returned:
-#' a vector \code{allocations} with the estimated cluster assignments of the nodes and an
-#' table \code{clusters} with the estimated posterior probability of the number
-#' of clusters in the network. The vector of node allocations is calculated using
-#' a method proposed by \insertCite{Dahl2009}{bgms} and also used by
-#' \insertCite{GengEtAl_2019}{bgms}.
+#' \item In the case of \code{edge_prior = "Stochastic-Block"}, two additional
+#' elements are returned:
+#'   \itemize{
+#'     \item A vector \code{allocations} with the estimated cluster assignments
+#'     of the nodes, calculated using a method proposed by \code{Dahl (2009)} and
+#'     also used by \code{Geng et al. (2019)}.
+#'     \item A matrix \code{components} with the estimated posterior probability
+#'     of the number of components (clusters) in the network. These probabilities
+#'     are calculated based on Equation 3.7 in \code{Miller (2018)}, which computes
+#'     the conditional probability of the number of components given the
+#'     number of clusters. The number of clusters is derived from the
+#'     cardinality of the sampled \code{allocations} vector for each iteration of
+#'     the MCMC sampler (see \code{save = TRUE}).
+#'   }
 #' }
 #' If \code{save = TRUE}, the result is a list of class ``bgms'' containing:
 #' \itemize{
@@ -180,8 +192,9 @@
 #' \item In the case of
 #' \code{edge_prior = "Stochastic-Block"} a matrix \code{allocations} with the
 #' cluster assignments of the nodes from each iteration is returned. This matrix
-#' can be used to calculate the posterior probability of the number of clusters
-#' by utilizing the \code{summary_SBM(bgm_output[["allocations"]])} function.
+#' can be used to calculate the posterior probability of the number of components
+#' and estimated cluster assignments of the nodes, by and  utilizing the
+#' \code{summary_SBM()} function.
 #' }
 #' Column averages of these matrices provide the model-averaged posterior means.
 #' Except for the \code{allocations} matrix, for which the \code{summary_SBM}
@@ -292,6 +305,7 @@ bgm = function(x,
                beta_bernoulli_alpha = 1,
                beta_bernoulli_beta = 1,
                dirichlet_alpha = 1,
+               lambda = 1,
                na.action = c("listwise", "impute"),
                save = FALSE,
                display_progress = TRUE) {
@@ -318,7 +332,8 @@ bgm = function(x,
                       inclusion_probability = inclusion_probability,
                       beta_bernoulli_alpha = beta_bernoulli_alpha,
                       beta_bernoulli_beta = beta_bernoulli_beta,
-                      dirichlet_alpha = dirichlet_alpha)
+                      dirichlet_alpha = dirichlet_alpha,
+                      lambda = lambda)
 
   # ----------------------------------------------------------------------------
   # The vector variable_type is now coded as boolean.
@@ -444,6 +459,7 @@ bgm = function(x,
                       beta_bernoulli_alpha = beta_bernoulli_alpha,
                       beta_bernoulli_beta = beta_bernoulli_beta,
                       dirichlet_alpha = dirichlet_alpha,
+                      lambda = lambda,
                       Index = Index,
                       iter = iter,
                       burnin = burnin,
@@ -477,6 +493,7 @@ bgm = function(x,
     beta_bernoulli_alpha = beta_bernoulli_alpha ,
     beta_bernoulli_beta =  beta_bernoulli_beta,
     dirichlet_alpha = dirichlet_alpha,
+    lambda = lambda,
     na.action = na.action,
     save = save,
     version = packageVersion("bgms")
@@ -521,13 +538,15 @@ bgm = function(x,
     if(edge_selection == TRUE) {
       if(edge_prior == "Stochastic-Block"){
 
-        summarySbm <- summary_SBM(cluster_allocations = out$allocations)
+        summarySbm <- summary_SBM(cluster_allocations = out$allocations,
+                                  dirichlet_alpha = dirichlet_alpha,
+                                  lambda = lambda)
 
         output = list(indicator = indicator,
                       interactions = interactions,
                       thresholds = thresholds,
+                      components = summarySbm$components,
                       allocations = summarySbm$allocations,
-                      clusters = summarySbm$no_clusters,
                       arguments = arguments)
       } else {
 

diff --git a/R/utility_functions.R b/R/utility_functions.R
@@ -1,6 +1,7 @@
 #' @importFrom Rcpp evalCpp
 #' @importFrom Rdpack reprompt
 #' @importFrom methods hasArg
+#' @importFrom stats dpois
 
 check_model = function(x,
                        variable_type,
@@ -13,7 +14,8 @@ check_model = function(x,
                        inclusion_probability = 0.5,
                        beta_bernoulli_alpha = 1,
                        beta_bernoulli_beta = 1,
-                       dirichlet_alpha = dirichlet_alpha) {
+                       dirichlet_alpha = dirichlet_alpha,
+                       lambda = lambda) {
 
   #Check variable type input ---------------------------------------------------
   if(length(variable_type) == 1) {
@@ -187,14 +189,14 @@ check_model = function(x,
     }
     if(edge_prior == "Stochastic-Block") {
       theta = matrix(0.5, nrow = ncol(x), ncol = ncol(x))
-      if(beta_bernoulli_alpha <= 0 || beta_bernoulli_beta <= 0 || dirichlet_alpha <= 0)
-        stop("The scale parameters of the beta and Dirichlet distribution need to be positive.")
-      if(!is.finite(beta_bernoulli_alpha) || !is.finite(beta_bernoulli_beta) || !is.finite(dirichlet_alpha))
-        stop("The scale parameters of the beta and Dirichlet distribution need to be finite.")
+      if(beta_bernoulli_alpha <= 0 || beta_bernoulli_beta <= 0 || dirichlet_alpha <= 0 || lambda <= 0)
+        stop("The scale parameters the beta distrubution, the concentration parameter of the Dirichlet distribution and the rate parameter of the Poisson distribution need to be positive.")
+      if(!is.finite(beta_bernoulli_alpha) || !is.finite(beta_bernoulli_beta) || !is.finite(dirichlet_alpha) || !is.finite(lambda))
+        stop("The scale parameters of the beta distrubution, the concentration parameter of the Dirichlet distribution and the rate parameter of the Poisson distribution need to be finite.")
       if(is.na(beta_bernoulli_alpha) || is.na(beta_bernoulli_beta) ||
          is.null(beta_bernoulli_alpha) || is.null(beta_bernoulli_beta) ||
-         is.null(dirichlet_alpha) || is.null(dirichlet_alpha))
-        stop("Values for both scale parameters of the beta and Dirichlet distribution need to be specified.")
+         is.null(dirichlet_alpha) || is.null(dirichlet_alpha) || is.null(lambda) || is.null(lambda))
+        stop("Values for both scale parameters of the betathe concentration parameter of the Dirichlet distribution and the rate parameter of the Poisson distribution need to be specified.")
     }
   } else {
     theta = matrix(0.5, nrow = 1, ncol = 1)
@@ -1013,7 +1015,6 @@ compare_reformat_data = function(x,
 #  Geng, J., Bhattacharya, A., & Pati, D. (2019). Probabilistic Community
 #  Detection With Unknown Number of Communities, Journal of the American
 #  Statistical Association, 114:526, 893-905, DOI:10.1080/01621459.2018.1458618
-
 getDahl <- function(cluster_allocations) {
   # Dimensions of the input matrix
   niters <- nrow(cluster_allocations)  # Number of iterations
@@ -1045,21 +1046,114 @@ getDahl <- function(cluster_allocations) {
   return(DahlAns)
 }
 
+# Calculate the conditional probability of the number of
+# components given the cardinality of a sampled
+# allocation vector based on Equation (3.7) from
+# Miller & Harrison (2018). Mixture Models With a Prior on the Number of
+# Components, Journal of the American Statistical Association, 113:521, 340-356,
+# DOI:10.1080/01621459.2016.1255636
+
+compute_p_k_given_t <- function(t,
+                                log_Vn,
+                                dirichlet_alpha,
+                                no_variables,
+                                lambda) {
+  # Define the K_values
+  K_values <- as.numeric(1:no_variables)
+
+  # Initialize vector for probabilities
+  p_k_given_t <- numeric(length(K_values))
+
+  # Normalization constant for t
+  log_vn_t <- log_Vn[t]
+
+  # Normalizing factor for the truncated Poisson distribution
+  norm_factor <- 1 - dpois(0, lambda)
+  truncated_poisson_pmf <- dpois(K_values, lambda) / norm_factor
+
+  # Loop through each value of K
+  for (i in seq_along(K_values)) {
+    K <- K_values[i]
+    if (K >= t) {
+      # Falling factorial
+      falling_factorial <- prod(K:(K - t + 1))
+      # Rising factorial
+      rising_factorial <- prod((dirichlet_alpha * K) + 0:(no_variables - 1))
+      # Compute log probability
+      log_p_k <- log(falling_factorial) - log(rising_factorial) + log(truncated_poisson_pmf[i]) - log_vn_t
+      # Convert log probability to probability
+      p_k_given_t[i] <- exp(log_p_k)
+    } else {
+      p_k_given_t[i] <- 0
+    }
+  }
+  # Normalize probabilities
+  p_k_given_t <- p_k_given_t / sum(p_k_given_t)
 
-# A function that computes the cluster posterior probabilities and allocations
+  return(p_k_given_t)
+}
 
-summary_SBM <- function(cluster_allocations) {
-  # Compute the number of unique clusters for each iteration
+#' Function for summarizing the block allocation vectors
+#'
+#' This function summarizes the sampled allocation vectors from each iteration
+#' of the Gibbs sampler from the output of  \code{bgm}
+#' when \code{edge_prior = "Stochastic-Block"} and \code{save = TRUE}.
+#'
+#' @param cluster_allocations a matrix of \code{iter} rows and \code{no_variables}
+#' columns containing the sampled cluster allocation vectors from each iteration.
+#' @param dirichlet_alpha the concentration parameter of the Dirichlet prior
+#' @param lambda the Poisson rate parameter
+#' @return Returns a list of two elements: \code{components} and \code{allocations},
+#' containing the posterior probabilties for the number of components (clusters)
+#' and the estimated cluster allocation of the nodes using Dahl's method.
+#' @examples
+#'
+#' # fit a model with the SBM prior
+#' bgm_object <- bgm(Wenchuan[, c(1:5)],
+#'                   edge_prior = "Stochastic-Block",
+#'                   save = TRUE,
+#'                   iter = 1e3)
+#'
+#' s <- summary_SBM(bgm_object$allocations,
+#'                  bgm_object$arguments$dirichlet_alpha,
+#'                  bgm_object$arguments$lambda)
+#'
+#' @export
+summary_SBM <- function(cluster_allocations,
+                        dirichlet_alpha,
+                        lambda) {
+
+  # precompute things
+  no_variables <- ncol(cluster_allocations)
+  log_Vn <- compute_Vn_mfm_sbm(no_variables,
+                               dirichlet_alpha,
+                               no_variables,
+                               lambda)
+  # Compute the number of unique clusters (t) for each iteration
+  # i.e., the cardinality  of the partition z
   clusters <- apply(cluster_allocations, 1, function(row) length(unique(row)))
+  # Compute the conditional probabilities of the number of clusters
+  # for each row in clusters
+  p_k_given_t <- matrix(NA, nrow = length(clusters), ncol = no_variables)
+
+  for (i in 1:length(clusters)) {
+    p_k_given_t[i, ] <- compute_p_k_given_t(clusters[i],
+                                            log_Vn,
+                                            dirichlet_alpha,
+                                            no_variables,
+                                            lambda)
+  }
+  # average across all iterations
+  p_k_given_t <- colMeans(p_k_given_t)
 
-  # Compute the posterior probabilities of the actual unique clusters
-  no_clusters <- table(clusters) / length(clusters)
+  # make it as a matrix
+  no_components <- 1:no_variables
+  components <- cbind(no_components, p_k_given_t)
+  colnames(components) <- c("no_components", "probability")
 
   # Compute the allocations of the nodes based on Dahl's method
   allocations <- getDahl(cluster_allocations)
 
-  # Return the results
-  return(list(no_clusters = no_clusters,
+  return(list(components = components,
               allocations = allocations))
 }
-
diff --git a/bgms.Rproj b/bgms.Rproj
@@ -1,4 +1,5 @@
 Version: 1.0
+ProjectId: 2fb4a59f-6132-41a6-9659-f1d3297dc9d0
 
 RestoreWorkspace: Default
 SaveWorkspace: Default

diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib
@@ -1,3 +1,15 @@
+@ARTICLE{miller2018mixture,
+  title={Mixture models with a prior on the number of components},
+  author={Miller, Jeffrey W and Harrison, Matthew T},
+  journal={Journal of the American Statistical Association},
+  volume={113},
+  number={521},
+  pages={340--356},
+  year={2018},
+  doi = {10.1080/01621459.2016.1255636},
+  publisher={Taylor \& Francis}
+}
+
 @article{Dahl2009,
   author       = {David B. Dahl},
   title        = {Modal clustering in a class of product partition models},