Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
6caf7fc
Update on the SBM prior: (1) now we save the sampled block allocation…
sekulovskin Dec 5, 2024
6633a70
update the restriction on Kmax
sekulovskin Dec 5, 2024
f74e723
changes: (1) take out the option to save out_cluster_prob and (2) cha…
sekulovskin Dec 6, 2024
4543486
take out saving K within the Gibbs sampler
sekulovskin Dec 6, 2024
f25e2f0
Include the option to calculate the number of clusters and cluster al…
sekulovskin Dec 9, 2024
21bedbc
Merge branch 'main' of https://github.com/sekulovskin/bgms
sekulovskin Dec 9, 2024
02be0e9
update the references
sekulovskin Dec 9, 2024
b83fc61
Merge branch 'MaartenMarsman:main' into main
sekulovskin Dec 10, 2024
73d42b8
Update: Include the Rao-Rlackwellized estimates for the probabilities…
sekulovskin Dec 19, 2024
28394fe
Update: Include the proper calculation of the probability of the numb…
sekulovskin Dec 20, 2024
b8f3de5
update: initial configuration of the cluster memberships.
sekulovskin Dec 23, 2024
7149ae1
delete unnecessary object
sekulovskin Jan 2, 2025
39e3ab0
correction to the truncated Poisson distribution
sekulovskin Jan 3, 2025
6a8ed38
update: randomize the order of the variables in the for loopp in the …
sekulovskin Jan 10, 2025
662d1c6
update: include a user-specified lambda
sekulovskin Jan 13, 2025
12fd23b
update SBM
sekulovskin Jan 21, 2025
42f0e86
update
sekulovskin Jan 21, 2025
2d4a8ac
sloving note: "Undefined global functions or variables: dpois"
sekulovskin Jan 21, 2025
ce8871d
solving notes and warning messages for the checks
sekulovskin Jan 21, 2025
1f09f8d
export the summary_SBM function
sekulovskin Jan 22, 2025
75cca47
export summary_SBM function (needed when save = TRUE, as well as for …
sekulovskin Jan 22, 2025
8564e28
Udate in the documentation
sekulovskin Jan 22, 2025
c7c2568
final update in the documentation
sekulovskin Jan 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@ export(extract_pairwise_interactions)
export(extract_pairwise_thresholds)
export(extract_posterior_inclusion_probabilities)
export(mrfSampler)
export(summary_SBM)
importFrom(Rcpp,evalCpp)
importFrom(Rdpack,reprompt)
importFrom(methods,hasArg)
importFrom(stats,dpois)
importFrom(utils,packageVersion)
useDynLib(bgms, .registration=TRUE)
8 changes: 6 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,15 @@ sample_bcomrf_gibbs <- function(no_states, no_variables, no_categories, interact
.Call(`_bgms_sample_bcomrf_gibbs`, no_states, no_variables, no_categories, interactions, thresholds, variable_type, reference_category, iter)
}

gibbs_sampler <- function(observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save = FALSE, display_progress = FALSE, edge_selection = TRUE) {
.Call(`_bgms_gibbs_sampler`, observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save, display_progress, edge_selection)
gibbs_sampler <- function(observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, lambda, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save = FALSE, display_progress = FALSE, edge_selection = TRUE) {
.Call(`_bgms_gibbs_sampler`, observations, indicator, interactions, thresholds, no_categories, interaction_scale, proposal_sd, proposal_sd_blumecapel, edge_prior, theta, beta_bernoulli_alpha, beta_bernoulli_beta, dirichlet_alpha, lambda, Index, iter, burnin, n_cat_obs, sufficient_blume_capel, threshold_alpha, threshold_beta, na_impute, missing_index, variable_bool, reference_category, save, display_progress, edge_selection)
}

compare_gibbs_sampler <- function(observations_gr1, observations_gr2, no_categories_gr1, no_categories_gr2, interaction_scale, pairwise_difference_scale, main_difference_scale, pairwise_difference_prior, main_difference_prior, inclusion_probability_difference, pairwise_beta_bernoulli_alpha, pairwise_beta_bernoulli_beta, main_beta_bernoulli_alpha, main_beta_bernoulli_beta, Index, iter, burnin, n_cat_obs_gr1, n_cat_obs_gr2, sufficient_blume_capel_gr1, sufficient_blume_capel_gr2, threshold_alpha, threshold_beta, na_impute, missing_index_gr1, missing_index_gr2, ordinal_variable, reference_category, independent_thresholds, save = FALSE, display_progress = FALSE, difference_selection = TRUE) {
.Call(`_bgms_compare_gibbs_sampler`, observations_gr1, observations_gr2, no_categories_gr1, no_categories_gr2, interaction_scale, pairwise_difference_scale, main_difference_scale, pairwise_difference_prior, main_difference_prior, inclusion_probability_difference, pairwise_beta_bernoulli_alpha, pairwise_beta_bernoulli_beta, main_beta_bernoulli_alpha, main_beta_bernoulli_beta, Index, iter, burnin, n_cat_obs_gr1, n_cat_obs_gr2, sufficient_blume_capel_gr1, sufficient_blume_capel_gr2, threshold_alpha, threshold_beta, na_impute, missing_index_gr1, missing_index_gr2, ordinal_variable, reference_category, independent_thresholds, save, display_progress, difference_selection)
}

compute_Vn_mfm_sbm <- function(no_variables, dirichlet_alpha, t_max, lambda) {
.Call(`_bgms_compute_Vn_mfm_sbm`, no_variables, dirichlet_alpha, t_max, lambda)
}

51 changes: 35 additions & 16 deletions R/bgm.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,12 @@
#' when nodes are expected to fall into distinct clusters. The inclusion
#' probabilities for the edges are defined at the level of the clusters, with a
#' beta prior for the unknown inclusion probability with shape parameters
#' \code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}, and a Dirichlet
#' \code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}, a Dirichlet
#' prior on the cluster assignment probabilities with a common concentration
#' parameter \code{dirichlet_alpha}. The default is
#' parameter \code{dirichlet_alpha} and a zero-truncated Poisson prior on the
#' number of clusters with a rate parameter \code{lambda}, indicating the
#' expected number of clusters.
#' The default is
#' \code{edge_prior = "Bernoulli"}.
#' @param inclusion_probability The prior edge inclusion probability for the
#' Bernoulli model. Can be a single probability, or a matrix of \code{p} rows
Expand All @@ -130,6 +133,8 @@
#' \code{beta_bernoulli_beta = 1}.
#' @param dirichlet_alpha The shape of the Dirichlet prior on the node-to-block
#' allocation probabilities for the Stochastic Block model.
#' @param lambda The rate parameter of the zero-truncated Poisson prior on the
#' number of cluster for the Stochastic Block model.
#' @param na.action How do you want the function to handle missing data? If
#' \code{na.action = "listwise"}, listwise deletion is used. If
#' \code{na.action = "impute"}, missing data are imputed iteratively during the
Expand All @@ -147,24 +152,31 @@
#' The default is \code{TRUE}.
#'
#' @return If \code{save = FALSE} (the default), the result is a list of class
#' ``bgms'' containing the following matrices with model-averaged quantities:
#' \code{"bgms"} containing the following matrices with model-averaged quantities:
#' \itemize{
#' \item \code{indicator}: A matrix with \code{p} rows and \code{p} columns,
#' containing the posterior inclusion probabilities of individual edges.
#' \item \code{interactions}: A matrix with \code{p} rows and \code{p} columns,
#' containing model-averaged posterior means of the pairwise associations.
#' \item \code{thresholds}: A matrix with \code{p} rows and \code{max(m)}
#' columns, containing model-averaged category thresholds. In the case of
#' ``blume-capel'' variables, the first entry is the parameter for the linear
#' \code{"blume-capel"} variables, the first entry is the parameter for the linear
#' effect and the second entry is the parameter for the quadratic effect, which
#' models the offset to the reference category.
#' \item In the case of
#' \code{edge_prior = "Stochastic-Block"} two additional elements are returned:
#' a vector \code{allocations} with the estimated cluster assignments of the nodes and an
#' table \code{clusters} with the estimated posterior probability of the number
#' of clusters in the network. The vector of node allocations is calculated using
#' a method proposed by \insertCite{Dahl2009}{bgms} and also used by
#' \insertCite{GengEtAl_2019}{bgms}.
#' \item In the case of \code{edge_prior = "Stochastic-Block"}, two additional
#' elements are returned:
#' \itemize{
#' \item A vector \code{allocations} with the estimated cluster assignments
#' of the nodes, calculated using a method proposed by \code{Dahl (2009)} and
#' also used by \code{Geng et al. (2019)}.
#' \item A matrix \code{components} with the estimated posterior probability
#' of the number of components (clusters) in the network. These probabilities
#' are calculated based on Equation 3.7 in \code{Miller (2018)}, which computes
#' the conditional probability of the number of components given the
#' number of clusters. The number of clusters is derived from the
#' cardinality of the sampled \code{allocations} vector for each iteration of
#' the MCMC sampler (see \code{save = TRUE}).
#' }
#' }
#' If \code{save = TRUE}, the result is a list of class ``bgms'' containing:
#' \itemize{
Expand All @@ -180,8 +192,9 @@
#' \item In the case of
#' \code{edge_prior = "Stochastic-Block"} a matrix \code{allocations} with the
#' cluster assignments of the nodes from each iteration is returned. This matrix
#' can be used to calculate the posterior probability of the number of clusters
#' by utilizing the \code{summary_SBM(bgm_output[["allocations"]])} function.
#' can be used to calculate the posterior probability of the number of components
#' and estimated cluster assignments of the nodes, by and utilizing the
#' \code{summary_SBM()} function.
#' }
#' Column averages of these matrices provide the model-averaged posterior means.
#' Except for the \code{allocations} matrix, for which the \code{summary_SBM}
Expand Down Expand Up @@ -292,6 +305,7 @@ bgm = function(x,
beta_bernoulli_alpha = 1,
beta_bernoulli_beta = 1,
dirichlet_alpha = 1,
lambda = 1,
na.action = c("listwise", "impute"),
save = FALSE,
display_progress = TRUE) {
Expand All @@ -318,7 +332,8 @@ bgm = function(x,
inclusion_probability = inclusion_probability,
beta_bernoulli_alpha = beta_bernoulli_alpha,
beta_bernoulli_beta = beta_bernoulli_beta,
dirichlet_alpha = dirichlet_alpha)
dirichlet_alpha = dirichlet_alpha,
lambda = lambda)

# ----------------------------------------------------------------------------
# The vector variable_type is now coded as boolean.
Expand Down Expand Up @@ -444,6 +459,7 @@ bgm = function(x,
beta_bernoulli_alpha = beta_bernoulli_alpha,
beta_bernoulli_beta = beta_bernoulli_beta,
dirichlet_alpha = dirichlet_alpha,
lambda = lambda,
Index = Index,
iter = iter,
burnin = burnin,
Expand Down Expand Up @@ -477,6 +493,7 @@ bgm = function(x,
beta_bernoulli_alpha = beta_bernoulli_alpha ,
beta_bernoulli_beta = beta_bernoulli_beta,
dirichlet_alpha = dirichlet_alpha,
lambda = lambda,
na.action = na.action,
save = save,
version = packageVersion("bgms")
Expand Down Expand Up @@ -521,13 +538,15 @@ bgm = function(x,
if(edge_selection == TRUE) {
if(edge_prior == "Stochastic-Block"){

summarySbm <- summary_SBM(cluster_allocations = out$allocations)
summarySbm <- summary_SBM(cluster_allocations = out$allocations,
dirichlet_alpha = dirichlet_alpha,
lambda = lambda)

output = list(indicator = indicator,
interactions = interactions,
thresholds = thresholds,
components = summarySbm$components,
allocations = summarySbm$allocations,
clusters = summarySbm$no_clusters,
arguments = arguments)
} else {

Expand Down
126 changes: 110 additions & 16 deletions R/utility_functions.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#' @importFrom Rcpp evalCpp
#' @importFrom Rdpack reprompt
#' @importFrom methods hasArg
#' @importFrom stats dpois

check_model = function(x,
variable_type,
Expand All @@ -13,7 +14,8 @@ check_model = function(x,
inclusion_probability = 0.5,
beta_bernoulli_alpha = 1,
beta_bernoulli_beta = 1,
dirichlet_alpha = dirichlet_alpha) {
dirichlet_alpha = dirichlet_alpha,
lambda = lambda) {

#Check variable type input ---------------------------------------------------
if(length(variable_type) == 1) {
Expand Down Expand Up @@ -187,14 +189,14 @@ check_model = function(x,
}
if(edge_prior == "Stochastic-Block") {
theta = matrix(0.5, nrow = ncol(x), ncol = ncol(x))
if(beta_bernoulli_alpha <= 0 || beta_bernoulli_beta <= 0 || dirichlet_alpha <= 0)
stop("The scale parameters of the beta and Dirichlet distribution need to be positive.")
if(!is.finite(beta_bernoulli_alpha) || !is.finite(beta_bernoulli_beta) || !is.finite(dirichlet_alpha))
stop("The scale parameters of the beta and Dirichlet distribution need to be finite.")
if(beta_bernoulli_alpha <= 0 || beta_bernoulli_beta <= 0 || dirichlet_alpha <= 0 || lambda <= 0)
stop("The scale parameters the beta distrubution, the concentration parameter of the Dirichlet distribution and the rate parameter of the Poisson distribution need to be positive.")
if(!is.finite(beta_bernoulli_alpha) || !is.finite(beta_bernoulli_beta) || !is.finite(dirichlet_alpha) || !is.finite(lambda))
stop("The scale parameters of the beta distrubution, the concentration parameter of the Dirichlet distribution and the rate parameter of the Poisson distribution need to be finite.")
if(is.na(beta_bernoulli_alpha) || is.na(beta_bernoulli_beta) ||
is.null(beta_bernoulli_alpha) || is.null(beta_bernoulli_beta) ||
is.null(dirichlet_alpha) || is.null(dirichlet_alpha))
stop("Values for both scale parameters of the beta and Dirichlet distribution need to be specified.")
is.null(dirichlet_alpha) || is.null(dirichlet_alpha) || is.null(lambda) || is.null(lambda))
stop("Values for both scale parameters of the betathe concentration parameter of the Dirichlet distribution and the rate parameter of the Poisson distribution need to be specified.")
}
} else {
theta = matrix(0.5, nrow = 1, ncol = 1)
Expand Down Expand Up @@ -1013,7 +1015,6 @@ compare_reformat_data = function(x,
# Geng, J., Bhattacharya, A., & Pati, D. (2019). Probabilistic Community
# Detection With Unknown Number of Communities, Journal of the American
# Statistical Association, 114:526, 893-905, DOI:10.1080/01621459.2018.1458618

getDahl <- function(cluster_allocations) {
# Dimensions of the input matrix
niters <- nrow(cluster_allocations) # Number of iterations
Expand Down Expand Up @@ -1045,21 +1046,114 @@ getDahl <- function(cluster_allocations) {
return(DahlAns)
}

# Calculate the conditional probability of the number of
# components given the cardinality of a sampled
# allocation vector based on Equation (3.7) from
# Miller & Harrison (2018). Mixture Models With a Prior on the Number of
# Components, Journal of the American Statistical Association, 113:521, 340-356,
# DOI:10.1080/01621459.2016.1255636

compute_p_k_given_t <- function(t,
log_Vn,
dirichlet_alpha,
no_variables,
lambda) {
# Define the K_values
K_values <- as.numeric(1:no_variables)

# Initialize vector for probabilities
p_k_given_t <- numeric(length(K_values))

# Normalization constant for t
log_vn_t <- log_Vn[t]

# Normalizing factor for the truncated Poisson distribution
norm_factor <- 1 - dpois(0, lambda)
truncated_poisson_pmf <- dpois(K_values, lambda) / norm_factor

# Loop through each value of K
for (i in seq_along(K_values)) {
K <- K_values[i]
if (K >= t) {
# Falling factorial
falling_factorial <- prod(K:(K - t + 1))
# Rising factorial
rising_factorial <- prod((dirichlet_alpha * K) + 0:(no_variables - 1))
# Compute log probability
log_p_k <- log(falling_factorial) - log(rising_factorial) + log(truncated_poisson_pmf[i]) - log_vn_t
# Convert log probability to probability
p_k_given_t[i] <- exp(log_p_k)
} else {
p_k_given_t[i] <- 0
}
}
# Normalize probabilities
p_k_given_t <- p_k_given_t / sum(p_k_given_t)

# A function that computes the cluster posterior probabilities and allocations
return(p_k_given_t)
}

summary_SBM <- function(cluster_allocations) {
# Compute the number of unique clusters for each iteration
#' Function for summarizing the block allocation vectors
#'
#' This function summarizes the sampled allocation vectors from each iteration
#' of the Gibbs sampler from the output of \code{bgm}
#' when \code{edge_prior = "Stochastic-Block"} and \code{save = TRUE}.
#'
#' @param cluster_allocations a matrix of \code{iter} rows and \code{no_variables}
#' columns containing the sampled cluster allocation vectors from each iteration.
#' @param dirichlet_alpha the concentration parameter of the Dirichlet prior
#' @param lambda the Poisson rate parameter
#' @return Returns a list of two elements: \code{components} and \code{allocations},
#' containing the posterior probabilties for the number of components (clusters)
#' and the estimated cluster allocation of the nodes using Dahl's method.
#' @examples
#'
#' # fit a model with the SBM prior
#' bgm_object <- bgm(Wenchuan[, c(1:5)],
#' edge_prior = "Stochastic-Block",
#' save = TRUE,
#' iter = 1e3)
#'
#' s <- summary_SBM(bgm_object$allocations,
#' bgm_object$arguments$dirichlet_alpha,
#' bgm_object$arguments$lambda)
#'
#' @export
summary_SBM <- function(cluster_allocations,
dirichlet_alpha,
lambda) {

# precompute things
no_variables <- ncol(cluster_allocations)
log_Vn <- compute_Vn_mfm_sbm(no_variables,
dirichlet_alpha,
no_variables,
lambda)
# Compute the number of unique clusters (t) for each iteration
# i.e., the cardinality of the partition z
clusters <- apply(cluster_allocations, 1, function(row) length(unique(row)))
# Compute the conditional probabilities of the number of clusters
# for each row in clusters
p_k_given_t <- matrix(NA, nrow = length(clusters), ncol = no_variables)

for (i in 1:length(clusters)) {
p_k_given_t[i, ] <- compute_p_k_given_t(clusters[i],
log_Vn,
dirichlet_alpha,
no_variables,
lambda)
}
# average across all iterations
p_k_given_t <- colMeans(p_k_given_t)

# Compute the posterior probabilities of the actual unique clusters
no_clusters <- table(clusters) / length(clusters)
# make it as a matrix
no_components <- 1:no_variables
components <- cbind(no_components, p_k_given_t)
colnames(components) <- c("no_components", "probability")

# Compute the allocations of the nodes based on Dahl's method
allocations <- getDahl(cluster_allocations)

# Return the results
return(list(no_clusters = no_clusters,
return(list(components = components,
allocations = allocations))
}

1 change: 1 addition & 0 deletions bgms.Rproj
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
Version: 1.0
ProjectId: 2fb4a59f-6132-41a6-9659-f1d3297dc9d0

RestoreWorkspace: Default
SaveWorkspace: Default
Expand Down
12 changes: 12 additions & 0 deletions inst/REFERENCES.bib
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
@ARTICLE{miller2018mixture,
title={Mixture models with a prior on the number of components},
author={Miller, Jeffrey W and Harrison, Matthew T},
journal={Journal of the American Statistical Association},
volume={113},
number={521},
pages={340--356},
year={2018},
doi = {10.1080/01621459.2016.1255636},
publisher={Taylor \& Francis}
}

@article{Dahl2009,
author = {David B. Dahl},
title = {Modal clustering in a class of product partition models},
Expand Down
Loading
Loading