-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
initial commit, version 0.1 - base version
- Loading branch information
Showing
38 changed files
with
1,317 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
^.*\.Rproj$ | ||
^\.Rproj\.user$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,3 +31,4 @@ vignettes/*.pdf | |
# Temporary files created by R markdown | ||
*.utf8.md | ||
*.knit.md | ||
.Rproj.user |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
Package: hdcd | ||
Type: Package | ||
Title: High-dimensional Changepoint Detection | ||
Version: 0.1.0 | ||
Author: Lorenz Haubner | ||
Maintainer: Lorenz Haubner <[email protected]> | ||
Description: The hdcd package is meant to provide tools for the detection of changepoints in high-dimensional, sequential, not identical but independent data. | ||
License: Apache License Version 2.0 | ||
Encoding: UTF-8 | ||
LazyData: true | ||
Imports: MASS, | ||
glasso, | ||
glmnet, | ||
clues, | ||
huge, | ||
data.tree, | ||
doParallel, | ||
foreach | ||
Suggests: testthat | ||
Depends: | ||
RoxygenNote: 6.0.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
S3method(plot,bs_cv) | ||
S3method(print,bs_tree) | ||
export(BinarySegmentation) | ||
export(ChainNetwork) | ||
export(CompareChangepointsRand) | ||
export(CreateModel) | ||
export(CrossValidation) | ||
export(HubNetwork) | ||
export(PruneTreeGamma) | ||
export(RandomNetwork) | ||
export(SimulateFromModel) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
#' BinarySegmentation | ||
#' | ||
#' Uses the binary segmentation algorithmn in order to build a binary tree of the data sequence | ||
#' and given tuning parameters lambda and delta recursively. The tree can then be pruned in order to obtain | ||
#' a changepoint estimate. | ||
#' | ||
#' @param x A n times p matrix for which to find the best splitting point. | ||
#' @param delta Value between 0 and 1. Tuning param which determines the minimal segment size | ||
#' proportional to the size of the dataset and hence an upper bound for the number of changepoints. | ||
#' @param lambda Sparsity penality parameter in single lasso fits. | ||
#' @param method Which method should be used for fitting the model? See defaults for possible choices. | ||
#' @param penalize_diagonal Boolean, should the diagonal elements of the precision matrix be penalized? | ||
#' @param threshold The threshold for halting the iteration in glasso or glmnet. In the former it controls the change of single parameters | ||
#' in the latter it controls the total objective value. | ||
#' @param use_ternary_search Use a ternary search algorithm in each level of the recursion to find a local optimum (EXPERIMNETAL) | ||
#' | ||
#' | ||
#' @return An object of class \strong{bs_tree} | ||
#' @export | ||
#' | ||
#' @examples | ||
#' dat <- SimulateFromModel(CreateModel(n_segments = 2,n = 100,p = 30, ChainNetwork)) | ||
#' res <- BinarySegmentation(dat, delta = 0.1, lambda = 0.01, method = "summed_regression") | ||
#' print(res) | ||
BinarySegmentation <- function(x, delta, lambda, | ||
method = c("glasso", "nodewise_regression", "summed_regression", "ratio_regression"), | ||
threshold = 1e-7, | ||
penalize_diagonal = F, | ||
use_ternary_search = F, | ||
...) { | ||
meth <- match.arg(method) | ||
SegmentLossFUN <- SegmentLoss(n_obs = nrow(x), lambda = lambda, penalize_diagonal = penalize_diagonal, method = meth, threshold = threshold, ...) | ||
|
||
tree <- data.tree::Node$new("bs_tree", start = 1, end = nrow(x)) | ||
|
||
Rec <- function(x, n_obs, delta, SegmentLossFUN, node = tree, use_ternary_search) { | ||
n_selected_obs <- nrow(x) | ||
|
||
if (n_selected_obs / n_obs >= 2 * delta) { # check whether segment is still long enough | ||
|
||
res <- FindBestSplit(x, delta, n_obs, use_ternary_search, SegmentLossFUN) | ||
|
||
node$min_loss <- min(res[["loss"]]) | ||
node$loss <- res[["loss"]] | ||
node$segment_loss <- res[["segment_loss"]] | ||
|
||
split_point <- res[["opt_split"]] | ||
|
||
if (is.na(split_point)) { | ||
return(NA) | ||
} else { | ||
start <- node$start | ||
|
||
child_left <- node$AddChild(as.character(start), start = start, end = start + split_point - 1) | ||
alpha_left <- Rec(x[1:(split_point - 1),, drop = F], n_obs, delta, SegmentLossFUN, child_left, use_ternary_search) | ||
|
||
child_right <- node$AddChild(as.character(start + split_point - 1), start = start + split_point - 1, end = start + n_selected_obs -1) | ||
alpha_right <- Rec(x[split_point:n_selected_obs,, drop = F], n_obs, delta, SegmentLossFUN, child_right, use_ternary_search) | ||
} | ||
} else { | ||
return(NA) | ||
} | ||
} | ||
Rec(x, nrow(x), delta, SegmentLossFUN, use_ternary_search = use_ternary_search) | ||
class(tree) <- c("bs_tree", class(tree)) | ||
tree | ||
} | ||
|
||
#' FindBestSplit | ||
#' | ||
#' Uses the segment_loss function for each possible split into two segements and | ||
#' returns the best splitting index as well as the loss for each candidate. | ||
#' | ||
#' @inheritParams BinarySegmentation | ||
#' @param SegmentLossFUN A loss function is created by \code{\link{SegmentLoss}} | ||
FindBestSplit <- function(x, delta, n_obs, use_ternary_search, SegmentLossFUN) { | ||
obs_count <- nrow(x) | ||
min_seg_length <- ceiling(delta * n_obs) | ||
|
||
if (obs_count < 2 * min_seg_length || obs_count < 4) | ||
return(list(opt_split = NA, loss = NA)) | ||
|
||
split_candidates <- seq( | ||
max(3, min_seg_length + 1), | ||
min(obs_count - 1, obs_count - min_seg_length + 1), 1 | ||
) | ||
|
||
segment_loss <- SegmentLossFUN(x) | ||
|
||
if (use_ternary_search) { | ||
result <- TernarySearch(split_candidates, 1, length(split_candidates), x, SegmentLossFUN) | ||
} else { | ||
loss <- numeric(length(split_candidates)) | ||
for (i in seq_along(split_candidates)) { | ||
loss[i] <- SplitLoss(x, split_point = split_candidates[i], SegmentLossFUN) | ||
} | ||
result <- list(opt_split = split_candidates[which.min(loss)], loss = loss) | ||
} | ||
|
||
if (round(min(result[["loss"]]), 15) >= round(segment_loss, 15)) | ||
list(opt_split = NA, loss = result[["loss"]], segment_loss = segment_loss) | ||
else | ||
list(opt_split = result[["opt_split"]], loss = result[["loss"]], segment_loss = segment_loss) # Add one since we want to split so that the minimum stays in the left segment (design choice) | ||
} | ||
|
||
#' TernarySearch | ||
#' | ||
#' Use a ternary search technique to find a local minimum for the next split recursively. | ||
#' | ||
#' @param split_candidates A list of indices of possible split points | ||
#' @param x An n times p data matrix | ||
#' @param left start index on the left | ||
#' @param right start index on the right | ||
#' @param SegmentLossFUN A loss function is created by \code{\link{SegmentLoss}} | ||
TernarySearch <- function(split_candidates, left, right, x, SegmentLossFUN) { | ||
|
||
# Stopping condition for recursion | ||
if (abs(left - right) <= 2) { | ||
|
||
# Check all three remaining points for the minimum | ||
inds <- split_candidates[left:right] | ||
loss <- sapply(inds, function(y) SplitLoss(x, y, SegmentLossFUN = SegmentLossFUN)) | ||
|
||
return(list(opt_split = inds[which.min(loss)], loss = min(loss))) | ||
} | ||
|
||
# Caclulate the new grid points | ||
left_third <- ceiling((right - left) * 1 / 3) + left | ||
right_third <- ceiling((right - left) * 2 / 3) + left | ||
|
||
# Evaluate loss for splitting at those points | ||
loss_left <- SplitLoss(x, split_candidates[left_third], SegmentLossFUN = SegmentLossFUN) | ||
loss_right <- SplitLoss(x, split_candidates[right_third], SegmentLossFUN = SegmentLossFUN) | ||
|
||
# Discard outer segment with higher loss and do recursion for remaining variables | ||
if (loss_left < loss_right) { | ||
TernarySearch(split_candidates, left, right_third, x, SegmentLossFUN) | ||
} else { | ||
TernarySearch(split_candidates, left_third, right, x, SegmentLossFUN) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
#' CrossValidation | ||
#' | ||
#' EXPERIMENTAL. Crossvalidation for the desired method and parameter combinations. | ||
#' Evalusate lambda values will refit the model whereas gamma values can be evaluted using a | ||
#' single fit. | ||
#' | ||
#' Will make use of a registered parallel backend over the folds. This will be changed to parallelize over lambdas | ||
#' in each fold to make use of all nodes if the number of folds is smaller than the number of nodes. | ||
#' | ||
#' @inheritParams BinarySegmentation | ||
#' @param n_folds Number of folds. Test data will be selected equi-spaced, i.e. each n_fold-th observation. | ||
#' @param gamma_max If NULL the range of gamma will be determined by doing an extra fit on the full model and taking the | ||
#' difference between the full segment loss and the loss of the first split. | ||
#' @param verbose If TRUE additional information will be printed. | ||
#' | ||
#' @return A nested list with the cv results and the full fitted models for each gamm, lambda combination. | ||
#' @export | ||
#' | ||
#'@examples | ||
#' dat <- SimulateFromModel(CreateModel(n_segments = 2,n = 100,p = 30, ChainNetwork)) | ||
#' CrossValidation(dat, delta = 0.1, method = "summed_regression") | ||
CrossValidation <- function(x, delta, | ||
lambda = seq(0.01, 0.1, 0.01), | ||
method = c("summed_regression", "ratio_regression"), | ||
threshold = 1e-4, | ||
penalize_diagonal = F, | ||
use_ternary_search = F, | ||
n_folds = 10, | ||
gamma_max = NULL, | ||
verbose = T) { | ||
n_obs <- nrow(x) | ||
n_p <- ncol(x) | ||
mth <- match.arg(method) | ||
`%dopar%` <- foreach::`%dopar%` | ||
|
||
if(is.null(gamma_max)){ | ||
tree <- BinarySegmentation(x = x, delta = delta, lambda = min(lambda), method = mth, | ||
penalize_diagonal = penalize_diagonal, use_ternary_search = use_ternary_search) | ||
gamma_max <- tree$Get(function(x) abs(x$min_loss - x$segment_loss), filterFun = data.tree::isRoot) | ||
rm(tree) | ||
} | ||
|
||
folds <- seq_len(n_folds) | ||
|
||
cv_results <- foreach::foreach(fold = folds, .inorder = F, .packages = "ChangeDetectoR", .verbose = verbose) %dopar% { | ||
if (verbose) cat("\n CV fold ",fold, " of ",n_folds,"\n") | ||
|
||
test_inds <- seq(fold, n_obs, n_folds) | ||
train_inds <- setdiff(1:n_obs, test_inds) | ||
n_g <- length(test_inds) | ||
fold_results <- list() | ||
|
||
for (lam in seq_along(lambda)){ | ||
tree <- BinarySegmentation(x = x[train_inds, ], delta = delta, lambda = lambda[lam], | ||
method = mth, penalize_diagonal = penalize_diagonal, use_ternary_search = use_ternary_search) | ||
res <- PruneTreeGamma(tree, gamma_max) | ||
|
||
rss_gamma <- numeric(length(res$gamma)) | ||
cpts <- list() | ||
for (gam in seq_along(res$gamma)){ | ||
fit <- FullRegression(x[train_inds, ], cpts = res$cpts[[gam]], lambda = lambda[lam]) | ||
|
||
segment_bounds <- c(1, train_inds[res$cpts[[gam]]], n_obs) # transform cpts back to original indices | ||
|
||
rss <- 0 | ||
for (seg in seq_along(fit[[1]])){ | ||
|
||
wi <- fit$est_coeffs[[seg]] | ||
intercepts <- fit$est_intercepts[[seg]] | ||
|
||
segment_test_inds <- test_inds[which(test_inds >= segment_bounds[seg] & test_inds < segment_bounds[seg + 1])] | ||
|
||
if(length(segment_test_inds) == 0){warning("Segment had no test data. Consider reducing the number of folds."); next} | ||
|
||
rss <- rss + sum(sapply(1:n_p, function(z) RSS(x[segment_test_inds, -z], x[segment_test_inds, z, drop = F], wi[-z, z, drop = F], intercepts[z]))) / n_obs | ||
} | ||
rss_gamma[gam] <- rss / n_g | ||
cpts[[gam]] <- segment_bounds[-c(1, length(segment_bounds))] | ||
} | ||
fold_results[[lam]] <- list(rss = rss_gamma, gamma = res$gamma, cpts = cpts, tree = tree, lambda = lambda[lam]) | ||
} | ||
fold_results | ||
} | ||
class(cv_results) <- "bs_cv" | ||
cv_results | ||
} | ||
|
||
RSS <- function(x, y, beta, intercepts){ | ||
sum((y - x %*% beta - intercepts)^2) | ||
} | ||
|
||
#' plot.bs_cv | ||
#' | ||
#' S3 method for plotting the results of cross-validation. Ony works for a specific lambda value at the moment. | ||
#' | ||
#'EXPERIMENTAL | ||
#' | ||
#' @param cv_results An object of class \strong{bs_cv} | ||
#' | ||
#' @export | ||
plot.bs_cv <- function(cv_results){ | ||
|
||
n_cpts <- sapply(cv_results, function(x) sapply(x[["cpts"]], function(y) length(y))) | ||
gamma <- cv_results[[1]][["gamma"]] | ||
rss <- rowMeans(sapply(cv_results, function(x) x[["rss"]])) | ||
sd <- apply(sapply(cv_results, function(x) x[["rss"]]), 1, sd) | ||
|
||
par(mar = c(5, 4, 4, 4) + 0.3) # Leave space for z axis | ||
plot (gamma, rss, ylim = c(0, 1.1*max(rss+sd)), col = "red", pch = 18, ylab = "rss / n_obs") | ||
segments(gamma,rss-sd,gamma,rss+sd) | ||
epsilon <- 0.0005 | ||
segments(gamma-epsilon,rss-sd,gamma+epsilon,rss-sd) | ||
segments(gamma-epsilon,rss+sd,gamma+epsilon,rss+sd) | ||
par(new = TRUE) | ||
plot(gamma, n_cpts[, 1], type = "l", axes = FALSE, bty = "n", xlab = "", ylab = "", col = rgb(0,0,1,alpha=0.2)) | ||
for (i in 2:ncol(n_cpts)){ | ||
lines(gamma, n_cpts[, i], col = rgb(0,0,1,alpha=0.2)) | ||
} | ||
axis(side=4, at = pretty(range(n_cpts[, 1]))) | ||
mtext("n_cpts", side=4, line=3) | ||
} |
Oops, something went wrong.