Skip to content

Commit

Permalink
initial commit, version 0.1 - base version
Browse files Browse the repository at this point in the history
  • Loading branch information
lorenzha committed Nov 18, 2017
1 parent 7c6c23d commit 69ed8fa
Show file tree
Hide file tree
Showing 38 changed files with 1,317 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ vignettes/*.pdf
# Temporary files created by R markdown
*.utf8.md
*.knit.md
.Rproj.user
21 changes: 21 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Package: hdcd
Type: Package
Title: High-dimensional Changepoint Detection
Version: 0.1.0
Author: Lorenz Haubner
Maintainer: Lorenz Haubner <[email protected]>
Description: The hdcd package is meant to provide tools for the detection of changepoints in high-dimensional, sequential, not identical but independent data.
License: Apache License Version 2.0
Encoding: UTF-8
LazyData: true
Imports: MASS,
glasso,
glmnet,
clues,
huge,
data.tree,
doParallel,
foreach
Suggests: testthat
Depends:
RoxygenNote: 6.0.1
13 changes: 13 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Generated by roxygen2: do not edit by hand

S3method(plot,bs_cv)
S3method(print,bs_tree)
export(BinarySegmentation)
export(ChainNetwork)
export(CompareChangepointsRand)
export(CreateModel)
export(CrossValidation)
export(HubNetwork)
export(PruneTreeGamma)
export(RandomNetwork)
export(SimulateFromModel)
141 changes: 141 additions & 0 deletions R/binary_segmentation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#' BinarySegmentation
#'
#' Uses the binary segmentation algorithmn in order to build a binary tree of the data sequence
#' and given tuning parameters lambda and delta recursively. The tree can then be pruned in order to obtain
#' a changepoint estimate.
#'
#' @param x A n times p matrix for which to find the best splitting point.
#' @param delta Value between 0 and 1. Tuning param which determines the minimal segment size
#' proportional to the size of the dataset and hence an upper bound for the number of changepoints.
#' @param lambda Sparsity penality parameter in single lasso fits.
#' @param method Which method should be used for fitting the model? See defaults for possible choices.
#' @param penalize_diagonal Boolean, should the diagonal elements of the precision matrix be penalized?
#' @param threshold The threshold for halting the iteration in glasso or glmnet. In the former it controls the change of single parameters
#' in the latter it controls the total objective value.
#' @param use_ternary_search Use a ternary search algorithm in each level of the recursion to find a local optimum (EXPERIMNETAL)
#'
#'
#' @return An object of class \strong{bs_tree}
#' @export
#'
#' @examples
#' dat <- SimulateFromModel(CreateModel(n_segments = 2,n = 100,p = 30, ChainNetwork))
#' res <- BinarySegmentation(dat, delta = 0.1, lambda = 0.01, method = "summed_regression")
#' print(res)
BinarySegmentation <- function(x, delta, lambda,
method = c("glasso", "nodewise_regression", "summed_regression", "ratio_regression"),
threshold = 1e-7,
penalize_diagonal = F,
use_ternary_search = F,
...) {
meth <- match.arg(method)
SegmentLossFUN <- SegmentLoss(n_obs = nrow(x), lambda = lambda, penalize_diagonal = penalize_diagonal, method = meth, threshold = threshold, ...)

tree <- data.tree::Node$new("bs_tree", start = 1, end = nrow(x))

Rec <- function(x, n_obs, delta, SegmentLossFUN, node = tree, use_ternary_search) {
n_selected_obs <- nrow(x)

if (n_selected_obs / n_obs >= 2 * delta) { # check whether segment is still long enough

res <- FindBestSplit(x, delta, n_obs, use_ternary_search, SegmentLossFUN)

node$min_loss <- min(res[["loss"]])
node$loss <- res[["loss"]]
node$segment_loss <- res[["segment_loss"]]

split_point <- res[["opt_split"]]

if (is.na(split_point)) {
return(NA)
} else {
start <- node$start

child_left <- node$AddChild(as.character(start), start = start, end = start + split_point - 1)
alpha_left <- Rec(x[1:(split_point - 1),, drop = F], n_obs, delta, SegmentLossFUN, child_left, use_ternary_search)

child_right <- node$AddChild(as.character(start + split_point - 1), start = start + split_point - 1, end = start + n_selected_obs -1)
alpha_right <- Rec(x[split_point:n_selected_obs,, drop = F], n_obs, delta, SegmentLossFUN, child_right, use_ternary_search)
}
} else {
return(NA)
}
}
Rec(x, nrow(x), delta, SegmentLossFUN, use_ternary_search = use_ternary_search)
class(tree) <- c("bs_tree", class(tree))
tree
}

#' FindBestSplit
#'
#' Uses the segment_loss function for each possible split into two segements and
#' returns the best splitting index as well as the loss for each candidate.
#'
#' @inheritParams BinarySegmentation
#' @param SegmentLossFUN A loss function is created by \code{\link{SegmentLoss}}
FindBestSplit <- function(x, delta, n_obs, use_ternary_search, SegmentLossFUN) {
obs_count <- nrow(x)
min_seg_length <- ceiling(delta * n_obs)

if (obs_count < 2 * min_seg_length || obs_count < 4)
return(list(opt_split = NA, loss = NA))

split_candidates <- seq(
max(3, min_seg_length + 1),
min(obs_count - 1, obs_count - min_seg_length + 1), 1
)

segment_loss <- SegmentLossFUN(x)

if (use_ternary_search) {
result <- TernarySearch(split_candidates, 1, length(split_candidates), x, SegmentLossFUN)
} else {
loss <- numeric(length(split_candidates))
for (i in seq_along(split_candidates)) {
loss[i] <- SplitLoss(x, split_point = split_candidates[i], SegmentLossFUN)
}
result <- list(opt_split = split_candidates[which.min(loss)], loss = loss)
}

if (round(min(result[["loss"]]), 15) >= round(segment_loss, 15))
list(opt_split = NA, loss = result[["loss"]], segment_loss = segment_loss)
else
list(opt_split = result[["opt_split"]], loss = result[["loss"]], segment_loss = segment_loss) # Add one since we want to split so that the minimum stays in the left segment (design choice)
}

#' TernarySearch
#'
#' Use a ternary search technique to find a local minimum for the next split recursively.
#'
#' @param split_candidates A list of indices of possible split points
#' @param x An n times p data matrix
#' @param left start index on the left
#' @param right start index on the right
#' @param SegmentLossFUN A loss function is created by \code{\link{SegmentLoss}}
TernarySearch <- function(split_candidates, left, right, x, SegmentLossFUN) {

# Stopping condition for recursion
if (abs(left - right) <= 2) {

# Check all three remaining points for the minimum
inds <- split_candidates[left:right]
loss <- sapply(inds, function(y) SplitLoss(x, y, SegmentLossFUN = SegmentLossFUN))

return(list(opt_split = inds[which.min(loss)], loss = min(loss)))
}

# Caclulate the new grid points
left_third <- ceiling((right - left) * 1 / 3) + left
right_third <- ceiling((right - left) * 2 / 3) + left

# Evaluate loss for splitting at those points
loss_left <- SplitLoss(x, split_candidates[left_third], SegmentLossFUN = SegmentLossFUN)
loss_right <- SplitLoss(x, split_candidates[right_third], SegmentLossFUN = SegmentLossFUN)

# Discard outer segment with higher loss and do recursion for remaining variables
if (loss_left < loss_right) {
TernarySearch(split_candidates, left, right_third, x, SegmentLossFUN)
} else {
TernarySearch(split_candidates, left_third, right, x, SegmentLossFUN)
}
}
121 changes: 121 additions & 0 deletions R/cross_validation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#' CrossValidation
#'
#' EXPERIMENTAL. Crossvalidation for the desired method and parameter combinations.
#' Evalusate lambda values will refit the model whereas gamma values can be evaluted using a
#' single fit.
#'
#' Will make use of a registered parallel backend over the folds. This will be changed to parallelize over lambdas
#' in each fold to make use of all nodes if the number of folds is smaller than the number of nodes.
#'
#' @inheritParams BinarySegmentation
#' @param n_folds Number of folds. Test data will be selected equi-spaced, i.e. each n_fold-th observation.
#' @param gamma_max If NULL the range of gamma will be determined by doing an extra fit on the full model and taking the
#' difference between the full segment loss and the loss of the first split.
#' @param verbose If TRUE additional information will be printed.
#'
#' @return A nested list with the cv results and the full fitted models for each gamm, lambda combination.
#' @export
#'
#'@examples
#' dat <- SimulateFromModel(CreateModel(n_segments = 2,n = 100,p = 30, ChainNetwork))
#' CrossValidation(dat, delta = 0.1, method = "summed_regression")
CrossValidation <- function(x, delta,
lambda = seq(0.01, 0.1, 0.01),
method = c("summed_regression", "ratio_regression"),
threshold = 1e-4,
penalize_diagonal = F,
use_ternary_search = F,
n_folds = 10,
gamma_max = NULL,
verbose = T) {
n_obs <- nrow(x)
n_p <- ncol(x)
mth <- match.arg(method)
`%dopar%` <- foreach::`%dopar%`

if(is.null(gamma_max)){
tree <- BinarySegmentation(x = x, delta = delta, lambda = min(lambda), method = mth,
penalize_diagonal = penalize_diagonal, use_ternary_search = use_ternary_search)
gamma_max <- tree$Get(function(x) abs(x$min_loss - x$segment_loss), filterFun = data.tree::isRoot)
rm(tree)
}

folds <- seq_len(n_folds)

cv_results <- foreach::foreach(fold = folds, .inorder = F, .packages = "ChangeDetectoR", .verbose = verbose) %dopar% {
if (verbose) cat("\n CV fold ",fold, " of ",n_folds,"\n")

test_inds <- seq(fold, n_obs, n_folds)
train_inds <- setdiff(1:n_obs, test_inds)
n_g <- length(test_inds)
fold_results <- list()

for (lam in seq_along(lambda)){
tree <- BinarySegmentation(x = x[train_inds, ], delta = delta, lambda = lambda[lam],
method = mth, penalize_diagonal = penalize_diagonal, use_ternary_search = use_ternary_search)
res <- PruneTreeGamma(tree, gamma_max)

rss_gamma <- numeric(length(res$gamma))
cpts <- list()
for (gam in seq_along(res$gamma)){
fit <- FullRegression(x[train_inds, ], cpts = res$cpts[[gam]], lambda = lambda[lam])

segment_bounds <- c(1, train_inds[res$cpts[[gam]]], n_obs) # transform cpts back to original indices

rss <- 0
for (seg in seq_along(fit[[1]])){

wi <- fit$est_coeffs[[seg]]
intercepts <- fit$est_intercepts[[seg]]

segment_test_inds <- test_inds[which(test_inds >= segment_bounds[seg] & test_inds < segment_bounds[seg + 1])]

if(length(segment_test_inds) == 0){warning("Segment had no test data. Consider reducing the number of folds."); next}

rss <- rss + sum(sapply(1:n_p, function(z) RSS(x[segment_test_inds, -z], x[segment_test_inds, z, drop = F], wi[-z, z, drop = F], intercepts[z]))) / n_obs
}
rss_gamma[gam] <- rss / n_g
cpts[[gam]] <- segment_bounds[-c(1, length(segment_bounds))]
}
fold_results[[lam]] <- list(rss = rss_gamma, gamma = res$gamma, cpts = cpts, tree = tree, lambda = lambda[lam])
}
fold_results
}
class(cv_results) <- "bs_cv"
cv_results
}

RSS <- function(x, y, beta, intercepts){
sum((y - x %*% beta - intercepts)^2)
}

#' plot.bs_cv
#'
#' S3 method for plotting the results of cross-validation. Ony works for a specific lambda value at the moment.
#'
#'EXPERIMENTAL
#'
#' @param cv_results An object of class \strong{bs_cv}
#'
#' @export
plot.bs_cv <- function(cv_results){

n_cpts <- sapply(cv_results, function(x) sapply(x[["cpts"]], function(y) length(y)))
gamma <- cv_results[[1]][["gamma"]]
rss <- rowMeans(sapply(cv_results, function(x) x[["rss"]]))
sd <- apply(sapply(cv_results, function(x) x[["rss"]]), 1, sd)

par(mar = c(5, 4, 4, 4) + 0.3) # Leave space for z axis
plot (gamma, rss, ylim = c(0, 1.1*max(rss+sd)), col = "red", pch = 18, ylab = "rss / n_obs")
segments(gamma,rss-sd,gamma,rss+sd)
epsilon <- 0.0005
segments(gamma-epsilon,rss-sd,gamma+epsilon,rss-sd)
segments(gamma-epsilon,rss+sd,gamma+epsilon,rss+sd)
par(new = TRUE)
plot(gamma, n_cpts[, 1], type = "l", axes = FALSE, bty = "n", xlab = "", ylab = "", col = rgb(0,0,1,alpha=0.2))
for (i in 2:ncol(n_cpts)){
lines(gamma, n_cpts[, i], col = rgb(0,0,1,alpha=0.2))
}
axis(side=4, at = pretty(range(n_cpts[, 1])))
mtext("n_cpts", side=4, line=3)
}
Loading

0 comments on commit 69ed8fa

Please sign in to comment.