initial commit, version 0.1 - base version

lorenzha · Nov 18, 2017 · 69ed8fa · 69ed8fa
1 parent 7c6c23d
commit 69ed8fa
Show file tree

Hide file tree

Showing 38 changed files with 1,317 additions and 1 deletion.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
@@ -31,3 +31,4 @@ vignettes/*.pdf
 # Temporary files created by R markdown
 *.utf8.md
 *.knit.md
+.Rproj.user
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,21 @@
+Package: hdcd
+Type: Package
+Title: High-dimensional Changepoint Detection
+Version: 0.1.0
+Author: Lorenz Haubner
+Maintainer: Lorenz Haubner <[email protected]>
+Description: The hdcd package is meant to provide tools for the detection of changepoints in high-dimensional, sequential, not identical but independent data.
+License: Apache License Version 2.0
+Encoding: UTF-8
+LazyData: true
+Imports: MASS,
+    glasso,
+    glmnet,
+    clues,
+    huge,
+    data.tree,
+    doParallel,
+    foreach
+Suggests: testthat
+Depends: 
+RoxygenNote: 6.0.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,13 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(plot,bs_cv)
+S3method(print,bs_tree)
+export(BinarySegmentation)
+export(ChainNetwork)
+export(CompareChangepointsRand)
+export(CreateModel)
+export(CrossValidation)
+export(HubNetwork)
+export(PruneTreeGamma)
+export(RandomNetwork)
+export(SimulateFromModel)
diff --git a/R/binary_segmentation.R b/R/binary_segmentation.R
@@ -0,0 +1,141 @@
+#' BinarySegmentation
+#'
+#' Uses the binary segmentation algorithmn in order to build a binary tree of the data sequence
+#' and given tuning parameters lambda and delta recursively. The tree can then be pruned in order to obtain
+#' a changepoint estimate.
+#'
+#' @param x A n times p matrix for which to find the best splitting point.
+#' @param delta Value between 0 and 1. Tuning param which determines the minimal segment size
+#' proportional to the size of the dataset and hence an upper bound for the number of changepoints.
+#' @param lambda Sparsity penality parameter in single lasso fits.
+#' @param method Which method should be used for fitting the model? See defaults for possible choices.
+#' @param penalize_diagonal Boolean, should the diagonal elements of the precision matrix be penalized?
+#' @param threshold The threshold for halting the iteration in glasso or glmnet. In the former it controls the change of single parameters
+#' in the latter it controls the total objective value.
+#' @param use_ternary_search Use a ternary search algorithm in each level of the recursion to find a local optimum (EXPERIMNETAL)
+#'
+#'
+#' @return An object of class \strong{bs_tree}
+#' @export
+#'
+#' @examples
+#' dat <- SimulateFromModel(CreateModel(n_segments = 2,n = 100,p = 30, ChainNetwork))
+#' res <- BinarySegmentation(dat, delta = 0.1, lambda = 0.01, method = "summed_regression")
+#' print(res)
+BinarySegmentation <- function(x, delta, lambda,
+                               method = c("glasso", "nodewise_regression", "summed_regression", "ratio_regression"),
+                               threshold = 1e-7,
+                               penalize_diagonal = F,
+                               use_ternary_search = F,
+                               ...) {
+  meth <- match.arg(method)
+  SegmentLossFUN <- SegmentLoss(n_obs = nrow(x), lambda = lambda, penalize_diagonal = penalize_diagonal, method = meth, threshold = threshold, ...)
+
+  tree <- data.tree::Node$new("bs_tree", start = 1, end = nrow(x))
+
+  Rec <- function(x, n_obs, delta, SegmentLossFUN, node = tree, use_ternary_search) {
+    n_selected_obs <- nrow(x)
+
+    if (n_selected_obs / n_obs >= 2 * delta) { # check whether segment is still long enough
+
+      res <- FindBestSplit(x, delta, n_obs, use_ternary_search, SegmentLossFUN)
+
+      node$min_loss <- min(res[["loss"]])
+      node$loss     <- res[["loss"]]
+      node$segment_loss <- res[["segment_loss"]]
+
+      split_point <- res[["opt_split"]]
+
+      if (is.na(split_point)) {
+        return(NA)
+      } else {
+        start <- node$start
+
+        child_left <- node$AddChild(as.character(start), start = start, end = start + split_point - 1)
+        alpha_left <- Rec(x[1:(split_point - 1),, drop = F], n_obs, delta, SegmentLossFUN, child_left, use_ternary_search)
+
+        child_right <- node$AddChild(as.character(start + split_point - 1), start = start + split_point - 1, end = start + n_selected_obs -1)
+        alpha_right <- Rec(x[split_point:n_selected_obs,, drop = F], n_obs, delta, SegmentLossFUN, child_right, use_ternary_search)
+      }
+    } else {
+      return(NA)
+    }
+  }
+  Rec(x, nrow(x), delta, SegmentLossFUN, use_ternary_search = use_ternary_search)
+  class(tree) <- c("bs_tree", class(tree))
+  tree
+}
+
+#' FindBestSplit
+#'
+#' Uses the segment_loss function for each possible split into two segements and
+#' returns the best splitting index as well as the loss for each candidate.
+#'
+#' @inheritParams BinarySegmentation
+#' @param SegmentLossFUN A loss function is created by \code{\link{SegmentLoss}}
+FindBestSplit <- function(x, delta, n_obs, use_ternary_search, SegmentLossFUN) {
+  obs_count <- nrow(x)
+  min_seg_length <- ceiling(delta * n_obs)
+
+  if (obs_count < 2 * min_seg_length || obs_count < 4)
+    return(list(opt_split = NA, loss = NA))
+
+  split_candidates <- seq(
+    max(3, min_seg_length + 1),
+    min(obs_count - 1, obs_count - min_seg_length + 1), 1
+  )
+
+  segment_loss <- SegmentLossFUN(x)
+
+  if (use_ternary_search) {
+    result <- TernarySearch(split_candidates, 1, length(split_candidates), x, SegmentLossFUN)
+  } else {
+    loss <- numeric(length(split_candidates))
+    for (i in seq_along(split_candidates)) {
+      loss[i] <- SplitLoss(x, split_point = split_candidates[i], SegmentLossFUN)
+    }
+    result <- list(opt_split = split_candidates[which.min(loss)], loss = loss)
+  }
+
+  if (round(min(result[["loss"]]), 15) >= round(segment_loss, 15))
+    list(opt_split = NA, loss = result[["loss"]], segment_loss = segment_loss)
+    else
+    list(opt_split = result[["opt_split"]], loss = result[["loss"]], segment_loss = segment_loss) # Add one since we want to split so that the minimum stays in the left segment (design choice)
+}
+
+#' TernarySearch
+#'
+#' Use a ternary search technique to find a local minimum for the next split recursively.
+#'
+#' @param split_candidates A list of indices of possible split points
+#' @param x An n times p data matrix
+#' @param left start index on the left
+#' @param right start index on the right
+#' @param SegmentLossFUN A loss function is created by \code{\link{SegmentLoss}}
+TernarySearch <- function(split_candidates, left, right, x, SegmentLossFUN) {
+
+  # Stopping condition for recursion
+  if (abs(left - right) <= 2) {
+
+    # Check all three remaining points for the minimum
+    inds <- split_candidates[left:right]
+    loss <- sapply(inds, function(y) SplitLoss(x, y, SegmentLossFUN = SegmentLossFUN))
+
+    return(list(opt_split = inds[which.min(loss)], loss = min(loss)))
+  }
+
+  # Caclulate the new grid points
+  left_third <- ceiling((right - left) * 1 / 3) + left
+  right_third <- ceiling((right - left) * 2 / 3) + left
+
+  # Evaluate loss for splitting at those points
+  loss_left <- SplitLoss(x, split_candidates[left_third], SegmentLossFUN = SegmentLossFUN)
+  loss_right <- SplitLoss(x, split_candidates[right_third], SegmentLossFUN = SegmentLossFUN)
+
+  # Discard outer segment with higher loss and do recursion for remaining variables
+  if (loss_left < loss_right) {
+    TernarySearch(split_candidates, left, right_third, x, SegmentLossFUN)
+  } else {
+    TernarySearch(split_candidates, left_third, right, x, SegmentLossFUN)
+  }
+}
diff --git a/R/cross_validation.R b/R/cross_validation.R
@@ -0,0 +1,121 @@
+#' CrossValidation
+#'
+#' EXPERIMENTAL. Crossvalidation for the desired method and parameter combinations.
+#' Evalusate lambda values will refit the model whereas gamma values can be evaluted using a
+#' single fit.
+#'
+#' Will make use of a registered parallel backend over the folds. This will be changed to parallelize over lambdas
+#' in each fold to make use of all nodes if the number of folds is smaller than the number of nodes.
+#'
+#' @inheritParams BinarySegmentation
+#' @param n_folds Number of folds. Test data will be selected equi-spaced, i.e. each n_fold-th observation.
+#' @param gamma_max If NULL the range of gamma will be determined by doing an extra fit on the full model and taking the
+#' difference between the full segment loss and the loss of the first split.
+#' @param verbose If TRUE additional information will be printed.
+#'
+#' @return A nested list with the cv results and the full fitted models for each gamm, lambda combination.
+#' @export
+#'
+#'@examples
+#' dat <- SimulateFromModel(CreateModel(n_segments = 2,n = 100,p = 30, ChainNetwork))
+#' CrossValidation(dat, delta = 0.1, method = "summed_regression")
+CrossValidation <- function(x, delta,
+                            lambda = seq(0.01, 0.1, 0.01),
+                            method = c("summed_regression", "ratio_regression"),
+                            threshold = 1e-4,
+                            penalize_diagonal = F,
+                            use_ternary_search = F,
+                            n_folds = 10,
+                            gamma_max = NULL,
+                            verbose = T) {
+  n_obs <- nrow(x)
+  n_p <- ncol(x)
+  mth <- match.arg(method)
+  `%dopar%` <- foreach::`%dopar%`
+
+  if(is.null(gamma_max)){
+    tree      <- BinarySegmentation(x = x, delta = delta, lambda = min(lambda), method = mth,
+                                    penalize_diagonal = penalize_diagonal, use_ternary_search = use_ternary_search)
+    gamma_max <- tree$Get(function(x) abs(x$min_loss - x$segment_loss), filterFun = data.tree::isRoot)
+    rm(tree)
+  }
+
+  folds <- seq_len(n_folds)
+
+  cv_results <- foreach::foreach(fold = folds, .inorder = F, .packages = "ChangeDetectoR", .verbose = verbose) %dopar% {
+    if (verbose) cat("\n CV fold ",fold, " of ",n_folds,"\n")
+
+    test_inds  <- seq(fold, n_obs, n_folds)
+    train_inds <- setdiff(1:n_obs, test_inds)
+    n_g <- length(test_inds)
+    fold_results <- list()
+
+    for (lam in seq_along(lambda)){
+      tree <- BinarySegmentation(x = x[train_inds, ], delta = delta, lambda = lambda[lam],
+                                 method = mth, penalize_diagonal = penalize_diagonal, use_ternary_search = use_ternary_search)
+      res  <- PruneTreeGamma(tree, gamma_max)
+
+      rss_gamma <- numeric(length(res$gamma))
+      cpts      <- list()
+      for (gam in seq_along(res$gamma)){
+        fit <- FullRegression(x[train_inds, ], cpts = res$cpts[[gam]], lambda = lambda[lam])
+
+        segment_bounds  <- c(1, train_inds[res$cpts[[gam]]], n_obs) # transform cpts back to original indices
+
+        rss <- 0
+        for (seg in seq_along(fit[[1]])){
+
+          wi <- fit$est_coeffs[[seg]]
+          intercepts <- fit$est_intercepts[[seg]]
+
+          segment_test_inds <- test_inds[which(test_inds >= segment_bounds[seg] & test_inds < segment_bounds[seg + 1])]
+
+          if(length(segment_test_inds) == 0){warning("Segment had no test data. Consider reducing the number of folds."); next}
+
+          rss <- rss +  sum(sapply(1:n_p, function(z) RSS(x[segment_test_inds, -z], x[segment_test_inds, z, drop = F], wi[-z, z, drop = F], intercepts[z]))) / n_obs
+        }
+        rss_gamma[gam] <- rss / n_g
+        cpts[[gam]]    <- segment_bounds[-c(1, length(segment_bounds))]
+      }
+      fold_results[[lam]] <- list(rss = rss_gamma, gamma = res$gamma, cpts = cpts, tree = tree, lambda = lambda[lam])
+    }
+    fold_results
+  }
+  class(cv_results) <- "bs_cv"
+  cv_results
+}
+
+RSS <- function(x, y, beta, intercepts){
+  sum((y - x %*% beta - intercepts)^2)
+}
+
+#' plot.bs_cv
+#'
+#' S3 method for plotting the results of cross-validation. Ony works for a specific lambda value at the moment.
+#'
+#'EXPERIMENTAL
+#'
+#' @param cv_results An object of class \strong{bs_cv}
+#'
+#' @export
+plot.bs_cv <- function(cv_results){
+
+  n_cpts <- sapply(cv_results, function(x) sapply(x[["cpts"]], function(y) length(y)))
+  gamma  <- cv_results[[1]][["gamma"]]
+  rss <- rowMeans(sapply(cv_results, function(x) x[["rss"]]))
+  sd  <- apply(sapply(cv_results, function(x) x[["rss"]]), 1, sd)
+
+  par(mar = c(5, 4, 4, 4) + 0.3)  # Leave space for z axis
+  plot (gamma, rss, ylim = c(0, 1.1*max(rss+sd)), col = "red", pch = 18, ylab = "rss / n_obs")
+  segments(gamma,rss-sd,gamma,rss+sd)
+  epsilon <- 0.0005
+  segments(gamma-epsilon,rss-sd,gamma+epsilon,rss-sd)
+  segments(gamma-epsilon,rss+sd,gamma+epsilon,rss+sd)
+  par(new = TRUE)
+  plot(gamma, n_cpts[, 1], type = "l", axes = FALSE, bty = "n", xlab = "", ylab = "", col = rgb(0,0,1,alpha=0.2))
+  for (i in 2:ncol(n_cpts)){
+    lines(gamma, n_cpts[, i], col = rgb(0,0,1,alpha=0.2))
+  }
+  axis(side=4, at = pretty(range(n_cpts[, 1])))
+  mtext("n_cpts", side=4, line=3)
+}