gerkovink · lottemensink · Nov 9, 2022 · Jan 22, 2023 · Jan 22, 2023
diff --git a/Practicals/Lotte/Assignment 2/Exercise 2.R b/Practicals/Lotte/Assignment 2/Exercise 2.R
@@ -0,0 +1,180 @@
+# mini simulation: 
+# test different versions of penalties for delta+ and delta-. The goal of the 
+# penalties is to weigh cases that are added to the audit sample more heavily
+# then cases that are removed from the audit sample. 
+
+# package for solving constrained minimization problem
+library(nloptr)
+
+# set seed for simulation study
+set.seed(123)
+
+# specify probabilities for the relationships between W, X, Y and Z
+WX <- matrix(c(.9/3, .05/3, .05/3, 
+               .1/3, .8/3, .1/3,
+               .15/3, .15/3, .7/3), nrow = 3, ncol = 3, byrow = TRUE)
+WY <-  matrix(c(.8/3, .1/3, .1/3, 
+                .1/3, .8/3, .1/3,
+                .1/3, .1/3, .8/3), nrow = 3, ncol = 3, byrow = TRUE)
+XZ <- matrix(c(.97/3, .018,
+               .97/3, .010,
+               .97/3, .002), nrow = 3, ncol = 2, byrow = TRUE)
+
+# list the number of categories for each variable
+X <- c(1,2,3)
+Y <- c(1,2,3)
+W <- c(1,2,3)
+Z <- c(0,1)
+
+# create grid 
+variables <- list(X = X, Y = Y, W = W, Z=Z)
+XYWZ <- expand.grid(variables)
+
+# find probabilties
+for (i in 1:nrow(XYWZ)){
+  # if X=... and W=... find the corresponding probability in the matrix WX
+  if(XYWZ[i,"W"] == 1 & XYWZ[i,"X"] == 1){XYWZ[i,"WX"] = WX[1,1]}
+  if(XYWZ[i,"W"] == 1 & XYWZ[i,"X"] == 2){XYWZ[i,"WX"] = WX[1,2]}
+  if(XYWZ[i,"W"] == 1 & XYWZ[i,"X"] == 3){XYWZ[i,"WX"] = WX[1,3]}
+  if(XYWZ[i,"W"] == 2 & XYWZ[i,"X"] == 1){XYWZ[i,"WX"] = WX[2,1]}
+  if(XYWZ[i,"W"] == 2 & XYWZ[i,"X"] == 2){XYWZ[i,"WX"] = WX[2,2]}
+  if(XYWZ[i,"W"] == 2 & XYWZ[i,"X"] == 3){XYWZ[i,"WX"] = WX[2,3]}
+  if(XYWZ[i,"W"] == 3 & XYWZ[i,"X"] == 1){XYWZ[i,"WX"] = WX[3,1]}
+  if(XYWZ[i,"W"] == 3 & XYWZ[i,"X"] == 2){XYWZ[i,"WX"] = WX[3,2]}
+  if(XYWZ[i,"W"] == 3 & XYWZ[i,"X"] == 3){XYWZ[i,"WX"] = WX[3,3]}
+  # if X=... and Y=... find the corresponding probability in the matrix WY
+  if(XYWZ[i,"W"] == 1 & XYWZ[i,"Y"] == 1){XYWZ[i,"WY"] = WY[1,1]}
+  if(XYWZ[i,"W"] == 1 & XYWZ[i,"Y"] == 2){XYWZ[i,"WY"] = WY[1,2]}
+  if(XYWZ[i,"W"] == 1 & XYWZ[i,"Y"] == 3){XYWZ[i,"WY"] = WY[1,3]}
+  if(XYWZ[i,"W"] == 2 & XYWZ[i,"Y"] == 1){XYWZ[i,"WY"] = WY[2,1]}
+  if(XYWZ[i,"W"] == 2 & XYWZ[i,"Y"] == 2){XYWZ[i,"WY"] = WY[2,2]}
+  if(XYWZ[i,"W"] == 2 & XYWZ[i,"Y"] == 3){XYWZ[i,"WY"] = WY[2,3]}
+  if(XYWZ[i,"W"] == 3 & XYWZ[i,"Y"] == 1){XYWZ[i,"WY"] = WY[3,1]}
+  if(XYWZ[i,"W"] == 3 & XYWZ[i,"Y"] == 2){XYWZ[i,"WY"] = WY[3,2]}
+  if(XYWZ[i,"W"] == 3 & XYWZ[i,"Y"] == 3){XYWZ[i,"WY"] = WY[3,3]}
+  # If Y=.. and Z=... find the corresponding probability in the matrix WZ
+  if(XYWZ[i,"X"] == 1 & XYWZ[i,"Z"] == 0){XYWZ[i,"XZ"] = XZ[1,1]}
+  if(XYWZ[i,"X"] == 1 & XYWZ[i,"Z"] == 1){XYWZ[i,"XZ"] = XZ[1,2]}
+  if(XYWZ[i,"X"] == 2 & XYWZ[i,"Z"] == 0){XYWZ[i,"XZ"] = XZ[2,1]}
+  if(XYWZ[i,"X"] == 2 & XYWZ[i,"Z"] == 1){XYWZ[i,"XZ"] = XZ[2,2]}
+  if(XYWZ[i,"X"] == 3 & XYWZ[i,"Z"] == 0){XYWZ[i,"XZ"] = XZ[3,1]}
+  if(XYWZ[i,"X"] == 3 & XYWZ[i,"Z"] == 1){XYWZ[i,"XZ"] = XZ[3,2]}
+}
+
+# compute final probabilities 
+XYWZ[, "prob"] <- (XYWZ[,"WX"]*XYWZ[,"WY"]*XYWZ[,"XZ"])*9
+
+# generate data based on the final probabilities
+generated_data <- cbind(XYWZ[,1:4], "freq" = rmultinom(n = 1, size = 10000, 
+                                                       prob = XYWZ[, "prob"]))
+
+# the code for the constrained minimization problem
+
+# create tab
+tab <- aggregate(generated_data[,5], by = list(X = generated_data$X, 
+                                               Y = generated_data$Y, 
+                                               Z = generated_data$Z), 
+                 FUN  = sum, drop = TRUE)
+colnames(tab) = c("X","Y","Z","freq")
+
+
+# we want a non-linear deviance constraint: the deviance for each model should 
+# be below a boundary that is specified by the user via alpha
+alpha <- 0.05 
+I <- 2 
+J <- 2 
+# compute the user-specified deviance threshold
+bound <- qchisq(p = 1 - alpha, df = J * (I-1), lower.tail = TRUE)
+
+# compute the constant term of the deviance for any solution to the problem 
+# (this constant term does not change because it only depends on X and Y, 
+# which do not change. Only Z changes)
+tot  <- aggregate(tab$freq, by = tab[ , "Y", drop = FALSE], FUN = sum)
+rtot <- aggregate(tab$freq, by = tab[ , c("X","Y")], FUN = sum)
+cst <- 2 * sum(tot$x * log(tot$x), na.rm=TRUE) - 2 * sum(rtot$x * log(rtot$x), na.rm=TRUE)
+
+# specify function for the non-linear constraint. This is a function of the 
+# solution x and the user specified deviance threshold. The deviance for 
+# solution x should fall below the threshold
+deviance_bound <- function(x, bound) {
+  deltaplus <- x[1:(length(x)/2)]
+  deltamin <- x[(1+(length(x)/2)):length(x)]
+  # compute the new frequency table under solution x
+  tab$freq <- tab$freq +
+    c(-deltaplus, -deltamin) + 
+    c(deltamin, deltaplus)
+  # calculate deviance under solution x using new frequency table
+  ktot <- aggregate(tab$freq, by = tab[ , c('Y','Z')], FUN = sum)
+  G2 <- cst + 2 * sum(tab$freq * log(tab$freq)) -
+    2 * sum(ktot$x * log(ktot$x))
+  # the constraint should be in the form hin(x) >= 0, therefore we return
+  # the threshold - calculated deviance under the solution
+  return(bound - G2)
+}
+
+# nloptr() function can only be used with numeric, tables not integers
+# convert table to numeric:
+tab[] <- lapply(tab, as.numeric)
+
+# set varying weights for the delta+ values
+plus_weight <- c(1,2,3,4,5,6,7,8,9,10)
+# set varying weights for the delta- values
+min_weight <- c(1,1/2,1/3,1/4,1/5,1/6,1/7,1/8,1/9,1/10)
+
+# initialize convergence, added units and removed units
+convergence <- c()
+add_units <- c()
+rem_units <- c()
+
+# set the options for the optimization procedure
+opts <- nl.opts()
+opts$algorithm <- "NLOPT_LD_AUGLAG"
+#SLSQP
+opts$local_opts <- list("algorithm" = "NLOPT_LD_SLSQP", "eval_grad_f" = NULL, 
+                        xtol_rel = 1e-6, "maxeval" = 1000, "tol_constraints_ineq" = 1e-2)
+opts$tol_constraints_ineq <- 1e-2
+hin <- function(x) deviance_bound(x, bound = bound)
+.hin <- match.fun(hin)
+hin <- function(x) (-1) * .hin(x)
+hinjac <- function(x) nl.jacobian(x, hin)
+
+# perform the procedure for every set of delta weights
+for (i in 1:length(plus_weight)){
+  sol <- nloptr(
+    # specify starting values 
+    x0 = c(rep(1, nrow(tab)/2), rep(0, nrow(tab)/2)),
+    # specify target function: the sum of delta+ and delta- values
+    eval_f = function(x) { plus_weight[i]*sum(x[1:(length(x)/2)]) +
+        min_weight[i]*sum(x[(1+(length(x)/2)):length(x)])},
+    # specify the derivative of the target function
+    eval_grad_f = function(x) { c(rep(plus_weight[i], (nrow(tab)/2)), 
+                                  rep(min_weight[i], (nrow(tab)/2))) },
+    # specify lower bounds for the solution: the delta values cannot be below 0
+    lb = rep(0, nrow(tab)),
+    # specify upper bounds for the solution: there can't be more units moved 
+    # from Z = 0 to Z = 1 then are initially in Z = 0, and vice versa
+    ub = tab$freq[c(which(tab$Z == 0), which(tab$Z == 1))],
+    # non-linear constraint on the deviance with the pre-specified function
+    eval_g_ineq = hin,
+    eval_jac_g_ineq = hinjac,
+    # solver to the problem
+    # localsolver = "LBFGS", 
+    opts = opts)
+  # save convergence, added units and removed units for every iteration
+  convergence[i] <- sol$status
+  add_units[i] <- sum(sol$solution[1:nrow(tab)/2])
+  rem_units[i] <- sum(sol$solution[(1+nrow(tab)/2):nrow(tab)])
+}
+
+# print the results 
+round(cbind(plus_weight, min_weight, convergence, add_units, rem_units), 3)
+
+# from the results, we can conclude that as delta plus increases and delta min
+# decreases, the number of added units decreases, and the number of removed units
+# increases. This is what we expect to happen. 
+
+# This simulation was done as an intermediate step of my thesis project
+
+
+
+
diff --git a/Practicals/Lotte/Assignment 3/Exercise 3.pdf b/Practicals/Lotte/Assignment 3/Exercise 3.pdf
diff --git a/Practicals/Lotte/Assignment 3/Exercise 3.synctex.gz b/Practicals/Lotte/Assignment 3/Exercise 3.synctex.gz
diff --git a/Practicals/Lotte/Assignment 3/Exercise 3.tex b/Practicals/Lotte/Assignment 3/Exercise 3.tex
@@ -0,0 +1,84 @@
+\documentclass[aspectratio=169]{beamer}
+\title{Example document to recreate with beamer in \LaTeX}
+\author{Lotte Mensink}
+\date[MLRPS]{\vspace{.5 in}\\ FALL 2022 \\ Markup Languages and Reproducible Programming in Statistics  \vskip6mm}
+
+\usecolortheme{beaver}
+
+\begin{document}
+
+\frame{\titlepage}
+
+\begin{frame}
+\frametitle{Outline}
+\tableofcontents
+\end{frame}
+
+\section{Working with equations}
+\begin{frame}
+\frametitle{Working with equations}
+We define a set of equations as
+\begin{equation}
+    a = b + c^2
+\end{equation}
+\begin{equation}
+    a - c^2 = b 
+\end{equation}
+\begin{equation}
+    \text{left side} = \text{right side}
+\end{equation}
+\begin{equation}
+    \text{left side} + \text{something} \geq \text{right side}
+\end{equation}
+for all something $>0$
+\end{frame}
+
+\subsection{Aligning the same equations}
+\begin{frame}
+\frametitle{Aligning the same equations}
+Aligning the equations by the equal sign gives a much better view into the placements of the separate equation components.
+\begin{eqnarray}
+    a &=& b + c^2 \\
+    a - c^2 &=& b \\
+    \text{left side} &=& \text{right side} \\
+    \text{left side} + \text{something} &\geq& \text{right side}
+\end{eqnarray}
+\end{frame}
+
+\subsection{Omit equation numbering}
+\begin{frame}
+\frametitle{Omit equation numbering}
+Alternatively, the equation numbering can be omitted.
+ \begin{eqnarray*}
+    a &=& b + c^2 \\
+    a - c^2 &=& b \\
+    \text{left side} &=& \text{right side} \\
+    \text{left side} + \text{something} &\geq& \text{right side}
+\end{eqnarray*}   
+\end{frame}
+
+\subsection{Ugly alignment}
+\begin{frame}
+\frametitle{Ugly alignment}
+Some components do not look well, when aligned. Especially equations with different heights and spacing. For example, 
+\begin{eqnarray}
+    E &=& mc^2 \\
+    m &=& \frac{E}{c^2}\\
+    c &=& \sqrt{\frac{E}{m}} 
+\end{eqnarray}   
+Take that into account.
+\end{frame}
+
+\section{Discussion}
+\begin{frame}
+\frametitle{Discussion}
+This is where you’d normally give your audience a recap of your talk, where you could discuss e.g. the following
+\begin{itemize}
+\item Your main findings
+\item The consequences of your main findings
+\item Things to do
+\item Any other business not currently investigated, but related to your talk
+\end{itemize}
+\end{frame}
+
+\end{document}
diff --git a/Practicals/Lotte/Assignment 5/Exercise 5.Rmd b/Practicals/Lotte/Assignment 5/Exercise 5.Rmd
@@ -0,0 +1,108 @@
+---
+title: "Exercise 5" 
+author: "Lotte Mensink"
+date: "2023-01-21"
+output: 
+  ioslides_presentation:
+    logo: uu_logo.png
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(ggplot2)
+```
+
+## Thesis Presentation
+
+<div class="columns-2">
+In this Markdown presentation, I will present some results of the pilot simulation study that I did for my thesis. My thesis is about efficient audit sample selection. In the pilot simulation study, I compare two methods for efficient sample selection against each other: the sample size approach and the deviance approach. Here follows an overview of some preliminary results. 
+</div>
+
+## Results in a table 
+
+This table shows the number of iterations and the deviance before and after in the first 100 datasets. 
+
+```{r, echo = FALSE, warning = FALSE, message = FALSE}
+require(DT)
+load("processed_results.RData")
+results <- results_data[1:100, c(4,7, 8)]
+datatable(results, options = list(pageLength = 5))
+```
+
+## Results in a Figure
+
+```{r, cache = TRUE}
+# load the results of the original simulation study
+load("results_data_original.RData")
+
+# get the data in the right structure for plotting
+total_results <- rbind(results_data, results_data_original)
+total_results$study <- as.factor(c(rep("Sample Size Approach", 8000), rep("Deviance Approach", 8000)))
+total_results$condition <- as.factor(total_results$condition)
+levels(total_results$condition) <- c("WY1WX1YZ1", "WY2WX1YZ1", "WY1WX2YZ1",
+                    "WY2WX2YZ1","WY1WX1YZ2","WY2WX1YZ2","WY1WX2YZ2","WY2WX2YZ2")
+```
+
+```{r, echo = FALSE}
+ggplot(aes(x = condition, y = added_units, fill = study), data = total_results) +
+  geom_boxplot() +
+  theme_classic() + 
+  scale_fill_grey() +
+  xlab("Condition") +
+  ylab("Added Units") +
+  theme(legend.title = element_blank(), text = element_text(size = 18),
+        axis.text.x = element_text(angle = -30, hjust = 0))
+```
+
+
+## Results in an Interactive Figure
+
+```{r, echo = FALSE, message = FALSE, warning = FALSE}
+library(plotly)
+
+p <- ggplot(aes(x = condition, y = added_units, fill = study), data = total_results) +
+  geom_boxplot() +
+  theme_classic() + 
+  scale_fill_grey() +
+  xlab("Condition") +
+  ylab("Added Units") +
+  theme(legend.title = element_blank(), text = element_text(size = 18),
+        axis.text.x = element_text(angle = -30, hjust = 0))
+
+ggplotly(p)
+```
+
+
+
+
+## The Code to Create the Interactive Figure
+
+```{r, results= 'hide', echo = TRUE}
+library(plotly)
+
+p <- ggplot(aes(x = condition, y = added_units, fill = study), data = total_results) +
+  geom_boxplot() +
+  theme_classic() + 
+  scale_fill_grey() +
+  xlab("Condition") +
+  ylab("Added Units") +
+  theme(legend.title = element_blank(), text = element_text(size = 18),
+        axis.text.x = element_text(angle = -30, hjust = 0))
+
+ggplotly(p)
+```
+
+## The Problem in an Equation
+
+The minimization procedure can be written as follows: 
+$$min\{\sum_{i,j} \delta_{ij}^+ + \sum_{i,j} \delta_{ij}^-\}$$
+under constraints
+$$m_{ij1} = n_{ij1} + \delta_{ij}^+ - \delta_{ij}^-;$$
+$$m_{ij0} = n_{ij0} - \delta_{ij}^+ + \delta_{ij}^-;$$
+$$D \leq \chi_{I(J-1)}^2 (1-\alpha);$$
+$$0 \leq \delta_{ij}^+ \leq n_{ij0};$$
+$$0 \leq \delta_{ij}^- \leq n_{ij1}.$$
+
+## The End
+
+