Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 180 additions & 0 deletions Practicals/Lotte/Assignment 2/Exercise 2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
# mini simulation:
# test different versions of penalties for delta+ and delta-. The goal of the
# penalties is to weigh cases that are added to the audit sample more heavily
# then cases that are removed from the audit sample.

# package for solving constrained minimization problem
library(nloptr)

# set seed for simulation study
set.seed(123)

# specify probabilities for the relationships between W, X, Y and Z
WX <- matrix(c(.9/3, .05/3, .05/3,
.1/3, .8/3, .1/3,
.15/3, .15/3, .7/3), nrow = 3, ncol = 3, byrow = TRUE)
WY <- matrix(c(.8/3, .1/3, .1/3,
.1/3, .8/3, .1/3,
.1/3, .1/3, .8/3), nrow = 3, ncol = 3, byrow = TRUE)
XZ <- matrix(c(.97/3, .018,
.97/3, .010,
.97/3, .002), nrow = 3, ncol = 2, byrow = TRUE)

# list the number of categories for each variable
X <- c(1,2,3)
Y <- c(1,2,3)
W <- c(1,2,3)
Z <- c(0,1)

# create grid
variables <- list(X = X, Y = Y, W = W, Z=Z)
XYWZ <- expand.grid(variables)

# find probabilties
for (i in 1:nrow(XYWZ)){
# if X=... and W=... find the corresponding probability in the matrix WX
if(XYWZ[i,"W"] == 1 & XYWZ[i,"X"] == 1){XYWZ[i,"WX"] = WX[1,1]}
if(XYWZ[i,"W"] == 1 & XYWZ[i,"X"] == 2){XYWZ[i,"WX"] = WX[1,2]}
if(XYWZ[i,"W"] == 1 & XYWZ[i,"X"] == 3){XYWZ[i,"WX"] = WX[1,3]}
if(XYWZ[i,"W"] == 2 & XYWZ[i,"X"] == 1){XYWZ[i,"WX"] = WX[2,1]}
if(XYWZ[i,"W"] == 2 & XYWZ[i,"X"] == 2){XYWZ[i,"WX"] = WX[2,2]}
if(XYWZ[i,"W"] == 2 & XYWZ[i,"X"] == 3){XYWZ[i,"WX"] = WX[2,3]}
if(XYWZ[i,"W"] == 3 & XYWZ[i,"X"] == 1){XYWZ[i,"WX"] = WX[3,1]}
if(XYWZ[i,"W"] == 3 & XYWZ[i,"X"] == 2){XYWZ[i,"WX"] = WX[3,2]}
if(XYWZ[i,"W"] == 3 & XYWZ[i,"X"] == 3){XYWZ[i,"WX"] = WX[3,3]}
# if X=... and Y=... find the corresponding probability in the matrix WY
if(XYWZ[i,"W"] == 1 & XYWZ[i,"Y"] == 1){XYWZ[i,"WY"] = WY[1,1]}
if(XYWZ[i,"W"] == 1 & XYWZ[i,"Y"] == 2){XYWZ[i,"WY"] = WY[1,2]}
if(XYWZ[i,"W"] == 1 & XYWZ[i,"Y"] == 3){XYWZ[i,"WY"] = WY[1,3]}
if(XYWZ[i,"W"] == 2 & XYWZ[i,"Y"] == 1){XYWZ[i,"WY"] = WY[2,1]}
if(XYWZ[i,"W"] == 2 & XYWZ[i,"Y"] == 2){XYWZ[i,"WY"] = WY[2,2]}
if(XYWZ[i,"W"] == 2 & XYWZ[i,"Y"] == 3){XYWZ[i,"WY"] = WY[2,3]}
if(XYWZ[i,"W"] == 3 & XYWZ[i,"Y"] == 1){XYWZ[i,"WY"] = WY[3,1]}
if(XYWZ[i,"W"] == 3 & XYWZ[i,"Y"] == 2){XYWZ[i,"WY"] = WY[3,2]}
if(XYWZ[i,"W"] == 3 & XYWZ[i,"Y"] == 3){XYWZ[i,"WY"] = WY[3,3]}
# If Y=.. and Z=... find the corresponding probability in the matrix WZ
if(XYWZ[i,"X"] == 1 & XYWZ[i,"Z"] == 0){XYWZ[i,"XZ"] = XZ[1,1]}
if(XYWZ[i,"X"] == 1 & XYWZ[i,"Z"] == 1){XYWZ[i,"XZ"] = XZ[1,2]}
if(XYWZ[i,"X"] == 2 & XYWZ[i,"Z"] == 0){XYWZ[i,"XZ"] = XZ[2,1]}
if(XYWZ[i,"X"] == 2 & XYWZ[i,"Z"] == 1){XYWZ[i,"XZ"] = XZ[2,2]}
if(XYWZ[i,"X"] == 3 & XYWZ[i,"Z"] == 0){XYWZ[i,"XZ"] = XZ[3,1]}
if(XYWZ[i,"X"] == 3 & XYWZ[i,"Z"] == 1){XYWZ[i,"XZ"] = XZ[3,2]}
}

# compute final probabilities
XYWZ[, "prob"] <- (XYWZ[,"WX"]*XYWZ[,"WY"]*XYWZ[,"XZ"])*9

# generate data based on the final probabilities
generated_data <- cbind(XYWZ[,1:4], "freq" = rmultinom(n = 1, size = 10000,
prob = XYWZ[, "prob"]))

# the code for the constrained minimization problem

# create tab
tab <- aggregate(generated_data[,5], by = list(X = generated_data$X,
Y = generated_data$Y,
Z = generated_data$Z),
FUN = sum, drop = TRUE)
colnames(tab) = c("X","Y","Z","freq")


# we want a non-linear deviance constraint: the deviance for each model should
# be below a boundary that is specified by the user via alpha
alpha <- 0.05
I <- 2
J <- 2
# compute the user-specified deviance threshold
bound <- qchisq(p = 1 - alpha, df = J * (I-1), lower.tail = TRUE)

# compute the constant term of the deviance for any solution to the problem
# (this constant term does not change because it only depends on X and Y,
# which do not change. Only Z changes)
tot <- aggregate(tab$freq, by = tab[ , "Y", drop = FALSE], FUN = sum)
rtot <- aggregate(tab$freq, by = tab[ , c("X","Y")], FUN = sum)
cst <- 2 * sum(tot$x * log(tot$x), na.rm=TRUE) - 2 * sum(rtot$x * log(rtot$x), na.rm=TRUE)

# specify function for the non-linear constraint. This is a function of the
# solution x and the user specified deviance threshold. The deviance for
# solution x should fall below the threshold
deviance_bound <- function(x, bound) {
deltaplus <- x[1:(length(x)/2)]
deltamin <- x[(1+(length(x)/2)):length(x)]
# compute the new frequency table under solution x
tab$freq <- tab$freq +
c(-deltaplus, -deltamin) +
c(deltamin, deltaplus)
# calculate deviance under solution x using new frequency table
ktot <- aggregate(tab$freq, by = tab[ , c('Y','Z')], FUN = sum)
G2 <- cst + 2 * sum(tab$freq * log(tab$freq)) -
2 * sum(ktot$x * log(ktot$x))
# the constraint should be in the form hin(x) >= 0, therefore we return
# the threshold - calculated deviance under the solution
return(bound - G2)
}

# nloptr() function can only be used with numeric, tables not integers
# convert table to numeric:
tab[] <- lapply(tab, as.numeric)

# set varying weights for the delta+ values
plus_weight <- c(1,2,3,4,5,6,7,8,9,10)
# set varying weights for the delta- values
min_weight <- c(1,1/2,1/3,1/4,1/5,1/6,1/7,1/8,1/9,1/10)

# initialize convergence, added units and removed units
convergence <- c()
add_units <- c()
rem_units <- c()

# set the options for the optimization procedure
opts <- nl.opts()
opts$algorithm <- "NLOPT_LD_AUGLAG"
#SLSQP
opts$local_opts <- list("algorithm" = "NLOPT_LD_SLSQP", "eval_grad_f" = NULL,
xtol_rel = 1e-6, "maxeval" = 1000, "tol_constraints_ineq" = 1e-2)
opts$tol_constraints_ineq <- 1e-2
hin <- function(x) deviance_bound(x, bound = bound)
.hin <- match.fun(hin)
hin <- function(x) (-1) * .hin(x)
hinjac <- function(x) nl.jacobian(x, hin)

# perform the procedure for every set of delta weights
for (i in 1:length(plus_weight)){
sol <- nloptr(
# specify starting values
x0 = c(rep(1, nrow(tab)/2), rep(0, nrow(tab)/2)),
# specify target function: the sum of delta+ and delta- values
eval_f = function(x) { plus_weight[i]*sum(x[1:(length(x)/2)]) +
min_weight[i]*sum(x[(1+(length(x)/2)):length(x)])},
# specify the derivative of the target function
eval_grad_f = function(x) { c(rep(plus_weight[i], (nrow(tab)/2)),
rep(min_weight[i], (nrow(tab)/2))) },
# specify lower bounds for the solution: the delta values cannot be below 0
lb = rep(0, nrow(tab)),
# specify upper bounds for the solution: there can't be more units moved
# from Z = 0 to Z = 1 then are initially in Z = 0, and vice versa
ub = tab$freq[c(which(tab$Z == 0), which(tab$Z == 1))],
# non-linear constraint on the deviance with the pre-specified function
eval_g_ineq = hin,
eval_jac_g_ineq = hinjac,
# solver to the problem
# localsolver = "LBFGS",
opts = opts)
# save convergence, added units and removed units for every iteration
convergence[i] <- sol$status
add_units[i] <- sum(sol$solution[1:nrow(tab)/2])
rem_units[i] <- sum(sol$solution[(1+nrow(tab)/2):nrow(tab)])
}

# print the results
round(cbind(plus_weight, min_weight, convergence, add_units, rem_units), 3)

# from the results, we can conclude that as delta plus increases and delta min
# decreases, the number of added units decreases, and the number of removed units
# increases. This is what we expect to happen.

# This simulation was done as an intermediate step of my thesis project




Binary file added Practicals/Lotte/Assignment 3/Exercise 3.pdf
Binary file not shown.
Binary file not shown.
84 changes: 84 additions & 0 deletions Practicals/Lotte/Assignment 3/Exercise 3.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
\documentclass[aspectratio=169]{beamer}
\title{Example document to recreate with beamer in \LaTeX}
\author{Lotte Mensink}
\date[MLRPS]{\vspace{.5 in}\\ FALL 2022 \\ Markup Languages and Reproducible Programming in Statistics \vskip6mm}

\usecolortheme{beaver}

\begin{document}

\frame{\titlepage}

\begin{frame}
\frametitle{Outline}
\tableofcontents
\end{frame}

\section{Working with equations}
\begin{frame}
\frametitle{Working with equations}
We define a set of equations as
\begin{equation}
a = b + c^2
\end{equation}
\begin{equation}
a - c^2 = b
\end{equation}
\begin{equation}
\text{left side} = \text{right side}
\end{equation}
\begin{equation}
\text{left side} + \text{something} \geq \text{right side}
\end{equation}
for all something $>0$
\end{frame}

\subsection{Aligning the same equations}
\begin{frame}
\frametitle{Aligning the same equations}
Aligning the equations by the equal sign gives a much better view into the placements of the separate equation components.
\begin{eqnarray}
a &=& b + c^2 \\
a - c^2 &=& b \\
\text{left side} &=& \text{right side} \\
\text{left side} + \text{something} &\geq& \text{right side}
\end{eqnarray}
\end{frame}

\subsection{Omit equation numbering}
\begin{frame}
\frametitle{Omit equation numbering}
Alternatively, the equation numbering can be omitted.
\begin{eqnarray*}
a &=& b + c^2 \\
a - c^2 &=& b \\
\text{left side} &=& \text{right side} \\
\text{left side} + \text{something} &\geq& \text{right side}
\end{eqnarray*}
\end{frame}

\subsection{Ugly alignment}
\begin{frame}
\frametitle{Ugly alignment}
Some components do not look well, when aligned. Especially equations with different heights and spacing. For example,
\begin{eqnarray}
E &=& mc^2 \\
m &=& \frac{E}{c^2}\\
c &=& \sqrt{\frac{E}{m}}
\end{eqnarray}
Take that into account.
\end{frame}

\section{Discussion}
\begin{frame}
\frametitle{Discussion}
This is where you’d normally give your audience a recap of your talk, where you could discuss e.g. the following
\begin{itemize}
\item Your main findings
\item The consequences of your main findings
\item Things to do
\item Any other business not currently investigated, but related to your talk
\end{itemize}
\end{frame}

\end{document}
108 changes: 108 additions & 0 deletions Practicals/Lotte/Assignment 5/Exercise 5.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
---
title: "Exercise 5"
author: "Lotte Mensink"
date: "2023-01-21"
output:
ioslides_presentation:
logo: uu_logo.png
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
library(ggplot2)
```

## Thesis Presentation

<div class="columns-2">
In this Markdown presentation, I will present some results of the pilot simulation study that I did for my thesis. My thesis is about efficient audit sample selection. In the pilot simulation study, I compare two methods for efficient sample selection against each other: the sample size approach and the deviance approach. Here follows an overview of some preliminary results.
</div>

## Results in a table

This table shows the number of iterations and the deviance before and after in the first 100 datasets.

```{r, echo = FALSE, warning = FALSE, message = FALSE}
require(DT)
load("processed_results.RData")
results <- results_data[1:100, c(4,7, 8)]
datatable(results, options = list(pageLength = 5))
```

## Results in a Figure

```{r, cache = TRUE}
# load the results of the original simulation study
load("results_data_original.RData")

# get the data in the right structure for plotting
total_results <- rbind(results_data, results_data_original)
total_results$study <- as.factor(c(rep("Sample Size Approach", 8000), rep("Deviance Approach", 8000)))
total_results$condition <- as.factor(total_results$condition)
levels(total_results$condition) <- c("WY1WX1YZ1", "WY2WX1YZ1", "WY1WX2YZ1",
"WY2WX2YZ1","WY1WX1YZ2","WY2WX1YZ2","WY1WX2YZ2","WY2WX2YZ2")
```

```{r, echo = FALSE}
ggplot(aes(x = condition, y = added_units, fill = study), data = total_results) +
geom_boxplot() +
theme_classic() +
scale_fill_grey() +
xlab("Condition") +
ylab("Added Units") +
theme(legend.title = element_blank(), text = element_text(size = 18),
axis.text.x = element_text(angle = -30, hjust = 0))
```


## Results in an Interactive Figure

```{r, echo = FALSE, message = FALSE, warning = FALSE}
library(plotly)

p <- ggplot(aes(x = condition, y = added_units, fill = study), data = total_results) +
geom_boxplot() +
theme_classic() +
scale_fill_grey() +
xlab("Condition") +
ylab("Added Units") +
theme(legend.title = element_blank(), text = element_text(size = 18),
axis.text.x = element_text(angle = -30, hjust = 0))

ggplotly(p)
```




## The Code to Create the Interactive Figure

```{r, results= 'hide', echo = TRUE}
library(plotly)

p <- ggplot(aes(x = condition, y = added_units, fill = study), data = total_results) +
geom_boxplot() +
theme_classic() +
scale_fill_grey() +
xlab("Condition") +
ylab("Added Units") +
theme(legend.title = element_blank(), text = element_text(size = 18),
axis.text.x = element_text(angle = -30, hjust = 0))

ggplotly(p)
```

## The Problem in an Equation

The minimization procedure can be written as follows:
$$min\{\sum_{i,j} \delta_{ij}^+ + \sum_{i,j} \delta_{ij}^-\}$$
under constraints
$$m_{ij1} = n_{ij1} + \delta_{ij}^+ - \delta_{ij}^-;$$
$$m_{ij0} = n_{ij0} - \delta_{ij}^+ + \delta_{ij}^-;$$
$$D \leq \chi_{I(J-1)}^2 (1-\alpha);$$
$$0 \leq \delta_{ij}^+ \leq n_{ij0};$$
$$0 \leq \delta_{ij}^- \leq n_{ij1}.$$

## The End


Loading