diff --git a/Practicals/Lotte/Assignment 2/Exercise 2.R b/Practicals/Lotte/Assignment 2/Exercise 2.R new file mode 100644 index 0000000..61078a5 --- /dev/null +++ b/Practicals/Lotte/Assignment 2/Exercise 2.R @@ -0,0 +1,180 @@ +# mini simulation: +# test different versions of penalties for delta+ and delta-. The goal of the +# penalties is to weigh cases that are added to the audit sample more heavily +# then cases that are removed from the audit sample. + +# package for solving constrained minimization problem +library(nloptr) + +# set seed for simulation study +set.seed(123) + +# specify probabilities for the relationships between W, X, Y and Z +WX <- matrix(c(.9/3, .05/3, .05/3, + .1/3, .8/3, .1/3, + .15/3, .15/3, .7/3), nrow = 3, ncol = 3, byrow = TRUE) +WY <- matrix(c(.8/3, .1/3, .1/3, + .1/3, .8/3, .1/3, + .1/3, .1/3, .8/3), nrow = 3, ncol = 3, byrow = TRUE) +XZ <- matrix(c(.97/3, .018, + .97/3, .010, + .97/3, .002), nrow = 3, ncol = 2, byrow = TRUE) + +# list the number of categories for each variable +X <- c(1,2,3) +Y <- c(1,2,3) +W <- c(1,2,3) +Z <- c(0,1) + +# create grid +variables <- list(X = X, Y = Y, W = W, Z=Z) +XYWZ <- expand.grid(variables) + +# find probabilties +for (i in 1:nrow(XYWZ)){ + # if X=... and W=... find the corresponding probability in the matrix WX + if(XYWZ[i,"W"] == 1 & XYWZ[i,"X"] == 1){XYWZ[i,"WX"] = WX[1,1]} + if(XYWZ[i,"W"] == 1 & XYWZ[i,"X"] == 2){XYWZ[i,"WX"] = WX[1,2]} + if(XYWZ[i,"W"] == 1 & XYWZ[i,"X"] == 3){XYWZ[i,"WX"] = WX[1,3]} + if(XYWZ[i,"W"] == 2 & XYWZ[i,"X"] == 1){XYWZ[i,"WX"] = WX[2,1]} + if(XYWZ[i,"W"] == 2 & XYWZ[i,"X"] == 2){XYWZ[i,"WX"] = WX[2,2]} + if(XYWZ[i,"W"] == 2 & XYWZ[i,"X"] == 3){XYWZ[i,"WX"] = WX[2,3]} + if(XYWZ[i,"W"] == 3 & XYWZ[i,"X"] == 1){XYWZ[i,"WX"] = WX[3,1]} + if(XYWZ[i,"W"] == 3 & XYWZ[i,"X"] == 2){XYWZ[i,"WX"] = WX[3,2]} + if(XYWZ[i,"W"] == 3 & XYWZ[i,"X"] == 3){XYWZ[i,"WX"] = WX[3,3]} + # if X=... and Y=... find the corresponding probability in the matrix WY + if(XYWZ[i,"W"] == 1 & XYWZ[i,"Y"] == 1){XYWZ[i,"WY"] = WY[1,1]} + if(XYWZ[i,"W"] == 1 & XYWZ[i,"Y"] == 2){XYWZ[i,"WY"] = WY[1,2]} + if(XYWZ[i,"W"] == 1 & XYWZ[i,"Y"] == 3){XYWZ[i,"WY"] = WY[1,3]} + if(XYWZ[i,"W"] == 2 & XYWZ[i,"Y"] == 1){XYWZ[i,"WY"] = WY[2,1]} + if(XYWZ[i,"W"] == 2 & XYWZ[i,"Y"] == 2){XYWZ[i,"WY"] = WY[2,2]} + if(XYWZ[i,"W"] == 2 & XYWZ[i,"Y"] == 3){XYWZ[i,"WY"] = WY[2,3]} + if(XYWZ[i,"W"] == 3 & XYWZ[i,"Y"] == 1){XYWZ[i,"WY"] = WY[3,1]} + if(XYWZ[i,"W"] == 3 & XYWZ[i,"Y"] == 2){XYWZ[i,"WY"] = WY[3,2]} + if(XYWZ[i,"W"] == 3 & XYWZ[i,"Y"] == 3){XYWZ[i,"WY"] = WY[3,3]} + # If Y=.. and Z=... find the corresponding probability in the matrix WZ + if(XYWZ[i,"X"] == 1 & XYWZ[i,"Z"] == 0){XYWZ[i,"XZ"] = XZ[1,1]} + if(XYWZ[i,"X"] == 1 & XYWZ[i,"Z"] == 1){XYWZ[i,"XZ"] = XZ[1,2]} + if(XYWZ[i,"X"] == 2 & XYWZ[i,"Z"] == 0){XYWZ[i,"XZ"] = XZ[2,1]} + if(XYWZ[i,"X"] == 2 & XYWZ[i,"Z"] == 1){XYWZ[i,"XZ"] = XZ[2,2]} + if(XYWZ[i,"X"] == 3 & XYWZ[i,"Z"] == 0){XYWZ[i,"XZ"] = XZ[3,1]} + if(XYWZ[i,"X"] == 3 & XYWZ[i,"Z"] == 1){XYWZ[i,"XZ"] = XZ[3,2]} +} + +# compute final probabilities +XYWZ[, "prob"] <- (XYWZ[,"WX"]*XYWZ[,"WY"]*XYWZ[,"XZ"])*9 + +# generate data based on the final probabilities +generated_data <- cbind(XYWZ[,1:4], "freq" = rmultinom(n = 1, size = 10000, + prob = XYWZ[, "prob"])) + +# the code for the constrained minimization problem + +# create tab +tab <- aggregate(generated_data[,5], by = list(X = generated_data$X, + Y = generated_data$Y, + Z = generated_data$Z), + FUN = sum, drop = TRUE) +colnames(tab) = c("X","Y","Z","freq") + + +# we want a non-linear deviance constraint: the deviance for each model should +# be below a boundary that is specified by the user via alpha +alpha <- 0.05 +I <- 2 +J <- 2 +# compute the user-specified deviance threshold +bound <- qchisq(p = 1 - alpha, df = J * (I-1), lower.tail = TRUE) + +# compute the constant term of the deviance for any solution to the problem +# (this constant term does not change because it only depends on X and Y, +# which do not change. Only Z changes) +tot <- aggregate(tab$freq, by = tab[ , "Y", drop = FALSE], FUN = sum) +rtot <- aggregate(tab$freq, by = tab[ , c("X","Y")], FUN = sum) +cst <- 2 * sum(tot$x * log(tot$x), na.rm=TRUE) - 2 * sum(rtot$x * log(rtot$x), na.rm=TRUE) + +# specify function for the non-linear constraint. This is a function of the +# solution x and the user specified deviance threshold. The deviance for +# solution x should fall below the threshold +deviance_bound <- function(x, bound) { + deltaplus <- x[1:(length(x)/2)] + deltamin <- x[(1+(length(x)/2)):length(x)] + # compute the new frequency table under solution x + tab$freq <- tab$freq + + c(-deltaplus, -deltamin) + + c(deltamin, deltaplus) + # calculate deviance under solution x using new frequency table + ktot <- aggregate(tab$freq, by = tab[ , c('Y','Z')], FUN = sum) + G2 <- cst + 2 * sum(tab$freq * log(tab$freq)) - + 2 * sum(ktot$x * log(ktot$x)) + # the constraint should be in the form hin(x) >= 0, therefore we return + # the threshold - calculated deviance under the solution + return(bound - G2) +} + +# nloptr() function can only be used with numeric, tables not integers +# convert table to numeric: +tab[] <- lapply(tab, as.numeric) + +# set varying weights for the delta+ values +plus_weight <- c(1,2,3,4,5,6,7,8,9,10) +# set varying weights for the delta- values +min_weight <- c(1,1/2,1/3,1/4,1/5,1/6,1/7,1/8,1/9,1/10) + +# initialize convergence, added units and removed units +convergence <- c() +add_units <- c() +rem_units <- c() + +# set the options for the optimization procedure +opts <- nl.opts() +opts$algorithm <- "NLOPT_LD_AUGLAG" +#SLSQP +opts$local_opts <- list("algorithm" = "NLOPT_LD_SLSQP", "eval_grad_f" = NULL, + xtol_rel = 1e-6, "maxeval" = 1000, "tol_constraints_ineq" = 1e-2) +opts$tol_constraints_ineq <- 1e-2 +hin <- function(x) deviance_bound(x, bound = bound) +.hin <- match.fun(hin) +hin <- function(x) (-1) * .hin(x) +hinjac <- function(x) nl.jacobian(x, hin) + +# perform the procedure for every set of delta weights +for (i in 1:length(plus_weight)){ + sol <- nloptr( + # specify starting values + x0 = c(rep(1, nrow(tab)/2), rep(0, nrow(tab)/2)), + # specify target function: the sum of delta+ and delta- values + eval_f = function(x) { plus_weight[i]*sum(x[1:(length(x)/2)]) + + min_weight[i]*sum(x[(1+(length(x)/2)):length(x)])}, + # specify the derivative of the target function + eval_grad_f = function(x) { c(rep(plus_weight[i], (nrow(tab)/2)), + rep(min_weight[i], (nrow(tab)/2))) }, + # specify lower bounds for the solution: the delta values cannot be below 0 + lb = rep(0, nrow(tab)), + # specify upper bounds for the solution: there can't be more units moved + # from Z = 0 to Z = 1 then are initially in Z = 0, and vice versa + ub = tab$freq[c(which(tab$Z == 0), which(tab$Z == 1))], + # non-linear constraint on the deviance with the pre-specified function + eval_g_ineq = hin, + eval_jac_g_ineq = hinjac, + # solver to the problem + # localsolver = "LBFGS", + opts = opts) + # save convergence, added units and removed units for every iteration + convergence[i] <- sol$status + add_units[i] <- sum(sol$solution[1:nrow(tab)/2]) + rem_units[i] <- sum(sol$solution[(1+nrow(tab)/2):nrow(tab)]) +} + +# print the results +round(cbind(plus_weight, min_weight, convergence, add_units, rem_units), 3) + +# from the results, we can conclude that as delta plus increases and delta min +# decreases, the number of added units decreases, and the number of removed units +# increases. This is what we expect to happen. + +# This simulation was done as an intermediate step of my thesis project + + + + diff --git a/Practicals/Lotte/Assignment 3/Exercise 3.pdf b/Practicals/Lotte/Assignment 3/Exercise 3.pdf new file mode 100644 index 0000000..b80292e Binary files /dev/null and b/Practicals/Lotte/Assignment 3/Exercise 3.pdf differ diff --git a/Practicals/Lotte/Assignment 3/Exercise 3.synctex.gz b/Practicals/Lotte/Assignment 3/Exercise 3.synctex.gz new file mode 100644 index 0000000..41302a2 Binary files /dev/null and b/Practicals/Lotte/Assignment 3/Exercise 3.synctex.gz differ diff --git a/Practicals/Lotte/Assignment 3/Exercise 3.tex b/Practicals/Lotte/Assignment 3/Exercise 3.tex new file mode 100644 index 0000000..a3a94c3 --- /dev/null +++ b/Practicals/Lotte/Assignment 3/Exercise 3.tex @@ -0,0 +1,84 @@ +\documentclass[aspectratio=169]{beamer} +\title{Example document to recreate with beamer in \LaTeX} +\author{Lotte Mensink} +\date[MLRPS]{\vspace{.5 in}\\ FALL 2022 \\ Markup Languages and Reproducible Programming in Statistics \vskip6mm} + +\usecolortheme{beaver} + +\begin{document} + +\frame{\titlepage} + +\begin{frame} +\frametitle{Outline} +\tableofcontents +\end{frame} + +\section{Working with equations} +\begin{frame} +\frametitle{Working with equations} +We define a set of equations as +\begin{equation} + a = b + c^2 +\end{equation} +\begin{equation} + a - c^2 = b +\end{equation} +\begin{equation} + \text{left side} = \text{right side} +\end{equation} +\begin{equation} + \text{left side} + \text{something} \geq \text{right side} +\end{equation} +for all something $>0$ +\end{frame} + +\subsection{Aligning the same equations} +\begin{frame} +\frametitle{Aligning the same equations} +Aligning the equations by the equal sign gives a much better view into the placements of the separate equation components. +\begin{eqnarray} + a &=& b + c^2 \\ + a - c^2 &=& b \\ + \text{left side} &=& \text{right side} \\ + \text{left side} + \text{something} &\geq& \text{right side} +\end{eqnarray} +\end{frame} + +\subsection{Omit equation numbering} +\begin{frame} +\frametitle{Omit equation numbering} +Alternatively, the equation numbering can be omitted. + \begin{eqnarray*} + a &=& b + c^2 \\ + a - c^2 &=& b \\ + \text{left side} &=& \text{right side} \\ + \text{left side} + \text{something} &\geq& \text{right side} +\end{eqnarray*} +\end{frame} + +\subsection{Ugly alignment} +\begin{frame} +\frametitle{Ugly alignment} +Some components do not look well, when aligned. Especially equations with different heights and spacing. For example, +\begin{eqnarray} + E &=& mc^2 \\ + m &=& \frac{E}{c^2}\\ + c &=& \sqrt{\frac{E}{m}} +\end{eqnarray} +Take that into account. +\end{frame} + +\section{Discussion} +\begin{frame} +\frametitle{Discussion} +This is where you’d normally give your audience a recap of your talk, where you could discuss e.g. the following +\begin{itemize} +\item Your main findings +\item The consequences of your main findings +\item Things to do +\item Any other business not currently investigated, but related to your talk +\end{itemize} +\end{frame} + +\end{document} diff --git a/Practicals/Lotte/Assignment 5/Exercise 5.Rmd b/Practicals/Lotte/Assignment 5/Exercise 5.Rmd new file mode 100644 index 0000000..cd3b4a8 --- /dev/null +++ b/Practicals/Lotte/Assignment 5/Exercise 5.Rmd @@ -0,0 +1,108 @@ +--- +title: "Exercise 5" +author: "Lotte Mensink" +date: "2023-01-21" +output: + ioslides_presentation: + logo: uu_logo.png +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(ggplot2) +``` + +## Thesis Presentation + +
+In this Markdown presentation, I will present some results of the pilot simulation study that I did for my thesis. My thesis is about efficient audit sample selection. In the pilot simulation study, I compare two methods for efficient sample selection against each other: the sample size approach and the deviance approach. Here follows an overview of some preliminary results. +
+ +## Results in a table + +This table shows the number of iterations and the deviance before and after in the first 100 datasets. + +```{r, echo = FALSE, warning = FALSE, message = FALSE} +require(DT) +load("processed_results.RData") +results <- results_data[1:100, c(4,7, 8)] +datatable(results, options = list(pageLength = 5)) +``` + +## Results in a Figure + +```{r, cache = TRUE} +# load the results of the original simulation study +load("results_data_original.RData") + +# get the data in the right structure for plotting +total_results <- rbind(results_data, results_data_original) +total_results$study <- as.factor(c(rep("Sample Size Approach", 8000), rep("Deviance Approach", 8000))) +total_results$condition <- as.factor(total_results$condition) +levels(total_results$condition) <- c("WY1WX1YZ1", "WY2WX1YZ1", "WY1WX2YZ1", + "WY2WX2YZ1","WY1WX1YZ2","WY2WX1YZ2","WY1WX2YZ2","WY2WX2YZ2") +``` + +```{r, echo = FALSE} +ggplot(aes(x = condition, y = added_units, fill = study), data = total_results) + + geom_boxplot() + + theme_classic() + + scale_fill_grey() + + xlab("Condition") + + ylab("Added Units") + + theme(legend.title = element_blank(), text = element_text(size = 18), + axis.text.x = element_text(angle = -30, hjust = 0)) +``` + + +## Results in an Interactive Figure + +```{r, echo = FALSE, message = FALSE, warning = FALSE} +library(plotly) + +p <- ggplot(aes(x = condition, y = added_units, fill = study), data = total_results) + + geom_boxplot() + + theme_classic() + + scale_fill_grey() + + xlab("Condition") + + ylab("Added Units") + + theme(legend.title = element_blank(), text = element_text(size = 18), + axis.text.x = element_text(angle = -30, hjust = 0)) + +ggplotly(p) +``` + + + + +## The Code to Create the Interactive Figure + +```{r, results= 'hide', echo = TRUE} +library(plotly) + +p <- ggplot(aes(x = condition, y = added_units, fill = study), data = total_results) + + geom_boxplot() + + theme_classic() + + scale_fill_grey() + + xlab("Condition") + + ylab("Added Units") + + theme(legend.title = element_blank(), text = element_text(size = 18), + axis.text.x = element_text(angle = -30, hjust = 0)) + +ggplotly(p) +``` + +## The Problem in an Equation + +The minimization procedure can be written as follows: +$$min\{\sum_{i,j} \delta_{ij}^+ + \sum_{i,j} \delta_{ij}^-\}$$ +under constraints +$$m_{ij1} = n_{ij1} + \delta_{ij}^+ - \delta_{ij}^-;$$ +$$m_{ij0} = n_{ij0} - \delta_{ij}^+ + \delta_{ij}^-;$$ +$$D \leq \chi_{I(J-1)}^2 (1-\alpha);$$ +$$0 \leq \delta_{ij}^+ \leq n_{ij0};$$ +$$0 \leq \delta_{ij}^- \leq n_{ij1}.$$ + +## The End + + diff --git a/Practicals/Lotte/Assignment 5/Exercise-5.html b/Practicals/Lotte/Assignment 5/Exercise-5.html new file mode 100644 index 0000000..442131b --- /dev/null +++ b/Practicals/Lotte/Assignment 5/Exercise-5.html @@ -0,0 +1,6958 @@ + + + + Exercise 5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

+ +

+

2023-01-21

+
+
+ +

Thesis Presentation

+ +
+

In this Markdown presentation, I will present some results of the pilot simulation study that I did for my thesis. My thesis is about efficient audit sample selection. In the pilot simulation study, I compare two methods for efficient sample selection against each other: the sample size approach and the deviance approach. Here follows an overview of some preliminary results.

+ +

Results in a table

+ +

This table shows the number of iterations and the deviance before and after in the first 100 datasets.

+ +
+ + +

Results in a Figure

+ +

+ +

Results in an Interactive Figure

+ +
+ + +

The Code to Create the Interactive Figure

+ +
library(plotly)
+
+p <- ggplot(aes(x = condition, y = added_units, fill = study), data = total_results) +
+  geom_boxplot() +
+  theme_classic() + 
+  scale_fill_grey() +
+  xlab("Condition") +
+  ylab("Added Units") +
+  theme(legend.title = element_blank(), text = element_text(size = 18),
+        axis.text.x = element_text(angle = -30, hjust = 0))
+
+ggplotly(p)
+ +

The Problem in an Equation

+ +

The minimization procedure can be written as follows: \[min\{\sum_{i,j} \delta_{ij}^+ + \sum_{i,j} \delta_{ij}^-\}\] under constraints \[m_{ij1} = n_{ij1} + \delta_{ij}^+ - \delta_{ij}^-;\] \[m_{ij0} = n_{ij0} - \delta_{ij}^+ + \delta_{ij}^-;\] \[D \leq \chi_{I(J-1)}^2 (1-\alpha);\] \[0 \leq \delta_{ij}^+ \leq n_{ij0};\] \[0 \leq \delta_{ij}^- \leq n_{ij1}.\]

+ +

The End

+ + + + +
+ + + + + + + + + diff --git a/Practicals/Lotte/Assignment 5/Exercise-5_files/figure-html/unnamed-chunk-3-1.png b/Practicals/Lotte/Assignment 5/Exercise-5_files/figure-html/unnamed-chunk-3-1.png new file mode 100644 index 0000000..76f4941 Binary files /dev/null and b/Practicals/Lotte/Assignment 5/Exercise-5_files/figure-html/unnamed-chunk-3-1.png differ diff --git a/Practicals/Lotte/Assignment 5/processed_results.RData b/Practicals/Lotte/Assignment 5/processed_results.RData new file mode 100644 index 0000000..12aff6a Binary files /dev/null and b/Practicals/Lotte/Assignment 5/processed_results.RData differ diff --git a/Practicals/Lotte/Assignment 5/results_data_original.RData b/Practicals/Lotte/Assignment 5/results_data_original.RData new file mode 100644 index 0000000..7d61054 Binary files /dev/null and b/Practicals/Lotte/Assignment 5/results_data_original.RData differ diff --git a/Practicals/Lotte/Assignment 5/uu logo.png b/Practicals/Lotte/Assignment 5/uu logo.png new file mode 100644 index 0000000..e2789c8 Binary files /dev/null and b/Practicals/Lotte/Assignment 5/uu logo.png differ diff --git a/Practicals/Lotte/Assignment 5/uu_logo.png b/Practicals/Lotte/Assignment 5/uu_logo.png new file mode 100644 index 0000000..16ce5db Binary files /dev/null and b/Practicals/Lotte/Assignment 5/uu_logo.png differ diff --git a/Practicals/Lotte/Assignment 6/app.R b/Practicals/Lotte/Assignment 6/app.R new file mode 100644 index 0000000..d9007e8 --- /dev/null +++ b/Practicals/Lotte/Assignment 6/app.R @@ -0,0 +1,108 @@ +# +# This is a Shiny web application. You can run the application by clicking +# the 'Run App' button above. +# +# Find out more about building applications with Shiny here: +# +# http://shiny.rstudio.com/ +# + +library(shiny) +library(eurostat) +library(tmap) +library(s2) +library(sf) +library(ggplot2) +library(dplyr) +migr_data <- get_eurostat("migr_imm8") +migr_data <- migr_data %>% + filter(age == "TOTAL", sex== "T", agedef == "COMPLET") + +geo_data <- st_read("https://raw.githubusercontent.com/eurostat/Nuts2json/master/pub/v2/2021/4326/20M/nutsrg_0.json", quiet = T) + +migr_data_adj <- migr_data %>% + rename(id = geo) + +tot_data <- inner_join(geo_data, migr_data_adj, by = "id") + +pop_data <- get_eurostat("tps00001") + +pop_data <- pop_data %>% + rename(id = geo) + +# Define UI for application that draws a histogram +ui <- fluidPage( + + # Application title + titlePanel("Immigration in the EU"), + + sidebarPanel( + # Sidebar with a slider input for number of bins + selectInput("country", "Countries to select:", + tot_data$na), + + selectInput("year", "Years to select:", + c("2011-01-01", "2012-01-01", "2013-01-01", "2014-01-01", + "2015-01-01", "2016-01-01", "2017-01-01","2018-01-01", "2019-01-01")), + ), + # Show a plot of the generated distribution + mainPanel( + p("This Shiny app can be used to get a more detailed overview into immigration in Europe. + The top plot shows immigration over the years for the selected country. The bottom plot + shows the immigration per capita for Europe in the selected year. + Use the drop-down menus to select the country and the year you would like to see in the plots."), + plotOutput("timeseriesplot"), + plotOutput("impcmap")) + +) + + + +# Define server logic required to draw a histogram +server <- function(input, output) { + + + output$timeseriesplot <- renderPlot({ + + country <- input$country + + tot_data_country <- tot_data %>% + filter(na == country) + + ggplot(tot_data_country, aes(x = time, y = values)) + + geom_line() + + ggtitle(paste("Immigration over time for", country)) + + ylab("Absolute immigration counts") + + xlab("Time") + + }) + + + output$impcmap <- renderPlot({ + + year <- input$year + + tot_data <- tot_data %>% + filter(time == year) + + pop_data <- pop_data %>% + filter(time == year) + + tot_pop_data <- inner_join(tot_data, pop_data, by = "id") + tot_pop_data$imm_pc <- tot_pop_data$values.x/tot_pop_data$values.y + + ggplot(tot_pop_data, aes(fill=imm_pc)) + + geom_sf() + + coord_sf() + + scale_fill_viridis_c() + + ggtitle(paste("Immigration per capita per country in", year)) + + labs(fill="Immigration per capita") + + }) + + + +} + +# Run the application +shinyApp(ui = ui, server = server)