gerkovink · jmnolte · Sep 7, 2022 · Sep 7, 2022 · Sep 7, 2022 · Sep 18, 2022
diff --git a/.DS_Store b/.DS_Store
diff --git a/Jakob/.DS_Store b/Jakob/.DS_Store
diff --git a/Jakob/Exercise 1/Exercise1.Rmd b/Jakob/Exercise 1/Exercise1.Rmd
@@ -0,0 +1,74 @@
+---
+title: "Exercise 1 - Reproducible Programming"
+author: "Jakob Nolte"
+date: "Sep 18, 2022"
+output: html_notebook
+---
+
+```{r}
+set.seed(123)
+```
+
+# 1. Perform a simulation that does the following:
+
+## a. Sample 100 samples from a standard normal distribution.
+
+```{r}
+library(plyr)
+samples <- rlply(100, rnorm(1000, 0, 1))
+```
+
+## b. For each of these samples, calculate the following statistics for the mean: 
+
+- absolute bias, 
+- standard error, 
+- lower bound of the 95% confidence interval, 
+- and upper bound of the 95% confidence interval
+
+```{r}
+sim.stats <- function(x){ 
+  mean <- mean(x)
+  se <- 1/sqrt(length(x))
+  df <- length(x) - 1
+  tscore <- qt(.975, df) * se
+  return(c(mean, mean - 0, se, mean - tscore, mean + tscore))
+}
+format <- c("Mean" = 0, "Bias" = 0, "Std.Err" = 0, "Lower" = 0, "Upper" = 0)
+```
+
+```{r}
+library(dplyr)
+results <- samples %>%
+  vapply(., sim.stats, format) %>%
+  t() %>%
+  as.data.frame() %>%
+  mutate(Included = Lower < 0 & 0 < Upper)
+```
+
+95 out of 100 samples include the true population value.
+
+```{r}
+colMeans(results)
+```
+
+## c. Create a plot that demonstrates the simulation results.
+
+```{r}
+library(ggplot2)
+limits <- aes(ymax = results$Upper, ymin = results$Lower)
+ggplot(results, aes(x=1:nrow(results), y=Mean, colour=Included)) +
+  geom_hline(aes(yintercept=0), color="black") +
+  geom_pointrange(limits) +
+  xlab("Simulated Samples") +
+  ylab("Means and 95% Confidence Intervals")
+```
+
+## d. Present a table containing all simulated samples for which the resulting confidence interval does not contain the population value.
+
+```{r}
+library(kableExtra)
+table <- results[!results$Included, ]
+kable(table[,1:5], "html") %>%
+  kable_styling(full_width = FALSE)
+```
+
diff --git a/Jakob/Exercise 1/Exercise1.nb.html b/Jakob/Exercise 1/Exercise1.nb.html
diff --git a/Jakob/Exercise 2/Exercise2.Rmd b/Jakob/Exercise 2/Exercise2.Rmd
@@ -0,0 +1,31 @@
+---
+title: "Exercise 2 - Reproducible Programming"
+author: "Jakob Nolte"
+date: "Sep 28, 2022"
+output: html_notebook
+---
+
+The given document simulates the analysis from a randomized control trial. Specifically, it fits a linear model explaining the trial's outcome as a function of its treatment and the patient's baseline score.
+
+```{r simulation}
+set.seed(123) # set the seed for comparison
+n <- 500 # total sample size
+treatment <- rbinom(n,1,0.3) # simulate treatment variable
+baseline <- rnorm(n,30,5) # simulate confounding variable
+outcome <- 25 + 3.8*treatment + 1.2*baseline + rnorm(n,0,5) # compute outcome as function of predictors and random noise
+data <- data.frame(outcome, treatment, baseline) # store variables in a data frame
+```
+
+```{r}
+summary(lm(outcome ~ treatment + confounder, data=data)) # compute the treatment's effect
+```
+
+```{r}
+plot(x=data$baseline, y=data$outcome, col=(data$treatment+1),
+        xlab="baseline", ylab="outcome") # plot the treatment's effect
+```
+
+```{r}
+sessionInfo() # display session info
+```
+
diff --git a/Jakob/Exercise 2/Exercise2.nb.html b/Jakob/Exercise 2/Exercise2.nb.html
diff --git a/Jakob/Exercise 3/.DS_Store b/Jakob/Exercise 3/.DS_Store
diff --git a/Jakob/Exercise 3/beamer_markup2022.pdf b/Jakob/Exercise 3/beamer_markup2022.pdf
diff --git a/Jakob/Exercise 3/markup2022.tex b/Jakob/Exercise 3/markup2022.tex
@@ -0,0 +1,141 @@
+\documentclass[9pt]{beamer}
+\usetheme{Rochester}
+\usecolortheme{wolverine}
+
+\usepackage[nodisplayskipstretch]{setspace}
+\usepackage{amsmath}
+\usepackage{hyperref}
+
+\hypersetup{
+    colorlinks=true,
+}
+
+% title page
+\setbeamerfont{title}{size=\large}
+\title{Example document to recreate with beamer in \LaTeX}
+\author{Jakob Nolte}
+\date{FALL 2022 \\
+Markup Languages and Reproducible Programming in Statistics}
+
+\begin{document}
+
+\frame{\titlepage}
+
+% table of contents
+\section*{Outline}
+{
+\setbeamerfont{frametitle}{size=\large}
+\setbeamertemplate{sections/subsections in toc}[default]
+\begin{frame}
+    \frametitle{Outline}
+    \hypersetup{linkcolor=black}
+    \tableofcontents %
+\end{frame}
+}
+
+% slide 1
+\section{Working with equations}
+{
+\setbeamerfont{frametitle}{size=\large}
+\setstretch{0.5}
+\begin{frame}
+\frametitle{Working with equations}
+
+We define a set of equations as \\
+\begin{equation}
+    a = b + c^2,
+\end{equation}
+\begin{equation}
+    a - c^2 = b,
+\end{equation}
+\begin{equation}
+    \text{left side} = \text{right side},
+\end{equation}
+\begin{equation}
+    \text{left side + something} \geq \text{right side},
+\end{equation}
+for all something $>$ 0.
+
+\end{frame}
+}
+
+% slide 2
+\subsection{Aligning the same equations}
+{
+\setbeamerfont{frametitle}{size=\large}
+\setstretch{0.5}
+\begin{frame}
+\frametitle{Aligning the same equations}
+
+Aligning the equations by the equal sign gives a much better view into the placements of the separate equation components. \\
+\begin{align}
+    a &= b + c^2, \\
+    a - c^2 &= b, \\
+    \text{left side} &= \text{right side}, \\
+    \text{left side + something} &\geq \text{right side},
+\end{align}
+for all something $>$ 0.
+
+\end{frame}
+}
+
+% slide 3
+\subsection{Omit equation numbering}
+{
+\setbeamerfont{frametitle}{size=\large}
+\setstretch{0.5}
+\begin{frame}
+\frametitle{Omit equation numbering}
+
+Alternatively, the equation numbering can be omitted. \\
+\begin{align}
+    a &= b + c^2, \notag\\
+    a - c^2 &= b, \notag\\
+    \text{left side} &= \text{right side}, \notag\\
+    \text{left side + something} &\geq \text{right side}, \notag
+\end{align}
+for all something $>$ 0.
+
+\end{frame}
+}
+
+% slide 4
+\subsection{Ugly alignment}
+{
+\setbeamerfont{frametitle}{size=\large}
+\setstretch{0.5}
+\begin{frame}
+\frametitle{Ugly alignment}
+
+Some components do not look well, when aligned. Especially equations with different heights and spacing. For example, \\
+\begin{align}
+    E &= mc^2, \\
+    m &= \frac{E}{c^2}, \\
+    c &= \sqrt{\frac{E}{c^2}}.
+\end{align}
+Take that into account.
+
+\end{frame}
+}
+
+% discussion
+\section{Discussion}
+{
+\setbeamerfont{frametitle}{size=\large}
+\setstretch{0.5}
+\setbeamertemplate{itemize items}[triangle]
+\begin{frame}
+\frametitle{Discussion}
+
+This is where you’d normally give your audience a recap of your talk, where you could discuss e.g. the following:
+\begin{itemize}
+    \item Your main findings
+    \item The consequences of your main findings
+    \item Things to do
+    \item Any other business not currently investigated, but related to your talk
+\end{itemize}
+
+\end{frame}
+}
+
+\end{document}
diff --git a/Jakob/Exercise 5/.DS_Store b/Jakob/Exercise 5/.DS_Store
diff --git a/Jakob/Exercise 5/UU_logo.png b/Jakob/Exercise 5/UU_logo.png
diff --git a/Jakob/Exercise 5/exercise5.Rmd b/Jakob/Exercise 5/exercise5.Rmd
@@ -0,0 +1,131 @@
+---
+title: "Has Hollywood Become more Sexist?"
+subtitle: "Investigating the Gender Bias in Movie Production Budgets"
+author: "Jakob Nolte"
+date: "January 10, 2023"
+output:
+  ioslides_presentation:
+    logo: UU_logo.png
+  widescreen: default
+  citation_package: natbib
+bibliography: exercise5.bib  
+---
+
+```{r attach required libraries, include=FALSE}
+library(dplyr)
+library(tidyverse)
+library(ggplot2)
+```
+
+```{r data preparation 1, include=FALSE}
+setwd("/Users/noltinho/Desktop/Jakob/Uni/Master/2. Semester/Bayesian Statistics/Assignment")
+movie.data <- read.csv("movies.csv", na.strings=c("","NA"))
+actor.data <- read.csv("oscar.csv", na.strings=c("","NA"))
+```
+
+```{r data preparation 2, include=FALSE}
+movie.data <- movie.data %>%
+  rename(title = name,
+         main_actor = star) %>%
+  na.exclude() %>%
+  mutate(budget_log = log(budget),
+         budget_log_mc = budget_log - mean(budget_log),
+         gross_log = log(gross),
+         year_1980 = (year - 1980)/10,
+         genre_action = ifelse(genre == "Action",1,0)) %>%
+  dplyr::select(title, genre, genre_action, year, year_1980, score, votes, main_actor, budget, budget_log, budget_log_mc, gross, gross_log)
+```
+
+```{r data preparation 3, include=FALSE}
+actor.data <- actor.data %>%
+  mutate(gender_main_chr = if_else(str_detect(actor.data$category, "ACTOR"),0,1)) %>%
+  rename(main_actor = name) %>%
+  dplyr::select(main_actor, gender_main_chr) %>%
+  distinct(main_actor, .keep_all = TRUE)
+```
+
+```{r data preparation 4, include=FALSE}
+data <- merge(movie.data, actor.data, by="main_actor")
+data <- data %>%
+  mutate(decade = case_when(
+    year >= 1980 & year < 1990 ~ 1,
+    year >= 1990 & year < 2000 ~ 2,
+    year >= 2000 & year < 2010 ~ 3,
+    year >= 2010 & year <= 2020 ~ 4)) %>%
+  group_by(decade) %>%
+  mutate(decade_prop = sum(gender_main_chr)/length(gender_main_chr))
+```
+
+## What the Research says {.smaller}
+
+- @erighaRaceGenderHollywood2015 finds that despite overall trends towards diversification, blockbuster movies have seen a decrease in female protagonists.
+
+- @lindnerMillionDollarMaybe2015 assess that the trend has worsened since the early 2000s.
+
+
+## Investigating the Trend over Time using the IMdB Database {.smaller}
+
+```{r, echo=FALSE, message=FALSE, warning=FALSE}
+#install.packages("DT")
+library(DT)
+datatable(data[c(1:3,5,9)], options = list(pageLength = 5))
+```
+
+
+## Modeling the Trend over Time {.smaller}
+
+$$
+\begin{aligned}
+ln(Budget_i) &= \beta_0 + \beta_1Time + \beta_2Gender_i + e_i \\
+Budget_i &= e^{\beta_0 + \beta_1Time + \beta_2Gender_i + e_i}
+\end{aligned}
+$$
+
+## Modeling the Trend over Time using R {.smaller}
+
+```{r, message=FALSE}
+# the code fits a regression line per group
+reg <- lm(budget_log ~ year_1980 * gender_main_chr, data=data)
+```
+
+
+## Female Actors are indeed increasingly discriminated against {.smaller}
+
+```{r, echo=FALSE}
+summary(reg)
+```
+
+
+## Displaying the Trend visually {.smaller}
+
+```{r, message=FALSE, echo=FALSE, fig.cap="Figure 1: Increase in Movie Production Budgets over Time by Gender (interactive view)"}
+#install.packages("plotly")
+library(plotly)
+movie_plot <- ggplot(data, aes(x = year, y = budget_log, color = factor(gender_main_chr))) +
+  geom_point() +
+  geom_smooth(method = "lm", fill = NA) +
+  labs(x="Year of Release",
+       y="Production Budget (log)") +
+  scale_colour_discrete(name="Main Characters\nGender",
+                        breaks=c(0,1),
+                        labels=c("Male","Female"))
+ggplotly(movie_plot)
+```
+
+
+## Interpreting the Results
+
+<div class="columns-2">
+```{r, message=FALSE, echo=FALSE, fig.width=4, fig.height=3}
+movie_plot + theme(legend.position="bottom")
+```
+
+
+- The data supports previous research findings.
+- Over time, the gap in movie production budgets between genders has worsened.
+</div>
+
+
+## References
+
+