prior to merge with Cale's work in two chapters

PLS120BookTeam · Nov 2, 2018 · 3992c27 · 3992c27
1 parent a4f8b69
commit 3992c27
Show file tree

Hide file tree

Showing 21 changed files with 2,204 additions and 111 deletions.
diff --git a/.Rhistory (1) b/.Rhistory (1)
@@ -1,96 +1,3 @@
-sd = 18,
-p = 0.10,
-lblx = "what is this value \n if area is 0.10?",
-cex = 0.95)
-pnorm(q = 180, mean = 200, sd = 18)
-left.tail(mean = 200,
-sd = 18,
-p = 0.10,
-lblx = "what is this value \n if area is 0.10?",
-cex = 0.95)
-left.tail(mean = 200,
-left.tail(mean = 200,
-sd = 18,
-p = 0.10,
-lblx = "what is this value?")
-left.tail(mean = 200,
-sd = 18,
-p = 0.10,
-lblx = "what is this value?")
-qnorm(p = 0.1, mean = 200, sd = 18)
-knitr::opts_chunk$set(echo = TRUE)
-library(arrangements)
-ncombinations(k = 11, n = 15, replace = FALSE)
-ncombinations(k = 10, n = 14) * ncombinations(k = 1, n = 3)
-ncombinations(k = 1, n = 3)
-ncombinations(k = 10, n = 14)
-=14*13*12*11/4*3*2
-14*13*12*11/4*3*2
-14*13*12*11/(4*3*2)
-ncombinations(k = 10, n = 14) * ncombinations(k = 1, n = 3)
-ncombinations(k = 5, n = 52)
-ncombinations(k = 1, n = 48)
-(P4Aces <- ncombinations(k = 1, n = 48) * ncombinations(k = 4, n = 4) / ncombinations(k = 5, n = 52))
-(SampleSpace <- tosscoin(times = 5, makespace = TRUE))
-library(prob)
-install.packages("prob")
-library(prob)
-(SampleSpace <- tosscoin(times = 5, makespace = TRUE))
-nrows(tosscoin(times = 5, makespace = TRUE))
-nrow(tosscoin(times = 5, makespace = TRUE))
-# Number of H's in each outcome
-SampleSpace$nH <- as.numeric(SampleSpace$toss1 == "H") +
-as.numeric(SampleSpace$toss2 == "H") +
-as.numeric(SampleSpace$toss3 == "H") +
-as.numeric(SampleSpace$toss4 == "H") +
-as.numeric(SampleSpace$toss5 == "H")
-(nHfreq <- as.data.frame(table(SampleSpace$nH)))
-# Rename columns
-names(nHfreq) <- c("nHeads", "AbsFreq")
-nHfreq$RelFreq <- nHfreq$AbsFreq / sum(nHfreq$AbsFreq)
-pander::pander(nHfreq)
-ncombinations(k = 3, n = 5)
-1 - 0.30 - 0.20 - 0.15 - 0.05 - 0.10
-0.80*0.05/0.50
-0.08+(0.10/0.50)*0.20
-0.08/0.12
-SampleSpace
-(5000 - 3500)/500
-pnorm(q = 4500, mean = 3500, sd = 500)
-pnorm(q = 1)
-pnorm(q = 1)1-
-1-pnorm(q = 1)
-0.9772499 - 0.1586553
-pnorm(q = 4500, mean = 3500, sd = 500) - pnorm(3000, mean = 3500, sd = 500)
-qnomr(p=0.10, mean = 3500, sd = 500)
-qnorm(p=0.10, mean = 3500, sd = 500)
-rnorm(9, 97, 8)
-round(rnorm(9, 97, 10), 1)
-# Two tails
-curve(df(x, df1 = 3, df2 = 5),
-from = 0,
-to = 35,
-lwd = 2.5,
-xlab = "",
-ylab = "")
-# Two tails
-curve(df(x, df1 = 5, df2 = 5),
-from = 0,
-to = 35,
-lwd = 2.5,
-xlab = "",
-ylab = "")
-# Two tails
-curve(df(x, df1 = 5, df2 = 15),
-from = 0,
-to = 35,
-lwd = 2.5,
-xlab = "",
-ylab = "")
-# Two tails
-curve(df(x, df1 = 5, df2 = 15),
-from = 0,
-to = 10,
 lwd = 2.5,
 xlab = "",
 ylab = "")
@@ -510,3 +417,96 @@ qt(0.025, df = df1)
 tcrit
 df1
 qt(0.025, df1)
+walking.spiders <- data.frame(
+'spider' = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+'control' = c(2.5, 5.5, 1.1, 2.7, 2.8, 1.6, 3.2, 4.5, 5.0, 6.9, 2.2, 3.9, 3.8, 3.5, 5.7),
+'mantis' = c(0.4, 1.9, 1.2, 2.6, 4.3, 0.3, 1.0, 1.5, 3.3, 2.6, 0.7, 1.4, 2.1, 3.4, 2.3),
+'dif' = c(2.1, 3.6, -0.1, 0.1, -1.5, 1.3, 2.2, 3.0, 1.7, 4.3, 1.5, 2.5, 1.7, 0.1, 3.4))
+dif = c(2.1, 3.6, -0.1, 0.1, -1.5, 1.3, 2.2, 3.0, 1.7, 4.3, 1.5, 2.5, 1.7, 0.1, 3.4)
+mean(dif)
+var(dif)
+qt(p = 0.975, df = 14)
+spd <- data.frame(
+spider = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+control = c(2.5, 5.5, 1.1, 2.7, 2.8, 1.6, 3.2, 4.5, 5.0, 6.9, 2.2, 3.9, 3.8, 3.5, 5.7),
+mantis = c(0.4, 1.9, 1.2, 2.6, 4.3, 0.3, 1.0, 1.5, 3.3, 2.6, 0.7, 1.4, 2.1, 3.4, 2.3))
+t.test(x = spd$control,
+y = spd$mantis,
+alternative = "two.sided",
+mu = 0,
+paired = TRUE,
+conf.level = 0.95)
+rats <- data.frame(
+'males' = c(700, 825, 425, 500, 575, 725, 800, 475, 575, 725, 500, 700, 575, 775),
+'females' = c(450, 725, 675, 725, 750, 850, 690, 725, 475, 700, 725, 475, 825, 725))
+(sample.avg <- sapply(rat.life, mean))
+(sample.avg <- sapply(rat, mean))
+(sample.avg <- sapply(rats, mean))
+str(sample.avg)
+sample.avg["males"]
+sample.avg["males"]
+sample.avg[1]
+sample.avg["females"]
+sample.avg[2]
+t.test(x = rats$male,
+y = rats$females,
+alternative = "two.sided",
+mu = 0,
+paired = FALSE,
+conf.level = 0.95)
+spd <- data.frame(
+spider = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+control = c(2.5, 5.5, 1.1, 2.7, 2.8, 1.6, 3.2, 4.5, 5.0, 6.9, 2.2, 3.9, 3.8, 3.5, 5.7),
+mantis = c(0.4, 1.9, 1.2, 2.6, 4.3, 0.3, 1.0, 1.5, 3.3, 2.6, 0.7, 1.4, 2.1, 3.4, 2.3))
+(control.avg <- mean(spd$control))
+(mantis.avg <- mean(spd$mantis))
+(control.var <- var(spd$control))
+(mantis.var <- var(spd$mantis))
+(spd$dif <- spd$control - spd$mantis)
+(d.bar <- mean(spd$dif))
+(dif.var <- var(spd$dif))
+(r <- length(spd$dif))
+d.bar.var <- dif.var / r
+(d.bar.var <- dif.var / r)
+(tcalc <- d.bar / sqrt(d.bar.var))
+(tcrit <- qt(p =0.975, df = r-1))
+(CIlo <- d.bar - tcrit * sqrt(d.bar.var))
+(CIhi <- d.bar + tcrit * sqrt(d.bar.var))
+t.test(x = spd$control,
+y = spd$mantis,
+alternative = "two.sided",
+mu = 0,
+paired = TRUE,
+conf.level = 0.95)
+rats <- data.frame(
+'males' = c(700, 825, 425, 500, 575, 725, 800, 475, 575, 725, 500, 700, 575, 775),
+'females' = c(450, 725, 675, 725, 750, 850, 690, 725, 475, 700, 725, 475, 825, 725))
+(sample.avg <- sapply(rats, mean))
+(sample.var <- sapply(rats, var))
+sample.avg["males"]
+sample.avg[1]
+sample.avg["females"]
+sample.avg[2]
+r1 <- length(rats$males)
+r2 <- length(rats$females)
+(fem.var <- var(rats$females))
+(male.var <- var(rats$males))
+(pooled.var <- ((r1 - 1) * male.var + (r2 - 1) * female.var) / (r1 + r2 - 2))
+(pooled.var <- ((r1 - 1) * male.var + (r2 - 1) * fem.var) / (r1 + r2 - 2))
+(rat.d.bar.var <- pooled.var / r1 + pooled.var / r2)
+rat.d.bar <- sample.avg[1] - sample.avg[2]
+(rat.tcalc <- rat.d.bar / sqrt(rat.d.bar.var))
+t.test(x = rats$male,
+y = rats$females,
+alternative = "two.sided",
+mu = 0,
+paired = FALSE,
+conf.level = 0.95)
+pt(q = abs(rat.tcalc), df = rat.df, lower.tail = FALSE)
+(rat.df <- r1 + r2 - 2)
+pt(q = abs(rat.tcalc), df = rat.df, lower.tail = FALSE)
+2 * pt(q = abs(rat.tcalc), df = rat.df, lower.tail = FALSE)
+r1
+transg.aa.m1 <- lm(wgain ~ variety, data = aaAnovaExmpl1.dat)
+summary(transg.aa.m1)
+anova(transg.aa.m1)
diff --git a/08.0_TwoPops.Rmd b/08.0_TwoPops.Rmd
@@ -171,7 +171,7 @@ Why use independent or paired procedures? Each has advantages and disadvantages,
 
 
 
-## Comparing Sample Variances
+## Comparing Sample Variances {#compare2Variances}
 
 When observations are independent, we will have two estimated variances, one for each sample. In some cases we may have previous information indicating that the population variances are equal or unequal and we use that information to determine if we can pool the estimates of the variance from each sample. When there is no information about the equality of the variances we can use an F-test to determine if variances estimates can be pooled. Pooling variance estimates is very desirable because it increases the degrees of freedom and results in tests that have lower probability of error Type II.
 
@@ -771,7 +771,7 @@ Table: (\#tab:CaseEquations)
 
 | Case | Pop Variances | Sample Size | Paired / Independent |         Pooled Sample Variance           |    Standard Error of the Difference      |   df   |     t-statistic     |     Confidence Interval   |
 |-----:|:-------------:|:-------------:|:------------------:|:---------------------------------------------------------------:|:-------------------------------------------------:|:-----------:| :-------------------------------------------:|:----------------:|
-| 1 |   Equal   |Equal/ Not Equal|  Independent   | $\frac{({{S_1}^2})({r_1-1})+({{S_2}^2})({r_2-1})}{(r_1+r_2-2)}$ |    $(\frac{S^2}{r_1 + r_2})^{1/2}$      | $r_1+r_2-2$ | $\frac{\bar{d}-\mu_{\bar{d}}}{S_{\bar{d}}}$ |  ${L\atop U} = \bar{d} \pm t_\alpha S_\bar{d}$  |
+| 1 |   Equal   |Equal/ Not Equal|  Independent   | $\frac{({{S_1}^2})({r_1-1})+({{S_2}^2})({r_2-1})}{(r_1+r_2-2)}$ |    $\left (\frac{S^2}{r_1} + \frac{S^2}{r_2} \right )^{1/2}$      | $r_1+r_2-2$ | $\frac{\bar{d}-\mu_{\bar{d}}}{S_{\bar{d}}}$ |  ${L\atop U} = \bar{d} \pm t_\alpha S_\bar{d}$  |
 | 1 |   Equal   |   Equal   |   Independent   |       $\frac{{{S_1}^2}+{{S_2}^2}}{2}$          |      $(\frac{{2S}^2}{r})^{1/2}$       |  $2(r-1)$ | $\frac{(\bar{d}-\mu_{\bar{d}})}{S_{\bar{d}}}$|  ${L\atop U} = \bar{d} \pm t_\alpha S_\bar{d}$  |
 | 2 |  Not Equal  |Equal/ Not Equal|   Independent   |                                 | $(\frac{{S_1}^2}{r_1}+\frac{{S_2}^2}{r_2})^{1/2}$ | $r_1+r_2-2$ | $\frac{\bar{d}-\mu_{\bar{d}}}{S_{\bar{d}}}$ |  ${L\atop U} = \bar{d} \pm t_\alpha S_\bar{d}$  |
 | 3 |   Equal   |   Equal   |    Paired     |     $S_d^2=\frac{\sum(d_i-\bar{d})^2}{r-1}$        |      $(\frac{S_d^2}{r})^{1/2}$       |   $r-1$  | $\frac{(\bar{d}-\mu_{\bar{d}})}{S_{\bar{d}}}$|  ${L\atop U} = \bar{d} \pm t_\alpha S_\bar{d}$  |

diff --git a/09.0_Anova1.Rmd b/09.0_Anova1.Rmd
@@ -170,7 +170,7 @@ $$S^2_Y = \frac{1}{k \ (r - 1)} \sum_{i=1}^k \sum_{j=1}^r (\bar{Y}_{ij} - \bar{Y
 
 The pooled variance is the average squared deviations from each sample average, and thus, it does not need the means to be equal to be a valid estimate of the common variance for all treatments. This pooled variance is based on the same concept used for comparing two means using independent samples in [Chapter 8](#Case1). Pooling variances is desirable because it leads to better precision and narrower confidence intervals. A pooled estimated variance is based on more observations and is therefore more stable (the variance of the estimated variance is smaller!!), moreover, the variance of Student's t distribution is also reduced as the degrees of freedom increase. Pooling leads to smaller critical t-values, which equals more power (lower probability of error type II) and narrower confidence intervals.
 
-If indeed the first estimate of the variance above, based on the variation among treatment averages, is a valid one, the quotient of the first estimate over the second one, based on the pooled variance, should have an F distribution with values about 1.0. We saw this when testing for differences between independent variance estimates in [section XXX](#addReference). The statistical distribution resulting from the quotient of two variance estimates is called an F-distribution, which was fully described in the *F Distribution* section of the [Probability chapter](#chProb). If the quotient calculated, called "calculated F-value," is too far from 1.0 we conclude that the two variances are not equal, therefore, the estimation of the variance using the averages is not a valid estimate, and the averages are considered too different to come from the same population. The means of the source populations must be different, so the null hypothesis is rejected. This is further illustrated with a numerical example below.
+If indeed the first estimate of the variance above, based on the variation among treatment averages, is a valid one, the quotient of the first estimate over the second one, based on the pooled variance, should have an F distribution with values about 1.0. We saw this when testing for [differences between independent variance estimates] (#compare2Variances). The statistical distribution resulting from the quotient of two variance estimates is called an F-distribution, which was fully described in the *F Distribution* section of the [Probability chapter](#chProb). If the quotient calculated, called "calculated F-value," is too far from 1.0 we conclude that the two variances are not equal, therefore, the estimation of the variance using the averages is not a valid estimate, and the averages are considered too different to come from the same population. Thus, one reasons, at least two means must be different, and the null hypothesis is rejected. This is further illustrated with a numerical example below.
 
 
 
@@ -237,14 +237,16 @@ SSE &= \sum_{i=1}^k \sum_{j=1}^r (\bar{Y}_{ij} - \bar{Y}_{i.})^2\\[15pt]
 \end{align}
 <br>
 
-## Degrees of freedom (The bulk of this section should be moved to the first time df are mentioned)
+## Degrees of freedom
 
 Degrees of freedom are quantities associated with sums of squares. Each sum of squares has an associated number that is called its degrees of freedom. We operationally define **degrees of freedom** associated with a sum of squares as the number of different terms in the sum minus the number of independent parameters estimated to be used in the calculation.
 
 In reality, degrees of freedom refers to the number of dimensions in which vectors can vary. The basic mathematics of statistics involves large-dimensional spaces and sub spaces. In short, and for those who are curious, a sample of size N is a vector in N dimensions and has N degrees of freedom because it can vary freely over N dimensions. Once a sample is set, the overall average is a vector in N dimensions with each coordinate value equal to the average of the observations. The average vector and the observation vector form a hyper plane in N-1 dimensions where the difference between observation and average reside. The difference $Y_{i} - \bar{Y}_{.}$ is the vector of deviations or residuals (for the simplest model $Y_i = \mu + \epsilon_i$) from observation to average. Given a set sample, this vector of residuals has only N-1 degrees of freedom, because it must stay in the subspace spanned by the observation and the overall average. The restriction comes from the fact that given the overall average and N-1 of the observation coordinates, the last one can be calculated as $Y_N = N \ \bar{Y}_. - \sum_{i=1}^{N-1} Y_i$. 
 
 This concept is illustrated for a sample consisting of three observations, so we can plot them in a 3D figure.
 
+<br>
+
 ```{r 3Dsample, echo=FALSE, warning=FALSE, fig.cap="Three-dimensional representation of a sample with 3 observations. The longest arrow (hypotenuse) represents the sample vector. The base of the triangle is the average and the third side is the vector of deviations from the average to the observation."}
 
 # library(rgl)
@@ -297,6 +299,8 @@ arrows3D(x0 = xyz.orig[, 1],
 
 ```
 
+<br>
+
 Note that the three arrows form a square triangle with the observation vector being the hypotenuse, and the other two sides being the average and the residual. Therefore the length of the observation vector equals the length of the average vector plus the length of the residual vector. Using the Pythagoras theorem repeatedly, we can see that the length of the observation vector is the sum of squares of individual observations; the length of the average vector is 3 times the squared average and the length of the residual vector is the sum of squares of the residuals or deviations. The length of the average vector is called "correction factor" because it is used to calculate the total variation about the average by subtracting it from the length of the observation vector.
 
 A really good reading to understand the concept of degrees of freedom is Walker [@Walker1940]. The following quote is taken from that paper:
@@ -363,13 +367,17 @@ TSS &= SST + SSE \\[15pt]
 
 The equations above show how each observation and the total variation are partitioned into treatment and residual. Total sum of squares is partitioned into SS of treatments and SS of residual or error ^[We try to use the term "residual" for the calculated values, which are estimates of the unknown true errors. We mention the term "error" because most books use "SSE" to refer to the SS of residuals. The "E" comes from "error."]. Each sum of squares has an associated degrees of freedom. By dividing the SS by the df we obtain **mean squares**. The mean square of residuals (MSE) is the best estimate of the common variance for all samples, and it does not depend on treatment means being equal. The mean square of treatments (MST) has an expectation equal to the common variance PLUS a component that is directly proportional to the difference among means. Only when means are equal is the MST another estimate of the variance of the error. See the optional section of Expected Mean Squares for details.
 
+<br>
 
 |Source            |  df      | Sum of Squares (SS) | Mean Squares (MS) | Calculated F |
 |:----------------:|:--------:|:-------------------:|:-----------------:|:------------:|
 |Total             | k r - 1  |     TSS             |                   |              |
 |Among Treatments  | k - 1    |     SST             |        MST        |   MST / MSE  |
 |Within Treatments | k (r-1)  |     SSE             |        MSE        |              |
 
+<br>
+
+
 Sums of squares, mean squares and calculated F are all statistics. The values based on one sample are specific realized values of the random variables, which have their own expected values variances and distributions. If the null hypothesis that all treatment means are the same is true and all assumptions are correct, the calculated F has an F distribution with mean or expectation equal to $dfe/(dfe - 2)$, where dfe is the residual or error degrees of freedom.
 
 ```{block, type = 'stattip'}
@@ -1963,7 +1971,7 @@ ssRes <- sum((milk.prot$logProt - milk.prot$AvgLogProt) ^ 2)
 
 ```
 
-Degrees of freedom are calculates as the total number of independent (different) terms in each sum, minus the number of parameters estimated for each sum. In the case of total sum of square, there are as many different terms are there are observations, because each observation is potentially different. Only the overall mean is estimated with the overall average. For the sum of squares of treatments we have as many different terms as there are treatments, and we use the overall average as estimate of the overall mean. Because the total degrees of freedom equal the sum of the components, the residual df can be calculated as the total df (n-1) minus treatment df.
+Degrees of freedom are calculates as the total number of independent (different) terms in each sum, minus the number of parameters estimated for each sum. In the case of total sum of squares, there are as many different terms as there are observations, because each observation is potentially different. Only the overall mean is estimated with the overall average. For the sum of squares of treatments we have as many different terms as there are treatments, and we use the overall average as estimate of the overall mean. Because the total degrees of freedom equal the sum of the components, the residual df can be calculated as the total df (n-1) minus treatment df.
 
 ```{r}