From 1deda8d1fcb0a0d3e6d0b59a753bca0b682222bf Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Mon, 26 Feb 2024 10:50:24 +0000 Subject: [PATCH 01/26] Update 03-ch3.Rmd --- 03-ch3.Rmd | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/03-ch3.Rmd b/03-ch3.Rmd index cd32564c..1019c870 100644 --- a/03-ch3.Rmd +++ b/03-ch3.Rmd @@ -1019,9 +1019,11 @@ Now that we have computed the statistics of interest for both genders, we can in ```{r, echo=T} # split the dataset by gender -male <- avgs %>% dplyr::filter(a_sex == 1) +male <- avgs %>% + dplyr::filter(a_sex == 1) -female <- avgs %>% dplyr::filter(a_sex == 2) +female <- avgs %>% + dplyr::filter(a_sex == 2) # rename columns of both splits colnames(male) <- c("Sex", "Year", "Y_bar_m", "s_m", "n_m") @@ -1141,16 +1143,32 @@ example4 <- cbind(X, Y) par(mfrow = c(2, 2)) # plot datasets -plot(example1, col = "steelblue", pch = 20, xlab = "X", ylab = "Y", +plot(example1, + col = "steelblue", + pch = 20, + xlab = "X", + ylab = "Y", main = "Correlation = 0.81") -plot(example2, col = "steelblue", pch = 20, xlab = "X", ylab = "Y", +plot(example2, + col = "steelblue", + pch = 20, + xlab = "X", + ylab = "Y", main = "Correlation = -0.81") -plot(example3, col = "steelblue", pch = 20, xlab = "X", ylab = "Y", +plot(example3, + col = "steelblue", + pch = 20, + xlab = "X", + ylab = "Y", main = "Correlation = 0") -plot(example4, col = "steelblue", pch = 20, xlab = "X", ylab = "Y", +plot(example4, + col = "steelblue", + pch = 20, + xlab = "X", + ylab = "Y", main = "Correlation = 0") ``` From a621a99edcb880d0c29d0f575593fb352cccc03f Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Mon, 26 Feb 2024 10:56:00 +0000 Subject: [PATCH 02/26] Update 04-ch4.Rmd --- 04-ch4.Rmd | 56 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/04-ch4.Rmd b/04-ch4.Rmd index 1f7ab88d..1d343844 100644 --- a/04-ch4.Rmd +++ b/04-ch4.Rmd @@ -44,7 +44,8 @@ frame <- data.frame( ) rownames(frame) <- 1:7 -kable(t(frame), "latex", booktabs = T) %>% kable_styling(position = "center") +kable(t(frame), "latex", booktabs = T) %>% + kable_styling(position = "center") ``` To work with these data in `r ttcode("R")` we begin by generating two vectors: one for the student-teacher ratios (`r ttcode("STR")`) and one for test scores (`r ttcode("TestScore")`), both containing the data from the table above. @@ -70,7 +71,9 @@ The following code reproduces Figure 4.1 from the textbook. ```{r , echo=TRUE, fig.align='center', cache=TRUE} # create a scatterplot of the data -plot(TestScore ~ STR,ylab="Test Score",pch=20) +plot(TestScore ~ STR, + ylab="Test Score", + pch=20) # add the systematic relationship to the plot abline(a = 713, b = -3) @@ -329,7 +332,8 @@ The first argument of the function to be specified is, similar to `r ttcode("plo ```{r} # estimate the model and assign the result to linear_model -linear_model <- lm(score ~ STR, data = CASchools) +linear_model <- lm(score ~ STR, + data = CASchools) # print the standard output of the estimated lm object to the console linear_model @@ -497,10 +501,19 @@ mod_quadratic <- lm( Y ~ X + I(X^2)) prediction <- predict(mod_quadratic, data.frame(X = sort(X))) # plot the results -plot( Y ~ X, col = "black", pch = 20, xlab = "X", ylab = "Y") -abline( mod_simple, col = "blue",lwd=2) +plot( Y ~ X, + col = "black", + pch = 20, + xlab = "X", + ylab = "Y") + +abline( mod_simple, + col = "blue", + lwd=2) #red line = incorrect linear regression (this violates the first OLS assumption) -lines( sort(X), prediction,col="red",lwd=2) +lines( sort(X), + prediction,col="red", + lwd=2) legend("topleft", legend = c("Simple Regression Model", "Quadratic Model"), @@ -586,9 +599,14 @@ fit <- lm(Y ~ X) fitWithoutOutlier <- lm(Y[-9] ~ X[-9]) # plot the results -plot(Y ~ X,pch=20) -abline(fit,lwd=2,col="blue") -abline(fitWithoutOutlier, col = "red",lwd=2) +plot(Y ~ X, + pch=20) +abline(fit, + lwd=2, + col="blue") +abline(fitWithoutOutlier, + col = "red", + lwd=2) legend("topleft", legend = c("Model with Outlier", "Model without Outlier"), @@ -797,7 +815,8 @@ for (j in 1:length(n)) { } # draw density estimates - plot(density(fit[ ,2]), xlim=c(2.5, 4.5), + plot(density(fit[ ,2]), + xlim=c(2.5, 4.5), col = j, main = paste("n=", n[j]), xlab = bquote(hat(beta)[1])) @@ -871,12 +890,21 @@ lm.set1 <- lm(Y ~ X, data = set1) lm.set2 <- lm(Y ~ X, data = set2) # plot observations -plot(set1, xlab = "X", ylab = "Y", pch = 19) -points(set2, col = "steelblue", pch = 19) +plot(set1, + xlab = "X", + ylab = "Y", + pch = 19) +points(set2, + col = "steelblue", + pch = 19) # add both lines to the plot -abline(lm.set1, col = "black",lwd=2) -abline(lm.set2, col = "steelblue",lwd=2) +abline(lm.set1, + col = "black", + lwd=2) +abline(lm.set2, + col = "steelblue", + lwd=2) legend("bottomright", legend = c("Set1", "Set2"), From 75417f61a7be577d714fc3d65ea8d997b2f82fc1 Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Wed, 6 Mar 2024 01:25:25 +0000 Subject: [PATCH 03/26] Update index.Rmd --- index.Rmd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/index.Rmd b/index.Rmd index 911eb8ae..1183b6b2 100755 --- a/index.Rmd +++ b/index.Rmd @@ -176,7 +176,9 @@ Here are two simple examples. ```{r, eval = T} # generate the vector `z` -z <- seq(from = 1, to = 5, by = 1) +z <- seq(from = 1, + to = 5, + by = 1) # compute the mean of the entries in `z` mean(z) From 77cf450b425273ba3af43ec039ad8c34c623835d Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Wed, 6 Mar 2024 01:26:12 +0000 Subject: [PATCH 04/26] Update 02-ch2.Rmd --- 02-ch2.Rmd | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/02-ch2.Rmd b/02-ch2.Rmd index 7d68720f..a3c0e718 100644 --- a/02-ch2.Rmd +++ b/02-ch2.Rmd @@ -32,7 +32,8 @@ events, e.g., 'the observed outcome lies between $2$ and $5$'. A basic function to draw random samples from a specified set of elements is the function `r ttcode("sample()")`, see `?sample`. We can use it to simulate the random outcome of a dice roll. Let's roll the dice! ```{r, echo = T, eval = T, message = F, warning = F} -sample(1:6, size=1) +sample(1:6, + size=1) ``` The probability distribution of a discrete random variable is the list of all possible values of the variable and their probabilities that sum to $1$. The cumulative probability distribution function gives the probability that the random variable is less than or equal to a particular value. @@ -40,8 +41,13 @@ The probability distribution of a discrete random variable is the list of all po For the dice roll, the probability distribution and the cumulative probability distribution are summarized in Table \@ref(tab:pdist). ```{r pdist, echo=FALSE, purl=FALSE} -pdfdata <- rbind("Outcome"=as.character(1:6), "Probability"=c("1/6","1/6","1/6","1/6","1/6","1/6"), "Cumulative Probability"=c("1/6","2/6","3/6","4/6","5/6","1")) -knitr::kable(pdfdata, format = my_output, caption = "PDF and CDF of a Dice Roll") +pdfdata <- rbind("Outcome" = as.character(1:6), + "Probability" = c("1/6","1/6","1/6","1/6","1/6","1/6"), + "Cumulative Probability" = c("1/6","2/6","3/6","4/6","5/6","1") + ) +knitr::kable(pdfdata, + format = my_output, + caption = "PDF and CDF of a Dice Roll") ``` We can easily plot both functions using `r ttcode("R")`. Since the probability is equal to $1/6$ for each outcome, we can set up the `r ttcode("probability")` vector using the function `r ttcode("rep()")`, which replicates a given value a specified number of times. @@ -114,7 +120,9 @@ observing $4$, $5$, $6$ or $7$ successes for $B(10, 0.5)$. This may be computed ```{r, echo = T, eval = T, message = F, warning = F} # compute P(4 <= k <= 7) using 'dbinom()' -sum(dbinom(x = 4:7, size = 10, prob = 0.5)) +sum(dbinom(x = 4:7, + size = 10, + prob = 0.5)) ``` An alternative approach is to use `r ttcode("pbinom()")`, the distribution function of the binomial distribution to compute $$P(4 \leq k \leq 7) = P(k \leq 7) - P(k\leq3 ).$$ @@ -229,7 +237,9 @@ An example of sampling with replacement is rolling a dice three times in a row. set.seed(1) # rolling a dice three times in a row -sample(1:6, 3, replace = T) +sample(1:6, + 3, + replace = T) ``` Note that every call of `sample(1:6, 3, replace = T)` gives a different outcome since we draw with replacement at random. To allow you to reproduce the results of computations that involve random numbers, we will use `set.seed()` to set R's random number generator to a specific state. You should check that it actually works: set the seed in your R session to 1 and verify that you obtain the same three random numbers! @@ -774,7 +784,9 @@ By definition, the support of both PDF and CDF of an $F_{M,n}$ distributed rando Say we have an $F$ distributed random variable $Y$ with numerator degrees of freedom $3$ and denominator degrees of freedom $14$ and are interested in $P(Y \geq 2)$. This can be computed with the help of the function `r ttcode("pf()")`. By setting the argument `r ttcode("lower.tail")` to `r ttcode("FALSE")` we ensure that `r ttcode("R")` computes $1- P(Y \leq 2)$, i.e, the probability mass in the tail right of $2$. ```{r, echo = T, eval = T, message = F, warning = F} -pf(2, df1 = 3, df2 = 14, lower.tail = F) +pf(2, + df1 = 3, df2 = 14, + lower.tail = F) ``` We can visualize this probability by drawing a line plot of the related density and adding a color shading with `r ttcode("polygon()")`. @@ -792,7 +804,9 @@ curve(df(x ,3 ,14), main = "Density Function") # draw the polygon -polygon(x, y, col = "orange") +polygon(x, + y, + col = "orange") ``` @@ -837,7 +851,9 @@ In simple random sampling, $n$ objects are drawn at random from a population. Ea What happens if we consider functions of the sample data? Consider the example of rolling a dice two times in a row once again. A sample now consists of two independent random draws from the set $\{1,2,3,4,5,6\}$. It is apparent that any function of these two random variables, e.g. their sum, is also random. Convince yourself by executing the code below several times. ```{r, echo = T, eval = T, message = F, warning = F} -sum(sample(1:6, 2, replace = T)) +sum(sample(1:6, + 2, + replace = T)) ``` Clearly, this sum, let us call it $S$, is a random variable as it depends on randomly drawn summands. For this example, we can completely enumerate all outcomes and hence write down the theoretical probability distribution of our function of the sample data $S$: @@ -897,7 +913,8 @@ So the distribution of $S$ is known. It is also evident that its distribution di ```{r, echo = T, eval = T, message = F, warning = F, fig.align='center'} # divide the plotting area into one row with two columns -par(mfrow = c(1, 2),cex.main=1) +par(mfrow = c(1, 2), + cex.main=1) # plot the distribution of S barplot(PS, @@ -1123,7 +1140,9 @@ set.seed(1) # set number of coin tosses and simulate N <- 30000 -Y <- sample(0:1, N, replace = T) +Y <- sample(0:1, + N, + replace = T) # Calculate R_n for 1:N S <- cumsum(Y) From 0d1e1540e780ecc905cfdf3cea34cdb9cd70381d Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Wed, 6 Mar 2024 01:28:24 +0000 Subject: [PATCH 05/26] Update 03-ch3.Rmd --- 03-ch3.Rmd | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/03-ch3.Rmd b/03-ch3.Rmd index 1019c870..51e54f61 100644 --- a/03-ch3.Rmd +++ b/03-ch3.Rmd @@ -168,11 +168,14 @@ For comparison purposes we store results for the estimator $Y_1$, the first obse pop <- rnorm(10000, 10, 1) # sample from the population and estimate the mean -est1 <- replicate(expr = mean(sample(x = pop, size = 5)), n = 25000) +est1 <- replicate(expr = mean(sample(x = pop, size = 5)), + n = 25000) -est2 <- replicate(expr = mean(sample(x = pop, size = 25)), n = 25000) +est2 <- replicate(expr = mean(sample(x = pop, size = 25)), + n = 25000) -fo <- replicate(expr = sample(x = pop, size = 5)[1], n = 25000) +fo <- replicate(expr = sample(x = pop, size = 5)[1], + n = 25000) ``` Check that `r ttcode("est1")` and `r ttcode("est2")` are vectors of length $25000$: @@ -210,7 +213,8 @@ lines(density(est2), lwd = 2) # add a vertical line at the true parameter -abline(v = 10, lty = 2) +abline(v = 10, + lty = 2) # add N(10,1) density to the plot curve(dnorm(x, mean = 10), @@ -429,7 +433,8 @@ axis(1, padj = 0.75, labels = c(expression(-frac(bar(Y)^"act"~-~bar(mu)["Y,0"], sigma[bar(Y)])), 0, - expression(frac(bar(Y)^"act"~-~bar(mu)["Y,0"], sigma[bar(Y)])))) + expression(frac(bar(Y)^"act"~-~bar(mu)["Y,0"], sigma[bar(Y)]))) + ) # shade p-value/2 region in left tail polygon(x = c(-6, seq(-6, -1.5, 0.01), -1.5), @@ -1040,7 +1045,8 @@ gap_ci_l <- gap - 1.96 * gap_se gap_ci_u <- gap + 1.96 * gap_se -result <- cbind(male[,-1], female[,-(1:2)], gap, gap_se, gap_ci_l, gap_ci_u) +result <- cbind(male[,-1], female[,-(1:2)], + gap, gap_se, gap_ci_l, gap_ci_u) # print the results to the console print(result, digits = 3) From 6aa85c94a2094f4c2a9d5fef7d439e6db9601db7 Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Wed, 6 Mar 2024 01:29:51 +0000 Subject: [PATCH 06/26] Update 04-ch4.Rmd --- 04-ch4.Rmd | 61 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/04-ch4.Rmd b/04-ch4.Rmd index 1d343844..d82aca2f 100644 --- a/04-ch4.Rmd +++ b/04-ch4.Rmd @@ -30,8 +30,9 @@ frame <- data.frame( ) rownames(frame) <- 1:7 -t(frame) %>% kable("latex", booktabs = T) %>% -kable_styling(latex_options = "striped") +t(frame) %>% + kable("latex", booktabs = T) %>% + kable_styling(latex_options = "striped") ``` ```{r, echo=F, purl=F,, eval=my_output=="latex"} @@ -76,7 +77,8 @@ plot(TestScore ~ STR, pch=20) # add the systematic relationship to the plot -abline(a = 713, b = -3) +abline(a = 713, + b = -3) ``` @@ -485,8 +487,11 @@ After generating the data we estimate both a simple regression model and a quadr set.seed(321) # simulate the data -X <- runif(50, min = -5, max = 5) -u <- rnorm(50, sd = 1) +X <- runif(50, + min = -5, + max = 5) +u <- rnorm(50, + sd = 1) # the true relation Y <- X^2 + 2 * X + u @@ -498,7 +503,8 @@ mod_simple <- lm(Y ~ X) mod_quadratic <- lm( Y ~ X + I(X^2)) # predict using a quadratic model -prediction <- predict(mod_quadratic, data.frame(X = sort(X))) +prediction <- predict(mod_quadratic, + data.frame(X = sort(X))) # plot the results plot( Y ~ X, @@ -512,7 +518,8 @@ abline( mod_simple, lwd=2) #red line = incorrect linear regression (this violates the first OLS assumption) lines( sort(X), - prediction,col="red", + prediction, + col="red", lwd=2) legend("topleft", legend = c("Simple Regression Model", @@ -542,7 +549,9 @@ $$ employment_t = -50 + 0.98 \cdot employment_{t-1} + u_t. $$ set.seed(123) # generate a date vector -Date <- seq(as.Date("1951/1/1"), as.Date("2000/1/1"), "years") +Date <- seq(as.Date("1951/1/1"), + as.Date("2000/1/1"), + "years") # initialize the employment vector X <- c(5000, rep(NA, length(Date)-1)) @@ -588,8 +597,12 @@ The following code roughly reproduces what is shown in figure 4.5 in the book. A set.seed(123) # generate the data -X <- sort(runif(10, min = 30, max = 70)) -Y <- rnorm(10 , mean = 200, sd = 50) +X <- sort(runif(10, + min = 30, + max = 70)) +Y <- rnorm(10, + mean = 200, + sd = 50) Y[9] <- 2000 # fit model with outlier @@ -685,8 +698,11 @@ Finally, we store the results in a data.frame. ```{r} # simulate data N <- 100000 -X <- runif(N, min = 0, max = 20) -u <- rnorm(N, sd = 10) +X <- runif(N, + min = 0, + max = 20) +u <- rnorm(N, + sd = 10) # population regression Y <- -2 + 3.5 * X + u @@ -729,7 +745,8 @@ n <- 100 reps <- 10000 # initialize the matrix of outcomes -fit <- matrix(ncol = 2, nrow = reps) +fit <- matrix(ncol = 2, + nrow = reps) # loop sampling and estimation of the coefficients for (i in 1:reps){ @@ -761,7 +778,8 @@ curve(dnorm(x, -2, sqrt(var_b0)), add = T, - col = "darkred",lwd=2) + col = "darkred", + lwd=2) # plot histograms of beta_hat_1 hist(fit[, 2], @@ -775,7 +793,8 @@ curve(dnorm(x, 3.5, sqrt(var_b1)), add = T, - col = "darkred",lwd=2) + col = "darkred", + lwd=2) ``` @@ -873,8 +892,7 @@ points(set2, col = "steelblue", pch = 19) legend("topleft", - legend = c("Set1", - "Set2"), + legend = c("Set1", "Set2"), cex = 1, pch = 19, col = c("black","steelblue")) @@ -886,8 +904,10 @@ It is clear that observations that are close to the sample average of the $X_i$ ```{r, fig.align='center'} # estimate both regression lines -lm.set1 <- lm(Y ~ X, data = set1) -lm.set2 <- lm(Y ~ X, data = set2) +lm.set1 <- lm(Y ~ X, + data = set1) +lm.set2 <- lm(Y ~ X, + data = set2) # plot observations plot(set1, @@ -906,8 +926,7 @@ abline(lm.set2, col = "steelblue", lwd=2) legend("bottomright", - legend = c("Set1", - "Set2"), + legend = c("Set1", "Set2"), cex = 1, lwd=2, col = c("black","steelblue")) From 5351f2ac3ac91448267bc5c012943a3089c91c45 Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Wed, 6 Mar 2024 01:31:22 +0000 Subject: [PATCH 07/26] Update 05-ch5.Rmd --- 05-ch5.Rmd | 131 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 92 insertions(+), 39 deletions(-) diff --git a/05-ch5.Rmd b/05-ch5.Rmd index ea3a0470..fd53a09d 100644 --- a/05-ch5.Rmd +++ b/05-ch5.Rmd @@ -78,21 +78,27 @@ Recall the definition of the $p$-value: ```{r, eval = my_output == "latex", results='asis', echo=F, purl=F} cat('\\begin{keyconcepts}[Testing Hypotheses regarding $\\beta_1$]{5.2} For testing the hypothesis $H_0: \\beta_1 = \\beta_{1,0}$, we need to perform the following steps:\\newline + \\begin{enumerate} \\item Compute the standard error of $\\hat{\\beta}_1$, $SE(\\hat{\\beta}_1)$ -\\begin{align*} -SE(\\hat{\\beta}_1) = \\sqrt{ \\hat{\\sigma}^2_{\\hat{\\beta}_1} }, \\quad + +\\[ SE(\\hat{\\beta}_1) = \\sqrt{ \\hat{\\sigma}^2_{\\hat{\\beta}_1} } \\ \\ , \\ \\ \\hat{\\sigma}^2_{\\hat{\\beta}_1} = \\frac{1}{n} \\times \\frac{\\frac{1}{n-2} \\sum_{i=1}^n (X_i - \\overline{X})^2 \\hat{u_i}^2 }{ \\left[ \\frac{1}{n} \\sum_{i=1}^n (X_i - \\overline{X})^2 \\right]^2}. -\\end{align*} +\\] + \\item Compute the $t$-statistic + \\[ t = \\frac{\\hat{\\beta}_1 - \\beta_{1,0}}{ SE(\\hat{\\beta}_1) }. \\] + \\item Given a two sided alternative ($H_1:\\beta_1 \\neq \\beta_{1,0}$) we reject at the $5\\%$ level if $|t^{act}| > 1.96$ or, equivalently, if the $p$-value is less than $0.05$.\\newline Recall the definition of the $p$-value: -\\begin{align*} - p \\text{-value} =& \\, \\text{Pr}_{H_0} \\left[ \\left| \\frac{ \\hat{\\beta}_1 - \\beta_{1,0} }{ SE(\\hat{\\beta}_1) } \\right| > \\left| \\frac{ \\hat{\\beta}_1^{act} - \\beta_{1,0} }{ SE(\\hat{\\beta}_1) } \\right| \\right] \\\\ - =& \\, \\text{Pr}_{H_0} (|t| > |t^{act}|) \\\\ - =& \\, 2 \\cdot \\Phi(-|t^{act}|). -\\end{align*} + + \\begin{align*} + p \\text{-value} =& \\, \\text{Pr}_{H_0} \\left[ \\left| \\frac{ \\hat{\\beta}_1 - \\beta_{1,0} }{ SE(\\hat{\\beta}_1) } \\right| > \\left| \\frac{ \\hat{\\beta}_1^{act} - \\beta_{1,0} }{ SE(\\hat{\\beta}_1) } \\right| \\right] \\\\ + =& \\, \\text{Pr}_{H_0} (|t| > |t^{act}|) \\\\ + =& \\, 2 \\cdot \\Phi(-|t^{act}|). + \\end{align*} + The last transformation is due to the normal approximation for large samples. \\end{enumerate} \\end{keyconcepts} @@ -116,7 +122,8 @@ CASchools$STR <- CASchools$students/CASchools$teachers CASchools$score <- (CASchools$read + CASchools$math)/2 # estimate the model -linear_model <- lm(score ~ STR, data = CASchools) +linear_model <- lm(score ~ STR, + data = CASchools) ``` For testing a hypothesis concerning the slope parameter (the coefficient on $STR$), we need $SE(\hat{\beta}_1)$, the standard error of the respective point estimator. As is common in the literature, standard errors are presented in parentheses below the point estimates. @@ -191,7 +198,9 @@ plot(x = t, tact <- -4.75 -axis(1, at = c(0, -1.96, 1.96, -tact, tact), cex.axis = 0.7) +axis(1, + at = c(0, -1.96, 1.96, -tact, tact), + cex.axis = 0.7) # Shade the critical regions using polygon(): @@ -207,11 +216,15 @@ polygon(x = c(1.96, seq(1.96, 6, 0.01), 6), col = 'orange') # Add arrows and texts indicating critical regions and the p-value -arrows(-3.5, 0.2, -2.5, 0.02, length = 0.1) -arrows(3.5, 0.2, 2.5, 0.02, length = 0.1) +arrows(-3.5, 0.2, -2.5, 0.02, + length = 0.1) +arrows(3.5, 0.2, 2.5, 0.02, + length = 0.1) -arrows(-5, 0.16, -4.75, 0, length = 0.1) -arrows(5, 0.16, 4.75, 0, length = 0.1) +arrows(-5, 0.16, -4.75, 0, + length = 0.1) +arrows(5, 0.16, 4.75, 0, + length = 0.1) text(-3.5, 0.22, labels = expression("0.025"~"="~over(alpha, 2)), @@ -228,8 +241,14 @@ text(5, 0.18, cex = 0.7) # Add ticks indicating critical values at the 0.05-level, t^act and -t^act -rug(c(-1.96, 1.96), ticksize = 0.145, lwd = 2, col = "darkred") -rug(c(-tact, tact), ticksize = -0.0451, lwd = 2, col = "darkgreen") +rug(c(-1.96, 1.96), + ticksize = 0.145, + lwd = 2, + col = "darkred") +rug(c(-tact, tact), + ticksize = -0.0451, + lwd = 2, + col = "darkgreen") ``` @@ -310,7 +329,8 @@ CI^{\mu}_{0.95} = \left[\hat\mu - 1.96 \times \frac{5}{\sqrt{100}} \ , \ \hat\mu It is fairly easy to compute this interval in `r ttcode('R')` by hand. The following code chunk generates a named vector containing the interval bounds: ```{r} -cbind(CIlower = mean(Y) - 1.96 * 5 / 10, CIupper = mean(Y) + 1.96 * 5 / 10) +cbind(CIlower = mean(Y) - 1.96 * 5 / 10, + CIupper = mean(Y) + 1.96 * 5 / 10) ``` Knowing that $\mu = 5$ we see that, for our example data, the confidence interval covers the true value. @@ -334,7 +354,9 @@ upper <- numeric(10000) # loop sampling / estimation / CI for(i in 1:10000) { - Y <- rnorm(100, mean = 5, sd = 5) + Y <- rnorm(100, + mean = 5, + sd = 5) lower[i] <- mean(Y) - 1.96 * 5 / 10 upper[i] <- mean(Y) + 1.96 * 5 / 10 @@ -449,8 +471,14 @@ plot( CASchools$score ~ CASchools$D, # provide the data to be plotted main = "Dummy Regression") # Add the average for each group -points( y = mean.score.for.D.0, x = 0, col="red", pch = 19) -points( y = mean.score.for.D.1, x = 1, col="red", pch = 19) +points( y = mean.score.for.D.0, + x = 0, + col="red", + pch = 19) +points( y = mean.score.for.D.1, + x = 1, + col="red", + pch = 19) ``` @@ -461,15 +489,23 @@ points( y = mean.score.for.D.1, x = 1, col="red", pch = 19) CASchools$D <- CASchools$STR < 20 # estimte the dummy regression -dummy_model <- lm(score ~ D, data = CASchools) +dummy_model <- lm(score ~ D, + data = CASchools) # Plot the data -plot(CASchools$D, CASchools$score, - pch = 20, cex = 0.5 , col = "Steelblue", - xlab = expression(D[i]), ylab = "Test Score", +plot(CASchools$D, + CASchools$score, + pch = 20, + cex = 0.5 , + col = "Steelblue", + xlab = expression(D[i]), + ylab = "Test Score", main = "Dummy Regression") # -points(CASchools$D, predict(dummy_model), col = "red", pch = 20) +points(CASchools$D, + predict(dummy_model), + col = "red", + pch = 20) ``` @@ -486,7 +522,8 @@ We will now use `r ttcode('R')` to estimate the dummy regression model as define ```{r} # estimate the dummy regression model -dummy_model <- lm(score ~ D, data = CASchools) +dummy_model <- lm(score ~ D, + data = CASchools) summary(dummy_model) ``` @@ -571,16 +608,21 @@ library(scales) set.seed(123) # set up vector of x coordinates -x <- rep(c(10, 15, 20, 25), each = 25) +x <- rep(c(10, 15, 20, 25), + each = 25) # initialize vector of errors e <- c() # sample 100 errors such that the variance increases with x -e[1:25] <- rnorm(25, sd = 10) -e[26:50] <- rnorm(25, sd = 15) -e[51:75] <- rnorm(25, sd = 20) -e[76:100] <- rnorm(25, sd = 25) +e[1:25] <- rnorm(25, + sd = 10) +e[26:50] <- rnorm(25, + sd = 15) +e[51:75] <- rnorm(25, + sd = 20) +e[76:100] <- rnorm(25, + sd = 25) # set up y y <- 720 - 3.3 * x + e @@ -600,7 +642,8 @@ plot(x = x, ylim = c(600, 710)) # Add the regression line to the plot -abline(mod, col = "darkred") +abline(mod, + col = "darkred") # Add boxplots to the plot boxplot(formula = y ~ x, @@ -730,7 +773,8 @@ Let us now compute robust standard error estimates for the coefficients in `r tt ```{r} # compute heteroskedasticity-robust standard errors -vcov <- vcovHC(linear_model, type = "HC1") +vcov <- vcovHC(linear_model, + type = "HC1") vcov ``` @@ -765,7 +809,8 @@ Now assume we want to generate a coefficient summary as provided by `r ttcode(' ```{r} # we invoke the function `coeftest()` on our model -coeftest(linear_model, vcov. = vcov) +coeftest(linear_model, + vcov. = vcov) ``` We see that the values reported in the column `r ttcode('Std.Error')` are equal those from `r ttcode('sqrt(diag(vcov))')`. @@ -781,7 +826,9 @@ set.seed(905) # generate heteroskedastic data X <- 1:500 -Y <- rnorm(n = 500, mean = X, sd = 0.6 * X) +Y <- rnorm(n = 500, + mean = X, + sd = 0.6 * X) # estimate a simple regression model reg <- lm(Y ~ X) @@ -792,7 +839,8 @@ We plot the data and add the regression line. ```{r, fig.align='center'} # plot the data -plot(x = X, y = Y, +plot(x = X, + y = Y, pch = 19, col = "steelblue", cex = 0.8) @@ -801,7 +849,10 @@ plot(x = X, y = Y, abline(reg, col = "red", lwd = 1.5) -legend("topleft","Regression line",col="red",lwd=1.5) +legend("topleft", + "Regression line", + col="red", + lwd=1.5) ``` @@ -857,7 +908,8 @@ for (i in 1:10000) { } # compute the fraction of false rejections -round(cbind(t = mean(t), t.rob = mean(t.rob)), 3) +round(cbind(t = mean(t), + t.rob = mean(t.rob)), 3) ``` These results reveal the increased risk of falsely rejecting the null using the homoskedasticity-only standard error for the testing problem at hand: with the common standard error, $`r round(sum(t)/length(t)*100,3)`\%$ of all tests falsely reject the null hypothesis. In contrast, with the robust test statistic we are closer to the nominal level of $5\%$. @@ -958,7 +1010,8 @@ lines(density(weightedestimator), lwd = 2) # add a dashed line at 0 and add a legend to the plot -abline(v = 0, lty = 2) +abline(v = 0, + lty = 2) legend('topright', c("OLS", "Weighted"), From d4fd30da15d258ab96e68ffb4b274b8af82dd95f Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Wed, 6 Mar 2024 01:32:12 +0000 Subject: [PATCH 08/26] Update 06-ch6.Rmd --- 06-ch6.Rmd | 61 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/06-ch6.Rmd b/06-ch6.Rmd index 3056193c..a88c94d5 100644 --- a/06-ch6.Rmd +++ b/06-ch6.Rmd @@ -94,8 +94,10 @@ Let us estimate both regression models and compare. Performing a multiple regres ```{r} # estimate both regression models -mod <- lm(score ~ STR, data = CASchools) -mult.mod <- lm(score ~ STR + english, data = CASchools) +mod <- lm(score ~ STR, + data = CASchools) +mult.mod <- lm(score ~ STR + english, + data = CASchools) # print the results to the console mod @@ -186,13 +188,20 @@ df <- data.frame(y, x1, x2) reg <- lm(y ~ x1 + x2) cf.mod <- coef(reg) -x1.seq <- seq(min(x1),max(x1),length.out=25) -x2.seq <- seq(min(x2),max(x2),length.out=25) -z <- t(outer(x1.seq, x2.seq, function(x,y) cf.mod[1] + cf.mod[2]*x + cf.mod[3]*y)) +x1.seq <- seq(min(x1), + max(x1), + length.out=25) +x2.seq <- seq(min(x2), + max(x2), + length.out=25) +z <- t(outer(x1.seq, + x2.seq, + function(x,y) cf.mod[1] + cf.mod[2]*x + cf.mod[3]*y)) rbPal <- colorRampPalette(c('red','blue')) -cols <- rbPal(10)[as.numeric(cut(abs(y-reg$fitted.values), breaks = 10))] +cols <- rbPal(10)[as.numeric(cut(abs(y-reg$fitted.values), + breaks = 10))] m <- list( t = 5 @@ -356,7 +365,8 @@ How does `r ttcode("R")` react if we try to estimate a model with perfectly corr CASchools$FracEL <- CASchools$english / 100 # estimate the model -mult.mod <- lm(score ~ STR + english + FracEL, data = CASchools) +mult.mod <- lm(score ~ STR + english + FracEL, + data = CASchools) # obtain a summary of the model summary(mult.mod) @@ -385,7 +395,8 @@ We add the corresponding column to `r ttcode("CASchools")` and estimate a multip CASchools$NS <- ifelse(CASchools$STR < 12, 0, 1) # estimate the model -mult.mod <- lm(score ~ computer + english + NS, data = CASchools) +mult.mod <- lm(score ~ computer + english + NS, + data = CASchools) # obtain a model summary summary(mult.mod) @@ -447,7 +458,7 @@ since then for all observations $i=1,\dots,n$ the constant term is a linear comb \begin{align} intercept = \, & \lambda_1 \cdot (North + West + South + East) \\ - \begin{pmatrix} 1 \\ \vdots \\ 1\end{pmatrix} = \, & \lambda_1 \cdot \begin{pmatrix} 1 \\ \vdots \\ 1\end{pmatrix} \\ \Leftrightarrow \, & \lambda_1 = 1 + \begin{pmatrix} 1 \\ \vdots \\ 1\end{pmatrix} = \, & \lambda_1 \cdot \begin{pmatrix} 1 \vdots \\ 1\end{pmatrix} \\ \Leftrightarrow \, & \lambda_1 = 1 \end{align} and we have perfect multicollinearity. Thus the "dummy variable trap" means not paying attention and falsely including exhaustive dummies *and* a constant in a regression model. @@ -464,7 +475,8 @@ CASchools$direction <- sample(c("West", "North", "South", "East"), replace = T) # estimate the model -mult.mod <- lm(score ~ STR + english + direction, data = CASchools) +mult.mod <- lm(score ~ STR + english + direction, + data = CASchools) # obtain a model summary summary(mult.mod) @@ -487,7 +499,8 @@ Let us do this in `r ttcode("R")`. CASchools$PctES <- 100 - CASchools$english # estimate the model -mult.mod <- lm(score ~ STR + english + PctES, data = CASchools) +mult.mod <- lm(score ~ STR + english + PctES, + data = CASchools) # obtain a model summary summary(mult.mod) @@ -536,7 +549,8 @@ library(mvtnorm) n <- 50 # initialize vectors of coefficients -coefs1 <- cbind("hat_beta_1" = numeric(10000), "hat_beta_2" = numeric(10000)) +coefs1 <- cbind("hat_beta_1" = numeric(10000), + "hat_beta_2" = numeric(10000)) coefs2 <- coefs1 # set seed @@ -546,13 +560,15 @@ set.seed(1) for (i in 1:10000) { # for cov(X_1,X_2) = 0.25 - X <- rmvnorm(n, c(0, 0), sigma = cbind(c(10, 2.5), c(2.5, 10))) + X <- rmvnorm(n, c(0, 0), + sigma = cbind(c(10, 2.5), c(2.5, 10))) u <- rnorm(n, sd = 5) Y <- 5 + 2.5 * X[, 1] + 3 * X[, 2] + u coefs1[i, ] <- lm(Y ~ X[, 1] + X[, 2])$coefficients[-1] # for cov(X_1,X_2) = 0.85 - X <- rmvnorm(n, c(0, 0), sigma = cbind(c(10, 8.5), c(8.5, 10))) + X <- rmvnorm(n, c(0, 0), + sigma = cbind(c(10, 8.5), c(8.5, 10))) Y <- 5 + 2.5 * X[, 1] + 3 * X[, 2] + u coefs2[i, ] <- lm(Y ~ X[, 1] + X[, 2])$coefficients[-1] @@ -611,7 +627,8 @@ library(mvtnorm) n <- 50 # initialize vector of coefficients -coefs <- cbind("hat_beta_1" = numeric(10000), "hat_beta_2" = numeric(10000)) +coefs <- cbind("hat_beta_1" = numeric(10000), + "hat_beta_2" = numeric(10000)) # set seed for reproducibility set.seed(1) @@ -619,7 +636,8 @@ set.seed(1) # loop sampling and estimation for (i in 1:10000) { - X <- rmvnorm(n, c(50, 100), sigma = cbind(c(10, 2.5), c(2.5, 10))) + X <- rmvnorm(n, c(50, 100), + sigma = cbind(c(10, 2.5), c(2.5, 10))) u <- rnorm(n, sd = 5) Y <- 5 + 2.5 * X[, 1] + 3 * X[, 2] + u coefs[i,] <- lm(Y ~ X[, 1] + X[, 2])$coefficients[-1] @@ -653,10 +671,15 @@ Where does this correlation come from? Notice that, due to the way we generated if(knitr::opts_knit$get("rmarkdown.pandoc.to") == "html") { library(plotly) -kde <- kde2d(coefs[, 1], coefs[, 2], n = 100) +kde <- kde2d(coefs[, 1], + coefs[, 2], + n = 100) -p <- plot_ly(x = kde$x, y = kde$y, z = kde$z, - type = "surface", showscale = FALSE) +p <- plot_ly(x = kde$x, + y = kde$y, + z = kde$z, + type = "surface", + showscale = FALSE) p %>% layout(scene = list(zaxis = list(title = "Est. Density" ), From d903b9511062f17243eb93e1c58d5d6358de0c2f Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Wed, 6 Mar 2024 01:32:52 +0000 Subject: [PATCH 09/26] Update 07-ch7.Rmd --- 07-ch7.Rmd | 98 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 31 deletions(-) diff --git a/07-ch7.Rmd b/07-ch7.Rmd index f75e57e7..a0f0d109 100644 --- a/07-ch7.Rmd +++ b/07-ch7.Rmd @@ -62,8 +62,11 @@ data(CASchools) CASchools$STR <- CASchools$students/CASchools$teachers CASchools$score <- (CASchools$read + CASchools$math)/2 -model <- lm(score ~ STR + english, data = CASchools) -coeftest(model, vcov. = vcovHC, type = "HC1") +model <- lm(score ~ STR + english, + data = CASchools) +coeftest(model, + vcov. = vcovHC, + type = "HC1") ``` You can verify that these quantities are computed as in the simple regression model by manually calculating the $t$-statistics or $p$-values using the provided output above and using `r ttcode("R")` as a calculator. @@ -72,8 +75,10 @@ For example, using the definition of the $p$-value for a two-sided test as given ```{r, warning=F, message=F} # compute two-sided p-value -2 * (1 - pt(abs(coeftest(model, vcov. = vcovHC, type = "HC1")[2, 3]), - df = model$df.residual)) +2 * (1 - pt(abs(coeftest(model, + vcov. = vcovHC, type = "HC1")[2, 3]), + df = model$df.residual) + ) ``` ```{r, eval = my_output == "html", results='asis', echo=F, purl=F} @@ -104,7 +109,8 @@ Let us take a look at the regression from Section \@ref(mofimr) again. Computing confidence intervals for individual coefficients in the multiple regression model proceeds as in the simple regression model using the function `r ttcode("confint()")`. ```{r, warning=F, message=F} -model <- lm(score ~ STR + english, data = CASchools) +model <- lm(score ~ STR + english, + data = CASchools) confint(model) ``` @@ -120,7 +126,8 @@ One drawback of using `r ttcode("confint()")` is that it doesn't utilize robust ```{r, warning=F, message=F} # compute robust standard errors -rob_se <- diag(vcovHC(model, type = "HC1"))^0.5 +rob_se <- diag(vcovHC(model, + type = "HC1"))^0.5 # compute robust 95% confidence intervals rbind("lower" = coef(model) - qnorm(0.975) * rob_se, @@ -165,8 +172,11 @@ CASchools$score <- (CASchools$read + CASchools$math)/2 CASchools$expenditure <- CASchools$expenditure/1000 # estimate the model -model <- lm(score ~ STR + english + expenditure, data = CASchools) -coeftest(model, vcov. = vcovHC, type = "HC1") +model <- lm(score ~ STR + english + expenditure, + data = CASchools) +coeftest(model, + vcov. = vcovHC, + type = "HC1") ``` The estimated effect of a one unit change in the student-teacher ratio on test scores with expenditure and the share of english learning pupils held constant is $-0.29$, which is rather small. What is more, the coefficient on $STR$ is not significantly different from zero anymore even at $10\%$ since $p\text{-value}=0.55$. Can you come up with an interpretation for these findings (see Chapter 7.1 of the book)? The insignificance of $\hat\beta_1$ could be due to a larger standard error of $\hat{\beta}_1$ resulting from adding $expenditure$ to the model so that we estimate the coefficient on $size$ less precisely. This illustrates the issue of strongly correlated regressors (imperfect multicollinearity). The correlation between $STR$ and $expenditure$ can be computed using `r ttcode("cor()")`. @@ -196,11 +206,13 @@ It is fairly easy to conduct $F$-tests in `r ttcode("R")`. We can use the functi ```{r, warning=F, message=F} # estimate the multiple regression model -model <- lm(score ~ STR + english + expenditure, data = CASchools) +model <- lm(score ~ STR + english + expenditure, + data = CASchools) # execute the function on the model object and provide both linear restrictions # to be tested as strings -linearHypothesis(model, c("STR=0", "expenditure=0")) +linearHypothesis(model, + c("STR=0", "expenditure=0")) ``` The output reveals that the $F$-statistic for this joint hypothesis test is about $8.01$ and the corresponding $p$-value is $0.0004$. Thus, we can reject the null hypothesis that both coefficients are zero at any level of significance commonly used in practice. @@ -209,7 +221,9 @@ A heteroskedasticity-robust version of this $F$-test (which leads to the same co ```{r, warning=F, message=F} # heteroskedasticity-robust F-test -linearHypothesis(model, c("STR=0", "expenditure=0"), white.adjust = "hc1") +linearHypothesis(model, + c("STR=0", "expenditure=0"), + white.adjust = "hc1") ``` The standard output of a model summary also reports an $F$-statistic and the corresponding $p$-value. The null hypothesis belonging to this $F$-test is that *all* of the population coefficients in the model except for the intercept are zero, so the hypotheses are $$H_0: \beta_1=0, \ \beta_2 =0, \ \beta_3 =0 \quad \text{vs.} \quad H_1: \beta_j \neq 0 \ \text{for at least one} \ j=1,2,3.$$ @@ -221,7 +235,8 @@ We now check whether the $F$-statistic belonging to the $p$-value listed in the ```{r, warning=F, message=F} # execute the function on the model object and provide the restrictions # to be tested as a character vector -linearHypothesis(model, c("STR=0", "english=0", "expenditure=0")) +linearHypothesis(model, + c("STR=0", "english=0", "expenditure=0")) # Access the overall F-statistic from the model's summary summary(model)$fstatistic @@ -239,7 +254,9 @@ We now plot the $95\%$ confidence ellipse for the coefficients on `r ttcode("STR ```{r, echo=2:3, fig.align = 'center', warning=F, message=F} -model <- lm(score ~ STR + english + expenditure, data = CASchools) +model <- lm(score ~ STR + english + expenditure, + data = CASchools) + # draw the 95% confidence set for coefficients on STR and expenditure confidenceEllipse(model, fill = T, @@ -325,8 +342,11 @@ CASchools$STR <- CASchools$students/CASchools$teachers CASchools$score <- (CASchools$read + CASchools$math)/2 # estimate the model and print the summary to console -model <- lm(score ~ STR + english + lunch, data = CASchools) -coeftest(model, vcov. = vcovHC, type = "HC1") +model <- lm(score ~ STR + english + lunch, + data = CASchools) +coeftest(model, + vcov. = vcovHC, + type = "HC1") ``` Thus, the estimated regression line is @@ -405,7 +425,8 @@ plot(CASchools$PLS, col = "steelblue") # regress test score on PLS -summary(lm(score ~ PLS, data = CASchools)) +summary(lm(score ~ PLS, + data = CASchools)) ``` @@ -500,11 +521,16 @@ The best way to communicate regression results is in a table. The `r ttcode("sta library(stargazer) # estimate different model specifications -spec1 <- lm(score ~ STR, data = CASchools) -spec2 <- lm(score ~ STR + english, data = CASchools) -spec3 <- lm(score ~ STR + english + lunch, data = CASchools) -spec4 <- lm(score ~ STR + english + calworks, data = CASchools) -spec5 <- lm(score ~ STR + english + lunch + calworks, data = CASchools) +spec1 <- lm(score ~ STR, + data = CASchools) +spec2 <- lm(score ~ STR + english, + data = CASchools) +spec3 <- lm(score ~ STR + english + lunch, + data = CASchools) +spec4 <- lm(score ~ STR + english + calworks, + data = CASchools) +spec5 <- lm(score ~ STR + english + lunch + calworks, + data = CASchools) # gather robust standard errors in a list rob_se <- list(sqrt(diag(vcovHC(spec1, type = "HC1"))), @@ -525,11 +551,16 @@ stargazer(spec1, spec2, spec3, spec4, spec5, ```{r, results='asis', echo=F, cache=T, message=FALSE, warning=FALSE, eval=my_output == "html"} library(stargazer) -spec1 <- lm(score ~ STR, data = CASchools) -spec2 <- lm(score ~ STR + english, data = CASchools) -spec3 <- lm(score ~ STR + english + lunch, data = CASchools) -spec4 <- lm(score ~ STR + english + calworks, data = CASchools) -spec5 <- lm(score ~ STR + english + lunch + calworks, data = CASchools) +spec1 <- lm(score ~ STR, + data = CASchools) +spec2 <- lm(score ~ STR + english, + data = CASchools) +spec3 <- lm(score ~ STR + english + lunch, + data = CASchools) +spec4 <- lm(score ~ STR + english + calworks, + data = CASchools) +spec5 <- lm(score ~ STR + english + lunch + calworks, + data = CASchools) # gather robust standard errors in a list rob_se <- list( @@ -559,11 +590,16 @@ stargazer_html_title("Regressions of Test Scores on the Student-Teacher Ratio an ```{r, results='asis', echo=F, cache=T, message=FALSE, warning=FALSE, eval=my_output == "latex"} library(stargazer) -spec1 <- lm(score ~ STR, data = CASchools) -spec2 <- lm(score ~ STR + english, data = CASchools) -spec3 <- lm(score ~ STR + english + lunch, data = CASchools) -spec4 <- lm(score ~ STR + english + calworks, data = CASchools) -spec5 <- lm(score ~ STR + english + lunch + calworks, data = CASchools) +spec1 <- lm(score ~ STR, + data = CASchools) +spec2 <- lm(score ~ STR + english, + data = CASchools) +spec3 <- lm(score ~ STR + english + lunch, + data = CASchools) +spec4 <- lm(score ~ STR + english + calworks, + data = CASchools) +spec5 <- lm(score ~ STR + english + lunch + calworks, + data = CASchools) # gather robust standard errors in a list rob_se <- list( From 0f8c7f6e9f33d01e9f9707e3f02d84514e4a8d88 Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Wed, 6 Mar 2024 01:33:41 +0000 Subject: [PATCH 10/26] Update 08-ch8.Rmd --- 08-ch8.Rmd | 123 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 85 insertions(+), 38 deletions(-) diff --git a/08-ch8.Rmd b/08-ch8.Rmd index 06b21adb..393a8dd3 100644 --- a/08-ch8.Rmd +++ b/08-ch8.Rmd @@ -32,7 +32,8 @@ Here, income and test scores are positively related: school districts with above ```{r fig.align = 'center'} # fit a simple linear model -linear_model<- lm(score ~ income, data = CASchools) +linear_model<- lm(score ~ income, + data = CASchools) # plot the observations plot(CASchools$income, CASchools$score, @@ -47,7 +48,10 @@ plot(CASchools$income, CASchools$score, abline(linear_model, col = "red", lwd = 2) -legend("bottomright", legend="linear fit",lwd=2,col="red") +legend("bottomright", + legend="linear fit", + lwd=2, + col="red") ``` @@ -59,10 +63,13 @@ $$TestScore_i = \beta_0 + \beta_1 \times income_i + \beta_2 \times income_i^2 + ```{r} # fit the quadratic Model -quadratic_model <- lm(score ~ income + I(income^2), data = CASchools) +quadratic_model <- lm(score ~ income + I(income^2), + data = CASchools) # obtain the model summary -coeftest(quadratic_model, vcov. = vcovHC, type = "HC1") +coeftest(quadratic_model, + vcov. = vcovHC, + type = "HC1") ``` The output tells us that the estimated regression function is @@ -97,8 +104,10 @@ lines(x = CASchools$income[order_id], y = fitted(quadratic_model)[order_id], col = "red", lwd = 2) -legend("bottomright",legend=c("Linear Line","Quadratic Line"), - lwd=2,col=c("green","red")) +legend("bottomright", + legend=c("Linear Line","Quadratic Line"), + lwd=2, + col=c("green","red")) ``` @@ -115,7 +124,8 @@ A cubic model for instance can be estimated in the same way as the quadratic mod ```{r cubic} # estimate a cubic model -cubic_model <- lm(score ~ poly(income, degree = 3, raw = TRUE), data = CASchools) +cubic_model <- lm(score ~ poly(income, degree = 3, raw = TRUE), + data = CASchools) ``` `r ttcode("poly()")` generates orthogonal polynomials which are orthogonal to the constant by default. Here, we set `r ttcode("raw = TRUE")` such that raw polynomials are evaluated, see `?poly`. @@ -195,7 +205,9 @@ The $t$-statistic on $income^3$ is $1.45$ so the null that the relationship is q ```{r} # test the hypothesis using robust standard errors -coeftest(cubic_model, vcov. = vcovHC, type = "HC1") +coeftest(cubic_model, + vcov. = vcovHC, + type = "HC1") ``` The reported standard errors have changed. Furthermore, the coefficient for `income^3` is now significant at the $5\%$ level. This means we reject the hypothesis that the regression function is quadratic against the alternative that it is cubic. Furthermore, we can also test if the coefficients for `r ttcode("income^2")` and `r ttcode("income^3")` are jointly significant using a robust version of the $F$-test. @@ -204,7 +216,8 @@ The reported standard errors have changed. Furthermore, the coefficient for `inc # perform robust F-test linearHypothesis(cubic_model, hypothesis.matrix = R, - vcov. = vcovHC, type = "HC1") + vcov. = vcovHC, + type = "HC1") ``` With a $p$-value of $8.945e-13$, i.e., much less than $0.05$, the null hypothesis of linearity is rejected in favor of the alternative that the relationship is quadratic or cubic. @@ -273,7 +286,8 @@ To compute $\widehat{Y}$ using `r ttcode("R")` we may use `r ttcode("predict()") ```{r quadratic} # compute and assign the quadratic model -quadratic_model <- lm(score ~ income + I(income^2), data = CASchools) +quadratic_model <- lm(score ~ income + I(income^2), + data = CASchools) # set up data for prediction new_data <- data.frame(income = c(10, 11)) @@ -322,11 +336,13 @@ The regression model then is $$Y_i = \beta_0 + \beta_1 \times \ln(X_i) + u_i \t ```{r} # estimate a level-log model -LinearLog_model <- lm(score ~ log(income), data = CASchools) +LinearLog_model <- lm(score ~ log(income), + data = CASchools) # compute robust summary coeftest(LinearLog_model, - vcov = vcovHC, type = "HC1") + vcov = vcovHC, + type = "HC1") ``` Hence, the estimated regression function is @@ -353,7 +369,10 @@ lines(CASchools$income[order_id], fitted(LinearLog_model)[order_id], col = "red", lwd = 2) -legend("bottomright",legend = "Linear-log line",lwd = 2,col ="red") +legend("bottomright", + legend = "Linear-log line", + lwd = 2, + col ="red") ``` @@ -386,11 +405,13 @@ $$ \ln(Y_i) = \beta_0 + \beta_1 \times X_i + u_i , \ \ i=1,...,n. $$ ```{r} # estimate a log-linear model -LogLinear_model <- lm(log(score) ~ income, data = CASchools) +LogLinear_model <- lm(log(score) ~ income, + data = CASchools) # obtain a robust coefficient summary coeftest(LogLinear_model, - vcov = vcovHC, type = "HC1") + vcov = vcovHC, + type = "HC1") ``` The estimated regression function is $$\widehat{\ln(TestScore)} = 6.439 + 0.00284 \times income.$$ An increase in district income by $\$1000$ is expected to increase test scores by $100\times 0.00284 \% = 0.284\%$. @@ -403,11 +424,13 @@ The log-log regression model is $$\ln(Y_i) = \beta_0 + \beta_1 \times \ln(X_i) + ```{r log log} # estimate the log-log model -LogLog_model <- lm(log(score) ~ log(income), data = CASchools) +LogLog_model <- lm(log(score) ~ log(income), + data = CASchools) # print robust coefficient summary to the console coeftest(LogLog_model, - vcov = vcovHC, type = "HC1") + vcov = vcovHC, + type = "HC1") ``` The estimated regression function hence is $$\widehat{\ln(TestScore)} = 6.336 + 0.0554 \times \ln(income).$$ In a log-log model, a $1\%$ change in $X$ is associated with an estimated $\hat\beta_1 \%$ change in $Y$. @@ -514,7 +537,8 @@ polyLog_model <- lm(score ~ log(income) + I(log(income)^2) + I(log(income)^3), # print robust summary to the console coeftest(polyLog_model, - vcov = vcovHC, type = "HC1") + vcov = vcovHC, + type = "HC1") ``` Comparing by $\bar{R}^2$ we find that, leaving out the log-linear model, all models have a similar adjusted fit. In the class of polynomial models, the cubic specification has the highest $\bar{R}^2$ whereas the linear-log specification is the best of the log-models. @@ -688,10 +712,13 @@ There are several ways to add the interaction term to the `r ttcode("formula")` ```{r} # estimate the model with a binary interaction term -bi_model <- lm(score ~ HiSTR * HiEL, data = CASchools) +bi_model <- lm(score ~ HiSTR * HiEL, + data = CASchools) # print a robust summary of the coefficients -coeftest(bi_model, vcov. = vcovHC, type = "HC1") +coeftest(bi_model, + vcov. = vcovHC, + type = "HC1") ``` The estimated regression model is $$\widehat{TestScore} = \underset{(1.39)}{664.1} - \underset{(1.93)}{1.9} \times HiSTR - \underset{(2.33)}{18.3} \times HiEL - \underset{(3.12)}{3.3} \times (HiSTR \times HiEL),$$ and it predicts that the effect of moving from a school district with a low student-teacher ratio to a district with a high student-teacher ratio, depending on high or low percentage of english learners is $-1.9-3.3\times HiEL$. So for districts with a low share of english learners ($HiEL = 0$), the estimated effect is a decrease of $1.9$ points in test scores while for districts with a large fraction of English learners ($HiEL = 1$), the predicted decrease in test scores amounts to $1.9 + 3.3 = 5.2$ points. @@ -702,16 +729,20 @@ We can also use the model to estimate the mean test score for each possible comb # estimate means for all combinations of HiSTR and HiEL # 1. -predict(bi_model, newdata = data.frame("HiSTR" = 0, "HiEL" = 0)) +predict(bi_model, + newdata = data.frame("HiSTR" = 0, "HiEL" = 0)) # 2. -predict(bi_model, newdata = data.frame("HiSTR" = 0, "HiEL" = 1)) +predict(bi_model, + newdata = data.frame("HiSTR" = 0, "HiEL" = 1)) # 3. -predict(bi_model, newdata = data.frame("HiSTR" = 1, "HiEL" = 0)) +predict(bi_model, + newdata = data.frame("HiSTR" = 1, "HiEL" = 0)) # 4. -predict(bi_model, newdata = data.frame("HiSTR" = 1, "HiEL" = 1)) +predict(bi_model, + newdata = data.frame("HiSTR" = 1, "HiEL" = 1)) ``` We now verify that these predictions are differences in the coefficient estimates presented in equation \@ref(eq:im): @@ -872,10 +903,13 @@ $$ \widehat{TestScore_i} = \beta_0 + \beta_1 \times size_i + \beta_2 \times HiEL ```{r} # estimate the model -bci_model <- lm(score ~ size + HiEL + size * HiEL, data = CASchools) +bci_model <- lm(score ~ size + HiEL + size * HiEL, + data = CASchools) # print robust summary of coefficients to the console -coeftest(bci_model, vcov. = vcovHC, type = "HC1") +coeftest(bci_model, + vcov. = vcovHC, + type = "HC1") ``` The estimated regression model is $$ \widehat{TestScore} = \underset{(11.87)}{682.2} - \underset{(0.59)}{0.97} \times size + \underset{(19.51)}{5.6} \times HiEL - \underset{(0.97)}{1.28} \times (size \times HiEL). $$ The estimated regression line for districts with a low fraction of English learners ($HiEL_i=0$) is $$ \widehat{TestScore} = 682.2 - 0.97\times size_i. $$ @@ -978,10 +1012,13 @@ We now examine the interaction between the continuous variables student-teacher ```{r} # estimate regression model including the interaction between 'PctEL' and 'size' -cci_model <- lm(score ~ size + english + english * size, data = CASchools) +cci_model <- lm(score ~ size + english + english * size, + data = CASchools) # print a summary to the console -coeftest(cci_model, vcov. = vcovHC, type = "HC1") +coeftest(cci_model, + vcov. = vcovHC, + type = "HC1") ``` The estimated model equation is $$ \widehat{TestScore} = \underset{(11.76)}{686.3} - \underset{(0.59)}{1.12} \times STR - \underset{(0.37)}{0.67} \times PctEL + \underset{(0.02)}{0.0012} \times (STR\times PctEL). $$ @@ -1083,7 +1120,8 @@ We use an $F$-Test to test if the transformations of $\ln(PricePerCitation)$ in # F-Test for significance of cubic terms linearHypothesis(J_mod3, c("I(log(PricePerCitation)^2)=0", "I(log(PricePerCitation)^3)=0"), - vcov. = vcovHC, type = "HC1") + vcov. = vcovHC, + type = "HC1") ``` Clearly, we cannot reject the null hypothesis $H_0: \beta_3=\beta_4=0$ in model $(III)$. @@ -1275,13 +1313,16 @@ TS_mod4 <- lm(score ~ size + HiEL + HiEL:size + lunch + log(income), data = CASchools) TS_mod5 <- lm(score ~ size + I(size^2) + I(size^3) + HiEL +lunch - + log(income), data = CASchools) + + log(income), + data = CASchools) TS_mod6 <- lm(score ~ size + I(size^2) + I(size^3) + HiEL + HiEL:size - +HiEL:I(size^2) + HiEL:I(size^3) + lunch + log(income),data = CASchools) + +HiEL:I(size^2) + HiEL:I(size^3) + lunch + log(income), + data = CASchools) TS_mod7 <- lm(score ~ size + I(size^2) + I(size^3) + english - +lunch + log(income),data = CASchools) + +lunch + log(income), + data = CASchools) ``` We may use `r ttcode("summary()")` to assess the models' fit. Using `r ttcode("stargazer()")` we may also obtain a tabular representation of all regression outputs and which is more convenient for comparison of the models. @@ -1388,7 +1429,8 @@ Consequently, regression (6) further explores whether the fraction of English le # check joint significance of the interaction terms linearHypothesis(TS_mod6, c("size:HiEL=0", "I(size^2):HiEL=0", "I(size^3):HiEL=0"), - vcov. = vcovHC, type = "HC1") + vcov. = vcovHC, + type = "HC1") ``` We find that the null can be rejected at the level of $5\%$ and conclude that the regression function differs for districts with high and low percentage of English learners. @@ -1427,7 +1469,8 @@ new_data <- data.frame("size" = seq(16, 24, 0.05), "HiEL" = mean(CASchools$HiEL)) # add estimated regression function for model (2) -fitted <- predict(TS_mod2, newdata = new_data) +fitted <- predict(TS_mod2, + newdata = new_data) lines(new_data$size, fitted, @@ -1435,7 +1478,8 @@ lines(new_data$size, col = "blue") # add estimated regression function for model (5) -fitted <- predict(TS_mod5, newdata = new_data) +fitted <- predict(TS_mod5, + newdata = new_data) lines(new_data$size, fitted, @@ -1443,7 +1487,8 @@ lines(new_data$size, col = "red") # add estimated regression function for model (7) -fitted <- predict(TS_mod7, newdata = new_data) +fitted <- predict(TS_mod7, + newdata = new_data) lines(new_data$size, fitted, @@ -1493,7 +1538,8 @@ new_data <- data.frame("size" = seq(12, 28, 0.05), "HiEL" = 0) # add estimated regression function for model (6) with HiEL=0 -fitted <- predict(TS_mod6, newdata = new_data) +fitted <- predict(TS_mod6, + newdata = new_data) lines(new_data$size, fitted, @@ -1503,7 +1549,8 @@ lines(new_data$size, # add estimated regression function for model (6) with HiEL=1 new_data$HiEL <- 1 -fitted <- predict(TS_mod6, newdata = new_data) +fitted <- predict(TS_mod6, + newdata = new_data) lines(new_data$size, fitted, From 23f3444cbad646476aa3a3239397da346e4f8d06 Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Wed, 6 Mar 2024 01:34:23 +0000 Subject: [PATCH 11/26] Update 09-ch9.Rmd --- 09-ch9.Rmd | 140 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 53 deletions(-) diff --git a/09-ch9.Rmd b/09-ch9.Rmd index cb455d98..e464e1bf 100644 --- a/09-ch9.Rmd +++ b/09-ch9.Rmd @@ -20,7 +20,8 @@ CASchools$HiSTR <- as.numeric(CASchools$STR >= 20) CASchools$HiEL <- as.numeric(CASchools$english >= 10) # model (2) for California -TestScore_mod2 <- lm(score ~ STR + english + lunch + log(income), data = CASchools) +TestScore_mod2 <- lm(score ~ STR + english + lunch + log(income), + data = CASchools) ``` The majority of Chapter 9 of the book is of a theoretical nature. Therefore this section briefly reviews the concepts of internal and external validity in general and discusses examples of threats to internal and external validity of multiple regression models. We discuss consequences of @@ -284,8 +285,7 @@ set.seed(1) # load the package 'mvtnorm' and simulate bivariate normal data library(mvtnorm) -dat <- data.frame( - rmvnorm(1000, c(50, 100), +dat <- data.frame(rmvnorm(1000, c(50, 100), sigma = cbind(c(10, 5), c(5, 10)))) # set columns names @@ -296,11 +296,13 @@ We now estimate a simple linear regression of $Y$ on $X$ using this sample data ```{r, eval = T} # estimate the model (without measurement error) -noerror_mod <- lm(Y ~ X, data = dat) +noerror_mod <- lm(Y ~ X, + data = dat) # estimate the model (with measurement error in X) dat$X <- dat$X + rnorm(n = 1000, sd = sqrt(10)) -error_mod <- lm(Y ~ X, data = dat) +error_mod <- lm(Y ~ X, + data = dat) # print estimated coefficients to console noerror_mod$coefficients @@ -385,14 +387,14 @@ If data are missing at random, this is nothing but losing the observations. For set.seed(1) # simulate data -dat <- data.frame( - rmvnorm(1000, c(50, 100), +dat <- data.frame(rmvnorm(1000, c(50, 100), sigma = cbind(c(10, 5), c(5, 10)))) colnames(dat) <- c("X", "Y") # mark 500 randomly selected observations -id <- sample(1:1000, size = 500) +id <- sample(1:1000, + size = 500) plot(dat$X[-id], dat$Y[-id], @@ -419,8 +421,10 @@ abline(noerror_mod) # estimate model case 1 and add the regression line dat <- dat[-id, ] -c1_mod <- lm(dat$Y ~ dat$X, data = dat) -abline(c1_mod, col = "purple") +c1_mod <- lm(dat$Y ~ dat$X, + data = dat) +abline(c1_mod, + col = "purple") # add a legend legend("bottomright", @@ -442,8 +446,7 @@ Selecting data randomly based on the value of a regressor has also the effect of set.seed(1) # simulate data -dat <- data.frame( - rmvnorm(1000, c(50, 100), +dat <- data.frame(rmvnorm(1000, c(50, 100), sigma = cbind(c(10, 5), c(5, 10)))) colnames(dat) <- c("X", "Y") @@ -477,8 +480,10 @@ abline(noerror_mod) id <- which(dat$X >= 45) dat <- dat[-id, ] -c2_mod <- lm(dat$Y ~ dat$X, data = dat) -abline(c2_mod, col = "purple") +c2_mod <- lm(dat$Y ~ dat$X, + data = dat) +abline(c2_mod, + col = "purple") # add legend legend("topleft", @@ -486,8 +491,9 @@ legend("topleft", bg = "transparent", cex = 0.8, col = c("darkgreen", "black", "purple"), - legend = c("Population", "Full sample",expression(paste("Obs.with ", - X <= 45)))) + legend = c("Population", + "Full sample", + expression(paste("Obs.with ",X <= 45)))) ``` @@ -501,8 +507,7 @@ In the third case we face sample selection bias. We can illustrate this by using set.seed(1) # simulate data -dat <- data.frame( - rmvnorm(1000, c(50,100), +dat <- data.frame(rmvnorm(1000, c(50,100), sigma = cbind(c(10,5), c(5,10)))) colnames(dat) <- c("X","Y") @@ -535,8 +540,10 @@ abline(noerror_mod) # estimate model case 1, add regression line dat <- dat[id, ] -c3_mod <- lm(dat$Y ~ dat$X, data = dat) -abline(c3_mod, col = "purple") +c3_mod <- lm(dat$Y ~ dat$X, + data = dat) +abline(c3_mod, + col = "purple") # add legend legend("bottomright", @@ -544,8 +551,9 @@ legend("bottomright", bg = "transparent", cex = 0.8, col = c("darkgreen", "black", "purple"), - legend = c("Population", "Full sample",expression(paste(X <= 55,"&", - Y >= 100)))) + legend = c("Population", + "Full sample", + expression(paste(X <= 55,"&",Y >= 100)))) ``` @@ -581,10 +589,12 @@ After loading the data set, we pick observations for the year 1995 and plot loga # load the data set library(AER) data("CigarettesSW") -c1995 <- subset(CigarettesSW, year == "1995") +c1995 <- subset(CigarettesSW, + year == "1995") # estimate the model -cigcon_mod <- lm(log(packs) ~ log(price), data = c1995) +cigcon_mod <- lm(log(packs) ~ log(price), + data = c1995) cigcon_mod ``` @@ -601,7 +611,9 @@ abline(cigcon_mod, col = "darkred", lwd = 1.5) # add legend -legend("topright",lty=1,col= "darkred", "Estimated Regression Line") +legend("topright", + lty=1, + col= "darkred", "Estimated Regression Line") ``` @@ -671,7 +683,8 @@ Furthermore, if one does not adjust for heteroskedasticity \\textit{and}/\\texti Recall the regression of test scores on the student-teacher ratio ($STR$) performed in Chapter \@ref(lrwor): ```{r} -linear_model <- lm(score ~ STR, data = CASchools) +linear_model <- lm(score ~ STR, + data = CASchools) linear_model ``` @@ -725,15 +738,18 @@ Following the book we examine the relationship between district income and test ```{r} # estimate linear model -Linear_model_MA <- lm(score ~ income, data = MASchools) +Linear_model_MA <- lm(score ~ income, + data = MASchools) Linear_model_MA # estimate linear-log model -Linearlog_model_MA <- lm(score ~ log(income), data = MASchools) +Linearlog_model_MA <- lm(score ~ log(income), + data = MASchools) Linearlog_model_MA # estimate Cubic model -cubic_model_MA <- lm(score ~ I(income) + I(income^2) + I(income^3), data = MASchools) +cubic_model_MA <- lm(score ~ I(income) + I(income^2) + I(income^3), + data = MASchools) cubic_model_MA ``` @@ -749,7 +765,8 @@ plot(MASchools$income, MASchools$score, ylim = c(620, 780)) # add estimated regression line for the linear model -abline(Linear_model_MA, lwd = 2) +abline(Linear_model_MA, + lwd = 2) # add estimated regression function for Linear-log model order_id <- order(MASchools$income) @@ -781,22 +798,27 @@ We continue by estimating most of the model specifications used for analysis of MASchools$HiEL <- as.numeric(MASchools$english > median(MASchools$english)) # estimate the model specifications from Table 9.2 of the book -TSMA_mod1 <- lm(score ~ STR, data = MASchools) +TSMA_mod1 <- lm(score ~ STR, + data = MASchools) TSMA_mod2 <- lm(score ~ STR + english + lunch + log(income), data = MASchools) TSMA_mod3 <- lm(score ~ STR + english + lunch + income + I(income^2) - + I(income^3), data = MASchools) + + I(income^3), + data = MASchools) TSMA_mod4 <- lm(score ~ STR + I(STR^2) + I(STR^3) + english + lunch + income - + I(income^2) + I(income^3), data = MASchools) + + I(income^2) + I(income^3), + data = MASchools) TSMA_mod5 <- lm(score ~ STR + HiEL + I(income^2) + I(income^3) + HiEL:STR - + lunch + income, data = MASchools) + + lunch + income, + data = MASchools) TSMA_mod6 <- lm(score ~ STR + I(income^2) + I(income^3)+ lunch - + income, data = MASchools) + + income, + data = MASchools) # gather robust standard errors rob_se <- list(sqrt(diag(vcovHC(TSMA_mod1, type = "HC1"))), @@ -827,17 +849,22 @@ stargazer(TSMA_mod1, TSMA_mod2, TSMA_mod3, library(TTR) library(stargazer) MASchools$HiEL <- as.numeric(MASchools$english > median(MASchools$english)) -TSMA_mod1 <- lm(score ~ STR, data = MASchools) +TSMA_mod1 <- lm(score ~ STR, + data = MASchools) TSMA_mod2 <- lm(score ~ STR + english + lunch + log(income), data = MASchools) -TSMA_mod3 <- lm(score ~ STR + english + lunch + income + I(income^2) - + I(income^3), data = MASchools) -TSMA_mod4 <- lm(score ~ STR + I(STR^2) + I(STR^3) + english + lunch + income + I(income^2) - + I(income^3), data = MASchools) -TSMA_mod5 <- lm(score ~ STR + HiEL + HiEL:STR + lunch + income + I(income^2) - + I(income^3), data = MASchools) +TSMA_mod3 <- lm(score ~ STR + english + lunch + income + + I(income^2) + I(income^3), + data = MASchools) +TSMA_mod4 <- lm(score ~ STR + I(STR^2) + I(STR^3) + english + +lunch + income + I(income^2) + I(income^3), + data = MASchools) +TSMA_mod5 <- lm(score ~ STR + HiEL + HiEL:STR + lunch + income + + I(income^2) + I(income^3), + data = MASchools) TSMA_mod6 <- lm(score ~ STR + lunch + income + I(income^2) - + I(income^3), data = MASchools) + + I(income^3), + data = MASchools) rob_se <- list( sqrt(diag(vcovHC(TSMA_mod1, type="HC1"))), @@ -848,7 +875,8 @@ rob_se <- list( sqrt(diag(vcovHC(TSMA_mod6, type="HC1"))) ) -stargazer(TSMA_mod1, TSMA_mod2, TSMA_mod3, TSMA_mod4, TSMA_mod5, TSMA_mod6, +stargazer(TSMA_mod1, TSMA_mod2, TSMA_mod3, + TSMA_mod4, TSMA_mod5, TSMA_mod6, se = rob_se, type = "html", header = FALSE, @@ -866,17 +894,22 @@ stargazer_html_title("Regressions Using Massachusetts Test Score Data", "rumtsd" ```{r, results='asis', echo=F, cache=T, message=FALSE, warning=FALSE, eval=my_output == "latex"} library(stargazer) MASchools$HiEL <- as.numeric(MASchools$english > median(MASchools$english)) -TSMA_mod1 <- lm(score ~ STR, data = MASchools) +TSMA_mod1 <- lm(score ~ STR, + data = MASchools) TSMA_mod2 <- lm(score ~ STR + english + lunch + log(income), data = MASchools) -TSMA_mod3 <- lm(score ~ STR + english + lunch + income + I(income^2) - + I(income^3), data = MASchools) -TSMA_mod4 <- lm(score ~ STR + I(STR^2) + I(STR^3) + english + lunch + income + I(income^2) - + I(income^3), data = MASchools) -TSMA_mod5 <- lm(score ~ STR + HiEL + HiEL:STR + lunch + income + I(income^2) - + I(income^3), data = MASchools) +TSMA_mod3 <- lm(score ~ STR + english + lunch + income + + I(income^2) + I(income^3), + data = MASchools) +TSMA_mod4 <- lm(score ~ STR + I(STR^2) + I(STR^3) + english + + lunch + income + I(income^2)+ I(income^3), + data = MASchools) +TSMA_mod5 <- lm(score ~ STR + HiEL + HiEL:STR + lunch + + income + I(income^2) + I(income^3), + data = MASchools) TSMA_mod6 <- lm(score ~ STR + lunch + income + I(income^2) - + I(income^3), data = MASchools) + + I(income^3), + data = MASchools) rob_se <- list( sqrt(diag(vcovHC(TSMA_mod1, type="HC1"))), @@ -887,7 +920,8 @@ rob_se <- list( sqrt(diag(vcovHC(TSMA_mod6, type="HC1"))) ) -stargazer(TSMA_mod1, TSMA_mod2, TSMA_mod3, TSMA_mod4, TSMA_mod5, TSMA_mod6, +stargazer(TSMA_mod1, TSMA_mod2, TSMA_mod3, + TSMA_mod4, TSMA_mod5, TSMA_mod6, digits = 3, title = "\\label{tab:rumtsd} Regressions Using Massachusetts Test Score Data", type = "latex", From 97626b777055e2a1f51314c1340ccf78e6eea2f9 Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Wed, 6 Mar 2024 01:34:50 +0000 Subject: [PATCH 12/26] Update 10-ch10.Rmd --- 10-ch10.Rmd | 118 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 87 insertions(+), 31 deletions(-) diff --git a/10-ch10.Rmd b/10-ch10.Rmd index 7adbc89c..54f7ad07 100644 --- a/10-ch10.Rmd +++ b/10-ch10.Rmd @@ -58,7 +58,8 @@ library(AER) library(plm) data(Fatalities) # pdata.frame() declares the data as panel data. -Fatalities<- pdata.frame(Fatalities, index = c("state", "year")) +Fatalities<- pdata.frame(Fatalities, + index = c("state", "year")) ``` ```{r, warning=FALSE, message=FALSE} @@ -97,17 +98,25 @@ We start by reproducing Figure 10.1 of the book. To this end we estimate simple Fatalities$fatal_rate <- Fatalities$fatal / Fatalities$pop * 10000 # subset the data -Fatalities1982 <- subset(Fatalities, year == "1982") -Fatalities1988 <- subset(Fatalities, year == "1988") +Fatalities1982 <- subset(Fatalities, + year == "1982") +Fatalities1988 <- subset(Fatalities, + year == "1988") ``` ```{r, warning=FALSE, message=FALSE} # estimate simple regression models using 1982 and 1988 data -fatal1982_mod <- lm(fatal_rate ~ beertax, data = Fatalities1982) -fatal1988_mod <- lm(fatal_rate ~ beertax, data = Fatalities1988) +fatal1982_mod <- lm(fatal_rate ~ beertax, + data = Fatalities1982) +fatal1988_mod <- lm(fatal_rate ~ beertax, + data = Fatalities1988) -coeftest(fatal1982_mod, vcov. = vcovHC, type = "HC1") -coeftest(fatal1988_mod, vcov. = vcovHC, type = "HC1") +coeftest(fatal1982_mod, + vcov. = vcovHC, + type = "HC1") +coeftest(fatal1988_mod, + vcov. = vcovHC, + type = "HC1") ``` The estimated regression functions are @@ -128,8 +137,12 @@ plot(x = as.double(Fatalities1982$beertax), pch = 20, col = "steelblue") -abline(fatal1982_mod, lwd = 1.5, col="darkred") -legend("topright",lty=1,col="darkred","Estimated Regression Line") +abline(fatal1982_mod, + lwd = 1.5, + col="darkred") +legend("topright", + lty=1, + col="darkred","Estimated Regression Line") # plot observations and add estimated regression line for 1988 data plot(x = as.double(Fatalities1988$beertax), @@ -141,8 +154,12 @@ plot(x = as.double(Fatalities1988$beertax), pch = 20, col = "steelblue") -abline(fatal1988_mod, lwd = 1.5,col="darkred") -legend("bottomright",lty=1,col="darkred","Estimated Regression Line") +abline(fatal1988_mod, + lwd = 1.5, + col="darkred") +legend("bottomright", + lty=1, + col="darkred","Estimated Regression Line") ``` @@ -168,7 +185,9 @@ diff_beertax <- Fatalities1988$beertax - Fatalities1982$beertax # estimate a regression using differenced data fatal_diff_mod <- lm(diff_fatal_rate ~ diff_beertax) -coeftest(fatal_diff_mod, vcov = vcovHC, type = "HC1") +coeftest(fatal_diff_mod, + vcov = vcovHC, + type = "HC1") ``` Including the intercept allows for a change in the mean fatality rate in the time between 1982 and 1988 in the absence of a change in the beer tax. @@ -190,9 +209,13 @@ plot(x = as.double(diff_beertax), col = "steelblue") # add the regression line to plot -abline(fatal_diff_mod, lwd = 1.5,col="darkred") +abline(fatal_diff_mod, + lwd = 1.5, + col="darkred") #add legend -legend("topright",lty=1,col="darkred","Estimated Regression Line") +legend("topright", + lty=1, + col="darkred","Estimated Regression Line") ``` @@ -301,7 +324,8 @@ a regression of the traffic fatality rate on beer tax and 48 binary regressors - We can simply use the function `r ttcode("lm()")` to obtain an estimate of $\beta_1$. ```{r} -fatal_fe_lm_mod <- lm(fatal_rate ~ beertax + state - 1, data = Fatalities) +fatal_fe_lm_mod <- lm(fatal_rate ~ beertax + state - 1, + data = Fatalities) fatal_fe_lm_mod ``` @@ -316,7 +340,8 @@ fatal_demeaned <- with(Fatalities, beertax = beertax - ave(beertax, state))) # estimate the regression -summary(lm(fatal_rate ~ beertax - 1, data = fatal_demeaned)) +summary(lm(fatal_rate ~ beertax - 1, + data = fatal_demeaned)) ``` The function `r ttcode("ave")` is convenient for computing group averages. We use it to obtain state specific averages of the fatality rate and the beer tax. @@ -339,7 +364,9 @@ fatal_fe_mod <- plm(fatal_rate ~ beertax, model = "within") -coeftest(fatal_fe_mod, vcov. = vcovHC, type = "HC1") +coeftest(fatal_fe_mod, + vcov. = vcovHC, + type = "HC1") ``` The estimated coefficient is again $-0.6559$. Note that `r ttcode("plm()")` uses the entity-demeaned OLS algorithm and thus does not report dummy coefficients. The estimated regression function is @@ -362,7 +389,8 @@ The following code chunk shows how to estimate the combined entity and time fixe # estimate a combined time and entity fixed effects regression model # via lm() -fatal_tefe_lm_mod <- lm(fatal_rate ~ beertax + state + year - 1, data = Fatalities) +fatal_tefe_lm_mod <- lm(fatal_rate ~ beertax + state + year - 1, + data = Fatalities) fatal_tefe_lm_mod # via plm() @@ -372,7 +400,9 @@ fatal_tefe_mod <- plm(fatal_rate ~ beertax, model = "within", effect = "twoways") -coeftest(fatal_tefe_mod, vcov = vcovHC, type = "HC1") +coeftest(fatal_tefe_mod, + vcov = vcovHC, + type = "HC1") ``` Before discussing the outcomes we convince ourselves that `r ttcode("state")` and `r ttcode("year")` are of the class `r ttcode("factor")` . @@ -451,14 +481,18 @@ class(fatal_tefe_lm_mod) # obtain a summary based on heteroskedasticity-robust standard errors # (no adjustment for heteroskedasticity only) -coeftest(fatal_tefe_lm_mod, vcov = vcovHC, type = "HC1")[1, ] +coeftest(fatal_tefe_lm_mod, + vcov = vcovHC, + type = "HC1")[1, ] # check class of the (plm) model object class(fatal_tefe_mod) # obtain a summary based on clustered standard errors # (adjustment for autocorrelation + heteroskedasticity) -coeftest(fatal_tefe_mod, vcov = vcovHC, type = "HC1") +coeftest(fatal_tefe_mod, + vcov = vcovHC, + type = "HC1") ``` The outcomes differ rather strongly: imposing no autocorrelation we obtain a standard error of $0.25$ which implies significance of $\hat\beta_1$, the coefficient on $BeerTax$ at the level of $5\%$. On the contrary, using the clustered standard error $0.35$ results in a failure to reject the null hypothesis $H_0: \beta_1 = 0$ at the same level, see equation \@ref(eq:cbnfemod). Consult Appendix 10.2 of the book (Stock and Watson) for insights on the computation of clustered standard errors. @@ -489,20 +523,24 @@ Fatalities$drinkagec <- cut(Fatalities$drinkage, Fatalities$drinkagec <- relevel(Fatalities$drinkagec, "[21,22]") # mandatory jail or community service? -Fatalities$punish <- with(Fatalities, factor(jail == "yes" | service == "yes", +Fatalities$punish <- with(Fatalities, + factor(jail == "yes" | service == "yes", labels = c("no", "yes"))) # the set of observations on all variables for 1982 and 1988 -fatal_1982_1988 <- Fatalities[with(Fatalities, year == 1982 | year == 1988), ] +fatal_1982_1988 <- Fatalities[with(Fatalities, + year == 1982 | year == 1988), ] ``` Next, we estimate all seven models using `r ttcode("plm()")`. ```{r} # estimate all seven models -fat_mod1 <- lm(fatal_rate ~ beertax, data = Fatalities) +fat_mod1 <- lm(fatal_rate ~ beertax, + data = Fatalities) -fat_mod2 <- plm(fatal_rate ~ beertax + state, data = Fatalities) +fat_mod2 <- plm(fatal_rate ~ beertax + state, + data = Fatalities) fat_mod3 <- plm(fatal_rate ~ beertax + state + year, index = c("state","year"), @@ -586,14 +624,22 @@ rob_se <- list( sqrt(diag(vcovHC(fat_mod7, type="HC1"))) ) -stargazer(fat_mod1, fat_mod2, fat_mod3, fat_mod4, fat_mod5, fat_mod6, fat_mod7, +stargazer(fat_mod1, + fat_mod2, + fat_mod3, + fat_mod4, + fat_mod5, + fat_mod6, + fat_mod7, digits = 3, type = "html", header = FALSE, se = rob_se, + model.names = FALSE, dep.var.caption = "Dependent Variable: Fatality Rate", - model.numbers = FALSE, - column.labels = c("(1)", "(2)", "(3)", "(4)", "(5)", "(6)", "(7)") + dep.var.labels.include =FALSE, + model.numbers = TRUE, + column.labels = c('OLS','','','Linear Panel Regression') ) stargazer_html_title("Linear Panel Regression Models of Traffic Fatalities due to Drunk Driving", "lprmotfdtdd") @@ -614,13 +660,21 @@ rob_se <- list( sqrt(diag(vcovHC(fat_mod7, type="HC1"))) ) -stargazer(fat_mod1, fat_mod2, fat_mod3, fat_mod4, fat_mod5, fat_mod6, fat_mod7, +stargazer(fat_mod1, + fat_mod2, + fat_mod3, + fat_mod4, + fat_mod5, + fat_mod6, + fat_mod7, digits = 3, type = "latex", float.env = "sidewaystable", column.sep.width = "-5pt", se = rob_se, header = FALSE, + dep.var.caption = "Dependent Variable: Fatality Rate", + dep.var.labels.include =FALSE, model.names = FALSE, column.labels = c('OLS','','','Linear Panel Regression'), omit.stat = "f", @@ -640,7 +694,8 @@ The model specifications (4) to (7) include covariates that shall capture the ef linearHypothesis(fat_mod4, test = "F", c("drinkagec[18,19)=0", "drinkagec[19,20)=0", "drinkagec[20,21)"), - vcov. = vcovHC, type = "HC1") + vcov. = vcovHC, + type = "HC1") ``` 3. There is no evidence that punishment for first offenders has a deterring effects on drunk driving: the corresponding coefficient is not significant at the $10\%$ level. @@ -651,7 +706,8 @@ linearHypothesis(fat_mod4, linearHypothesis(fat_mod4, test = "F", c("log(income)", "unemp"), - vcov. = vcovHC, type = "HC1") + vcov. = vcovHC, + type = "HC1") ``` Model (5) omits the economic factors. The result supports the notion that economic indicators should remain in the model as the coefficient on beer tax is sensitive to the inclusion of the latter. From 91d50db20576eee5b4a578d1df21c0499371e594 Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:30:51 +0200 Subject: [PATCH 13/26] Update 03-ch3.Rmd --- 03-ch3.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/03-ch3.Rmd b/03-ch3.Rmd index 51e54f61..3f55e680 100644 --- a/03-ch3.Rmd +++ b/03-ch3.Rmd @@ -365,7 +365,7 @@ In this section we briefly review concepts in hypothesis testing and discuss how #### About Hypotheses and Hypothesis Testing {-} -In a significance test we want to exploit the information contained in a sample as evidence in favor of against a hypothesis. Essentially, hypotheses are simple questions that can be answered by 'yes' or 'no'. In a hypothesis test we typically deal with two different hypotheses: +In a significance test, we want to exploit the information contained in a sample as evidence in favor of or against a hypothesis. Essentially, hypotheses are simple questions that can be answered by 'yes' or 'no'. In a hypothesis test we typically deal with two different hypotheses: - The *null hypothesis*, denoted by $H_0$, is the hypothesis we are interested in testing. From c12eb3606153d18675071d98ef7de64f62f6d2d4 Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:31:51 +0200 Subject: [PATCH 14/26] Update 04-ch4.Rmd --- 04-ch4.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/04-ch4.Rmd b/04-ch4.Rmd index d82aca2f..c7497e93 100644 --- a/04-ch4.Rmd +++ b/04-ch4.Rmd @@ -1028,7 +1028,7 @@ The vectors cs and ts are available in the working environment **Instructions:** -+ The function lm() is part of the package AER. Attach the package using library(). ++ The function lm() is part of the package stats. + Use lm() to estimate the regression model $$TestScore_i = \\beta_0 + \\beta_1 STR_i + u_i.$$ Assign the result to mod. From 2b1c142343c63d69bd579db7d8bc6453d8e5e817 Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:32:48 +0200 Subject: [PATCH 15/26] Update 08-ch8.Rmd --- 08-ch8.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/08-ch8.Rmd b/08-ch8.Rmd index 393a8dd3..7cc433ab 100644 --- a/08-ch8.Rmd +++ b/08-ch8.Rmd @@ -970,7 +970,7 @@ legend("topleft", Consider a regression model with $Y$ the log earnings and two continuous regressors $X_1$, the years of work experience, and $X_2$, the years of schooling. We want to estimate the effect on wages of an additional year of work experience depending on a given level of schooling. This effect can be assessed by including the interaction term $(X_{1i} \times X_{2i})$ in the model: -$$ \Delta Y_i = \beta_0 + \beta_1 \times X_{1i} + \beta_2 \times X_{2i} + \beta_3 \times (X_{1i} \times X_{2i}) + u_i. $$ +$$Y_i = \beta_0 + \beta_1 \times X_{1i} + \beta_2 \times X_{2i} + \beta_3 \times (X_{1i} \times X_{2i}) + u_i. $$ Following Key Concept 8.1, we find that the effect on $Y$ for a change on $X_1$ given $X_2$ is $$ \frac{\Delta Y}{\Delta X_1} = \beta_1 + \beta_3 X_2. $$ From be0c9b81a156ca8c607880730ab3fb5b125d9cab Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:33:32 +0200 Subject: [PATCH 16/26] Update 10-ch10.Rmd --- 10-ch10.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/10-ch10.Rmd b/10-ch10.Rmd index 54f7ad07..094bde55 100644 --- a/10-ch10.Rmd +++ b/10-ch10.Rmd @@ -388,7 +388,7 @@ The following code chunk shows how to estimate the combined entity and time fixe ```{r} # estimate a combined time and entity fixed effects regression model -# via lm() +# via lm(). "-1" in the model excludes the intercept. fatal_tefe_lm_mod <- lm(fatal_rate ~ beertax + state + year - 1, data = Fatalities) fatal_tefe_lm_mod From df744ebf47e61e67b8926f9d9f29ac3cca97b7a2 Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:34:03 +0200 Subject: [PATCH 17/26] Update 11-ch11.Rmd --- 11-ch11.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/11-ch11.Rmd b/11-ch11.Rmd index 644dbd23..1c2445ed 100644 --- a/11-ch11.Rmd +++ b/11-ch11.Rmd @@ -34,7 +34,7 @@ cat(' The linear regression model $$Y_i = \\beta_0 + \\beta_1 X_{1i} + \\beta_2 X_{2i} + \\dots + \\beta_k X_{ki} + u_i$$ -with a binary dependent variable $Y_i$ is called the linear probability model. In the linear probability model we have $$E(Y\\vert X_1,X_2,\\dots,X_k) = P(Y=1\\vert X_1, X_2,\\dots, X_3)$$ where $$ P(Y = 1 \\vert X_1, X_2, \\dots, X_k) = \\beta_0 + \\beta_1 + X_1 + \\beta_2 X_2 + \\dots + \\beta_k X_k.$$ +with a binary dependent variable $Y_i$ is called the linear probability model. In the linear probability model we have $$E(Y\\vert X_1,X_2,\\dots,X_k) = P(Y=1\\vert X_1, X_2,\\dots, X_3)$$ where $$ P(Y = 1 \\vert X_1, X_2, \\dots, X_k) = \\beta_0 + \\beta_1 X_1 + \\beta_2 X_2 + \\dots + \\beta_k X_k.$$ Thus, $\\beta_j$ can be interpreted as the change in the probability that $Y_i=1$, holding constant the other $k-1$ regressors. Just as in common multiple regression, the $\\beta_j$ can be estimated using OLS and the robust standard error formulas can be used for hypothesis testing and computation of confidence intervals. @@ -52,7 +52,7 @@ Linear probability models are easily estimated in R using the function cat('\\begin{keyconcepts}[The Linear Probability Model]{11.1} The linear regression model $$Y_i = \\beta_0 + \\beta_1 + X_{1i} + \\beta_2 X_{2i} + \\dots + \\beta_k X_{ki} + u_i$$ -with a binary dependent variable $Y_i$ is called the linear probability model. In the linear probability model we have $$E(Y\\vert X_1,X_2,\\dots,X_k) = P(Y=1\\vert X_1, X_2,\\dots, X_3)$$ where $$ P(Y = 1 \\vert X_1, X_2, \\dots, X_k) = \\beta_0 + \\beta_1 + X_1 + \\beta_2 X_2 + \\dots + \\beta_k X_k.$$ +with a binary dependent variable $Y_i$ is called the linear probability model. In the linear probability model we have $$E(Y\\vert X_1,X_2,\\dots,X_k) = P(Y=1\\vert X_1, X_2,\\dots, X_3)$$ where $$ P(Y = 1 \\vert X_1, X_2, \\dots, X_k) = \\beta_0 + \\beta_1 X_1 + \\beta_2 X_2 + \\dots + \\beta_k X_k.$$ Thus, $\\beta_j$ can be interpreted as the change in the probability that $Y_i=1$, holding constant the other $k-1$ regressors. Just as in common multiple regression, the $\\beta_j$ can be estimated using OLS and the robust standard error formulas can be used for hypothesis testing and computation of confidence intervals.\\newline From 131a41dfaaed09de346f3bac3dc411fb518a2cdc Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:34:42 +0200 Subject: [PATCH 18/26] Update 14-ch14.Rmd --- 14-ch14.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/14-ch14.Rmd b/14-ch14.Rmd index 2a5cca4c..53325d77 100644 --- a/14-ch14.Rmd +++ b/14-ch14.Rmd @@ -66,7 +66,7 @@ In a time series context, the parent could use data on present and past years te ## Time Series Data and Serial Correlation {#tsdasc} -GDP is commonly defined as the total value of goods and services produced during a given time period. The dataset us_macro_quarterly.xlsx, provided by the authors, can be downloaded [here](http://wps.pearsoned.co.uk/wps/media/objects/16103/16489878/data3eu/us_macro_quarterly.xlsx). It contains quarterly data on U.S. real GDP (i.e., inflation-adjusted) from 1947 to 2004. +GDP is commonly defined as the total value added of goods and services produced during a given time period. The dataset us_macro_quarterly.xlsx, provided by the authors, can be downloaded [here](https://github.com/mca91/EconometricsWithR/raw/master/data/us_macro_quarterly.xlsx). It contains quarterly data on U.S. real GDP (i.e., inflation-adjusted) from 1947 to 2004. As before, a good starting point is to plot the data. The package `r ttcode("quantmod")` provides some convenient functions for plotting and computing with time series data. We also load the package `r ttcode("readxl")` to read the data into `r ttcode("R")`. @@ -132,7 +132,7 @@ plot(as.zoo(GDPGrowth), ### Notation, Lags, Differences, Logarithms and Growth Rates {-} -For observations of a variable $Y$ recorded over time, $Y_t$ denotes the value observed at time $t$. The period between two sequential observations $Y_t$ and $Y_{t-1}$ is a unit of time: hours, days, weeks, months, quarters, years, etc. Key Concept 14.1 introduces the essential terminology and notation for time series data we use in the subsequent sections. +For observations of a variable $Y$ recorded over time, $Y_t$ denotes the total value observed at time $t$. The period between two sequential observations $Y_t$ and $Y_{t-1}$ is a unit of time: hours, days, weeks, months, quarters, years, etc. Key Concept 14.1 introduces the essential terminology and notation for time series data we use in the subsequent sections. ```{r, eval = my_output == "html", results='asis', echo=F, purl=F} cat(' @@ -492,7 +492,7 @@ coeftest(GDPGR_AR2, vcov. = sandwich) The estimation yields \begin{align} - \widehat{GDPGR}_t = \underset{(0.40)}{1.63} + \underset{(0.08)}{0.28} GDPGR_{t-1} + \underset{(0.08)}{0.18} GDPGR_{t-1}. (\#eq:GDPGRAR2) + \widehat{GDPGR}_t = \underset{(0.40)}{1.63} + \underset{(0.08)}{0.28} GDPGR_{t-1} + \underset{(0.08)}{0.18} GDPGR_{t-2}. (\#eq:GDPGRAR2) \end{align} We see that the coefficient on the second lag is significantly different from zero. The fit improves slightly: $\bar{R}^2$ grows from $0.11$ for the AR($1$) model to about $0.14$ and the $SER$ reduces to $3.13$. From 44b3f845b52eafdd9f43f9f6209541cab7aaea56 Mon Sep 17 00:00:00 2001 From: Ocalak <96614838+Ocalak@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:35:58 +0200 Subject: [PATCH 19/26] Update index.Rmd --- index.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index.Rmd b/index.Rmd index 1183b6b2..e732c4b8 100755 --- a/index.Rmd +++ b/index.Rmd @@ -63,7 +63,7 @@ cat('
- # attach the package AER
-
-
- # estimate the model
+ # estimate the model
# obtain a model summary
- # attach the package AER
- library(AER)
-
# estimate the model
lm(ts ~ cs)
@@ -74,9 +68,8 @@
test_predefined_objects(c("ts","cs"))
- test_student_typed("library(AER)")
test_student_typed("lm(ts ~ cs)")
test_function("summary")