library(tidyverse)
# 1) In the data set below, what is the true value of beta 1
# (the effect of education on earnings)?
tibble(
ability = runif(n = 20, min = 0, max = 100),
education = 0.5 * ability + 0.5 * runif(n = 20, min = 0, max = 100),
earnings = 0.3 * education + 0.5 * ability + rnorm(n = 20, mean = 50, sd = 30)
)
# 2) Pipe the data into lm() and estimate the model
# `earnings ~ education`.
# What is your estimate for beta 1?
# When you run the code repeatedly, do the estimates
# seem to be correct on average?
tibble(
ability = runif(n = 20, min = 0, max = 100),
education = 0.5 * ability + 0.5 * runif(n = 20, min = 0, max = 100),
earnings = 0.3 * education + 0.5 * ability + rnorm(n = 20, mean = 50, sd = 30)
) %>%
___ %>%
broom::tidy()
# 3) Use slice and select to get the estimate of beta 1 only.
tibble(
ability = runif(n = 20, min = 0, max = 100),
education = 0.5 * ability + 0.5 * runif(n = 20, min = 0, max = 100),
earnings = 0.3 * education + 0.5 * ability + rnorm(n = 20, mean = 50, sd = 30)
) %>%
___ %>%
broom::tidy() %>%
___ %>%
___
# 4) Use map() to run the simulation 100 times, storing the estimate
# for beta 1 every time. Then pipe the simulation results into
# a ggplot to create a histogram of estimates of beta 1.
# What value are the estimates centered around? Add a red vertical
# line to represent the true value for beta 1.
map(
.x = ___,
.f = function(x) {
tibble(
ability = runif(n = 20, min = 0, max = 100),
education = 0.5 * ability + 0.5 * runif(n = 20, min = 0, max = 100),
earnings = 0.5 * ability + 0.3 * education + rnorm(n = 20, mean = 50, sd = 30)
) %>%
___ %>%
broom::tidy() %>%
___ %>%
___
}
) %>%
bind_rows() %>%
ggplot(___) +
geom_histogram() +
geom_vline(___)
# 5) Write a function beta_1_estimate that takes a samplesize,
# runs the simulation above 100 times with map() under the
# samplesize given by the function's user, and then returns a
# 100x2 tibble of beta1 estimates along with the samplesize.
beta_1_estimate <- function(samplesize) {
map(
.x = ___,
.f = function(x) {
tibble(
ability = runif(n = samplesize, min = 0, max = 100),
education = 0.5 * ability + 0.5 * runif(n = samplesize, min = 0, max = 100),
earnings = 0.5 * ability + 0.3 * education + rnorm(n = samplesize, mean = 50, sd = 30)
) %>%
___ %>%
broom::tidy() %>%
___ %>%
___
}
) %>%
bind_rows() %>%
mutate(n = factor(samplesize))
}
# 6) Test that beta_1_estimate works. When you call it on a samplesize
# of 20, what is the minimum and maximum estimates you get?
# Compare: run beta_1_estimate(1000): what are the minimum and maximum
# estimates there?
beta_1_estimate(20) %>%
ggplot(___) +
geom_histogram()
beta_1_estimate(1000) %>%
ggplot(___) +
geom_histogram()
# 7) Use `map()` to call `beta_1_estimate` on samplesizes
# c(20, 50, 200, 1000), and plot the distribution of estimates
# with geom_density, where samplesize is represented by fill.
# Add a red vertical line to represent the true value of beta 1.
map(
.x = c(20, 50, 200, 1000),
.f = ___
) %>%
bind_rows() %>%
ggplot(___) +
geom_density(alpha = .5) +
geom_vline(___)
# 8) Interpret: does omitted variable bias disappear as the sample
# size increases?
#
#
#12 Omitted Variable Bias Simulation
In this classwork, you’ll do another simulation. This time, you’ll:
- Generate a simulated data set with variables
ability,education, andearnings - Let
educationdepend somewhat on a person’s underlying ability - Let
earningsdepend both oneducationandability. - Then you’ll estimate the model
earnings ~ educationto see if we get omitted variable bias when we omit ability. - Finally, evaluate: does omitted variable bias disappear when we let the sample size grow?
Download this assignment
Here’s a link to download this assignment.