Load Libraries
1
- Set random seed generator to 123
- hr_2_tidy.csv is the name of your data subset
- Read it into and assign to hr
hr <- read_csv("https://estanny.com/static/week13/data/hr_2_tidy.csv",
col_types = "fddfff")
- use the skim to summarize the data in hr
Table 1: Data summary
Name |
hr |
Number of rows |
500 |
Number of columns |
6 |
_______________________ |
|
Column type frequency: |
|
factor |
4 |
numeric |
2 |
________________________ |
|
Group variables |
None |
Variable type: factor
gender |
0 |
1 |
FALSE |
2 |
mal: 256, fem: 244 |
evaluation |
0 |
1 |
FALSE |
4 |
bad: 154, fai: 142, goo: 108, ver: 96 |
salary |
0 |
1 |
FALSE |
6 |
lev: 95, lev: 94, lev: 87, lev: 85 |
status |
0 |
1 |
FALSE |
3 |
fir: 194, pro: 179, ok: 127 |
Variable type: numeric
age |
0 |
1 |
39.86 |
11.55 |
20.3 |
29.60 |
40.2 |
50.1 |
59.9 |
▇▇▇▇▇ |
hours |
0 |
1 |
49.39 |
13.15 |
35.0 |
37.48 |
45.6 |
58.9 |
79.9 |
▇▃▂▂▂ |
- specify that hours is the variable of interest
Response: hours (numeric)
# A tibble: 500 x 1
hours
<dbl>
1 78.1
2 35.1
3 36.9
4 38.5
5 36.1
6 78.1
7 76
8 35.6
9 35.6
10 56.8
# ... with 490 more rows
- hypothesize that the average hours worked is 48
Response: hours (numeric)
Null Hypothesis: point
# A tibble: 500 x 1
hours
<dbl>
1 78.1
2 35.1
3 36.9
4 38.5
5 36.1
6 78.1
7 76
8 35.6
9 35.6
10 56.8
# ... with 490 more rows
- generate 1000 replicates representing the null hypothesis
Response: hours (numeric)
Null Hypothesis: point
# A tibble: 500,000 x 2
# Groups: replicate [1,000]
replicate hours
<int> <dbl>
1 1 39.7
2 1 44.3
3 1 46.8
4 1 33.7
5 1 39.6
6 1 39.5
7 1 40.5
8 1 55.8
9 1 72.6
10 1 35.7
# ... with 499,990 more rows
- calculate the distribution of statistics from the generated data
- Assign the output null_t_distribution
- Display null_t_distribution
null_t_distribution <- hr %>%
specify(response = age) %>%
hypothesize(null = "point", mu = 48) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "t")
null_t_distribution
# A tibble: 1,000 x 2
replicate stat
<int> <dbl>
1 1 0.144
2 2 -1.72
3 3 0.404
4 4 -1.11
5 5 0.00894
6 6 1.46
7 7 -0.905
8 8 -0.663
9 9 0.291
10 10 3.09
# ... with 990 more rows
- visualize the simulated null distribution
- calculate the statistic from your observed data
- Assign the output observed_t_statistic
- Display observed_t_statistic
observed_t_statistic <- hr %>%
specify(response = hours) %>%
hypothesize(null = "point", mu = 48) %>%
calculate(stat = "t")
observed_t_statistic
# A tibble: 1 x 1
stat
<dbl>
1 2.37
- get_p_value from the simulated null distribution and the observed statistic
null_t_distribution %>%
get_p_value(obs_stat = observed_t_statistic, direction = "two-sided")
# A tibble: 1 x 1
p_value
<dbl>
1 0.014
- shade_p_value on the simulated null distribution
2
- hr_3_tidy.csv is the name of your data subset
- Read it into and assign to hr_2
hr_2 <- read_csv("https://estanny.com/static/week13/data/hr_3_tidy.csv",
col_types = "fddfff")
- Q: Is the average number of hours worked the same for both genders in hr_2?
- use skim to summarize the data in hr_2 by gender
hr_2 %>%
group_by(gender) %>%
skim()
Table 2: Data summary
Name |
Piped data |
Number of rows |
500 |
Number of columns |
6 |
_______________________ |
|
Column type frequency: |
|
factor |
3 |
numeric |
2 |
________________________ |
|
Group variables |
gender |
Variable type: factor
evaluation |
male |
0 |
1 |
FALSE |
4 |
bad: 72, fai: 67, goo: 61, ver: 47 |
evaluation |
female |
0 |
1 |
FALSE |
4 |
bad: 76, fai: 71, goo: 61, ver: 45 |
salary |
male |
0 |
1 |
FALSE |
6 |
lev: 47, lev: 43, lev: 43, lev: 42 |
salary |
female |
0 |
1 |
FALSE |
6 |
lev: 51, lev: 46, lev: 45, lev: 43 |
status |
male |
0 |
1 |
FALSE |
3 |
fir: 98, pro: 81, ok: 68 |
status |
female |
0 |
1 |
FALSE |
3 |
fir: 98, pro: 91, ok: 64 |
Variable type: numeric
age |
male |
0 |
1 |
38.23 |
10.86 |
20 |
28.9 |
37.9 |
47.05 |
59.9 |
▇▇▇▇▅ |
age |
female |
0 |
1 |
40.56 |
11.67 |
20 |
31.0 |
40.3 |
50.50 |
59.8 |
▆▆▇▆▇ |
hours |
male |
0 |
1 |
49.55 |
13.11 |
35 |
38.4 |
45.4 |
57.65 |
79.9 |
▇▃▂▂▂ |
hours |
female |
0 |
1 |
49.80 |
13.38 |
35 |
38.2 |
45.6 |
59.40 |
79.8 |
▇▂▃▂▂ |
- Use geom_boxplot to plot distributions of hours worked by gender
hr_2 %>%
ggplot(aes(x = gender, y = hours)) +
geom_boxplot()
-hypothesize that the number of hours worked and gender are independent
hr_2 %>%
specify(response = hours, explanatory = gender) %>%
hypothesize(null = "independence")
Response: hours (numeric)
Explanatory: gender (factor)
Null Hypothesis: independence
# A tibble: 500 x 2
hours gender
<dbl> <fct>
1 49.6 male
2 39.2 female
3 63.2 female
4 42.2 male
5 54.7 male
6 54.3 female
7 37.3 female
8 45.6 female
9 35.1 female
10 53 male
# ... with 490 more rows
- generate 1000 replicates representing the null hypothesis
hr_2 %>%
specify(response = hours, explanatory = gender) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute")
Response: hours (numeric)
Explanatory: gender (factor)
Null Hypothesis: independence
# A tibble: 500,000 x 3
# Groups: replicate [1,000]
hours gender replicate
<dbl> <fct> <int>
1 55.7 male 1
2 35.5 female 1
3 35.1 female 1
4 44.2 male 1
5 52.8 male 1
6 46 female 1
7 41.2 female 1
8 52.9 female 1
9 35.6 female 1
10 35 male 1
# ... with 499,990 more rows
- calculate the distribution of statistics from the generated data
- Assign the output null_distribution_2_sample_permute
- Display null_distribution_2_sample_permute
null_distribution_2_sample_permute <- hr_2 %>%
specify(response = hours, explanatory = gender) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "t", order = c("female", "male"))
null_distribution_2_sample_permute
# A tibble: 1,000 x 2
replicate stat
<int> <dbl>
1 1 -1.81
2 2 -1.29
3 3 0.0525
4 4 -0.793
5 5 0.826
6 6 0.429
7 7 0.0843
8 8 -0.264
9 9 2.42
10 10 0.603
# ... with 990 more rows
- visualize the simulated null distribution
visualise(null_distribution_2_sample_permute)
- calculate the statistic from your observed data
- Assign the output observed_t_2_sample_stat
- observed_t_2_sample_stat
observed_t_2_sample_stat <- hr_2 %>%
specify(response = hours, explanatory = gender) %>%
calculate(stat = "t", order = c("female", "male"))
observed_t_2_sample_stat
# A tibble: 1 x 1
stat
<dbl>
1 0.208
- get_p_value from the simulated null distribution and the observed statistic
null_t_distribution %>%
get_p_value(obs_stat = observed_t_2_sample_stat, direction = "two-sided")
# A tibble: 1 x 1
p_value
<dbl>
1 0.878
- shade_p_value on the simulated null distribution
null_t_distribution %>%
visualize() +
shade_p_value(obs_stat = observed_t_2_sample_stat, direction = "two-sided")
ggsave("preview_2.png",
path = here("_posts", "2021-05-18-hypothesis-testing"))
3
- hr_1_tidy.csv is the name of your data subset
- Read it into and assign to hr_anova
hr_anova <- read_csv("https://estanny.com/static/week13/data/hr_1_tidy.csv",
col_types = "fddfff")
- Q: Is the average number of hours worked the same for all three status (fired, ok and promoted) ?
- use skim to summarize the data in hr_anova by status
hr_anova %>%
group_by(status) %>%
skim()
Table 3: Data summary
Name |
Piped data |
Number of rows |
500 |
Number of columns |
6 |
_______________________ |
|
Column type frequency: |
|
factor |
3 |
numeric |
2 |
________________________ |
|
Group variables |
status |
Variable type: factor
gender |
fired |
0 |
1 |
FALSE |
2 |
fem: 96, mal: 89 |
gender |
ok |
0 |
1 |
FALSE |
2 |
fem: 77, mal: 76 |
gender |
promoted |
0 |
1 |
FALSE |
2 |
fem: 87, mal: 75 |
evaluation |
fired |
0 |
1 |
FALSE |
4 |
bad: 65, fai: 63, goo: 31, ver: 26 |
evaluation |
ok |
0 |
1 |
FALSE |
4 |
bad: 69, fai: 59, goo: 15, ver: 10 |
evaluation |
promoted |
0 |
1 |
FALSE |
4 |
ver: 63, goo: 60, fai: 20, bad: 19 |
salary |
fired |
0 |
1 |
FALSE |
6 |
lev: 41, lev: 37, lev: 32, lev: 32 |
salary |
ok |
0 |
1 |
FALSE |
6 |
lev: 40, lev: 37, lev: 29, lev: 23 |
salary |
promoted |
0 |
1 |
FALSE |
6 |
lev: 37, lev: 35, lev: 29, lev: 23 |
Variable type: numeric
age |
fired |
0 |
1 |
38.64 |
11.43 |
20.2 |
28.30 |
38.30 |
47.60 |
59.6 |
▇▇▇▅▆ |
age |
ok |
0 |
1 |
41.34 |
12.11 |
20.3 |
31.00 |
42.10 |
51.70 |
59.9 |
▆▆▆▆▇ |
age |
promoted |
0 |
1 |
42.13 |
10.98 |
21.0 |
33.40 |
42.95 |
50.98 |
59.9 |
▆▅▆▇▇ |
hours |
fired |
0 |
1 |
41.67 |
7.88 |
35.0 |
36.10 |
38.90 |
43.90 |
75.5 |
▇▂▁▁▁ |
hours |
ok |
0 |
1 |
48.05 |
11.65 |
35.0 |
37.70 |
45.60 |
56.10 |
78.2 |
▇▃▃▂▁ |
hours |
promoted |
0 |
1 |
59.27 |
12.90 |
35.0 |
51.12 |
60.10 |
70.15 |
79.7 |
▆▅▇▇▇ |
- Use geom_boxplot to plot distributions of hours worked by status
hr_anova %>%
ggplot(aes(x = status, y = hours)) +
geom_boxplot()
- specify the variables of interest are hours and status
hr_anova %>%
specify(response = hours, explanatory = status)
Response: hours (numeric)
Explanatory: status (factor)
# A tibble: 500 x 2
hours status
<dbl> <fct>
1 36.5 fired
2 55.8 ok
3 35 fired
4 52 promoted
5 35.1 ok
6 36.3 ok
7 40.1 promoted
8 42.7 fired
9 66.6 promoted
10 35.5 ok
# ... with 490 more rows
- hypothesize that the number of hours worked and status are independent
hr_anova %>%
specify(response = hours, explanatory = status) %>%
hypothesise(null = "independence")
Response: hours (numeric)
Explanatory: status (factor)
Null Hypothesis: independence
# A tibble: 500 x 2
hours status
<dbl> <fct>
1 36.5 fired
2 55.8 ok
3 35 fired
4 52 promoted
5 35.1 ok
6 36.3 ok
7 40.1 promoted
8 42.7 fired
9 66.6 promoted
10 35.5 ok
# ... with 490 more rows
- generate 1000 replicates representing the null hypothesis
hr_anova %>%
specify(response = hours, explanatory = status) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute")
Response: hours (numeric)
Explanatory: status (factor)
Null Hypothesis: independence
# A tibble: 500,000 x 3
# Groups: replicate [1,000]
hours status replicate
<dbl> <fct> <int>
1 40.3 fired 1
2 40.3 ok 1
3 37.3 fired 1
4 50.5 promoted 1
5 35.1 ok 1
6 67.8 ok 1
7 39.3 promoted 1
8 35.7 fired 1
9 40.2 promoted 1
10 38.4 ok 1
# ... with 499,990 more rows
- calculate the distribution of statistics from the generated data
- Assign the output null_distribution_anova
- Display null_distribution_anova
null_distribution_anova <- hr_anova %>%
specify(response = hours, explanatory = gender) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "F")
null_distribution_anova
# A tibble: 1,000 x 2
replicate stat
<int> <dbl>
1 1 0.365
2 2 0.650
3 3 0.185
4 4 0.0184
5 5 0.163
6 6 0.0194
7 7 4.92
8 8 2.11
9 9 0.341
10 10 0.855
# ... with 990 more rows
- visualize the simulated null distribution
- calculate the statistic from your observed data
- Assign the output observed_f_sample_stat
- Display observed_f_sample_stat
observed_f_sample_stat <- hr_anova %>%
specify(response = hours, explanatory = status) %>%
calculate(stat = "F")
observed_f_sample_stat
# A tibble: 1 x 1
stat
<dbl>
1 115.
- get_p_value from the simulated null distribution and the observed statistic
null_distribution_anova %>%
get_p_value(obs_stat = observed_f_sample_stat, direction = "greater")
# A tibble: 1 x 1
p_value
<dbl>
1 0
- shade_p_value on the simulated null distribution