ggplot(data = survey_data,
mapping = aes(x = weight, y = species)) +
geom_boxplot(aes(color = genus),
outliers = FALSE) +
scale_colour_brewer(palette = "Set1") +
annotate("text", y = 1:14, x = 250, label = c("Neomota",
"Chaetodipus",
"Peromyscus",
"Perognathus",
"Reithrodontomys",
"Sigmodon",
"Onychomys",
"Peromyscus",
"Reithrodontomys",
"Dipodomys",
"Dipodomys",
"Chaetodipus",
"Dipodomys",
"Onychromys")) +
theme(legend.position = "none") +
labs(title = "Rodent Weight Distribution by Species",
x = "Weight (g)",
y = "")
Gallary of Exemplary Student Work
Challenge 2
This code is brought to us by Zack Kramer! I think this code is so special because Zack managed to specify every annotation within one call to the annotate()
function—very efficient!
Lab 3
This code is brought to us by a student who wishes to remain anonymous. What I appreciated about their code is both their efficiency and their concise output!
3. Provide a brief overview (~4 sentences) of the dataset.
This data set contains the metrics generated from students’ teacher evaluations at a University in Poland during the winter semester of the 2020-2021 academic year. The data set is 8015 by 22, representing 8015 observations with 22 variables. Each row of data corresponds to a specific teacher, their class identifier and an evaluation question. And for each of these, the average score of the question, the teacher’s information, and additional student metrics are observed.
6. How many unique instructors and unique courses are present in the cleaned dataset?
|>
teacher_evals_clean summarise(unique_teachers = n_distinct(teacher_id),
unique_courses = n_distinct(course_id),
unique_combinations = n_distinct(course_id,teacher_id))
# A tibble: 1 × 3
unique_teachers unique_courses unique_combinations
<int> <int> <int>
1 297 939 1094
9. Each course seems to have used a different subset of the nine evaluation questions. How many teacher-course combinations asked all nine questions?
|>
teacher_evals_clean group_by(course_id, teacher_id) |>
summarise(question_count = n_distinct(question_no),
.groups = "drop") |>
filter(question_count == 9) |>
count() |>
rename(teacher_course = n)
# A tibble: 1 × 1
teacher_course
<int>
1 49
10. Which instructors had the highest and lowest average rating for Question 1 (I learnt a lot during the course.) across all their courses?
|>
teacher_evals_clean group_by(teacher_id) |>
filter(question_no == 901) |>
summarize(avg_q1_score = mean(SET_score_avg),
.groups = "drop") |>
filter(avg_q1_score == min(avg_q1_score) |
== max(avg_q1_score)) avg_q1_score
# A tibble: 82 × 2
teacher_id avg_q1_score
<fct> <dbl>
1 3471 5
2 3513 5
3 3543 5
4 3608 5
5 3662 5
6 3682 5
7 37013 5
8 37023 5
9 37026 5
10 37988 5
# ℹ 72 more rows
Lab 4
4. Let’s consider the median household income of each region, and how that income has changed over time. Create a table with ten rows, one for each region, and two columns, one for 2008 and one for 2018. The cells should contain the median
of the median household income (expressed in 2018 dollars) of the region
and the study_year
. Arrange the rows by 2018 values.
This code is brought to us by Oliver Lane & Cassandra Miller! What I appreciate about their code is how their tables have descriptive titles!
Oliver’s Code:
|>
ca_childcare filter(study_year %in% c(2008, 2018)) |>
group_by(region, study_year) |>
summarise(median_income = median(mhi_2018, na.rm = TRUE), .groups = 'drop') |>
pivot_wider(names_from = study_year,
values_from = median_income,
names_prefix = "median_income_") |>
arrange(desc(median_income_2018))
# A tibble: 10 × 3
region median_income_2008 median_income_2018
<fct> <dbl> <dbl>
1 San Francisco Bay Area 90412. 104552
2 Orange County 86452. 85398
3 Central Coast 72979 74849
4 Los Angeles County 63471. 64251
5 Inland Empire 65977. 62056
6 San Diego - Imperial 58201. 60344.
7 Northern San Joaquin Valley 59108. 57769
8 Superior California 57831. 53270
9 Southern San Joaquin Valley 52676. 52479
10 North Coast 47862. 45528
Cassandra’s Code:
<- ca_childcare |>
median_income_by_region filter(study_year %in% c(2008, 2018)) |>
group_by(region, study_year) |>
summarise(median_income = median(mhi_2018, na.rm = TRUE), .groups = 'drop') |>
pivot_wider(names_from = study_year, values_from = median_income, names_prefix = "Income_")
5. Which California region
had the lowest median
full-time median weekly price for center-based childcare for infants in 2018? Does this region
correspond to the region
with the lowest median
income in 2018 that you found in Q4?
This code is brought to us by Eva Moylan & Tillman Erb! What I appreciate about their code is how they went above and beyond by including both the median price of center-based childcare for infants and median household income for each region.
Eva’s Code:
|>
ca_childcare group_by(region, study_year) |>
summarize(mhi_2018 = median(mhi_2018), #values now are median for each year at each county
mc_infant = median(mc_infant),
.groups = 'drop') |>
filter(study_year == 2018) |>
pivot_wider(id_cols = "region", #region stays from original table orientation
names_from = study_year,
values_from = c(mhi_2018,
|>
mc_infant)) slice_min(mc_infant_2018) #Superior CA has the lowest med childcare for infants in 2018
# A tibble: 1 × 3
region mhi_2018_2018 mc_infant_2018
<fct> <dbl> <dbl>
1 Superior California 53270 215.
Tillman’s Code:
|>
ca_childcare filter(study_year == 2018) |>
group_by(region) |>
summarize(median_weekly_cc_price = median(mc_infant)) |>
arrange(median_weekly_cc_price) |>
inner_join(select(median_inc, -`2008`)) |>
rename(median_household_income =`2018`)
# A tibble: 10 × 3
region median_weekly_cc_price median_household_income
<fct> <dbl> <dbl>
1 Superior California 215. 53270
2 North Coast 226. 45528
3 Northern San Joaquin Valley 238. 57769
4 San Diego - Imperial 262. 60344.
5 Southern San Joaquin Valley 287. 52479
6 Inland Empire 294. 62056
7 Central Coast 318. 74849
8 Los Angeles County 334. 64251
9 Orange County 341. 85398
10 San Francisco Bay Area 402. 104552
6. The following plot shows, for all ten regions, the change over time of the full-time median price for center-based childcare for infants, toddlers, and preschoolers. Recreate the plot. You do not have to replicate the exact colors or theme, but your plot should have the same content, including the order of the facets and legend, reader-friendly labels, axes breaks, and a loess smoother.
This code is brought to us by Isabel Villafuerte! What I appreciate about their code is the reordering of the facets and legend, adding dollar signs to the y-axis labels, and dodging the x-axis labels (a clever way to not need to remove some labels!).
|>
ca_childcare filter(study_year >= 2008) |>
pivot_longer(cols = c(mc_infant,
mc_toddler,
mc_preschool),names_to = "age_demo",
values_to = "median_weekly_price") |>
mutate(age_demo = fct_recode(age_demo,
"Infant" = "mc_infant",
"Toddler" = "mc_toddler",
"Preschool" = "mc_preschool"),
age_demo = fct_relevel(age_demo,
"Infant",
"Toddler",
"Preschool")
|>
) ggplot(mapping = aes (x = study_year,
y = median_weekly_price,
color = fct_reorder2(region,
.x = study_year,
.y = median_weekly_price))) +
geom_point() +
geom_smooth(method = "loess",
se = TRUE) +
facet_wrap(~age_demo) +
labs(x = "Study Year",
y = "",
title = "Weekly Median Price for Center-Based Childcare ($)",
color = "California Region") +
scale_x_continuous(breaks = seq(2008,
2018,
by = 2)) +
guides(x = guide_axis(n.dodge = 2)) +
theme_bw() +
scale_y_continuous(limits = c(0,
500),
labels = scales::label_dollar())
7. Create a scatterplot showing the relationship between median household income (expressed in 2018 dollars) and the full-time median weekly price charged for center-based childcare for an infant in California. Overlay a linear regression line (lm) to show the trend.
This code is brought to us by Oliver Lane & Cassandra Miller! What I appreciate about their plot is their use of color, themes, and thoughtful plot titles!
ggplot(ca_childcare, aes(x = mhi_2018, y = mc_infant)) +
geom_point(alpha = 0.6, color = "steelblue") + # Scatterplot points
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
theme_bw() +
labs(
title = "Household Income vs. Infant Care Prices",
x = "Median Household Income (dollars)",
y = "",
subtitle = "Median Weekly Price for Infant Care (dollars)"
+
) theme(
plot.title = element_text(hjust = 0.5, face = "bold")
)
Lab 7
2. Create ONE thoughtful visualization that explores the frequency of missing values across the different years, sections, and trips.
This code is brought to us by Tillman Erb! I appreciate Tillman’s use of non-standard colors and facet titles that are clear to the reader.
|>
fish filter(if_any(.cols = everything(), .fns = is.na)) |>
mutate(trip = as.character(trip),
trip = fct_recode(trip,
"Trip 1" = "1",
"Trip 2" = "2")) |>
ggplot(mapping = aes(x = year, fill = section)) +
facet_grid(~ trip) +
geom_bar() +
scale_fill_viridis_d() +
labs(x = "Year",
y = NULL,
title = "Missing observation frequency by section, trip and year",
subtitle = "Number of missing observations",
color = "Section")
This code is brought to us by Christopher Hawkins! I really like how Christopher thought outside the box and combined the year and the trip number to a single variable on the y-axis (with a descriptive plot title!).
|>
fish group_by(year, section, trip) |>
summarize(missing_weight = sum(is.na(weight)), .groups = "drop") |>
# Create a combined label for year and trip
mutate(year_trip = paste(year, trip, sep = "-")) |>
# Plotting the data
ggplot(mapping = aes(x = missing_weight, y = year_trip, fill = section)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Count of Missing Values in Weight by Year, Section, and Trip",
subtitle = "Year-Trip #",
x = "Count of Missing Weight Values",
y = "",
fill = "Section"
+
) theme_minimal()
This visualization is brought to us by Siwuthara Prak! What I really appreciate about this plot is how it plots every year that there were measurements.
|>
fish filter(is.na(weight)) |>
mutate(trip = factor(trip, levels = c("1", "2")),
trip = fct_recode(trip,
`Trip 1` = "1",
`Trip 2` = "2")
|>
)group_by(year,
section, |>
trip) summarise(missing_count = n(),
.groups = 'drop') |>
ggplot(aes(x = factor(year),
y = missing_count,
fill = section)) +
geom_bar(stat = "identity",
position = "stack") +
facet_wrap(~ trip, ncol = 2) +
theme_bw(base_size = 8) +
labs(
x = "Year",
y = "",
subtitle = "Number of Missing Values",
fill = "Section",
title = "Frequency of Missing Values in 'Weight' by Year, Section, and Trip"
+
) scale_fill_manual(values = c("Johnsrud" = "steelblue", "ScottyBrown" = "gray")) # Adjusted color for ScottyBrown
This code is brought to us by Alex Lung! I love the idea of faceting by year, since it gets at the same idea as Siwuthara’s plot, where the reader can see every year there are data values for. Alex also uses some fun (non-standard) colors!
|>
fish #Make sure it is rows lacking wight data
filter(is.na(weight)) |>
#Alter into a usable distincter variable
mutate(trip = as.factor(trip)) |>
group_by(year, section, trip) |>
summarise(Missing_Count = n(), .groups = "drop") |>
ggplot(aes(x = section, y = Missing_Count, fill = trip)) +
geom_bar(stat = "identity") +
facet_wrap(~ year, ncol = 5) +
labs(
title = "Frequency of Missing Weight Values by Year, Section, and Trip",
subtitle = "Count of Missing Values",
x = "Section",
y = "",
fill = "Trip"
+
) theme_bw(base_size = 8) +
theme(
strip.text = element_text(size = 12),
aspect.ratio = 0.6,
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
+
) scale_fill_manual(values = c("darkcyan", "cyan2"))
This code is brought to us by a student who wishes to remain anonymous. I think a line plot is a great approach here to see the changes in missing values over time, plus they use some great colors!
|>
fish group_by(year, section, trip) |>
summarize(missing_count = sum(is.na(weight)), .groups = "drop") |>
mutate(trip = case_when(
== 1 ~ "Trip 1",
trip == 2 ~ "Trip 2")) |>
trip ggplot(aes(x = year, y = missing_count, color = section)) +
geom_line(size = 1) +
scale_color_brewer(palette = "Set2") +
labs(
title = "Missing Data in Measurements of Trout Species (Blackfoot River)",
x = "Year",
subtitle = "Weight Values",
y = "",
color = "Section") +
facet_wrap(~ trip) +
theme_minimal()
This code is brought to us by Cassandra Miller! Cassandra also used a line plot, with great facet labels and a great plot title!
|>
fish filter(is.na(weight)) |>
group_by(year, section, trip) |>
mutate(trip = recode(trip,
'1' = 'Trip 1',
'2' = 'Trip 2')) |>
summarise(Missing_amount = n(), .groups = 'drop') |>
ggplot(aes(x = year, y = Missing_amount, color = section)) +
geom_line() +
facet_wrap(~trip) +
theme_minimal() +
labs(
x = 'Year',
y = '',
title = 'Frequency of Missing Values Across Various Years, Sections and Trips',
subtitle = 'Count of Missing Values Across Years',
color = 'Section'
)
5. Let’s incorporate some input validation into your function. Modify your previous code so that the function stops if …
- … the input vector is not numeric.
- … the length of the input vector is not greater than 1.
This code is brought to us by Ryan Chen! I was excited to see that Ryan figured out how to add messages to the stopifnot()
function!
<- function(vec){
rescale_01 # validate input vector
stopifnot("Error: Input vector must be numeric" = is.numeric(vec),
"Error: Input vector must contain more than one element" = length(x) > 1)
= min(vec,
min_val na.rm = TRUE)
= max(vec,
max_val na.rm = TRUE)
return((vec - min_val) / (max_val - min_val))
}
This code is brought to us by Daniel Bush! I thought it was really clever to add an additional check to see if the min and the max values were identical.
<- function(vector) {
rescale_01 stopifnot(is.numeric(vector),
length(vector) > 1)
<- range(vector, na.rm = TRUE)
range_vals <- range_vals[1]
min_val <- range_vals[2]
max_val
if (min_val == max_val) {
return(rep(NA,
length(vector)))
} <- (vector - min_val) / (max_val - min_val)
rescaled_vector
return(rescaled_vector)
}
This code is brought to us by Tillman! I was very impressed with the addition of a na.rm
argument (that’s optional!), to control the output of the final vector.
<- function(vec, na.rm = TRUE) {
rescale_01 if (!is.numeric(vec[1])){
stop("Error: vector provided is not numeric")
else if (length(vec) <= 1) {
} stop("Error: vector length must be greater than 1")
}<- range(vec, na.rm = TRUE, finite = TRUE)
vec_range <- (vec - vec_range[1]) /
vec 2] - vec_range[1])
(vec_range[if (na.rm){
<- vec[!is.na(vec)]
vec
}return(vec)
}