Project Checkpoint 3

Setup

For simplicity here, I will study only the top 10 countries in terms of female medals. This code is from last week’s project step.

# Function to count medals but not overcount team sports

num_female_medals <- function(country) {
  
  female_medals <- olympics |>
    filter(team == country,
           sex == "F") |>
    distinct(games, event, medal) |>
    pull(medal)
  
  
  sum(!is.na(female_medals), na.rm = TRUE)
  
}
top_countries <- olympics |>
  distinct(team) |>
  mutate(
    n_f_medals = map_dbl(team, num_female_medals)
  ) |>
  slice_max(n_f_medals, n = 10) |>
  pull(team)

top_countries
 [1] "United States" "Germany"       "Soviet Union"  "China"        
 [5] "Russia"        "East Germany"  "Great Britain" "Australia"    
 [9] "Canada"        "Netherlands"  

New data from API

I’m choosing to use the API-Ninjas API for country populations. This API requires an account and API key. In the code that you see below, I have replaced hidden the line that stores my API key, so that it is not publicly shared.

get_population <- function(country, key = my_key)
  {
  
  url <- glue("https://api.api-ninjas.com/v1/population?country={country};X-Api-Key={key}")
  my_data <- safely(fromJSON)(url)
  
  if (is.null(my_data$result)) {
    return(NA)
  } else {
    return(my_data$result$historical_population)
  }
}

get_population("Japan")
   year population yearly_change_percentage yearly_change migrants median_age
1  2024  123753041                    -0.50       -617906   153357       49.4
2  2023  124370947                    -0.50       -626631   175003       49.0
3  2022  124997578                    -0.54       -681760   175003       48.5
4  2020  126304543                    -0.31       -394881    42001       47.7
5  2015  127275872                    -0.14       -181881   168896       45.8
6  2010  128185275                     0.04         54389   131860       44.2
7  2005  127913330                     0.14        177108   113017       42.6
8  2000  127027789                     0.21        271025    23468       40.8
9  1995  125672665                     0.37        454580    87714       39.0
10 1990  123399765                     0.42        513520   124582       36.9
11 1985  120832163                     0.41        494681  -141323       34.5
12 1980  118358756                     0.87       1007512   -92314       31.7
13 1975  113321196                     1.21       1321766      107       29.7
14 1970  106712368                     1.12       1159270    33731       28.3
15 1965  100916019                     0.92        903253   -50667       26.6
16 1960   96399754                     0.85        794855  -150371       24.7
17 1955   92425478                     1.35       1196440   -95975       22.7
   fertility_rate density urban_population_pct urban_population
1            1.22     339                 92.9        114979260
2            1.21     341                 92.7        115292289
3            1.26     343                 92.5        115583843
4            1.30     346                 91.9        116099672
5            1.42     349                 91.9        116944428
6            1.36     352                 91.1        116741034
7            1.25     351                 86.3        110340709
8            1.35     348                 79.0        100303716
9            1.41     345                 78.5         98593178
10           1.51     338                 78.0         96298507
11           1.74     331                 77.4         93507944
12           1.74     325                 75.8         89755553
13           1.92     311                 75.1         85121987
14           2.04     293                 70.7         75417163
15           2.09     277                 66.2         66812422
16           1.98     264                 61.5         59269408
17           2.35     254                 56.3         52005319
   percentage_of_world_population rank
1                            1.52   12
2                            1.54   12
3                            1.56   12
4                            1.60   11
5                            1.70   10
6                            1.83   10
7                            1.94   10
8                            2.06    9
9                            2.18    8
10                           2.32    7
11                           2.48    7
12                           2.66    7
13                           2.78    6
14                           2.89    6
15                           3.03    5
16                           3.20    5
17                           3.37    5
get_population("United States")
Warning in open.connection(con, "rb"): URL
'https://api.api-ninjas.com/v1/population?country=United
States;X-Api-Key=1ekT3dWHWPDuwhNECaYVFxJqyiEDVZo6HJqPh9xF': status was 'URL
using bad/illegal format or missing URL'
[1] NA

Since “United States” has a space in it, it isn’t being searched properly by the API URL. After some investigation on the API site and experimentation, I see that I can use country abbreviations instead.

top_countries <- olympics |>
  distinct(team, noc) |>
  mutate(
    n_f_medals = map_dbl(team, num_female_medals)
  ) |>
  slice_max(n_f_medals, n = 10) |>
  pull(noc)

top_countries
 [1] "USA" "GER" "URS" "CHN" "RUS" "GDR" "GBR" "AUS" "CAN" "NED"
data_list <- top_countries |>
  map(get_population)

# Check for ones that didn't work
data_list |> is.na() |> sum()
[1] 0

Now let’s make this a tibble so we can join it to our original data:

pop_df <- data_list |>
  set_names(top_countries) |>
  bind_rows(
    .id = "noc"
  ) 

pop_df |> head()
  noc year population yearly_change_percentage yearly_change migrants
1 USA 2024  345426571                     0.57       1949236  1286132
2 USA 2023  343477335                     0.57       1943289  1322668
3 USA 2022  341534046                     0.40       1372605  1319009
4 USA 2020  339436159                     0.49       1646092   329769
5 USA 2015  326126497                     0.95       3012741  1722127
6 USA 2010  311062790                     1.02       3069225  1594453
  median_age fertility_rate density urban_population_pct urban_population
1       38.3           1.62      38                 82.4        284698234
2       38.0           1.62      38                 82.1        281984165
3       37.7           1.67      37                 81.8        279286931
4       37.2           1.62      37                 80.7        273975139
5       36.4           1.83      36                 80.1        261287811
6       35.9           1.92      34                 80.1        249297076
  percentage_of_world_population rank
1                           4.23    3
2                           4.24    3
3                           4.26    3
4                           4.30    3
5                           4.37    3
6                           4.43    3

Now, we join the datasets:

olymp_top <- olympics |>
  filter(noc %in% top_countries) |>
  left_join(pop_df)
Joining with `by = join_by(noc, year)`
olymp_top |> head()
# A tibble: 6 × 26
     id name      sex     age height weight team  noc   games  year season city 
  <dbl> <chr>     <chr> <dbl>  <dbl>  <dbl> <chr> <chr> <chr> <dbl> <chr>  <chr>
1     1 A Dijiang M        24    180     80 China CHN   1992…  1992 Summer Barc…
2     2 A Lamusi  M        23    170     60 China CHN   2012…  2012 Summer Lond…
3     5 Christin… F        21    185     82 Neth… NED   1988…  1988 Winter Calg…
4     5 Christin… F        21    185     82 Neth… NED   1988…  1988 Winter Calg…
5     5 Christin… F        25    185     82 Neth… NED   1992…  1992 Winter Albe…
6     5 Christin… F        25    185     82 Neth… NED   1992…  1992 Winter Albe…
# ℹ 14 more variables: sport <chr>, event <chr>, medal <chr>, population <int>,
#   yearly_change_percentage <dbl>, yearly_change <dbl>, migrants <dbl>,
#   median_age <dbl>, fertility_rate <dbl>, density <int>,
#   urban_population_pct <dbl>, urban_population <int>,
#   percentage_of_world_population <dbl>, rank <int>

Note: There are a lot of NAs because the population data doesn’t go as far back in time as the olympic data!

Finally, we use our new information to produce an interesting summary:

olymp_top |>
  summarize(
    n_gold = sum(medal == "Gold", na.rm = TRUE),
    population = population[1],
    .by = c(noc, sex, year)
  ) |>
  mutate(
    n_gold_pp = n_gold/population
  ) |>
  filter(sex == "F") |>
  group_by(noc) |>
  summarize(
    n_gold_pp = median(n_gold_pp, na.rm = TRUE)
  ) |>
  arrange(desc(n_gold_pp))
# A tibble: 10 × 2
   noc   n_gold_pp
   <chr>     <dbl>
 1 URS     2.69e-5
 2 GER     1.48e-7
 3 NED     1.34e-7
 4 AUS     9.38e-8
 5 USA     7.26e-8
 6 CAN     5.59e-8
 7 GBR     1.75e-8
 8 CHN     5.92e-9
 9 GDR    NA      
10 RUS    NA