Chapter 11 APIs (Application Programming Interface)

11.1 Leading Causes of Deaths

library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.5     ✔ purrr   0.3.4
✔ tibble  3.1.4     ✔ dplyr   1.0.7
✔ tidyr   1.1.4     ✔ stringr 1.4.0
✔ readr   2.0.2     ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
lcod <- read_csv("data/NCHS_-_Leading_Causes_of_Death__United_States.csv")
Rows: 10868 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): 113 Cause Name, Cause Name, State
dbl (1): Year

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lcod
# A tibble: 10,868 × 6
    Year `113 Cause Name`        `Cause Name`   State   Deaths `Age-adjusted De…
   <dbl> <chr>                   <chr>          <chr>    <dbl>             <dbl>
 1  2017 Accidents (unintention… Unintentional… United… 169936              49.4
 2  2017 Accidents (unintention… Unintentional… Alabama   2703              53.8
 3  2017 Accidents (unintention… Unintentional… Alaska     436              63.7
 4  2017 Accidents (unintention… Unintentional… Arizona   4184              56.2
 5  2017 Accidents (unintention… Unintentional… Arkans…   1625              51.8
 6  2017 Accidents (unintention… Unintentional… Califo…  13840              33.2
 7  2017 Accidents (unintention… Unintentional… Colora…   3037              53.6
 8  2017 Accidents (unintention… Unintentional… Connec…   2078              53.2
 9  2017 Accidents (unintention… Unintentional… Delawa…    608              61.9
10  2017 Accidents (unintention… Unintentional… Distri…    427              61  
# … with 10,858 more rows

Exercise

  1. Filter the dataset to just look at the year 2017
  2. Get the unique list of Causes from the 113 Cause Name columns
  3. Get the values for the All Causes death counts
lcod_2017 <- lcod %>%
  filter(Year == 2017)
lcod_2017 %>%
  pull(`113 Cause Name`) %>%
  unique()
 [1] "Accidents (unintentional injuries) (V01-X59,Y85-Y86)"                 
 [2] "All Causes"                                                           
 [3] "Alzheimer's disease (G30)"                                            
 [4] "Cerebrovascular diseases (I60-I69)"                                   
 [5] "Chronic lower respiratory diseases (J40-J47)"                         
 [6] "Diabetes mellitus (E10-E14)"                                          
 [7] "Diseases of heart (I00-I09,I11,I13,I20-I51)"                          
 [8] "Influenza and pneumonia (J09-J18)"                                    
 [9] "Intentional self-harm (suicide) (*U03,X60-X84,Y87.0)"                 
[10] "Malignant neoplasms (C00-C97)"                                        
[11] "Nephritis, nephrotic syndrome and nephrosis (N00-N07,N17-N19,N25-N27)"
lcod_2017_all <- lcod_2017 %>%
  filter(`113 Cause Name`== "All Causes")

11.2 Adjusted Counts

Let’s go do population adjusted rates instead

11.3 Census API

library(tidycensus)
census_api_key("YOUR API KEY GOES HERE", install = TRUE)

11.3.1 Find the tables you need

acs5_2010_vars <- load_variables(2010, "acs5", cache = TRUE)
acs5_2010_vars
acs5_2010_vars <- read_csv("data/census/acs5_2010_vars.csv")
Rows: 20927 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): name, label, concept

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
acs5_2010_vars_age <- acs5_2010_vars %>%
  filter(stringr::str_detect(concept, "AGE")) %>%
  filter(concept == "SEX BY AGE")

11.3.2 Download the data

census_dat <- get_acs(geography = "state",
                        variables = "B01001_003",
                        year = 2010)
census_dat
census_dat <- read_csv("data/census/state-B01001_003-2010.csv")
Rows: 52 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): GEOID, NAME, variable
dbl (2): estimate, moe

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
census_dat %>%
  mutate(lablel = "B01001_003")
# A tibble: 52 × 6
   GEOID NAME                 variable   estimate   moe lablel    
   <chr> <chr>                <chr>         <dbl> <dbl> <chr>     
 1 01    Alabama              B01001_003   154287   569 B01001_003
 2 02    Alaska               B01001_003    26714   233 B01001_003
 3 04    Arizona              B01001_003   236518   411 B01001_003
 4 05    Arkansas             B01001_003   100110   620 B01001_003
 5 06    California           B01001_003  1300849   740 B01001_003
 6 08    Colorado             B01001_003   173902   544 B01001_003
 7 09    Connecticut          B01001_003   105318   199 B01001_003
 8 10    Delaware             B01001_003    28476    88 B01001_003
 9 11    District of Columbia B01001_003    16463    74 B01001_003
10 12    Florida              B01001_003   552027   785 B01001_003
# … with 42 more rows

11.3.3 Write a function

download_census <- function(var_name, label) {
  census_dat <- get_acs(geography = "state",
                        variables = var_name,
                        year = 2010)
  census_dat <- census_dat %>%
    mutate(label = label)
  return(census_dat)
}
census_tbl <- download_census("B01001_003","Estimate!!Total!!Male!!Under 5 years")
census_tbl
census_tbl <- read_csv("data/census/B01001_003-state-2010.csv")
Rows: 52 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): GEOID, NAME, variable, label
dbl (2): estimate, moe

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

11.3.4 Download and Tidy Data

census <- purrr::map2_df(acs5_2010_vars_age$name,
                         acs5_2010_vars_age$label,
                         download_census)
census <- read_csv("data/census/B01001-state-2010.csv")
Rows: 2548 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): GEOID, NAME, variable, label
dbl (2): estimate, moe

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
census %>%
  distinct(label)
# A tibble: 49 × 1
   label                                 
   <chr>                                 
 1 Estimate!!Total                       
 2 Estimate!!Total!!Male                 
 3 Estimate!!Total!!Male!!Under 5 years  
 4 Estimate!!Total!!Male!!5 to 9 years   
 5 Estimate!!Total!!Male!!10 to 14 years 
 6 Estimate!!Total!!Male!!15 to 17 years 
 7 Estimate!!Total!!Male!!18 and 19 years
 8 Estimate!!Total!!Male!!20 years       
 9 Estimate!!Total!!Male!!21 years       
10 Estimate!!Total!!Male!!22 to 24 years 
# … with 39 more rows
census_age <- census %>%
  filter(stringr::str_detect(label, "!!Male!!") | stringr::str_detect(label, "!!Female!!")) %>%
  separate(label, into = c("estimate_text", "total", "gender", "age_group"), sep="!!") %>%
  select(-estimate_text, -total)

11.3.5 Population counts

pop_state_age <- census_age %>%
  group_by(NAME, age_group) %>%
  summarize(pop = sum(estimate)) %>%
  ungroup()
`summarise()` has grouped output by 'NAME'. You can override using the `.groups` argument.
pop_state_age
# A tibble: 1,196 × 3
   NAME    age_group          pop
   <chr>   <chr>            <dbl>
 1 Alabama 10 to 14 years  316989
 2 Alabama 15 to 17 years  202056
 3 Alabama 18 and 19 years 142038
 4 Alabama 20 years         73079
 5 Alabama 21 years         69099
 6 Alabama 22 to 24 years  188000
 7 Alabama 25 to 29 years  307389
 8 Alabama 30 to 34 years  289796
 9 Alabama 35 to 39 years  309435
10 Alabama 40 to 44 years  323835
# … with 1,186 more rows
# check to see if we got everything
census_age$age_group %>%
  unique() %>%
  length() * 52
[1] 1196
# save if you want
# write_csv("data/census_state_age_group.csv")

11.4 Population Adjusted Death

pop_state <- pop_state_age %>%
  group_by(NAME) %>%
  summarize(pop = sum(pop)) %>%
  ungroup()
pop_state
# A tibble: 52 × 2
   NAME                      pop
   <chr>                   <dbl>
 1 Alabama               4712651
 2 Alaska                 691189
 3 Arizona               6246816
 4 Arkansas              2872684
 5 California           36637290
 6 Colorado              4887061
 7 Connecticut           3545837
 8 Delaware               881278
 9 District of Columbia   584400
10 Florida              18511620
# … with 42 more rows
cod_state_pop <- inner_join(lcod_2017_all, pop_state, by = c("State" = "NAME"))
dim(cod_state_pop)
[1] 51  7
cod_state_pop
# A tibble: 51 × 7
    Year `113 Cause Name` `Cause Name` State    Deaths `Age-adjusted Dea…    pop
   <dbl> <chr>            <chr>        <chr>     <dbl>              <dbl>  <dbl>
 1  2017 All Causes       All causes   Alabama   53238               918. 4.71e6
 2  2017 All Causes       All causes   Alaska     4411               709. 6.91e5
 3  2017 All Causes       All causes   Arizona   57758               678. 6.25e6
 4  2017 All Causes       All causes   Arkansas  32588               900. 2.87e6
 5  2017 All Causes       All causes   Califor… 268189               619. 3.66e7
 6  2017 All Causes       All causes   Colorado  38063               663. 4.89e6
 7  2017 All Causes       All causes   Connect…  31312               651. 3.55e6
 8  2017 All Causes       All causes   Delaware   9178               750. 8.81e5
 9  2017 All Causes       All causes   Distric…   4965               725. 5.84e5
10  2017 All Causes       All causes   Florida  203636               672. 1.85e7
# … with 41 more rows
cod_state_pop <- cod_state_pop %>%
  mutate(death_rate_adj_pop = (Deaths / pop) * 100000)
cod_state_pop
# A tibble: 51 × 8
    Year `113 Cause Name` `Cause Name` State    Deaths `Age-adjusted Dea…    pop
   <dbl> <chr>            <chr>        <chr>     <dbl>              <dbl>  <dbl>
 1  2017 All Causes       All causes   Alabama   53238               918. 4.71e6
 2  2017 All Causes       All causes   Alaska     4411               709. 6.91e5
 3  2017 All Causes       All causes   Arizona   57758               678. 6.25e6
 4  2017 All Causes       All causes   Arkansas  32588               900. 2.87e6
 5  2017 All Causes       All causes   Califor… 268189               619. 3.66e7
 6  2017 All Causes       All causes   Colorado  38063               663. 4.89e6
 7  2017 All Causes       All causes   Connect…  31312               651. 3.55e6
 8  2017 All Causes       All causes   Delaware   9178               750. 8.81e5
 9  2017 All Causes       All causes   Distric…   4965               725. 5.84e5
10  2017 All Causes       All causes   Florida  203636               672. 1.85e7
# … with 41 more rows, and 1 more variable: death_rate_adj_pop <dbl>