Chapter 11 APIs (Application Programming Interface)
11.1 Leading Causes of Deaths
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.5 ✔ purrr 0.3.4
✔ tibble 3.1.4 ✔ dplyr 1.0.7
✔ tidyr 1.1.4 ✔ stringr 1.4.0
✔ readr 2.0.2 ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
<- read_csv("data/NCHS_-_Leading_Causes_of_Death__United_States.csv") lcod
Rows: 10868 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): 113 Cause Name, Cause Name, State
dbl (1): Year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lcod
# A tibble: 10,868 × 6
Year `113 Cause Name` `Cause Name` State Deaths `Age-adjusted De…
<dbl> <chr> <chr> <chr> <dbl> <dbl>
1 2017 Accidents (unintention… Unintentional… United… 169936 49.4
2 2017 Accidents (unintention… Unintentional… Alabama 2703 53.8
3 2017 Accidents (unintention… Unintentional… Alaska 436 63.7
4 2017 Accidents (unintention… Unintentional… Arizona 4184 56.2
5 2017 Accidents (unintention… Unintentional… Arkans… 1625 51.8
6 2017 Accidents (unintention… Unintentional… Califo… 13840 33.2
7 2017 Accidents (unintention… Unintentional… Colora… 3037 53.6
8 2017 Accidents (unintention… Unintentional… Connec… 2078 53.2
9 2017 Accidents (unintention… Unintentional… Delawa… 608 61.9
10 2017 Accidents (unintention… Unintentional… Distri… 427 61
# … with 10,858 more rows
Exercise
- Filter the dataset to just look at the year 2017
- Get the unique list of Causes from the
113 Cause Name
columns - Get the values for the
All Causes
death counts
<- lcod %>%
lcod_2017 filter(Year == 2017)
%>%
lcod_2017 pull(`113 Cause Name`) %>%
unique()
[1] "Accidents (unintentional injuries) (V01-X59,Y85-Y86)"
[2] "All Causes"
[3] "Alzheimer's disease (G30)"
[4] "Cerebrovascular diseases (I60-I69)"
[5] "Chronic lower respiratory diseases (J40-J47)"
[6] "Diabetes mellitus (E10-E14)"
[7] "Diseases of heart (I00-I09,I11,I13,I20-I51)"
[8] "Influenza and pneumonia (J09-J18)"
[9] "Intentional self-harm (suicide) (*U03,X60-X84,Y87.0)"
[10] "Malignant neoplasms (C00-C97)"
[11] "Nephritis, nephrotic syndrome and nephrosis (N00-N07,N17-N19,N25-N27)"
<- lcod_2017 %>%
lcod_2017_all filter(`113 Cause Name`== "All Causes")
11.2 Adjusted Counts
Reference population
count / population at risk * 100,000
Cannot compare values with different reference populations
mortality data: https://www.cdc.gov/nchs/data_access/vitalstatsonline.htm#Mortality_Multiple
file format: https://github.com/tommaho/VS13MORT.DUSMCPUB-Parser
Median age: https://www.statsamerica.org/sip/rank_list.aspx?rank_label=pop46&ct=S09
Let’s go do population adjusted rates instead
11.3 Census API
library(tidycensus)
- Get your API key: https://api.census.gov/data/key_signup.html
census_api_key("YOUR API KEY GOES HERE", install = TRUE)
11.3.1 Find the tables you need
<- load_variables(2010, "acs5", cache = TRUE)
acs5_2010_vars acs5_2010_vars
<- read_csv("data/census/acs5_2010_vars.csv") acs5_2010_vars
Rows: 20927 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): name, label, concept
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- acs5_2010_vars %>%
acs5_2010_vars_age filter(stringr::str_detect(concept, "AGE")) %>%
filter(concept == "SEX BY AGE")
11.3.2 Download the data
<- get_acs(geography = "state",
census_dat variables = "B01001_003",
year = 2010)
census_dat
<- read_csv("data/census/state-B01001_003-2010.csv") census_dat
Rows: 52 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): GEOID, NAME, variable
dbl (2): estimate, moe
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
%>%
census_dat mutate(lablel = "B01001_003")
# A tibble: 52 × 6
GEOID NAME variable estimate moe lablel
<chr> <chr> <chr> <dbl> <dbl> <chr>
1 01 Alabama B01001_003 154287 569 B01001_003
2 02 Alaska B01001_003 26714 233 B01001_003
3 04 Arizona B01001_003 236518 411 B01001_003
4 05 Arkansas B01001_003 100110 620 B01001_003
5 06 California B01001_003 1300849 740 B01001_003
6 08 Colorado B01001_003 173902 544 B01001_003
7 09 Connecticut B01001_003 105318 199 B01001_003
8 10 Delaware B01001_003 28476 88 B01001_003
9 11 District of Columbia B01001_003 16463 74 B01001_003
10 12 Florida B01001_003 552027 785 B01001_003
# … with 42 more rows
11.3.3 Write a function
<- function(var_name, label) {
download_census <- get_acs(geography = "state",
census_dat variables = var_name,
year = 2010)
<- census_dat %>%
census_dat mutate(label = label)
return(census_dat)
}
<- download_census("B01001_003","Estimate!!Total!!Male!!Under 5 years")
census_tbl census_tbl
<- read_csv("data/census/B01001_003-state-2010.csv") census_tbl
Rows: 52 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): GEOID, NAME, variable, label
dbl (2): estimate, moe
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
11.3.4 Download and Tidy Data
<- purrr::map2_df(acs5_2010_vars_age$name,
census $label,
acs5_2010_vars_age download_census)
<- read_csv("data/census/B01001-state-2010.csv") census
Rows: 2548 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): GEOID, NAME, variable, label
dbl (2): estimate, moe
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
%>%
census distinct(label)
# A tibble: 49 × 1
label
<chr>
1 Estimate!!Total
2 Estimate!!Total!!Male
3 Estimate!!Total!!Male!!Under 5 years
4 Estimate!!Total!!Male!!5 to 9 years
5 Estimate!!Total!!Male!!10 to 14 years
6 Estimate!!Total!!Male!!15 to 17 years
7 Estimate!!Total!!Male!!18 and 19 years
8 Estimate!!Total!!Male!!20 years
9 Estimate!!Total!!Male!!21 years
10 Estimate!!Total!!Male!!22 to 24 years
# … with 39 more rows
<- census %>%
census_age filter(stringr::str_detect(label, "!!Male!!") | stringr::str_detect(label, "!!Female!!")) %>%
separate(label, into = c("estimate_text", "total", "gender", "age_group"), sep="!!") %>%
select(-estimate_text, -total)
11.3.5 Population counts
<- census_age %>%
pop_state_age group_by(NAME, age_group) %>%
summarize(pop = sum(estimate)) %>%
ungroup()
`summarise()` has grouped output by 'NAME'. You can override using the `.groups` argument.
pop_state_age
# A tibble: 1,196 × 3
NAME age_group pop
<chr> <chr> <dbl>
1 Alabama 10 to 14 years 316989
2 Alabama 15 to 17 years 202056
3 Alabama 18 and 19 years 142038
4 Alabama 20 years 73079
5 Alabama 21 years 69099
6 Alabama 22 to 24 years 188000
7 Alabama 25 to 29 years 307389
8 Alabama 30 to 34 years 289796
9 Alabama 35 to 39 years 309435
10 Alabama 40 to 44 years 323835
# … with 1,186 more rows
# check to see if we got everything
$age_group %>%
census_ageunique() %>%
length() * 52
[1] 1196
# save if you want
# write_csv("data/census_state_age_group.csv")
11.4 Population Adjusted Death
<- pop_state_age %>%
pop_state group_by(NAME) %>%
summarize(pop = sum(pop)) %>%
ungroup()
pop_state
# A tibble: 52 × 2
NAME pop
<chr> <dbl>
1 Alabama 4712651
2 Alaska 691189
3 Arizona 6246816
4 Arkansas 2872684
5 California 36637290
6 Colorado 4887061
7 Connecticut 3545837
8 Delaware 881278
9 District of Columbia 584400
10 Florida 18511620
# … with 42 more rows
<- inner_join(lcod_2017_all, pop_state, by = c("State" = "NAME"))
cod_state_pop dim(cod_state_pop)
[1] 51 7
cod_state_pop
# A tibble: 51 × 7
Year `113 Cause Name` `Cause Name` State Deaths `Age-adjusted Dea… pop
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 2017 All Causes All causes Alabama 53238 918. 4.71e6
2 2017 All Causes All causes Alaska 4411 709. 6.91e5
3 2017 All Causes All causes Arizona 57758 678. 6.25e6
4 2017 All Causes All causes Arkansas 32588 900. 2.87e6
5 2017 All Causes All causes Califor… 268189 619. 3.66e7
6 2017 All Causes All causes Colorado 38063 663. 4.89e6
7 2017 All Causes All causes Connect… 31312 651. 3.55e6
8 2017 All Causes All causes Delaware 9178 750. 8.81e5
9 2017 All Causes All causes Distric… 4965 725. 5.84e5
10 2017 All Causes All causes Florida 203636 672. 1.85e7
# … with 41 more rows
<- cod_state_pop %>%
cod_state_pop mutate(death_rate_adj_pop = (Deaths / pop) * 100000)
cod_state_pop
# A tibble: 51 × 8
Year `113 Cause Name` `Cause Name` State Deaths `Age-adjusted Dea… pop
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 2017 All Causes All causes Alabama 53238 918. 4.71e6
2 2017 All Causes All causes Alaska 4411 709. 6.91e5
3 2017 All Causes All causes Arizona 57758 678. 6.25e6
4 2017 All Causes All causes Arkansas 32588 900. 2.87e6
5 2017 All Causes All causes Califor… 268189 619. 3.66e7
6 2017 All Causes All causes Colorado 38063 663. 4.89e6
7 2017 All Causes All causes Connect… 31312 651. 3.55e6
8 2017 All Causes All causes Delaware 9178 750. 8.81e5
9 2017 All Causes All causes Distric… 4965 725. 5.84e5
10 2017 All Causes All causes Florida 203636 672. 1.85e7
# … with 41 more rows, and 1 more variable: death_rate_adj_pop <dbl>