Chapter 5 Descriptive Calculations
5.1 Introduction
5.2 Learning Objectives
5.3 Building the pipeline
New script, clear your environment, re-load your libraries
R
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.5 ✔ purrr 0.3.4
✔ tibble 3.1.4 ✔ dplyr 1.0.7
✔ tidyr 1.1.4 ✔ stringr 1.4.0
✔ readr 2.0.2 ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
<- read_csv("data/tumor_filtered.csv") tumor_subset
Rows: 60 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (4): Group, ID, Day, Size
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tumor_subset
# A tibble: 60 × 4
Group ID Day Size
<dbl> <dbl> <dbl> <dbl>
1 1 101 0 41.8
2 1 101 13 1030.
3 1 102 0 79.4
4 1 102 13 619.
5 1 103 0 44.8
6 1 104 0 67.7
7 1 105 0 54.7
8 1 105 13 1699.
9 1 106 0 60
10 1 107 0 46.8
# … with 50 more rows
Python
import pandas as pd
= pd.read_csv("data/tumor_filtered.csv")
tumor_subset tumor_subset
Group ID Day Size
0 1 101 0 41.8
1 1 101 13 1030.4
2 1 102 0 79.4
3 1 102 13 618.8
4 1 103 0 44.8
5 1 104 0 67.7
6 1 105 0 54.7
7 1 105 13 1699.3
8 1 106 0 60.0
9 1 107 0 46.8
10 1 107 13 2342.6
11 1 108 0 49.4
12 1 108 13 2295.9
13 2 201 0 49.1
14 2 201 13 455.5
15 2 202 0 60.6
16 2 203 0 41.5
17 2 204 0 46.8
18 2 205 0 39.5
19 2 205 13 552.5
20 2 206 0 53.5
21 2 207 0 43.5
22 2 207 13 62.4
23 2 208 0 64.4
24 2 209 0 47.5
25 2 210 0 71.7
26 2 210 13 743.5
27 3 301 0 44.1
28 3 301 13 2058.7
29 3 302 0 42.1
30 3 302 13 589.5
31 3 303 0 42.5
32 3 303 13 442.4
33 3 304 0 56.9
34 3 304 13 1066.4
35 3 305 0 46.7
36 3 305 13 455.3
37 3 306 0 51.2
38 3 306 13 1177.5
39 3 307 0 44.0
40 3 307 13 651.8
41 3 308 0 59.8
42 3 308 13 956.9
43 3 309 0 40.7
44 3 309 13 565.4
45 3 310 0 58.2
46 3 310 13 1372.7
47 4 401 0 41.3
48 4 402 0 53.5
49 4 403 0 45.8
50 4 403 13 624.1
51 4 404 0 48.2
52 4 405 0 47.7
53 4 405 13 527.1
54 4 406 0 69.2
55 4 406 13 1227.7
56 4 407 0 43.9
57 4 408 0 59.3
58 4 409 0 51.1
59 4 409 13 691.4
5.4 Summary statistics
R
%>%
tumor_subset filter(Day == 0) %>%
pull(Size)
[1] 41.8 79.4 44.8 67.7 54.7 60.0 46.8 49.4 49.1 60.6 41.5 46.8 39.5 53.5 43.5
[16] 64.4 47.5 71.7 44.1 42.1 42.5 56.9 46.7 51.2 44.0 59.8 40.7 58.2 41.3 53.5
[31] 45.8 48.2 47.7 69.2 43.9 59.3 51.1
%>%
tumor_subset filter(Day == 0) %>%
pull(Size) %>%
mean()
[1] 51.59189
Python
"Day"] == 0, "Size"] tumor_subset.loc[tumor_subset[
0 41.8
2 79.4
4 44.8
5 67.7
6 54.7
8 60.0
9 46.8
11 49.4
13 49.1
15 60.6
16 41.5
17 46.8
18 39.5
20 53.5
21 43.5
23 64.4
24 47.5
25 71.7
27 44.1
29 42.1
31 42.5
33 56.9
35 46.7
37 51.2
39 44.0
41 59.8
43 40.7
45 58.2
47 41.3
48 53.5
49 45.8
51 48.2
52 47.7
54 69.2
56 43.9
57 59.3
58 51.1
Name: Size, dtype: float64
"Day"] == 0, "Size"].mean() tumor_subset.loc[tumor_subset[
51.59189189189189
(tumor_subset"Day"] == 0, "Size"]
.loc[tumor_subset[
.mean() )
51.59189189189189
R
%>%
tumor_subset filter(Group == 1, Day == 0) %>%
pull(Size) %>%
mean()
[1] 55.575
%>%
tumor_subset filter(Group == 2, Day == 0) %>%
pull(Size) %>%
mean()
[1] 51.81
%>%
tumor_subset filter(Group == 3, Day == 0) %>%
pull(Size) %>%
mean()
[1] 48.62
%>%
tumor_subset filter(Group == 4, Day == 0) %>%
pull(Size) %>%
mean()
[1] 51.11111
%>%
tumor_subset pull(Group) %>%
unique()
[1] 1 2 3 4
Python
(tumor_subset"Group"] == 1, "Size"]
.loc[tumor_subset[
.mean() )
648.5846153846154
(tumor_subset"Group"] == 2, "Size"]
.loc[tumor_subset[
.mean() )
166.57142857142858
(tumor_subset"Group"] == 3, "Size"]
.loc[tumor_subset[
.mean() )
491.14000000000004
(tumor_subset"Group"] == 4, "Size"]
.loc[tumor_subset[
.mean() )
271.56153846153853
(tumor_subset"Group"]
.loc[:,
.unique() )
array([1, 2, 3, 4])
5.5 Groupby operations
R
%>%
tumor_subset filter(Day == 0) %>%
group_by(Group) %>%
summarize(avg_size = mean(Size))
# A tibble: 4 × 2
Group avg_size
<dbl> <dbl>
1 1 55.6
2 2 51.8
3 3 48.6
4 4 51.1
Python
(tumor_subset"Day"] == 0]
.loc[tumor_subset["Group")
.groupby(
.mean() )
ID Day Size
Group
1 104.5 0.0 55.575000
2 205.5 0.0 51.810000
3 305.5 0.0 48.620000
4 405.0 0.0 51.111111
(tumor_subset"Day"] == 0]
.loc[tumor_subset["Group")
.groupby("Size"]
[
.mean() )
Group
1 55.575000
2 51.810000
3 48.620000
4 51.111111
Name: Size, dtype: float64
import numpy as np
(tumor_subset"Day"] == 0]
.loc[tumor_subset["Group")
.groupby("Size"]
[= np.mean)
.agg(avg_size )
avg_size
Group
1 55.575000
2 51.810000
3 48.620000
4 51.111111
(tumor_subset"Day"] == 0]
.loc[tumor_subset["Group")
.groupby(= ("Size", np.mean))
.agg(avg_size )
avg_size
Group
1 55.575000
2 51.810000
3 48.620000
4 51.111111
R
%>%
tumor_subset group_by(Group, Day) %>%
summarize(avg_size = mean(Size),
sd_size = sd(Size),
q1 = quantile(Size, probs = .25)
)
`summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.
# A tibble: 8 × 5
# Groups: Group [4]
Group Day avg_size sd_size q1
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 55.6 12.9 46.3
2 1 13 1597. 764. 1030.
3 2 0 51.8 10.6 44.3
4 2 13 453. 287. 357.
5 3 0 48.6 7.30 42.9
6 3 13 934. 510. 571.
7 4 0 51.1 8.64 45.8
8 4 13 768. 314. 600.
Python
(tumor_subset"Group", "Day"])
.groupby([= ("Size", np.mean),
.agg(avg_size = ("Size", np.std),
std_size = ("Size", lambda x: np.quantile(x, q=.25)))
q25_size )
avg_size std_size q25_size
Group Day
1 0 55.575000 12.850653 46.300
13 1597.400000 763.668099 1030.400
2 0 51.810000 10.604763 44.325
13 453.475000 286.858971 357.225
3 0 48.620000 7.304002 42.875
13 933.660000 509.875839 571.425
4 0 51.111111 8.635312 45.800
13 767.575000 314.075865 599.850
(tumor_subset"Group", "Day"])
.groupby([= ("Size", np.mean),
.agg(avg_size = ("Size", np.std),
std_size = ("Size", lambda x: np.quantile(x, q=.25)))
q25_size
.reset_index() )
Group Day avg_size std_size q25_size
0 1 0 55.575000 12.850653 46.300
1 1 13 1597.400000 763.668099 1030.400
2 2 0 51.810000 10.604763 44.325
3 2 13 453.475000 286.858971 357.225
4 3 0 48.620000 7.304002 42.875
5 3 13 933.660000 509.875839 571.425
6 4 0 51.111111 8.635312 45.800
7 4 13 767.575000 314.075865 599.850
5.6 Summary
5.7 Additional Resources
::opts_chunk$set(comment = "") knitr