Chapter 5 Descriptive Calculations

5.1 Introduction

5.2 Learning Objectives

5.3 Building the pipeline

New script, clear your environment, re-load your libraries

R
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.5     ✔ purrr   0.3.4
✔ tibble  3.1.4     ✔ dplyr   1.0.7
✔ tidyr   1.1.4     ✔ stringr 1.4.0
✔ readr   2.0.2     ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
tumor_subset <- read_csv("data/tumor_filtered.csv")
Rows: 60 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (4): Group, ID, Day, Size

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tumor_subset
# A tibble: 60 × 4
   Group    ID   Day   Size
   <dbl> <dbl> <dbl>  <dbl>
 1     1   101     0   41.8
 2     1   101    13 1030. 
 3     1   102     0   79.4
 4     1   102    13  619. 
 5     1   103     0   44.8
 6     1   104     0   67.7
 7     1   105     0   54.7
 8     1   105    13 1699. 
 9     1   106     0   60  
10     1   107     0   46.8
# … with 50 more rows
Python
import pandas as pd

tumor_subset = pd.read_csv("data/tumor_filtered.csv")
tumor_subset
    Group   ID  Day    Size
0       1  101    0    41.8
1       1  101   13  1030.4
2       1  102    0    79.4
3       1  102   13   618.8
4       1  103    0    44.8
5       1  104    0    67.7
6       1  105    0    54.7
7       1  105   13  1699.3
8       1  106    0    60.0
9       1  107    0    46.8
10      1  107   13  2342.6
11      1  108    0    49.4
12      1  108   13  2295.9
13      2  201    0    49.1
14      2  201   13   455.5
15      2  202    0    60.6
16      2  203    0    41.5
17      2  204    0    46.8
18      2  205    0    39.5
19      2  205   13   552.5
20      2  206    0    53.5
21      2  207    0    43.5
22      2  207   13    62.4
23      2  208    0    64.4
24      2  209    0    47.5
25      2  210    0    71.7
26      2  210   13   743.5
27      3  301    0    44.1
28      3  301   13  2058.7
29      3  302    0    42.1
30      3  302   13   589.5
31      3  303    0    42.5
32      3  303   13   442.4
33      3  304    0    56.9
34      3  304   13  1066.4
35      3  305    0    46.7
36      3  305   13   455.3
37      3  306    0    51.2
38      3  306   13  1177.5
39      3  307    0    44.0
40      3  307   13   651.8
41      3  308    0    59.8
42      3  308   13   956.9
43      3  309    0    40.7
44      3  309   13   565.4
45      3  310    0    58.2
46      3  310   13  1372.7
47      4  401    0    41.3
48      4  402    0    53.5
49      4  403    0    45.8
50      4  403   13   624.1
51      4  404    0    48.2
52      4  405    0    47.7
53      4  405   13   527.1
54      4  406    0    69.2
55      4  406   13  1227.7
56      4  407    0    43.9
57      4  408    0    59.3
58      4  409    0    51.1
59      4  409   13   691.4

5.4 Summary statistics

R
tumor_subset %>%
  filter(Day == 0) %>%
  pull(Size)
 [1] 41.8 79.4 44.8 67.7 54.7 60.0 46.8 49.4 49.1 60.6 41.5 46.8 39.5 53.5 43.5
[16] 64.4 47.5 71.7 44.1 42.1 42.5 56.9 46.7 51.2 44.0 59.8 40.7 58.2 41.3 53.5
[31] 45.8 48.2 47.7 69.2 43.9 59.3 51.1
tumor_subset %>%
  filter(Day == 0) %>%
  pull(Size) %>%
  mean()
[1] 51.59189
Python
tumor_subset.loc[tumor_subset["Day"] == 0, "Size"]
0     41.8
2     79.4
4     44.8
5     67.7
6     54.7
8     60.0
9     46.8
11    49.4
13    49.1
15    60.6
16    41.5
17    46.8
18    39.5
20    53.5
21    43.5
23    64.4
24    47.5
25    71.7
27    44.1
29    42.1
31    42.5
33    56.9
35    46.7
37    51.2
39    44.0
41    59.8
43    40.7
45    58.2
47    41.3
48    53.5
49    45.8
51    48.2
52    47.7
54    69.2
56    43.9
57    59.3
58    51.1
Name: Size, dtype: float64
tumor_subset.loc[tumor_subset["Day"] == 0, "Size"].mean()
51.59189189189189
(tumor_subset
  .loc[tumor_subset["Day"] == 0, "Size"]
  .mean()
)
51.59189189189189
R
tumor_subset %>%
  filter(Group == 1, Day == 0) %>%
  pull(Size) %>%
  mean()
[1] 55.575
tumor_subset %>%
  filter(Group == 2, Day == 0) %>%
  pull(Size) %>%
  mean()
[1] 51.81
tumor_subset %>%
  filter(Group == 3, Day == 0) %>%
  pull(Size) %>%
  mean()
[1] 48.62
tumor_subset %>%
  filter(Group == 4, Day == 0) %>%
  pull(Size) %>%
  mean()
[1] 51.11111
tumor_subset %>%
  pull(Group) %>%
  unique()
[1] 1 2 3 4
Python
(tumor_subset
  .loc[tumor_subset["Group"] == 1, "Size"]
  .mean()
)
648.5846153846154
(tumor_subset
  .loc[tumor_subset["Group"] == 2, "Size"]
  .mean()
)
166.57142857142858
(tumor_subset
  .loc[tumor_subset["Group"] == 3, "Size"]
  .mean()
)
491.14000000000004
(tumor_subset
  .loc[tumor_subset["Group"] == 4, "Size"]
  .mean()
)
271.56153846153853
(tumor_subset
  .loc[:, "Group"]
  .unique()
)
array([1, 2, 3, 4])

5.5 Groupby operations

R
tumor_subset %>%
  filter(Day == 0) %>%
  group_by(Group) %>%
  summarize(avg_size = mean(Size))
# A tibble: 4 × 2
  Group avg_size
  <dbl>    <dbl>
1     1     55.6
2     2     51.8
3     3     48.6
4     4     51.1
Python
(tumor_subset
  .loc[tumor_subset["Day"] == 0]
  .groupby("Group")
  .mean()
)
          ID  Day       Size
Group                       
1      104.5  0.0  55.575000
2      205.5  0.0  51.810000
3      305.5  0.0  48.620000
4      405.0  0.0  51.111111
(tumor_subset
  .loc[tumor_subset["Day"] == 0]
  .groupby("Group")
  ["Size"]
  .mean()
)
Group
1    55.575000
2    51.810000
3    48.620000
4    51.111111
Name: Size, dtype: float64
import numpy as np

(tumor_subset
  .loc[tumor_subset["Day"] == 0]
  .groupby("Group")
  ["Size"]
  .agg(avg_size = np.mean)
)
        avg_size
Group           
1      55.575000
2      51.810000
3      48.620000
4      51.111111
(tumor_subset
  .loc[tumor_subset["Day"] == 0]
  .groupby("Group")
  .agg(avg_size = ("Size", np.mean))
)
        avg_size
Group           
1      55.575000
2      51.810000
3      48.620000
4      51.111111
R
tumor_subset %>%
  group_by(Group, Day) %>%
  summarize(avg_size = mean(Size),
            sd_size = sd(Size),
            q1 = quantile(Size, probs = .25)
            )
`summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.
# A tibble: 8 × 5
# Groups:   Group [4]
  Group   Day avg_size sd_size     q1
  <dbl> <dbl>    <dbl>   <dbl>  <dbl>
1     1     0     55.6   12.9    46.3
2     1    13   1597.   764.   1030. 
3     2     0     51.8   10.6    44.3
4     2    13    453.   287.    357. 
5     3     0     48.6    7.30   42.9
6     3    13    934.   510.    571. 
7     4     0     51.1    8.64   45.8
8     4    13    768.   314.    600. 
Python
(tumor_subset
  .groupby(["Group", "Day"])
  .agg(avg_size = ("Size", np.mean),
       std_size = ("Size", np.std),
       q25_size = ("Size", lambda x: np.quantile(x, q=.25)))
)
              avg_size    std_size  q25_size
Group Day                                   
1     0      55.575000   12.850653    46.300
      13   1597.400000  763.668099  1030.400
2     0      51.810000   10.604763    44.325
      13    453.475000  286.858971   357.225
3     0      48.620000    7.304002    42.875
      13    933.660000  509.875839   571.425
4     0      51.111111    8.635312    45.800
      13    767.575000  314.075865   599.850
(tumor_subset
  .groupby(["Group", "Day"])
  .agg(avg_size = ("Size", np.mean),
       std_size = ("Size", np.std),
       q25_size = ("Size", lambda x: np.quantile(x, q=.25)))
  .reset_index()
)
   Group  Day     avg_size    std_size  q25_size
0      1    0    55.575000   12.850653    46.300
1      1   13  1597.400000  763.668099  1030.400
2      2    0    51.810000   10.604763    44.325
3      2   13   453.475000  286.858971   357.225
4      3    0    48.620000    7.304002    42.875
5      3   13   933.660000  509.875839   571.425
6      4    0    51.111111    8.635312    45.800
7      4   13   767.575000  314.075865   599.850

5.6 Summary

5.7 Additional Resources

knitr::opts_chunk$set(comment = "")