download.file(url="https://ndownloader.figshare.com/files/2292169",
              destfile = "data/portal_data_joined.csv")

Challenges

1

surveys <- read_csv("data/portal_data_joined.csv")
## Parsed with column specification:
## cols(
##   record_id = col_double(),
##   month = col_double(),
##   day = col_double(),
##   year = col_double(),
##   plot_id = col_double(),
##   species_id = col_character(),
##   sex = col_character(),
##   hindfoot_length = col_double(),
##   weight = col_double(),
##   genus = col_character(),
##   species = col_character(),
##   taxa = col_character(),
##   plot_type = col_character()
## )
surveys %>%
  filter(year < 1995) %>%
  select(year, sex, weight)
## # A tibble: 21,486 x 3
##     year sex   weight
##    <dbl> <chr>  <dbl>
##  1  1977 M         NA
##  2  1977 M         NA
##  3  1977 <NA>      NA
##  4  1977 <NA>      NA
##  5  1977 <NA>      NA
##  6  1977 <NA>      NA
##  7  1977 <NA>      NA
##  8  1978 <NA>      NA
##  9  1978 M        218
## 10  1978 <NA>      NA
## # … with 21,476 more rows

2

up_surveys=surveys %>% 
  mutate(hindfoot_half=hindfoot_length/2) %>% 
  filter(hindfoot_half<30,
         !is.na(hindfoot_half)) %>% 
  select(species_id, hindfoot_half)
head(up_surveys)
## # A tibble: 6 x 2
##   species_id hindfoot_half
##   <chr>              <dbl>
## 1 NL                  16  
## 2 NL                  15.5
## 3 NL                  16  
## 4 NL                  17  
## 5 NL                  16  
## 6 NL                  16.5

3.1 How many animals were caught in each plot_type surveyed?

surveys %>%
    group_by(plot_type) %>%
    summarize(count = n())
## # A tibble: 5 x 2
##   plot_type                 count
##   <chr>                     <int>
## 1 Control                   15611
## 2 Long-term Krat Exclosure   5118
## 3 Rodent Exclosure           4233
## 4 Short-term Krat Exclosure  5906
## 5 Spectab exclosure          3918

3.2

surveys %>%
    group_by(species_id) %>%
    filter(!is.na(hindfoot_length)) %>% 
    summarize(mean_hindfoot = mean(hindfoot_length),
              min_hindfoot = min(hindfoot_length),
              max_hindfoot= max(hindfoot_length),
              count = n())
## # A tibble: 25 x 5
##    species_id mean_hindfoot min_hindfoot max_hindfoot count
##    <chr>              <dbl>        <dbl>        <dbl> <int>
##  1 AH                  33             31           35     2
##  2 BA                  13              6           16    45
##  3 DM                  36.0           16           50  9972
##  4 DO                  35.6           26           64  2887
##  5 DS                  49.9           39           58  2132
##  6 NL                  32.3           21           70  1074
##  7 OL                  20.5           12           39   920
##  8 OT                  20.3           13           50  2139
##  9 OX                  19.1           13           21     8
## 10 PB                  26.1            2           47  2864
## # … with 15 more rows

3.3

surveys %>%
    filter(!is.na(weight)) %>%
    group_by(year) %>%
    select(year, genus, species_id, weight) %>% 
    filter(weight==max(weight)) %>% 
    arrange(year)
## # A tibble: 27 x 4
## # Groups:   year [26]
##     year genus     species_id weight
##    <dbl> <chr>     <chr>       <dbl>
##  1  1977 Dipodomys DS            149
##  2  1978 Neotoma   NL            232
##  3  1978 Neotoma   NL            232
##  4  1979 Neotoma   NL            274
##  5  1980 Neotoma   NL            243
##  6  1981 Neotoma   NL            264
##  7  1982 Neotoma   NL            252
##  8  1983 Neotoma   NL            256
##  9  1984 Neotoma   NL            259
## 10  1985 Neotoma   NL            225
## # … with 17 more rows

4.1

surveys_gen <- surveys %>%
  group_by(year, plot_id) %>%
  summarize(genera = n_distinct(genus))

surveys_gen=surveys_gen %>% 
  spread(key=year, value=genera)
head(surveys_gen)
## # A tibble: 6 x 27
##   plot_id `1977` `1978` `1979` `1980` `1981` `1982` `1983` `1984` `1985`
##     <dbl>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>
## 1       1      2      3      4      7      5      6      7      6      4
## 2       2      6      6      6      8      5      9      9      9      6
## 3       3      5      6      4      6      6      8     10     11      7
## 4       4      4      4      3      4      5      4      6      3      4
## 5       5      4      3      2      5      4      6      7      7      3
## 6       6      3      4      3      4      5      9      9      7      5
## # … with 17 more variables: `1986` <int>, `1987` <int>, `1988` <int>,
## #   `1989` <int>, `1990` <int>, `1991` <int>, `1992` <int>, `1993` <int>,
## #   `1994` <int>, `1995` <int>, `1996` <int>, `1997` <int>, `1998` <int>,
## #   `1999` <int>, `2000` <int>, `2001` <int>, `2002` <int>

4.2

surveys_gen %>% 
  gather(key=year, value=genera, -plot_id)
## # A tibble: 624 x 3
##    plot_id year  genera
##      <dbl> <chr>  <int>
##  1       1 1977       2
##  2       2 1977       6
##  3       3 1977       5
##  4       4 1977       4
##  5       5 1977       4
##  6       6 1977       3
##  7       7 1977       3
##  8       8 1977       2
##  9       9 1977       3
## 10      10 1977       1
## # … with 614 more rows

4.3

surveys_meas=surveys %>% 
  gather(key=measurement, value=value, hindfoot_length, weight)

4.4

surveys_meas %>% 
  filter(!is.na(value)) %>%
  group_by(year, plot_type, measurement) %>% 
  summarize(value=mean(value)) %>% 
  spread(key=measurement, value=value)
## # A tibble: 130 x 4
## # Groups:   year, plot_type [130]
##     year plot_type                 hindfoot_length weight
##    <dbl> <chr>                               <dbl>  <dbl>
##  1  1977 Control                              36.1   50.4
##  2  1977 Long-term Krat Exclosure             33.7   34.8
##  3  1977 Rodent Exclosure                     39.1   48.2
##  4  1977 Short-term Krat Exclosure            35.8   41.3
##  5  1977 Spectab exclosure                    37.2   47.1
##  6  1978 Control                              38.1   70.8
##  7  1978 Long-term Krat Exclosure             22.6   35.9
##  8  1978 Rodent Exclosure                     37.8   67.3
##  9  1978 Short-term Krat Exclosure            36.9   63.8
## 10  1978 Spectab exclosure                    42.3   80.1
## # … with 120 more rows

Exercise 1

SNPs <- read_tsv("data/23andMe_complete.txt", skip = 14, col_types = 
                   cols(chromosome = col_factor()))
SNPs %>% 
  select(position, genotype)
## # A tibble: 960,614 x 2
##    position genotype
##       <dbl> <chr>   
##  1    82154 AA      
##  2   752566 AA      
##  3   752721 GG      
##  4   776546 AG      
##  5   798959 AG      
##  6   800007 CC      
##  7   838555 AC      
##  8   846808 CT      
##  9   854250 AG      
## 10   861808 GG      
## # … with 960,604 more rows

Exercise 2

SNPs %>% 
  filter(chromosome=="MT", genotype!=c("A","T"))
## # A tibble: 1,794 x 4
##    rsid     chromosome position genotype
##    <chr>    <fct>         <dbl> <chr>   
##  1 i4001358 MT                9 G       
##  2 i4000553 MT               26 C       
##  3 i4001190 MT               41 C       
##  4 i4000964 MT               43 C       
##  5 i4001177 MT               46 T       
##  6 i4000987 MT               49 A       
##  7 i4000979 MT               51 T       
##  8 i3001932 MT               53 G       
##  9 i3002014 MT               54 G       
## 10 i3002101 MT               55 T       
## # … with 1,784 more rows

Exercise 3

pos_SNPs=SNPs %>% 
  group_by(chromosome) %>% 
  summarize(min_position=min(position),
            max_position=max(position),
            tot_positions=n_distinct(position))
head(pos_SNPs)
## # A tibble: 6 x 4
##   chromosome min_position max_position tot_positions
##   <fct>             <dbl>        <dbl>         <int>
## 1 1                 82154    249218992         76906
## 2 2                 18674    243048760         77344
## 3 3                 61495    197838262         63285
## 4 4                 71566    190963766         55017
## 5 5                 27564    180696889         56019
## 6 6                155815    170919470         63244

Exercise 4

pos_SNPs=pos_SNPs %>% 
  mutate(pos_density=(max_position-min_position)/tot_positions)
head(pos_SNPs)
## # A tibble: 6 x 5
##   chromosome min_position max_position tot_positions pos_density
##   <fct>             <dbl>        <dbl>         <int>       <dbl>
## 1 1                 82154    249218992         76906       3239.
## 2 2                 18674    243048760         77344       3142.
## 3 3                 61495    197838262         63285       3125.
## 4 4                 71566    190963766         55017       3470.
## 5 5                 27564    180696889         56019       3225.
## 6 6                155815    170919470         63244       2700.

Exercise 5

pos_SNPs=pos_SNPs %>% 
  arrange(pos_density)
head(pos_SNPs)
## # A tibble: 6 x 5
##   chromosome min_position max_position tot_positions pos_density
##   <fct>             <dbl>        <dbl>         <int>       <dbl>
## 1 MT                    3        16567          2458        6.74
## 2 22             16114244     51211392         14098     2490.  
## 3 20                63244     62912463         23834     2637.  
## 4 13             19058717    115103529         36078     2662.  
## 5 10                98087    135508269         50321     2691.  
## 6 6                155815    170919470         63244     2700.

Exercise 6

write_csv(pos_SNPs, path = "data/pos_SNPs.csv")