download.file(url="https://ndownloader.figshare.com/files/2292169",
destfile = "data/portal_data_joined.csv")
1
surveys <- read_csv("data/portal_data_joined.csv")
## Parsed with column specification:
## cols(
## record_id = col_double(),
## month = col_double(),
## day = col_double(),
## year = col_double(),
## plot_id = col_double(),
## species_id = col_character(),
## sex = col_character(),
## hindfoot_length = col_double(),
## weight = col_double(),
## genus = col_character(),
## species = col_character(),
## taxa = col_character(),
## plot_type = col_character()
## )
surveys %>%
filter(year < 1995) %>%
select(year, sex, weight)
## # A tibble: 21,486 x 3
## year sex weight
## <dbl> <chr> <dbl>
## 1 1977 M NA
## 2 1977 M NA
## 3 1977 <NA> NA
## 4 1977 <NA> NA
## 5 1977 <NA> NA
## 6 1977 <NA> NA
## 7 1977 <NA> NA
## 8 1978 <NA> NA
## 9 1978 M 218
## 10 1978 <NA> NA
## # … with 21,476 more rows
2
up_surveys=surveys %>%
mutate(hindfoot_half=hindfoot_length/2) %>%
filter(hindfoot_half<30,
!is.na(hindfoot_half)) %>%
select(species_id, hindfoot_half)
head(up_surveys)
## # A tibble: 6 x 2
## species_id hindfoot_half
## <chr> <dbl>
## 1 NL 16
## 2 NL 15.5
## 3 NL 16
## 4 NL 17
## 5 NL 16
## 6 NL 16.5
3.1 How many animals were caught in each plot_type surveyed?
surveys %>%
group_by(plot_type) %>%
summarize(count = n())
## # A tibble: 5 x 2
## plot_type count
## <chr> <int>
## 1 Control 15611
## 2 Long-term Krat Exclosure 5118
## 3 Rodent Exclosure 4233
## 4 Short-term Krat Exclosure 5906
## 5 Spectab exclosure 3918
3.2
surveys %>%
group_by(species_id) %>%
filter(!is.na(hindfoot_length)) %>%
summarize(mean_hindfoot = mean(hindfoot_length),
min_hindfoot = min(hindfoot_length),
max_hindfoot= max(hindfoot_length),
count = n())
## # A tibble: 25 x 5
## species_id mean_hindfoot min_hindfoot max_hindfoot count
## <chr> <dbl> <dbl> <dbl> <int>
## 1 AH 33 31 35 2
## 2 BA 13 6 16 45
## 3 DM 36.0 16 50 9972
## 4 DO 35.6 26 64 2887
## 5 DS 49.9 39 58 2132
## 6 NL 32.3 21 70 1074
## 7 OL 20.5 12 39 920
## 8 OT 20.3 13 50 2139
## 9 OX 19.1 13 21 8
## 10 PB 26.1 2 47 2864
## # … with 15 more rows
3.3
surveys %>%
filter(!is.na(weight)) %>%
group_by(year) %>%
select(year, genus, species_id, weight) %>%
filter(weight==max(weight)) %>%
arrange(year)
## # A tibble: 27 x 4
## # Groups: year [26]
## year genus species_id weight
## <dbl> <chr> <chr> <dbl>
## 1 1977 Dipodomys DS 149
## 2 1978 Neotoma NL 232
## 3 1978 Neotoma NL 232
## 4 1979 Neotoma NL 274
## 5 1980 Neotoma NL 243
## 6 1981 Neotoma NL 264
## 7 1982 Neotoma NL 252
## 8 1983 Neotoma NL 256
## 9 1984 Neotoma NL 259
## 10 1985 Neotoma NL 225
## # … with 17 more rows
4.1
surveys_gen <- surveys %>%
group_by(year, plot_id) %>%
summarize(genera = n_distinct(genus))
surveys_gen=surveys_gen %>%
spread(key=year, value=genera)
head(surveys_gen)
## # A tibble: 6 x 27
## plot_id `1977` `1978` `1979` `1980` `1981` `1982` `1983` `1984` `1985`
## <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1 2 3 4 7 5 6 7 6 4
## 2 2 6 6 6 8 5 9 9 9 6
## 3 3 5 6 4 6 6 8 10 11 7
## 4 4 4 4 3 4 5 4 6 3 4
## 5 5 4 3 2 5 4 6 7 7 3
## 6 6 3 4 3 4 5 9 9 7 5
## # … with 17 more variables: `1986` <int>, `1987` <int>, `1988` <int>,
## # `1989` <int>, `1990` <int>, `1991` <int>, `1992` <int>, `1993` <int>,
## # `1994` <int>, `1995` <int>, `1996` <int>, `1997` <int>, `1998` <int>,
## # `1999` <int>, `2000` <int>, `2001` <int>, `2002` <int>
4.2
surveys_gen %>%
gather(key=year, value=genera, -plot_id)
## # A tibble: 624 x 3
## plot_id year genera
## <dbl> <chr> <int>
## 1 1 1977 2
## 2 2 1977 6
## 3 3 1977 5
## 4 4 1977 4
## 5 5 1977 4
## 6 6 1977 3
## 7 7 1977 3
## 8 8 1977 2
## 9 9 1977 3
## 10 10 1977 1
## # … with 614 more rows
4.3
surveys_meas=surveys %>%
gather(key=measurement, value=value, hindfoot_length, weight)
4.4
surveys_meas %>%
filter(!is.na(value)) %>%
group_by(year, plot_type, measurement) %>%
summarize(value=mean(value)) %>%
spread(key=measurement, value=value)
## # A tibble: 130 x 4
## # Groups: year, plot_type [130]
## year plot_type hindfoot_length weight
## <dbl> <chr> <dbl> <dbl>
## 1 1977 Control 36.1 50.4
## 2 1977 Long-term Krat Exclosure 33.7 34.8
## 3 1977 Rodent Exclosure 39.1 48.2
## 4 1977 Short-term Krat Exclosure 35.8 41.3
## 5 1977 Spectab exclosure 37.2 47.1
## 6 1978 Control 38.1 70.8
## 7 1978 Long-term Krat Exclosure 22.6 35.9
## 8 1978 Rodent Exclosure 37.8 67.3
## 9 1978 Short-term Krat Exclosure 36.9 63.8
## 10 1978 Spectab exclosure 42.3 80.1
## # … with 120 more rows
SNPs <- read_tsv("data/23andMe_complete.txt", skip = 14, col_types =
cols(chromosome = col_factor()))
SNPs %>%
select(position, genotype)
## # A tibble: 960,614 x 2
## position genotype
## <dbl> <chr>
## 1 82154 AA
## 2 752566 AA
## 3 752721 GG
## 4 776546 AG
## 5 798959 AG
## 6 800007 CC
## 7 838555 AC
## 8 846808 CT
## 9 854250 AG
## 10 861808 GG
## # … with 960,604 more rows
SNPs %>%
filter(chromosome=="MT", genotype!=c("A","T"))
## # A tibble: 1,794 x 4
## rsid chromosome position genotype
## <chr> <fct> <dbl> <chr>
## 1 i4001358 MT 9 G
## 2 i4000553 MT 26 C
## 3 i4001190 MT 41 C
## 4 i4000964 MT 43 C
## 5 i4001177 MT 46 T
## 6 i4000987 MT 49 A
## 7 i4000979 MT 51 T
## 8 i3001932 MT 53 G
## 9 i3002014 MT 54 G
## 10 i3002101 MT 55 T
## # … with 1,784 more rows
pos_SNPs=SNPs %>%
group_by(chromosome) %>%
summarize(min_position=min(position),
max_position=max(position),
tot_positions=n_distinct(position))
head(pos_SNPs)
## # A tibble: 6 x 4
## chromosome min_position max_position tot_positions
## <fct> <dbl> <dbl> <int>
## 1 1 82154 249218992 76906
## 2 2 18674 243048760 77344
## 3 3 61495 197838262 63285
## 4 4 71566 190963766 55017
## 5 5 27564 180696889 56019
## 6 6 155815 170919470 63244
pos_SNPs=pos_SNPs %>%
mutate(pos_density=(max_position-min_position)/tot_positions)
head(pos_SNPs)
## # A tibble: 6 x 5
## chromosome min_position max_position tot_positions pos_density
## <fct> <dbl> <dbl> <int> <dbl>
## 1 1 82154 249218992 76906 3239.
## 2 2 18674 243048760 77344 3142.
## 3 3 61495 197838262 63285 3125.
## 4 4 71566 190963766 55017 3470.
## 5 5 27564 180696889 56019 3225.
## 6 6 155815 170919470 63244 2700.
pos_SNPs=pos_SNPs %>%
arrange(pos_density)
head(pos_SNPs)
## # A tibble: 6 x 5
## chromosome min_position max_position tot_positions pos_density
## <fct> <dbl> <dbl> <int> <dbl>
## 1 MT 3 16567 2458 6.74
## 2 22 16114244 51211392 14098 2490.
## 3 20 63244 62912463 23834 2637.
## 4 13 19058717 115103529 36078 2662.
## 5 10 98087 135508269 50321 2691.
## 6 6 155815 170919470 63244 2700.
write_csv(pos_SNPs, path = "data/pos_SNPs.csv")