-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunclassified_realm.R
70 lines (59 loc) · 2.47 KB
/
unclassified_realm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
source("blackflies.R")
unclassified_seq <- taxonomy_df %>%
#mutate_at(vars(Realm), ~replace_na(., "Unclassified")) %>%
filter(is.na(Realm))
unclassified_seq <- unclassified_seq %>%
mutate(length=as.integer(str_split_i(rowname, "_", 4)),
name=sub("^(?:[^_]*_){6}(.*)$", "\\1", rowname))
#unclassified_seq %>%
# select(rowname) %>%
# write_delim("data/unclassified.txt")
clusters <- read_delim("data/unclassified_clusters.tsv",
col_names = c("representative", "cluster"))
unclassified_joined <- left_join(unclassified_seq, clusters, by = join_by(rowname==representative))
unclassified_joined %>%
filter(!is.na(Family)) %>%
group_by(Realm, Kingdom, Phylum, Class, Order, Family, Genus, Species) %>%
summarise(rownames=paste0(rowname, collapse = ","))
unclassified_joined %>%
ggplot(aes(x=as.integer(length), fill=Family))+
geom_bar()+
scale_x_binned(limits = c(1000, 11000))+
labs(x="length (bp)")+
theme_bw()
family_joined <- unclassified_joined %>%
filter(!is.na(Family)) %>%
tidyr::expand(nesting(select(., -cluster)), cluster) %>%
left_join(unclassified_joined2 %>% select(-name), by="cluster") %>%
mutate(name_in_cluster = str_detect(cluster, name)) %>%
filter(name_in_cluster==T,
Family.x == Family.y) %>%
group_by(rowname.x) %>%
mutate(keep_row = !rowname.x %in% lead(rowname.y)) %>%
ungroup() %>%
filter(keep_row) %>%
select(-keep_row, -ends_with(".y"), -starts_with("name"), -cluster) %>%
distinct() %>%
rename_with(~str_remove(., "\\.x$"), ends_with(".x"))
species_joined <- unclassified_joined %>%
filter(!is.na(Species) & is.na(Family)) %>%
tidyr::expand(nesting(select(., -cluster)), cluster) %>%
left_join(unclassified_joined2 %>% select(-name), by="cluster") %>%
mutate(name_in_cluster = str_detect(cluster, name)) %>%
filter(name_in_cluster==T,
Species.x == Species.y) %>%
group_by(rowname.x) %>%
mutate(keep_row = !rowname.x %in% lead(rowname.y)) %>%
ungroup() %>%
filter(keep_row) %>%
select(-keep_row, -ends_with(".y"), -starts_with("name"), -cluster) %>%
distinct() %>%
rename_with(~str_remove(., "\\.x$"), ends_with(".x"))
all_unclassified <- full_join(family_joined, species_joined) %>%
full_join(unclassified_joined %>%
filter(is.na(Species) & is.na(Family)) %>%
select(-name, -cluster)) %>%
select(-length)
taxonomy_df_unique_species <- taxonomy_df %>%
filter(!is.na(Realm)) %>%
bind_rows(all_unclassified)