Taxonomy
assignment
- This is Part 2 of the microbiome data analysis workshop, focusing on
taxonomic classification and downstream analysis.
- We will use the
seqtab_nochim.rds file generated in
Part 1; alternatively, you may download here:
seqtab_nochim.rds.
Load R packages
library(phyloseq)
library(dplyr)
library(tidyr)
library(tibble)
library(htmlwidgets)
library(htmltools)
library(sankeyD3)
DADA2’s taxonomy
assignment
- The DADA2 function
assignTaxonomy() provides a built-in
implementation of the naïve Bayesian classifier for taxonomic
assignment.
- DADA2-formatted training fasta file was downloaded from
Taxonomic
reference data
- SILVA: Broad coverage across Bacteria, Archaea, and
Eukaryota (SSU rRNA (16S/18S) and LSU rRNA)
- RDP: rRNA sequences plus the RDP Classifier
ecosystem (16S (Bacteria/Archaea) and fungal 28S)
- GreenGenes2: Unifies 16S and genomic data in one
reference tree (16S integrated with genome phylogeny)
# Set the file paths to your taxonomy reference files
tax_train_fasta <- "/home/dbs/silva_nr99_v138.2_toGenus_trainset.fa.gz"
tax_species_fasta <- "/home/dbs/silva_v138.2_assignSpecies.fa.gz"
rdp_train_fasta <- "/home/dbs/rdp_19_toGenus_trainset.fa.gz"
gg2_train_fasta <- "/home/dbs/gg2_2024_09_toGenus_trainset.fa.gz"
tax_mat <- assignTaxonomy(
seqtab_nochim,
refFasta = tax_train_fasta
)
tax_mat <- addSpecies(tax_mat, refFasta = tax_species_fasta, allowMultiple = TRUE)
saveRDS(tax_mat, file.path(base_dir, "DADA2_taxonomy_matrix_SILVA.rds"))
to_preview <- tax_mat # Removing sequence rownames for display only
rownames(to_preview) <- NULL
head(to_preview)
## Kingdom Phylum Class Order Family
## [1,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales" "Muribaculaceae"
## [2,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales" "Muribaculaceae"
## [3,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales" "Bacteroidaceae"
## [4,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales" "Muribaculaceae"
## [5,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales" "Rikenellaceae"
## [6,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales" "Muribaculaceae"
## Genus Species
## [1,] NA NA
## [2,] NA NA
## [3,] "Bacteroides" "acidifaciens/caecimuris"
## [4,] NA NA
## [5,] "Alistipes" NA
## [6,] NA NA
rdp_mat <- assignTaxonomy(
seqtab_nochim,
refFasta = rdp_train_fasta
)
saveRDS(rdp_mat, file.path(base_dir, "DADA2_taxonomy_matrix_RDP.rds"))
gg2_mat <- assignTaxonomy(
seqtab_nochim,
refFasta = gg2_train_fasta
)
saveRDS(gg2_mat, base::file.path(base_dir, "DADA2_taxonomy_matrix_GG2.rds"))
Analysis
- The phyloseq R package provides a robust framework for downstream
microbiome analysis and visualization, and the scripts used here were
adapted from
the
phyloseq tutorial.
- Results and visualizations are reported based on SILVA for
consistency; however, taxonomy assignments from the other databases may
be loaded to reproduce the same workflow using the scripts provided
below.
Alpha
diversity over time
- Visualize within-sample diversity using Shannon and Simpson
indices across sampling Day, with points colored by Group
(Early/Late) and shaped by Sex (Male/Female).
Group highlighted
by color
group_cols <- c("E" = "#009988", "L" = "#EE7733") # teal and orange
rich_plot <- plot_richness(phylo_obj,
x = "Day", measures = c("Shannon", "Simpson"),
color = "Group", shape = "Sex"
) +
theme_bw() +
scale_color_manual(values = group_cols) +
geom_point(size = 3)
pdf(file.path(base_dir, "07_alpha_diversity_by_group.pdf"), width = 6, height = 6)
print(rich_plot)
dev.off()

Sex highlighted by
color
sex_cols <- c("M" = "#0077BB", "F" = "#BB5566") # blue and red
rich_plot <- plot_richness(phylo_obj,
x = "Day", measures = c("Shannon", "Simpson"),
shape = "Group", color = "Sex"
) +
theme_bw() +
scale_color_manual(values = sex_cols) +
geom_point(size = 3)
pdf(file.path(base_dir, "08_alpha_diversity_by_sex.pdf"), width = 6, height = 6)
print(rich_plot)
dev.off()

Beta diversity
- Assess between-sample community differences using
Bray–Curtis distances on relative-abundance data and
visualize them with NMDS, with points colored by Group (Early/Late) and
shaped by Sex (Male/Female)
ps_nz <- prune_samples(sample_sums(phylo_obj) > 0, phylo_obj)
otu <- as(otu_table(ps_nz), "matrix")
if (taxa_are_rows(ps_nz)) otu <- t(otu)
otu[!is.finite(otu)] <- 0
otu_table(ps_nz) <- otu_table(otu, taxa_are_rows = FALSE)
ps_prop <- transform_sample_counts(ps_nz, function(x) x / sum(x))
nmds_bray <- ordinate(ps_prop, method = "NMDS", distance = "bray", trace = 0)
ordi_plot <- plot_ordination(ps_prop, nmds_bray, color = "Group", shape = "Sex") +
theme_bw() +
scale_color_manual(values = group_cols)
pdf(file.path(base_dir, "09_beta_diversity.pdf"), width = 6, height = 6)
print(ordi_plot)
dev.off()

Taxonomic relative
abundance
- Plot the top 20 most abundant ASVs as relative
abundance bar charts, aggregated and colored by Taxa, and faceted by
Group (Early/Late) and Sex (Male/Female) across sampling days.
top <- names(sort(taxa_sums(phylo_obj), decreasing = TRUE))[1:20]
ps_top <- transform_sample_counts(phylo_obj, function(OTU) OTU / sum(OTU))
ps_top <- prune_taxa(top, ps_top)
df_plot <- psmelt(ps_top)
df_plot$Family <- as.character(df_plot$Family)
df_plot$Family[is.na(df_plot$Family) | df_plot$Family == ""] <- "Unassigned"
dt <- as.data.table(df_plot)
dt_sum <- dt[, .(Abundance = mean(Abundance, na.rm = TRUE)), by = .(Category, Family)]
dt_sum[, Abundance := Abundance / sum(Abundance), by = Category]
top20_taxa <- ggplot(dt_sum, aes(x = Category, y = Abundance, fill = Family)) +
geom_col(color = NA) +
scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
scale_fill_brewer(palette = "Set2", drop = FALSE) +
theme_bw() +
theme(
legend.position = "bottom",
axis.text.x = element_text(angle = 90, vjust = 0, hjust = 1)
) +
coord_flip()
pdf(file.path(base_dir, "10_top20_taxa_by_group.pdf"), width = 8, height = 5)
print(top20_taxa)
dev.off()

Taxonomic lineage
profiles by category
- Taxonomic composition is summarized as lineage-level flows, i.e.,
Sankey chart, from higher to lower ranks for each
category, enabling rapid comparison of dominant clades and where
group-specific differences emerge along the taxonomic hierarchy.
source("/home/rstudio/R/generate_sankey_plot.R")
meta_df <- data.frame(phyloseq::sample_data(phylo_obj), stringsAsFactors = FALSE)
meta_df$Sample <- rownames(meta_df)
tt_cols <- colnames(phyloseq::tax_table(phylo_obj))
# E_M (Early + Male), E_F (Early + Female), L_M (Late + Male), L_F (Late + Female)
cat_value <- "E_M"
dat <- build_sankey_data(ps = phylo_obj, meta_df = meta_df, category_value = cat_value)
title_text <- sprintf("%s(#Samples=%d, #Taxa=%d, #Reads=%d)", cat_value, dat$nsamples, dat$ntaxa, dat$total_reads)
w <- render_one_sankey(dat, title_text)
htmlwidgets::saveWidget(w, file = file.path(base_dir, paste0("sankey_", cat_value, ".html")))
