1 Introduction

The Bioinformatics team generated this report includes stats and diagnostic plots of the Oxford Nanopore Sequencing data.

An analytical platform designed to quantify and compare Nanopore sequencing data quality across multiple experimental conditions, enabling detection of batch-specific variability and ensuring consistency in sequencing performance.

2 Prerequisite

2.1 Environment

  • OS: macOS 26.0.1
  • Platform: aarch64-apple-darwin20
  • Software: R (v4.5.1)
  • Pakcages: dplyr (v1.1.4), ggplot2 (v4.0.0), ggrepel (v0.9.6), ggridges (v0.5.7), ggsankey (v0.0.99999), stringr (v1.5.2), reshape2 (v1.4.4), RColorBrewer (v1.1-3), viridis (0.4.2), cowplot (v1.2.0)

2.2 Metadata

  • Project: Factors affecting microbiota transfer to mice
  • Description: Combined cecum, colon, and fecal samples from mice.
  • Generated: from June 09 to August 12, 2025

2.3 Run report

3 Quality assessment

3.1 Preparation

  • Set working directory
# Do not run
baseDir <- "WORKING_DIRECTORY" # where the Nanopore stats files are located
setwd(baseDir)
outDir <- file.path(baseDir, "Results")
# Load libraries
library(dplyr)
library(ggplot2)
library(ggrepel)
library(ggridges)
library(ggsankey)
library(stringr)
library(reshape2)
library(RColorBrewer)
library(viridis)
library(cowplot)

3.2 Sequence summary

  • Sequencing output summary files were generated using Dorado with the summary command.
  • Pore activity files could not be regenerated when MinKNOW failed and only the available files were included.

3.2.1 Import data

tsvFiles <- list.files(path = file.path(baseDir, "Data"), pattern = "tsv", full.names = TRUE)
# sequencing summary
summaryL <- lapply(seq_along(tsvFiles), function(idx) {
        tsvFile <- tsvFiles[idx]
        tsv <- read.table(tsvFile, header = TRUE, stringsAsFactor = FALSE, sep = "\t")
        tsv$sample <- gsub(".tsv", "", basename(tsvFile))
        return(tsv)
})
names(summaryL) <- gsub(".tsv", "", basename(tsvFiles))
summaryL <- summaryL[sampleNames] # reorder

summary <- do.call(rbind, summaryL)
summary$sample <- factor(summary$sample, levels = sampleNames)

# pore activity
csvFiles <- list.files(path = file.path(baseDir, "Data"), pattern = "csv", full.names = TRUE)

poreL <- lapply(seq_along(csvFiles), function(idx) {
        csvFile <- csvFiles[idx]
        csv <- read.csv(csvFile, header = TRUE)
        csv$sample <- gsub(".csv", "", basename(csvFile))
        return(csv)
})
names(poreL) <- gsub(".csv", "", basename(csvFiles))

basename(tsvFiles)
##  [1] "Donors.tsv"  "Plate01.tsv" "Plate02.tsv" "Plate03.tsv" "Plate04.tsv"
##  [6] "Plate05.tsv" "Plate06.tsv" "Plate07.tsv" "Plate08.tsv" "Plate09.tsv"
## [11] "Plate10.tsv" "Plate11.tsv" "Plate12.tsv" "Plate13.tsv" "Plate14.tsv"

3.2.2 Sequence output

  • Legend:
readLengthMax <- max(summary$sequence_length_template)
readQualityMax <- max(summary$mean_qscore_template)

for (sampleName in sampleNames) {
        stat <- summaryL[[sampleName]]
        readDf <- data.frame(
                readLength = stat$sequence_length_template,
                readQuality = stat$mean_qscore_template
        )
        readDf <- densityColors(df = readDf, cols = heatCols)

        cat("#### ", sampleName, "\n")
        cat("\n")
        cat("  * Number of reads: ", format(nrow(readDf), big.mark=",",scientific = FALSE), "\n")
        plot(log10(readDf$readLength), readDf$readQuality, col = readDf$Col, xlab = "Length, log10", ylab = "Quality, phred", pch = 20, xlim = c(0, log10(readLengthMax)), ylim = c(0, readQualityMax))
        cat("\n\n")
}

3.2.2.1 Donors

  • Number of reads: 71,196,543

3.2.2.2 Plate01

  • Number of reads: 61,058,307

3.2.2.3 Plate02

  • Number of reads: 107,947,949

3.2.2.4 Plate03

  • Number of reads: 34,346,079

3.2.2.5 Plate04

  • Number of reads: 68,717,387

3.2.2.6 Plate05

  • Number of reads: 64,711,517

3.2.2.7 Plate06

  • Number of reads: 61,159,790

3.2.2.8 Plate07

  • Number of reads: 42,141,210

3.2.2.9 Plate08

  • Number of reads: 69,663,459

3.2.2.10 Plate09

  • Number of reads: 61,819,183

3.2.2.11 Plate10

  • Number of reads: 99,209,083

3.2.2.12 Plate11

  • Number of reads: 39,390,159

3.2.2.13 Plate12

  • Number of reads: 121,194,224

3.2.2.14 Plate13

  • Number of reads: 116,074,554

3.2.2.15 Plate14

  • Number of reads: 58,361,605

3.2.3 Read quality

  • Read lengths were log10-transformed according to the formula log10(read length + 1)
Min. 1st Qu. Median Mean 3rd Qu. Max.
Donors 1 17.83 22.90 22.83 27.62 50
Plate01 1 17.31 21.95 22.24 26.49 50
Plate02 1 16.76 21.40 22.12 26.52 50
Plate03 1 14.00 19.26 20.63 25.70 50
Plate04 1 15.38 20.23 20.78 25.10 50
Plate05 1 16.08 20.12 20.55 24.47 50
Plate06 1 16.87 21.24 22.22 26.65 50
Plate07 1 15.08 19.63 20.09 24.31 50
Plate08 1 16.86 21.61 22.50 27.35 50
Plate09 1 15.67 19.96 20.48 24.27 50
Plate10 1 16.53 21.20 21.87 26.17 50
Plate11 1 15.99 20.62 21.91 26.67 50
Plate12 1 16.63 21.15 22.05 26.33 50
Plate13 1 16.45 21.03 22.03 26.79 50
Plate14 1 16.11 21.54 22.40 27.89 50
ridgeQuality <- ggplot(summary, aes(x = mean_qscore_template, y = sample, fill = ..x..)) +
        geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
        scale_fill_viridis(name = "Quality, phred", option = "C") +
        labs(
                x = "Quality, phred", 
                y = "Plate"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                text = element_text(size = 12)
        ) +
        geom_vline(xintercept = 20, linetype = "dashed", color = "red", size = 1)
print(ridgeQuality)

3.2.4 Read length

Min. 1st Qu. Median Mean 3rd Qu. Max.
Donors 5 312 575 1,647.89 1,673 1,625,012
Plate01 5 247 514 1,712.78 1,730 1,348,333
Plate02 5 210 336 968.91 716 2,490,141
Plate03 5 186 283 652.25 547 1,436,315
Plate04 5 230 383 1,224.34 943 1,415,156
Plate05 5 226 321 650.05 565 1,380,165
Plate06 5 216 360 1,035.02 852 1,606,542
Plate07 5 226 376 1,120.82 910 1,457,070
Plate08 5 202 317 911.83 675 2,165,164
Plate09 5 240 418 1,242.15 1,094 1,482,838
Plate10 5 225 412 1,415.85 1,240 1,617,295
Plate11 5 196 298 713.64 591 1,435,036
Plate12 5 211 343 976.20 771 1,843,158
Plate13 5 216 344 854.97 703 1,540,083
Plate14 5 206 311 786.36 600 1,387,738
ridgeLength <- ggplot(summary, aes(x = log10(sequence_length_template + 1), y = sample, fill = ..x..)) +
        geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
        scale_fill_viridis(name = "Length, log10") +
        labs(
                x = "Length, log10", 
                y = "Plate"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                text = element_text(size = 12)
        )
print(ridgeLength)

3.2.5 Pore activity

for (sampleName in sampleNames) {
        if (sampleName %in% names(poreL) & sampleName %in% names(summaryL)) {
                summDf <- summaryL[[sampleName]]
                sec2HalfDayBreaks <- seq(-1, (max(summDf$start_time) + 43200), 43200)
                summDf$TimeInGroup <- cut(summDf$start_time, breaks = sec2HalfDayBreaks, labels = as.character(seq(0.5, (length(sec2HalfDayBreaks)-1)/2, 0.5)))
                tsvDf <- data.frame(
                        TimeInGroup = summDf$TimeInGroup,
                        Value = summDf$mean_qscore_template / 50 * 100,
                        Category = "Quality"
                )

                poreDf <- poreL[[sampleName]]
                poreDf$Active <- "No"
                poreDf$Active[which(poreDf$Channel.State %in% c("strand", "adapter"))] <- "Yes"
                poreAct <- poreDf %>% group_by(Experiment.Time..minutes.) %>%  summarise(Total = sum(State.Time..samples.), Live = sum(State.Time..samples.[Active == "Yes"]))
                min2HalfDayBreaks <- seq(-1, (max(poreAct$Experiment.Time..minutes.) + 720), 720)
                poreAct$TimeInGroup <- cut(poreAct$Experiment.Time..minutes., breaks = min2HalfDayBreaks, labels = as.character(seq(0.5, (length(min2HalfDayBreaks)-1)/2, 0.5)))
                poreAct <- poreAct %>% mutate(Prop = Live / Total)
                csvDf <- data.frame(
                        TimeInGroup = poreAct$TimeInGroup,
                        Value = poreAct$Prop * 100,
                        Category = "Active pores"
                )
                
                poreActivityPlot <- poreActivity(df = rbind(csvDf, tsvDf), cols = c("#1B9E77", "#D95F02"))
                
                cat("#### ", sampleName, "\n")
                cat("\n")
                cat("  * Read quality and pore activity over time.\n")
                print(poreActivityPlot)
                cat("\n\n")
        } else {
                summDf <- summaryL[[sampleName]]
                sec2HalfDayBreaks <- seq(-1, (max(summDf$start_time) + 43200), 43200)
                summDf$TimeInGroup <- cut(summDf$start_time, breaks = sec2HalfDayBreaks, labels = as.character(seq(0.5, (length(sec2HalfDayBreaks)-1)/2, 0.5)))
                tsvDf <- data.frame(
                        TimeInGroup = summDf$TimeInGroup,
                        Value = summDf$mean_qscore_template / 50 * 100,
                        Category = "Quality"
                )
                csvDf <- data.frame( # dummy data for consistent visualization
                        TimeInGroup = as.character(seq(0.5, (length(sec2HalfDayBreaks)-1)/2, 0.5)),
                        Value = rep(0, length(as.character(seq(0.5, (length(sec2HalfDayBreaks)-1)/2, 0.5)))),
                        Category = "Active pores"
                )
                poreActivityPlot <- poreActivity(df = rbind(csvDf, tsvDf), cols = c("#1B9E77", "#D95F02"))

                cat("#### ", sampleName, "\n")
                cat("\n")
                cat("  * _MinKNOW_ failed to produce a pore activity report.\n")
                print(poreActivityPlot)
                cat("\n\n")
        }
}

3.2.5.1 Donors

  • MinKNOW failed to produce a pore activity report.

3.2.5.2 Plate01

  • Read quality and pore activity over time.

3.2.5.3 Plate02

  • Read quality and pore activity over time.

3.2.5.4 Plate03

  • MinKNOW failed to produce a pore activity report.

3.2.5.5 Plate04

  • Read quality and pore activity over time.

3.2.5.6 Plate05

  • Read quality and pore activity over time.

3.2.5.7 Plate06

  • Read quality and pore activity over time.

3.2.5.8 Plate07

  • Read quality and pore activity over time.

3.2.5.9 Plate08

  • Read quality and pore activity over time.

3.2.5.10 Plate09

  • MinKNOW failed to produce a pore activity report.

3.2.5.11 Plate10

  • Read quality and pore activity over time.

3.2.5.12 Plate11

  • Read quality and pore activity over time.

3.2.5.13 Plate12

  • Read quality and pore activity over time.

3.2.5.14 Plate13

  • Read quality and pore activity over time.

3.2.5.15 Plate14

  • MinKNOW failed to produce a pore activity report.

3.3 Barcode-level summary

  • At the barcode-level data, only reads with a quality score of 10 or higher were retained.
  • Consequently, the number of reads shown in the Figures below is lower than that presented in the Figures above.

3.3.1 Import data

data <- read.table(file.path(baseDir, "Data", "NanoStats_summary.txt"), header = T, stringsAsFactors = F, sep = "\t")
data$Plate <- sapply(str_split(data$Plate_Barcode, "_"), "[[", 1)
data$Plate <- factor(data$Plate, levels = c("Donors", "Plate01", "Plate02", "Plate03", "Plate04", "Plate05", "Plate06", "Plate07", "Plate08", "Plate09", "Plate10", "Plate11", "Plate12", "Plate13", "Plate14"))
data$Barcode <- sapply(str_split(data$Plate_Barcode, "_"), "[[", 2)
head(data[,c("Plate", "Barcode", "number_of_reads", "n50")])
##    Plate   Barcode number_of_reads  n50
## 1 Donors barcode09         3581458 4913
## 2 Donors barcode10         4325293 4940
## 3 Donors barcode11         7151873 3846
## 4 Donors barcode12         8050965 2692
## 5 Donors barcode13         6104907 4512
## 6 Donors barcode14         4970546 3838

3.3.2 Number of reads

ggplot(data = data, aes(x = Plate, y = log10(number_of_reads + 1), fill = Plate)) +
        geom_violin(position = dodge, size = 0) +
        geom_boxplot(width = 0.1, position = dodge, fill="white") +
        scale_fill_manual(values = sampleCols) +
        labs(
                x = "Plate", 
                y = "Number of reads, log10"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                legend.position = "none", 
                text = element_text(size = 12)
        )

3.3.3 Number of bases per read

ggplot(data = data, aes(x = Plate, y = number_of_bases/number_of_reads, fill = Plate)) +
        geom_violin(position = dodge, size = 0) +
        geom_boxplot(width = 0.1, position = dodge, fill="white") +
        scale_fill_manual(values = sampleCols) +
        labs(
                x = "Plate", 
                y = "Number of bases per read"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                legend.position = "none", 
                text = element_text(size = 12)
        )

3.3.4 N50

ggplot(data, aes(x = n50, y = Plate, fill = ..x..)) +
        geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
        scale_fill_viridis(name = "N50", option = "F") +
        labs(
                x = "N50", 
                y = "Plate"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                text = element_text(size = 12)
        )

3.3.5 Read quality

3.3.5.1 Proportion

qualDf <- data.frame(
        Plate = data$Plate,
        Barcode = data$Barcode,
        Lt10 = data$number_of_reads - data$Reads..Q10._count,
        Gt10Lt15 = data$Reads..Q10._count - data$Reads..Q15._count,
        Gt15lt20 = data$Reads..Q15._count - data$Reads..Q20._count,
        Gt20lt25 = data$Reads..Q20._count - data$Reads..Q25._count,
        Gt25lt30 = data$Reads..Q25._count - data$Reads..Q30._count,
        Gt30 = data$Reads..Q30._count
)
qDf <- melt(qualDf, value.name = "Count")
colnames(qDf) <- c("Plate", "Barcode", "Quality", "Count")
qDf$Quality <- factor(qDf$Quality, levels = c("Gt30", "Gt25lt30", "Gt20lt25", "Gt15lt20", "Gt10Lt15", "Lt10"), labels = c(">=30", ">=25 and <30", ">=20 and <25", ">=15 and <20", ">=10 and <15", "<10"))

ggplot(qDf, aes(x = Plate, y = Count, fill = Quality)) + 
        geom_bar(position="fill", stat="identity") +
        labs(
                x = "Plate", 
                y = "Proportion"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                text = element_text(size = 12)
        ) + scale_fill_grey()

3.3.5.2 Median

ggplot(data, aes(x = median_qual, y = Plate, fill = ..x..)) +
        geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
        scale_fill_viridis(name = "Phred", option = "C") +
        labs(
                x = "Quality, median", 
                y = "Plate"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                text = element_text(size = 12)
        )

3.3.5.3 Mean

ggplot(data, aes(x = mean_qual, y = Plate, fill = ..x..)) +
        geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
        scale_fill_viridis(name = "Phred", option = "C") +
        labs(
                x = "Quality, mean", 
                y = "Plate"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                text = element_text(size = 12)
        )

3.3.6 Read length

3.3.6.1 Median

ggplot(data, aes(x = median_read_length, y = Plate, fill = ..x..)) +
        geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
        scale_fill_viridis(name = "bp", option = "D") +
        labs(
                x = "Length, median", 
                y = "Plate"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                text = element_text(size = 12)
        )

3.3.6.2 Mean

ggplot(data, aes(x = mean_read_length, y = Plate, fill = ..x..)) +
        geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
        scale_fill_viridis(name = "bp", option = "D") +
        labs(
                x = "Length, mean", 
                y = "Plate"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                text = element_text(size = 12)
        )

4 Study design

  • The Sankey plot was generated using 1,210 mouse samples after excluding 19 donor samples, 12 OMM12 controls, and 8 negative controls.
    • 1,249 = 1,210 + 19 + 12 + 8
  • One sample, F1_F23_6_F, was sequenced twice; therefore, we annotated the replicate libraries as F1_F23-1_6_F and F1_F23-2_6_F corresponding to Plate10_barcode08 and Plate14_barcode12, respectively.
sampleAnnot <- read.table(file.path(baseDir, "Data", "sample_annot.txt"), header = T, stringsAsFactors = F, sep = "\t")
annotDf <- sampleAnnot %>% make_long(DonorID, MouseSex, Week, Type)
ggplot(annotDf, aes(x = x, next_x = next_x, node = node, next_node = next_node, fill = factor(node), label = node)) +
        geom_sankey(flow.alpha = .6, node.color = "gray30") +
        geom_sankey_label(size = 3, color = "black", fill = "white") +
        scale_fill_viridis_d(option = "A", alpha = 0.95) +
        theme_sankey(base_size = 18) +
        labs(
                x = NULL, 
                y = NULL
        ) +
        theme_bw() +
        theme(
                axis.line = element_blank(),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                axis.ticks.x = element_blank(),
                axis.text.y = element_blank(),
                axis.ticks.y = element_blank(),
                legend.position = "none", 
                text = element_text(size = 12)
        )

5 SessionInfo

sessioninfo::session_info(pkgs = "attached")
## ─ Session info ───────────────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.5.1 (2025-06-13)
##  os       macOS Tahoe 26.0.1
##  system   aarch64, darwin20
##  ui       X11
##  language (EN)
##  collate  en_CA.UTF-8
##  ctype    en_CA.UTF-8
##  tz       America/Edmonton
##  date     2025-11-06
##  pandoc   3.8.2 @ /opt/homebrew/bin/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────────────
##  package      * version   date (UTC) lib source
##  cowplot      * 1.2.0     2025-07-07 [1] CRAN (R 4.5.0)
##  dplyr        * 1.1.4     2023-11-17 [1] CRAN (R 4.5.0)
##  ggplot2      * 4.0.0     2025-09-11 [1] CRAN (R 4.5.0)
##  ggrepel      * 0.9.6     2024-09-07 [1] CRAN (R 4.5.0)
##  ggridges     * 0.5.7     2025-08-27 [1] CRAN (R 4.5.0)
##  ggsankey     * 0.0.99999 2025-11-04 [1] Github (davidsjoberg/ggsankey@b675d0d)
##  knitr        * 1.50      2025-03-16 [1] CRAN (R 4.5.0)
##  RColorBrewer * 1.1-3     2022-04-03 [1] CRAN (R 4.5.0)
##  reshape2     * 1.4.4     2020-04-09 [1] CRAN (R 4.5.0)
##  sessioninfo  * 1.2.3     2025-02-05 [1] CRAN (R 4.5.0)
##  stringr      * 1.5.2     2025-09-08 [1] CRAN (R 4.5.0)
##  viridis      * 0.6.5     2024-01-29 [1] CRAN (R 4.5.0)
##  viridisLite  * 0.4.2     2023-05-02 [1] CRAN (R 4.5.0)
##  yaml         * 2.3.10    2024-07-26 [1] CRAN (R 4.5.0)
## 
##  [1] /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library
##  * ── Packages attached to the search path.
## 
## ──────────────────────────────────────────────────────────────────────────────────────