microbiome · raivo-otus · Oct 3, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025
diff --git a/inst/assets/bibliography.bib b/inst/assets/bibliography.bib
@@ -3069,6 +3069,25 @@ @article{Robinson2010
    year = {2010}
 }
 
+@article{Gamboa-Tuz2025,
+  author = {Gamboa-Tuz, Samuel D. and Ramos, Marcel and Franzosa, Eric and Huttenhower, Curtis and Segata, Nicola and Oh, Sehyun and Waldron, Levi},
+  title = {Commonly used compositional data analysis implementations are not advantageous in microbial differential abundance analyses benchmarked against biological        ▎ ground truth},
+  elocation-id = {2025.02.13.638109},
+  year = {2025},
+  doi = {10.1101/2025.02.13.638109},
+  publisher = {Cold Spring Harbor Laboratory},
+  abstract = {Previous benchmarking of differential abundance (DA) analysis methods in microbiome studies have employed synthetic data, simulations, and {\textquotedb       ▎ lleft}real data{\textquotedblright} examples, but to the best of our knowledge, none have yet employed experimental data with known {\textquotedblleft}ground truth{       ▎ \textquotedblright} differential abundance. A key debate in the field centers on whether compositional methods are necessary for DA analysis, which is challenging t       ▎ o answer due to the lack of ground truth data. To address this gap, we created the Bioconductor data package MicrobiomeBenchmarkData, featuring three microbiome dat       ▎ asets with established biological ground truths: 1) diverse oral microbiomes from supragingival and subgingival plaques, expected to favor aerobic and anaerobic bac       ▎ teria, respectively, 2) low-diversity microbiomes from healthy vaginas and bacterial vaginosis, conditions that have been well-characterized through cell culture an       ▎ d microscopy, and 3) a spike-in dataset with constant, known absolute abundances of three bacteria. We benchmarked 17 DA approaches and demonstrated that compositio       ▎ nal DA methods are not beneficial but rather lack sensitivity, show increased variability in constant-abundance spike-ins, and, most surprisingly, more frequently p       ▎ roduce paradoxical results with DA in the wrong direction for the low-diversity microbiome. Conversely, commonly used methods in microbiome literature, such as LEfS       ▎ e, the Wilcoxon test, and RNA-seq-derived methods, performed best. We conclude that researchers continue using widely adopted non-parametric or RNA-seq DA methods a       ▎ nd that further development of compositional methods includes benchmarking against datasets with known biological ground truth.Competing Interest StatementThe autho       ▎ rs have declared no competing interest.},
+  URL = {https://www.biorxiv.org/content/early/2025/02/17/2025.02.13.638109},
+  eprint = {https://www.biorxiv.org/content/early/2025/02/17/2025.02.13.638109.full.pdf},
+  journal = {bioRxiv}
+}
+
+@manual{HMP2data,
+  author = {Stansfield, John and Smirnova, Ekaterina and Zhao, Ni and Fettweis, Jennifer and Waldron, Levi and Dozmorov, Mikhail},
+  title = {{HMP2data}: 16s rRNA sequencing data from the Human Microbiome Project 2},
+  year = {2025},
+  note = {R package version 1.22.0},
+  doi = {10.18129/B9.bioc.HMP2Data}
 @article{Medearis2026,
   abstract = {The human gut microbiome encodes rich information about host health, yet current analysis pipelines remain narrowly optimized for individual tasks. This limits our ability to gain a thorough view of how the microbiome impacts health and disease. Here we introduce BiomeGPT, a transformer-based foundation model pretrained on over 13,300 human gut metagenomes spanning 32 phenotypes—including healthy and 31 diverse diseases—to learn context-aware, species-level gut microbiome representations. The model captures quantitative compositional structure and intricate cross-species dependencies embedded within community profiles. When fine-tuned for predicting host health status, BiomeGPT accurately distinguishes healthy from diseased microbiomes and resolves individual disease states across a broad clinical spectrum. Furthermore, its attention patterns reveal biologically plausible microbial signatures, highlighting both shared and disease-specific microbial species linked to host phenotypes. By providing a unified, scalable framework for species-level gut microbiome representation learning and prediction, BiomeGPT enables new avenues for biomarker discovery, disease stratification, and microbiome-driven precision medicine.},
   author = {Medearis, Nicholas A. and Zhu, Siyao and Zomorrodi, Ali R.},

diff --git a/inst/pages/agglomeration.qmd b/inst/pages/agglomeration.qmd
@@ -88,7 +88,7 @@
 
 Since we specified `update.tree = TRUE`, the phylogenetic tree has also been
 agglomerated. This is evident from the tree, which now has only
-`r length(rowTree(tse_phylum)$tip.label)` tips, each corresponding to a single
+`r length(TreeSummarizedExperiment::rowTree(tse_phylum)$tip.label)` tips, each corresponding to a single
 row in the dataset.
 
 ```{r}
@@ -141,7 +141,7 @@

 # Add another assay
 assay(tse, "another_assay", withDimnames = FALSE) <- matrix(
    runif(ncol(tse) * nrow(tse), 0, 1),
    ncol = ncol(tse), nrow = nrow(tse)
 )

@@ -243,7 +243,7 @@

 # Agglomerate
 tse_prev <- agglomerateByPrevalence(
    tse,
    assay.type = "relabundance",
    prevalence = 20 / 100,
    detection = 0.1 / 100

diff --git a/inst/pages/import.qmd b/inst/pages/import.qmd
@@ -525,20 +525,43 @@ sequencing data, all processed using the same pipeline and reference
 database. For more use examples in R/Bioconductor, see the 
 [MicroBioMap vignette](https://blekhmanlab.github.io/MicroBioMap/articles/overview.html).
 
+### Integrative human microbiome project
+
+Datasets from the second phase of the human microbiome project, also known as
+integrative human microbiome project are made available with `r BiocStyle::Biocpkg("HMP2Data")`[@HMP2data].
+The datasets are offered as `TreeSE` objects. Additional data, *e.g.* cytokines,
+is included when available.
+
+### Microbiome benchmark data
+
+The `r BiocStyle::Biocpkg("MicrobiomeBenchmarkData")` allows access to datasets
+with some groundtruth available[@Gamboa-Tuz2025]. These datasets are compiled for the purpose of
+benchmarking differential abundance methods, but the ground truth may be useful
+for other benchmarking applications. Datasets are offered directily as `TreeSE`
+objects.
+
 ### Other data sources
 
 The current collections provide access to vast microbiome data
-resources. The output has to be converted into `TreeSE/MAE` separately.
+resources. The output might need to be converted into `TreeSE/MAE` separately.
 
 - `r BiocStyle::Biocpkg("MGnifyR")` provides access to
 [EBI/MGnify](https://www.ebi.ac.uk/metagenomics/)
 - `r BiocStyle::Biocpkg("HoloFoodR")` provides access to
-[EBI/HoloFood](https://www.holofooddata.org/)
+[EBI/HoloFood](https://www.holofooddata.org/) datasets directly as
+`TreeSE` objects
 - `r BiocStyle::Githubpkg("cran/qiitr")` provides access to
 [QIITA](https://qiita.com/about)
 - `r BiocStyle::Githubpkg("jbisanz/qiime2R")` provides access to
 [QIIME2](https://amplicon-docs.qiime2.org/en/latest//)
 
+| Package | Resource | direct to tse |
+| ------- | -------- | ------------- |
+| `r BiocStyle::Biocpkg("MGnifyR")` | [EBI/MGnify](https://www.ebi.ac.uk/metagenomics/) | |
+| `r BiocStyle::Biocpkg("HoloFoodR")` | [EBI/HoloFood](https://www.holofooddata.org/) | yes |
+| `r BiocStyle::Githubpkg("cran/qiitr")` | [QIITA](https://qiita.com/about) | |
+| `r BiocStyle::Githubpkg("jbisanz/qiime2R")` | [QIIME2](https://amplicon-docs.qiime2.org/en/latest//) | |
+
 ::: {.callout-tip icon="false"}
 ## Exercises