@phdthesis{Maistrenko2021, author = {Maistrenko, Oleksandr}, title = {Pangenome analysis of bacteria and its application in metagenomics}, doi = {10.25972/OPUS-21499}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-214996}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2021}, abstract = {The biosphere harbors a large quantity and diversity of microbial organisms that can thrive in all environments. Estimates of the total number of microbial species reach up to 1012, of which less than 15,000 have been characterized to date. It has been challenging to delineate phenotypically, evolutionary and ecologically meaningful lineages such as for example, species, subspecies and strains. Even within recognized species, gene content can vary considerably between sublineages (for example strains), a problem that can be addressed by analyzing pangenomes, defined as the non-redundant set of genes within a phylogenetic clade, as evolutionary units. Species considered to be ecologically and evolutionary coherent units, however to date it is still not fully understood what are primary habitats and ecological niches of many prokaryotic species and how environmental preferences drive their genomic diversity. Majority of comparative genomics studies focused on a single prokaryotic species in context of clinical relevance and ecology. With accumulation of sequencing data due to genomics and metagenomics, it is now possible to investigate trends across many species, which will facilitate understanding of pangenome evolution, species and subspecies delineation. The major aims of this thesis were 1) to annotate habitat preferences of prokaryotic species and strains; 2) investigate to what extent these environmental preferences drive genomic diversity of prokaryotes and to what extent phylogenetic constraints limit this diversification; 3) explore natural nucleotide identity thresholds to delineate species in bacteria in metagenomics gene catalogs; 4) explore species delineation for applications in subspecies and strain delineation in metagenomics. The first part of the thesis describes methods to infer environmental preferences of microbial species. This data is a prerequisite for the analyses performed in the second part of the thesis which explores how the structure of bacterial pangenomes is predetermined by past evolutionary history and how is it linked to environmental preferences of the species. The main finding in this subchapter that habitat preferences explained up to 49\% of the variance for pangenome structure, compared to 18\% by phylogenetic inertia. In general, this trend indicates that phylogenetic inertia does not limit evolution of pangenome size and diversity, but that convergent evolution may overcome phylogenetic constraints. In this project we show that core genome size is associated with higher environmental ubiquity of species. It is likely this is due to the fact that species need to have more versatile genomes and most necessary genes need to be present in majority of genomes of that species to be highly prevalent. Taken together these findings may be useful for future predictive analyses of ecological niches in newly discovered species. The third part of the thesis explores data-driven, operational species boundaries. I show that homologous genes from the same species from different genomes tend to share at least 95\% of nucleotide identity, while different species within the same genus have lower nucleotide identity. This is in line with other studies showing that genome-wide natural species boundary might be in range of 90-95\% of nucleotide identity. Finally, the fourth part of the thesis discusses how challenges in species delineation are relevant for the identification of meaningful within-species groups, followed by a discussion on how advancements in species delineation can be applied for classification of within-species genomic diversity in the age of metagenomics.}, subject = {Pangenom}, language = {en} } @phdthesis{Costea2016, author = {Costea, Paul Igor}, title = {Stratification and variation of the human gut microbiota}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-139649}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2016}, abstract = {The microbial communities that live inside the human gastrointestinal tract -the human gut microbiome- are important for host health and wellbeing. Characterizing this new "organ", made up of as many cells as the human body itself, has recently become possible through technological advances. Metagenomics, the high-throughput sequencing of DNA directly from microbial communities, enables us to take genomic snapshots of thousands of microbes living together in this complex ecosystem, without the need for isolating and growing them. Quantifying the composition of the human gut microbiome allows us to investigate its properties and connect it to host physiology and disease. The wealth of such connections was unexpected and is probably still underestimated. Due to the fact that most of our dietary as well as medicinal intake affects the microbiome and that the microbiome itself interacts with our immune system through a multitude of pathways, many mechanisms have been proposed to explain the observed correlations, though most have yet to be understood in depth. An obvious prerequisite to characterizing the microbiome and its interactions with the host is the accurate quantification of its composition, i.e. determining which microbes are present and in what numbers they occur. Historically, standard practices have existed for sample handling, DNA extraction and data analysis for many years. However, these were generally developed for single microbe cultures and it is not always feasible to implement them in large scale metagenomic studies. Partly because of this and partly because of the excitement that new technology brings about, the first metagenomic studies each took the liberty to define their own approach and protocols. From early meta-analysis of these studies it became clear that the differences in sample handling, as well as differences in computational approaches, made comparisons across studies very difficult. This restricts our ability to cross-validate findings of individual studies and to pool samples from larger cohorts. To address the pressing need for standardization, we undertook an extensive comparison of 21 different DNA extraction methods as well as a series of other sample manipulations that affect quantification. We developed a number of criteria for determining the measurement quality in the absence of a mock community and used these to propose best practices for sampling, DNA extraction and library preparation. If these were to be accepted as standards in the field, it would greatly improve comparability across studies, which would dramatically increase the power of our inferences and our ability to draw general conclusions about the microbiome. Most metagenomics studies involve comparisons between microbial communities, for example between fecal samples from cases and controls. A multitude of approaches have been proposed to calculate community dissimilarities (beta diversity) and they are often combined with various preprocessing techniques. Direct metagenomics quantification usually counts sequencing reads mapped to specific taxonomic units, which can be species, genera, etc. Due to technology-inherent differences in sampling depth, normalizing counts is necessary, for instance by dividing each count by the sum of all counts in a sample (i.e. total sum scaling), or by subsampling. To derive a single value for community (dis-)similarity, multiple distance measures have been proposed. Although it is theoretically difficult to benchmark these approaches, we developed a biologically motivated framework in which distance measures can be evaluated. This highlights the importance of data transformations and their impact on the measured distances. Building on our experience with accurate abundance estimation and data preprocessing techniques, we can now try and understand some of the basic properties of microbial communities. In 2011, it was proposed that the space of genus level variation of the human gut microbial community is structured into three basic types, termed enterotypes. These were described in a multi-country cohort, so as to be independent of geography, age and other host properties. Operationally defined through a clustering approach, they are "densely populated areas in a multidimensional space of community composition"(source) and were proposed as a general stratifier for the human population. Later studies that applied this concept to other datasets raised concerns about the optimum number of clusters and robustness of the clustering approach. This heralded a long standing debate about the existence of structure and the best ways to determine and capture it. Here, we reconsider the concept of enterotypes, in the context of the vastly increased amounts of available data. We propose a refined framework in which the different types should be thought of as weak attractors in compositional space and we try to implement an approach to determining which attractor a sample is closest to. To this end, we train a classifier on a reference dataset to assign membership to new samples. This way, enterotypes assignment is no longer dataset dependent and effects due to biased sampling are minimized. Using a model in which we assume the existence of three enterotypes characterized by the same driver genera, as originally postulated, we show the relevance of this stratification and propose it to be used in a clinical setting as a potential marker for disease development. Moreover, we believe that these attractors underline different rules of community assembly and we recommend they be accounted for when analyzing gut microbiome samples. While enterotypes describe structure in the community at genus level, metagenomic sequencing can in principle achieve single-nucleotide resolution, allowing us to identify single nucleotide polymorphisms (SNPs) and other genomic variants in the gut microbiome. Analysis methodology for this level of resolution has only recently been developed and little exploration has been done to date. Assessing SNPs in a large, multinational cohort, we discovered that the landscape of genomic variation seems highly structured even beyond species resolution, indicating that clearly distinguishable subspecies are prevalent among gut microbes. In several cases, these subspecies exhibit geo-stratification, with some subspecies only found in the Chinese population. Generally however, they present only minor dispersion limitations and are seen across most of our study populations. Within one individual, one subspecies is commonly found to dominate and only rarely are several subspecies observed to co-occur in the same ecosystem. Analysis of longitudinal data indicates that the dominant subspecies remains stable over periods of more than three years. When interrogating their functional properties we find many differences, with specific ones appearing relevant to the host. For example, we identify a subspecies of E. rectale that is lacking the flagellum operon and find its presence to be significantly associated with lower body mass index and lower insulin resistance of their hosts; it also correlates with higher microbial community diversity. These associations could not be seen at the species level (where multiple subspecies are convoluted), which illustrates the importance of this increased resolution for a more comprehensive understanding of microbial interactions within the microbiome and with the host. Taken together, our results provide a rigorous basis for performing comparative metagenomics of the human gut, encompassing recommendations for both experimental sample processing and computational analysis. We furthermore refine the concept of community stratification into enterotypes, develop a reference-based approach for enterotype assignment and provide compelling evidence for their relevance. Lastly, by harnessing the full resolution of metagenomics, we discover a highly structured genomic variation landscape below the microbial species level and identify common subspecies of the human gut microbiome. By developing these high-precision metagenomics analysis tools, we thus hope to contribute to a greatly improved understanding of the properties and dynamics of the human gut microbiome.}, subject = {Mensch}, language = {en} } @phdthesis{Arumugam2010, author = {Arumugam, Manimozhiyan}, title = {Comparative metagenomic analysis of the human intestinal microbiota}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-55903}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2010}, abstract = {The human gut is home for thousands of microbes that are important for human life. As most of these cannot be cultivated, metagenomics is an important means to understand this important community. To perform comparative metagenomic analysis of the human gut microbiome, I have developed SMASH (Simple metagenomic analysis shell), a computational pipeline. SMASH can also be used to assemble and analyze single genomes, and has been successfully applied to the bacterium Mycoplasma pneumoniae and the fungus Chaetomium thermophilum. In the context of the MetaHIT (Metagenomics of the human intestinal tract) consortium our group is participating in, I used SMASH to validate the assembly and to estimate the assembly error rate of 576.7 Gb metagenome sequence obtained using Illumina Solexa technology from fecal DNA of 124 European individuals. I also estimated the completeness of the gene catalogue containing 3.3 million open reading frames obtained from these metagenomes. Finally, I used SMASH to analyze human gut metagenomes of 39 individuals from 6 countries encompassing a wide range of host properties such as age, body mass index and disease states. We find that the variation in the gut microbiome is not continuous but stratified into enterotypes. Enterotypes are complex host-microbial symbiotic states that are not explained by host properties, nutritional habits or possible technical biases. The concept of enterotypes might have far reaching implications, for example, to explain different responses to diet or drug intake. We also find several functional markers in the human gut microbiome that correlate with a number of host properties such as body mass index, highlighting the need for functional analysis and raising hopes for the application of microbial markers as diagnostic or even prognostic tools for microbiota-associated human disorders.}, subject = {Darmflora}, language = {en} }