@phdthesis{Yu2024, author = {Yu, Yanying}, title = {Applied machine learning for the analysis of CRISPR-Cas systems}, doi = {10.25972/OPUS-32021}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-320219}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2024}, abstract = {Among the defense strategies developed in microbes over millions of years, the innate adaptive CRISPR-Cas immune systems have spread across most of bacteria and archaea. The flexibility, simplicity, and specificity of CRISPR-Cas systems have laid the foundation for CRISPR-based genetic tools. Yet, the efficient administration of CRISPR-based tools demands rational designs to maximize the on-target efficiency and off-target specificity. Specifically, the selection of guide RNAs (gRNAs), which play a crucial role in the target recognition of CRISPR-Cas systems, is non-trivial. Despite the fact that the emerging machine learning techniques provide a solution to aid in gRNA design with prediction algorithms, design rules for many CRISPR-Cas systems are ill-defined, hindering their broader applications. CRISPR interference (CRISPRi), an alternative gene silencing technique using a catalytically dead Cas protein to interfere with transcription, is a leading technique in bacteria for functional interrogation, pathway manipulation, and genome-wide screens. Although the application is promising, it also is hindered by under-investigated design rules. Therefore, in this work, I develop a state-of-art predictive machine learning model for guide silencing efficiency in bacteria leveraging the advantages of feature engineering, data integration, interpretable AI, and automated machine learning. I first systematically investigate the influential factors that attribute to the extent of depletion in multiple CRISPRi genome-wide essentiality screens in Escherichia coli and demonstrate the surprising dominant contribution of gene-specific effects, such as gene expression level. These observations allowed me to segregate the confounding gene-specific effects using a mixed-effect random forest (MERF) model to provide a better estimate of guide efficiency, together with the improvement led by integrating multiple screens. The MERF model outperformed existing tools in an independent high-throughput saturating screen. I next interpret the predictive model to extract the design rules for robust gene silencing, such as the preference for cytosine and disfavoring for guanine and thymine within and around the protospacer adjacent motif (PAM) sequence. I further incorporated the MERF model in a web-based tool that is freely accessible at www.ciao.helmholtz-hiri.de. When comparing the MERF model with existing tools, the performance of the alternative gRNA design tool optimized for CRISPRi in eukaryotes when applied to bacteria was far from satisfying, questioning the robustness of prediction algorithms across organisms. In addition, the CRISPR-Cas systems exhibit diverse mechanisms albeit with some similarities. The captured predictive patterns from one dataset thereby are at risk of poor generalization when applied across organisms and CRISPR-Cas techniques. To fill the gap, the machine learning approach I present here for CRISPRi could serve as a blueprint for the effective development of prediction algorithms for specific organisms or CRISPR-Cas systems of interest. The explicit workflow includes three principle steps: 1) accommodating the feature set for the CRISPR-Cas system or technique; 2) optimizing a machine learning model using automated machine learning; 3) explaining the model using interpretable AI. To illustrate the applicability of the workflow and diversity of results when applied across different bacteria and CRISPR-Cas systems, I have applied this workflow to analyze three distinct CRISPR-Cas genome-wide screens. From the CRISPR base editor essentiality screen in E. coli, I have determined the PAM preference and sequence context in the editing window for efficient editing, such as A at the 2nd position of PAM, A/TT/TG downstream of PAM, and TC at the 4th to 5th position of gRNAs. From the CRISPR-Cas13a screen in E. coli, in addition to the strong correlation with the guide depletion, the target expression level is the strongest predictor in the model, supporting it as a main determinant of the activation of Cas13-induced immunity and better characterizing the CRISPR-Cas13 system. From the CRISPR-Cas12a screen in Klebsiella pneumoniae, I have extracted the design rules for robust antimicrobial activity across K. pneumoniae strains and provided a predictive algorithm for gRNA design, facilitating CRISPR-Cas12a as an alternative technique to tackle antibiotic resistance. Overall, this thesis presents an accurate prediction algorithm for CRISPRi guide efficiency in bacteria, providing insights into the determinants of efficient silencing and guide designs. The systematic exploration has led to a robust machine learning approach for effective model development in other bacteria and CRISPR-Cas systems. Applying the approach in the analysis of independent CRISPR-Cas screens not only sheds light on the design rules but also the mechanisms of the CRISPR-Cas systems. Together, I demonstrate that applied machine learning paves the way to a deeper understanding and a broader application of CRISPR-Cas systems.}, subject = {Maschinelles Lernen}, language = {en} } @phdthesis{Ye2023, author = {Ye, Liqing}, title = {RNA-RNA interactions in viral genome packaging}, doi = {10.25972/OPUS-29636}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-296361}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2023}, abstract = {RNA is one of the most abundant macromolecules and plays essential roles in numerous biological processes. This doctoral thesis consists of two projects focusing on RNA structure and RNA-RNA interactions in viral genome packaging. In the first project I developed a method called Functional Analysis of RNA Structure (FARS-seq) to investigate structural features regulating genome dimerization within the HIV-1 5'UTR. Genome dimerization is a conserved feature of retroviral replication and is thought to be a prerequisite for binding to the viral structural protein Pr55Gag during genome packaging. It also plays a role in genome integrity and evolution through recombination, and is linked to a structural switch that may regulate genome packaging and translation within cells. Despite its importance for HIV-1 replication, the RNA signals regulating genome dimerization, and the molecular mechanism leading to the selection of the genome dimer over the monomer for packaging are incompletely understood. The FARS-seq method combines RNA structural information obtained by chemical probing with single nucleotide resolution profiles of RNA function obtained by mutational interference. In this way, we found nucleotides that were critical for dimerization, especially within the well-characterized dimerization motif within stem-loop 1 (SL1). We also found stretches of nucleotides that enhanced genome dimerization upon mutation, suggesting their role in negatively regulating dimerization. A structural analysis identified distinct structural signatures within monomeric and dimeric RNA. The dimeric conformation displayed the canonical transactivation response (TAR), PolyA, primer binding site (PBS), and SL1-SL3 stem-loops, and contained a long range U5-AUG interaction. Unexpectedly, in monomeric RNA, SL1 was reconfigured into long- and short-range base-pairings with PolyA and PBS, respectively. Intriguingly, these base pairings concealed the palindromic sequence needed for dimerization and disrupted the internal loop in SL1 previously shown to contain the major packaging motif for Pr55Gag. We therefore rationally introduced mutations into PolyA and PBS, and showed how these regions regulate genome dimerization, and the binding of Pr55Gag in vitro, as well as genome packaging into virions. These findings give insights into late stages of the HIV-1 life cycle and a mechanistic explanation for the link between RNA dimerization and packaging. In the second project, I developed a proximity ligation and high-throughput sequencing-based method, RNA-RNA seq, which can measure direct (RNA-RNA) and indirect (protein-mediated) interactions. In contrast to existing methods, RNA-RNA seq is not limited by specific protein or RNA baits, nor to a particular crosslinking reagent. The genome of influenza A virus contains eight segments, which assemble into a "7+1" supramolecular complex. However, the molecular details of genome assembly are poorly understood. Our goal is to use RNA-RNA seq to identify the sites of interaction between the eight genomic RNAs of influenza, and to use this information to define the quaternary RNA architecture of the genome. We showed that RNA-RNA seq worked on model substrates, like the HIV-1 Dimerization Initiation Site (DIS) RNA and purified ribosome, as well as influenza A virus infected cells.}, subject = {RNS-Viren}, language = {en} } @phdthesis{Vafadarnejad2022, author = {Vafadarnejad, Ehsan}, title = {Implementation and application of bioinformatics methods to analyze and visualize single-cell RNA-sequencing data}, doi = {10.25972/OPUS-26925}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-269258}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2022}, abstract = {RNA sequencing (RNA-seq) has become a transformative method to profile genome-wide gene expression and whole transcriptome analysis over the last decade. In recent years, with the development of new technologies, it has become possible to study gene expression at single-cell level. This new advances in single-cell RNA-sequencing has revolutionized the way scientists study biological processes. Single-cell RNA-sequencing has been used in different areas to better understand the underlying mechanisms of biological processes. In particular, single-RNA-sequencing is a suitable method to study infectious diseases. Infection is composed of heterogeneous mechanisms on either the host or pathogen side and the best way to understand the heterogeneity of these mechanisms and how they interact with each other is to study infectious diseases at the single-cell level. Studying infection processes at the single-cell level can reveal not only the heterogeneity but also the dynamics of infection and the interplay between the host and pathogen at the molecular level. In this thesis, we implemented and applied different single-cell RNA-seq technologies to better understand infectious diseases. In the present work, we conducted four independent but related research works to shed light on different aspects of infection biology: ● We took advantage of this novel technology to study the consequences of RSV infection on primary human epithelial cells. The primary human epithelial cells were collected from six donors and cultured in air liquid interface (ALI) cell culture inoculated with respiratory syncytial virus (RSV). In this project, we discovered ciliated cells as the susceptible cell types in RSV infection. We applied viral load as an indicator of infection progression and used it to reconstruct the dynamics of host response to RSV infection. Reconstruction of the dynamics of infection revealed many host genes and pathways that were suppressed or induced as a result of RSV infection. Pathways related to innate immune response and interferon response were suppressed during the progression of infection and on the other hand pathways like protein targeting to endoplasmic reticulum and apoptosis were induced. ● We developed a new method which is capable of sequencing the transcriptome of a bacterium at the single-cell level and potentially can help us to characterize the bacterial heterogeneity during the course of infection. In this research project, bacteria were cultured in three different culture conditions namely Late stationary phase, Anaerobic shock and NaCl shock and we used a poly(A)-independent single-cell RNA-sequencing protocol to sequence bacteria at the single-cell level. In this work, we report the faithful capture of growth-dependent gene expression patterns in individual Salmonella and Pseudomonas bacteria. The results of our analysis showed that not only we could capture transcripts across different RNA classes but also our method is capable of discerning the transcriptome of bacteria across different culture conditions. ● We used single-cell RNA-sequencing technology to characterize the immune cells landscape over the course of atherosclerosis. Atherosclerosis is considered a cardiac disease which is highly related to infections and previous infections with bacteria or viruses is considered as a risk factor for atherosclerosis. We performed single-cell RNA sequencing of aortic CD45+ cells extracted from healthy and atherosclerotic aorta of mice. We managed to find certain cell populations which were specifically present in atherosclerotic mice. One of the atheroschelorotic populations was previously undescribed TREM2high macrophages showing enrichment in Trem2 gene expression. This population of macrophages seemed to be involved in functions like lipid metabolism and catabolism and lesion calcification. This work revealed the phenotypic heterogeneity and immune cells landscape of different immune cell populations at different stages of atherosclerosis. Our work paves the way to better describe the relation between different infectious diseases and cardiovascular diseases. ● We developed a web-based platform called Infection Atlas to browse and visualize single-cell RNA-sequencing data. Infection Atlas platform provides a user-friendly interface to study different aspects of infectious diseases at the single-cell level and can potentially promote targeted approaches to intervene in infectious diseases. This platform which is available at infection-atlas.org in the short term provides a user-friendly interface to browse and visualize different aspects of infectious diseases and in the long-term is expected to be a comprehensive atlas of infection in human and mouse across different tissues and different pathogens. Overall, in this thesis we provide a framework to study infectious diseases at the single cell level with providing novel data analysis methods and this thesis paves the way for future studies to study host-pathogen encounters at the single-cell level.}, subject = {Einzelzellanalyse}, language = {en} } @phdthesis{Pekarek2024, author = {Pek{\´a}rek, Luk{\´a}š}, title = {Single-Molecule Approaches To Study Frameshifting Mechanisms}, doi = {10.25972/OPUS-34611}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-346112}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2024}, abstract = {The RNAs of many viruses contain a frameshift stimulatory element (FSE) that grants access to an alternate reading frame via -1 programmed ribosomal frameshifting (PRF). This -1PRF is essential for effective viral replication. The -1PRF efficiency relies on the presence of conserved RNA elements within the FSE, such as a slippery sequence, spacer, and a downstream secondary structure - often a hairpin or a pseudoknot. The PRF efficiency is also affected by trans-acting factors such as proteins, miRNAs and metabolites. The interactions of these factors with the RNA and the translation machinery have not yet been completely understood. Traditional ensemble methods used previously to study these events focus on the whole population of molecular species. This results in innate averaging of the molecular behavior and a loss of heterogeneity information. Here, we first established the experimental workflow to study the RNA structures and the effect of potential trans-acting factors using single-molecule force spectroscopy technique, optical tweezers. Additionally, to streamline the data analysis, we developed an algorithm for automatized data processing. Next, we harnessed this knowledge to study viral RNA elements responsible for stimulation of PRF and how the presence of trans-acting factors affects the RNA behavior. We further complemented these single-molecule structural data with ensemble functional assays to gain a complex view on the dynamics behind the programmed ribosomal frameshifting. Specifically, two different viral RNA elements have been studied in the presented work. First, the dynamics of SARS-CoV-2 FSE and the role of extended sequences have been explored. Then, the mode of action of the host-encoded trans-acting factor ZAP-S inhibition of SARS-CoV-2 PRF has been examined. Finally, the mechanism of the trans-acting viral factor induced PRF in Encephalomyocarditis virus (EMCV) has been uncovered.}, language = {en} } @phdthesis{MikaGospodorz2022, author = {Mika-Gospodorz, Bozena}, title = {Development and application of bioinformatics tools for analysis of dual RNA-seq experiments}, doi = {10.25972/OPUS-28126}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-281264}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2022}, abstract = {Dual RNA-seq captures both host and pathogen transcriptomes at the site of infection, facilitating an exploration of processes that play an essential role in pathogenesis and the host defense. This work presents an application of this technique to explore processes occurring during the infection of the human endothelial cells with two clinical isolates of Orientia tsutsugamushi (Ot) — the causative agent of scrub typhus. Combining comparative genomics, transcriptomics, and proteomics, we investigated the transcriptional architecture of Ot and identified non-coding RNAs, operon structures, and widespread antisense transcription, that may have a role in regulation of repetitive genes that are abundant in the Ot genome. In addition, the comparative analysis of bacterial and eukaryotic transcriptomes allowed us to investigate factors that drive the difference in virulence between Karp and UT176 and the host response to these two Ot strains. The host and pathogen transcriptional profiles in each dual RNA-seq study are obtained in‑silico by adopting tools developed for RNA-seq data analysis. The Dualrnaseq pipeline presented in the second part of this work is the first publicly available, highly reproducible, scalable, and user‑friendly workflow developed for processing dual RNA‑seq data of any eukaryotic and bacterial organisms with a reference genome and annotation. It provides three mapping and quantification strategies: (i) alignment-based mapping of reads onto the chimeric genome with STAR followed by counting of uniquely mapped reads with HTSeq; (ii) a fast transcriptome quantification method handling multi‑mapped reads (Salmon with Selective Alignment); (iii) and Salmon alignment-based mode which uses a STAR‑derived alignment combined with Salmon quantification. Performing an initial benchmark analysis of the employed methods we provided recommendations ensuring accurate estimation of host and pathogen transcript expression.}, subject = {Transkriptomanalyse}, language = {en} } @phdthesis{Imdahl2023, author = {Imdahl, Fabian Dominik}, title = {Development of novel experimental approaches to decipher host-pathogen interaction at the single-cell level}, doi = {10.25972/OPUS-28943}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-289435}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2023}, abstract = {Abstract: COVID-19 has impressively shown how quickly an emerging pathogen can have a massive impact on our entire lives and show how infectious diseases spread regardless of national borders and economic stability. We find ourselves in a post-antibiotic era and have rested too long on the laurels of past research, so today more and more people are dying from infections with multi-resistant germs. Infections are highly plastic and heterogeneous processes that are strongly dependent on the individual, whether on the host or pathogen side. Improving our understanding of the pathogenicity of microorganisms and finding potential targets for a completely new class of drugs is a declared goal of current basic research. To tackle this challenge, single-cell RNA sequencing (scRNA-seq) is our most accurate tool. In this thesis we implemented different state of the art scRNA-seq technologies to better understand infectious diseases. Furthermore, we developed a new method which is capable to resolve the transcriptome of a single bacterium. Applying a poly(A)-independent scRNA-seq protocol to three different, infection relevant growth conditions we can report the faithful detection of growth-dependent gene expression patterns in individual Salmonella Typhimurium and Pseudomonas aeruginosa bacteria. The data analysis shows that this method not only allows the differentiation of various culture conditions but can also capture transcripts across different RNA species. Furthermore, using state of the art imaging and single-cell RNA sequencing technologies, we comprehensively characterized a human intestinal tissue model which in further course of the project was used as a Salmonella enterica serovar Typhimurium infection model. While most infection studies are conducted in mice, lacking a human intestinal physiology, the in vitro human tissue model allows us to directly infer in vivo pathogenesis. Combining immunofluorescent imaging, deep single-cell RNA sequencing and HCR-FISH, applied in time course experiments, allows an unseen resolution for studying heterogeneity and the dynamics of Salmonella infection which reveals details of pathogenicity contrary to the general scientific opinion.}, subject = {Salmonella}, language = {en} } @phdthesis{Dietrich2024, author = {Dietrich, Oliver}, title = {Integrating single-cell multi-omics to decipher host-pathogen interactions}, doi = {10.25972/OPUS-36013}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-360138}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2024}, abstract = {Interactions between host and pathogen determine the development, progression and outcomes of disease. Medicine benefits from better descriptions of these interactions through increased precision of prevention, diagnosis and treatment of diseases. Single-cell genomics is a disruptive technology revolutionizing science by increasing the resolution with which we study diseases. Cell type specific changes in abundance or gene expression are now routinely investigated in diseases. Meanwhile, detecting cellular phenotypes across diseases can connect scientific fields and fuel discovery. Insights acquired through systematic analysis of high resolution data will soon be translated into clinical practice and improve decision making. Therefore, the continued use of single-cell technologies and their application towards clinical samples will improve molecular interpretation, patient stratification, and the prediction of outcomes. In the past years, I was fortunate to participate in interdisciplinary research groups bridging biology, clinical research and data science. I was able to contribute to diverse projects through computational analysis and biological interpretation of sequencing data. Together, we were able to discover cellular phenotypes that influence disease progression and outcomes as well as the response to treatment. Here, I will present four studies that I have conducted in my PhD. First, we performed a case study of relapse from cell-based immunotherapy in Multiple Myeloma. We identified genomic deletion of the epitope as mechanism of immune escape and implicate heterozygosity or monosomy of the genomic locus at baseline as a potential risk factor. Second, we investigated the pathomechanisms of severe COVID-19 at the earliest stage of the COVID- 19 pandemic in Germany in March 2020. We discovered that profibrotic macrophages and lung fibrosis can be caused by SARS-CoV-2 infection. Third, we used a mouse model of chronic infection with Staphylococcus aureus that causes Osteomyelitis similar to the human disease. We were able to identify dysregulated immunometabolism associated with the generation of myeloid-derived suppressor cells (MDSC). Fourth, we investigated Salmonella infection of the human small intestine in an in vitro model and describe features of pathogen invasion and host response. Overall, I have been able to successfully employ single-cell sequencing to discover important aspects of diseases ranging from development to treatment and outcome. I analyzed samples from the clinics, human donors, mouse models and organoid models to investigate different aspects of diseases and managed to integrate data across sample types, technologies and diseases. Based on successful studies, we increased our efforts to combine data from multiple sources to build comprehensive references for the integration of large collections of clinical samples. Our findings exemplify how single-cell sequencing can improve clinical research and highlights the potential of mechanistic discoveries to drive precision medicine.}, subject = {Einzelzellanalyse}, language = {en} }