@phdthesis{MikaGospodorz2022, author = {Mika-Gospodorz, Bozena}, title = {Development and application of bioinformatics tools for analysis of dual RNA-seq experiments}, doi = {10.25972/OPUS-28126}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-281264}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2022}, abstract = {Dual RNA-seq captures both host and pathogen transcriptomes at the site of infection, facilitating an exploration of processes that play an essential role in pathogenesis and the host defense. This work presents an application of this technique to explore processes occurring during the infection of the human endothelial cells with two clinical isolates of Orientia tsutsugamushi (Ot) — the causative agent of scrub typhus. Combining comparative genomics, transcriptomics, and proteomics, we investigated the transcriptional architecture of Ot and identified non-coding RNAs, operon structures, and widespread antisense transcription, that may have a role in regulation of repetitive genes that are abundant in the Ot genome. In addition, the comparative analysis of bacterial and eukaryotic transcriptomes allowed us to investigate factors that drive the difference in virulence between Karp and UT176 and the host response to these two Ot strains. The host and pathogen transcriptional profiles in each dual RNA-seq study are obtained in‑silico by adopting tools developed for RNA-seq data analysis. The Dualrnaseq pipeline presented in the second part of this work is the first publicly available, highly reproducible, scalable, and user‑friendly workflow developed for processing dual RNA‑seq data of any eukaryotic and bacterial organisms with a reference genome and annotation. It provides three mapping and quantification strategies: (i) alignment-based mapping of reads onto the chimeric genome with STAR followed by counting of uniquely mapped reads with HTSeq; (ii) a fast transcriptome quantification method handling multi‑mapped reads (Salmon with Selective Alignment); (iii) and Salmon alignment-based mode which uses a STAR‑derived alignment combined with Salmon quantification. Performing an initial benchmark analysis of the employed methods we provided recommendations ensuring accurate estimation of host and pathogen transcript expression.}, subject = {Transkriptomanalyse}, language = {en} } @phdthesis{Yu2024, author = {Yu, Yanying}, title = {Applied machine learning for the analysis of CRISPR-Cas systems}, doi = {10.25972/OPUS-32021}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-320219}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2024}, abstract = {Among the defense strategies developed in microbes over millions of years, the innate adaptive CRISPR-Cas immune systems have spread across most of bacteria and archaea. The flexibility, simplicity, and specificity of CRISPR-Cas systems have laid the foundation for CRISPR-based genetic tools. Yet, the efficient administration of CRISPR-based tools demands rational designs to maximize the on-target efficiency and off-target specificity. Specifically, the selection of guide RNAs (gRNAs), which play a crucial role in the target recognition of CRISPR-Cas systems, is non-trivial. Despite the fact that the emerging machine learning techniques provide a solution to aid in gRNA design with prediction algorithms, design rules for many CRISPR-Cas systems are ill-defined, hindering their broader applications. CRISPR interference (CRISPRi), an alternative gene silencing technique using a catalytically dead Cas protein to interfere with transcription, is a leading technique in bacteria for functional interrogation, pathway manipulation, and genome-wide screens. Although the application is promising, it also is hindered by under-investigated design rules. Therefore, in this work, I develop a state-of-art predictive machine learning model for guide silencing efficiency in bacteria leveraging the advantages of feature engineering, data integration, interpretable AI, and automated machine learning. I first systematically investigate the influential factors that attribute to the extent of depletion in multiple CRISPRi genome-wide essentiality screens in Escherichia coli and demonstrate the surprising dominant contribution of gene-specific effects, such as gene expression level. These observations allowed me to segregate the confounding gene-specific effects using a mixed-effect random forest (MERF) model to provide a better estimate of guide efficiency, together with the improvement led by integrating multiple screens. The MERF model outperformed existing tools in an independent high-throughput saturating screen. I next interpret the predictive model to extract the design rules for robust gene silencing, such as the preference for cytosine and disfavoring for guanine and thymine within and around the protospacer adjacent motif (PAM) sequence. I further incorporated the MERF model in a web-based tool that is freely accessible at www.ciao.helmholtz-hiri.de. When comparing the MERF model with existing tools, the performance of the alternative gRNA design tool optimized for CRISPRi in eukaryotes when applied to bacteria was far from satisfying, questioning the robustness of prediction algorithms across organisms. In addition, the CRISPR-Cas systems exhibit diverse mechanisms albeit with some similarities. The captured predictive patterns from one dataset thereby are at risk of poor generalization when applied across organisms and CRISPR-Cas techniques. To fill the gap, the machine learning approach I present here for CRISPRi could serve as a blueprint for the effective development of prediction algorithms for specific organisms or CRISPR-Cas systems of interest. The explicit workflow includes three principle steps: 1) accommodating the feature set for the CRISPR-Cas system or technique; 2) optimizing a machine learning model using automated machine learning; 3) explaining the model using interpretable AI. To illustrate the applicability of the workflow and diversity of results when applied across different bacteria and CRISPR-Cas systems, I have applied this workflow to analyze three distinct CRISPR-Cas genome-wide screens. From the CRISPR base editor essentiality screen in E. coli, I have determined the PAM preference and sequence context in the editing window for efficient editing, such as A at the 2nd position of PAM, A/TT/TG downstream of PAM, and TC at the 4th to 5th position of gRNAs. From the CRISPR-Cas13a screen in E. coli, in addition to the strong correlation with the guide depletion, the target expression level is the strongest predictor in the model, supporting it as a main determinant of the activation of Cas13-induced immunity and better characterizing the CRISPR-Cas13 system. From the CRISPR-Cas12a screen in Klebsiella pneumoniae, I have extracted the design rules for robust antimicrobial activity across K. pneumoniae strains and provided a predictive algorithm for gRNA design, facilitating CRISPR-Cas12a as an alternative technique to tackle antibiotic resistance. Overall, this thesis presents an accurate prediction algorithm for CRISPRi guide efficiency in bacteria, providing insights into the determinants of efficient silencing and guide designs. The systematic exploration has led to a robust machine learning approach for effective model development in other bacteria and CRISPR-Cas systems. Applying the approach in the analysis of independent CRISPR-Cas screens not only sheds light on the design rules but also the mechanisms of the CRISPR-Cas systems. Together, I demonstrate that applied machine learning paves the way to a deeper understanding and a broader application of CRISPR-Cas systems.}, subject = {Maschinelles Lernen}, language = {en} }