@phdthesis{Marquardt2023,
  author    = {Marquardt, Andr{\´e}},
  title     = {Machine-Learning-Based Identification of Tumor Entities, Tumor Subgroups, and Therapy Options},
  doi       = {10.25972/OPUS-32954},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-329548},
  school      = {Universit{\"a}t W{\"u}rzburg},
  year      = {2023},
  abstract  = {Molecular genetic analyses, such as mutation analyses, are becoming increasingly important in the tumor field, especially in the context of therapy stratification. The identification of the underlying tumor entity is crucial, but can sometimes be difficult, for example in the case of metastases or the so-called Cancer of Unknown Primary (CUP) syndrome. In recent years, methylome and transcriptome utilizing machine learning (ML) approaches have been developed to enable fast and reliable tumor and tumor subtype identification. However, so far only methylome analysis have become widely used in routine diagnostics. The present work addresses the utility of publicly available RNA-sequencing data to determine the underlying tumor entity, possible subgroups, and potential therapy options. Identification of these by ML - in particular random forest (RF) models - was the first task. The results with test accuracies of up to 99\% provided new, previously unknown insights into the trained models and the corresponding entity prediction. Reducing the input data to the top 100 mRNA transcripts resulted in a minimal loss of prediction quality and could potentially enable application in clinical or real-world settings. By introducing the ratios of these top 100 genes to each other as a new database for RF models, a novel method was developed enabling the use of trained RF models on data from other sources. Further analysis of the transcriptomic differences of metastatic samples by visual clustering showed that there were no differences specific for the site of metastasis. Similarly, no distinct clusters were detectable when investigating primary tumors and metastases of cutaneous skin melanoma (SKCM). Subsequently, more than half of the validation datasets had a prediction accuracy of at least 80\%, with many datasets even achieving a prediction accuracy of - or close to - 100\%. To investigate the applicability of the used methods for subgroup identification, the TCGA-KIPAN dataset, consisting of the three major kidney cancer subgroups, was used. The results revealed a new, previously unknown subgroup consisting of all histopathological groups with clinically relevant characteristics, such as significantly different survival. Based on significant differences in gene expression, potential therapeutic options of the identified subgroup could be proposed. Concludingly, in exploring the potential applicability of RNA-sequencing data as a basis for therapy prediction, it was shown that this type of data is suitable to predict entities as well as subgroups with high accuracy. Clinical relevance was also demonstrated for a novel subgroup in renal cell carcinoma. The reduction of the number of genes required for entity prediction to 100 genes, enables panel sequencing and thus demonstrates potential applicability in a real-life setting.},
  subject      = {Maschinelles Lernen},
  language  = {en}
}