@phdthesis{Steininger2023,
  author    = {Steininger, Michael},
  title     = {Deep Learning for Geospatial Environmental Regression},
  doi       = {10.25972/OPUS-31312},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-313121},
  school      = {Universit{\"a}t W{\"u}rzburg},
  year      = {2023},
  abstract  = {Environmental issues have emerged especially since humans burned fossil fuels, which led to air pollution and climate change that harm the environment. These issues' substantial consequences evoked strong efforts towards assessing the state of our environment. Various environmental machine learning (ML) tasks aid these efforts. These tasks concern environmental data but are common ML tasks otherwise, i.e., datasets are split (training, validatition, test), hyperparameters are optimized on validation data, and test set metrics measure a model's generalizability. This work focuses on the following environmental ML tasks: Regarding air pollution, land use regression (LUR) estimates air pollutant concentrations at locations where no measurements are available based on measured locations and each location's land use (e.g., industry, streets). For LUR, this work uses data from London (modeled) and Zurich (measured). Concerning climate change, a common ML task is model output statistics (MOS), where a climate model's output for a study area is altered to better fit Earth observations and provide more accurate climate data. This work uses the regional climate model (RCM) REMO and Earth observations from the E-OBS dataset for MOS. Another task regarding climate is grain size distribution interpolation where soil properties at locations without measurements are estimated based on the few measured locations. This can provide climate models with soil information, that is important for hydrology. For this task, data from Lower Franconia is used. Such environmental ML tasks commonly have a number of properties: (i) geospatiality, i.e., their data refers to locations relative to the Earth's surface. (ii) The environmental variables to estimate or predict are usually continuous. (iii) Data can be imbalanced due to relatively rare extreme events (e.g., extreme precipitation). (iv) Multiple related potential target variables can be available per location, since measurement devices often contain different sensors. (v) Labels are spatially often only sparsely available since conducting measurements at all locations of interest is usually infeasible. These properties present challenges but also opportunities when designing ML methods for such tasks. In the past, environmental ML tasks have been tackled with conventional ML methods, such as linear regression or random forests (RFs). However, the field of ML has made tremendous leaps beyond these classic models through deep learning (DL). In DL, models use multiple layers of neurons, producing increasingly higher-level feature representations with growing layer depth. DL has made previously infeasible ML tasks feasible, improved the performance for many tasks in comparison to existing ML models significantly, and eliminated the need for manual feature engineering in some domains due to its ability to learn features from raw data. To harness these advantages for environmental domains it is promising to develop novel DL methods for environmental ML tasks. This thesis presents methods for dealing with special challenges and exploiting opportunities inherent to environmental ML tasks in conjunction with DL. To this end, the proposed methods explore the following techniques: (i) Convolutions as in convolutional neural networks (CNNs) to exploit reoccurring spatial patterns in geospatial data. (ii) Posing the problems as regression tasks to estimate the continuous variables. (iii) Density-based weighting to improve estimation performance for rare and extreme events. (iv) Multi-task learning to make use of multiple related target variables. (v) Semi-supervised learning to cope with label sparsity. Using these techniques, this thesis considers four research questions: (i) Can air pollution be estimated without manual feature engineering? This is answered positively by the introduction of the CNN-based LUR model MapLUR as well as the off-the-shelf LUR solution OpenLUR. (ii) Can colocated pollution data improve spatial air pollution models? Multi-task learning for LUR is developed for this, showing potential for improvements with colocated data. (iii) Can DL models improve the quality of climate model outputs? The proposed DL climate MOS architecture ConvMOS demonstrates this. Additionally, semi-supervised training of multilayer perceptrons (MLPs) for grain size distribution interpolation is presented, which can provide improved input data. (iv) Can DL models be taught to better estimate climate extremes? To this end, density-based weighting for imbalanced regression (DenseLoss) is proposed and applied to the DL architecture ConvMOS, improving climate extremes estimation. These methods show how especially DL techniques can be developed for environmental ML tasks with their special characteristics in mind. This allows for better models than previously possible with conventional ML, leading to more accurate assessment and better understanding of the state of our environment.},
  subject      = {Deep learning},
  language  = {en}
}
@phdthesis{Karama2021,
  author    = {Karama, Alphonse},
  title     = {East African Seasonal Rainfall prediction using multiple linear regression and regression with ARIMA errors models},
  doi       = {10.25972/OPUS-25183},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-251831},
  school      = {Universit{\"a}t W{\"u}rzburg},
  year      = {2021},
  abstract  = {The detrimental impacts of climate variability on water, agriculture, and food resources in East Africa underscore the importance of reliable seasonal climate prediction. To overcome this difficulty RARIMAE method were evolved. Applications RARIMAE in the literature shows that amalgamating different methods can be an efficient and effective way to improve the forecasts of time series under consideration. With these motivations, attempt have been made to develop a multiple linear regression model (MLR) and a RARIMAE models for forecasting seasonal rainfall in east Africa under the following objectives: 1. To develop MLR model for seasonal rainfall prediction in East Africa. 2. To develop a RARIMAE model for seasonal rainfall prediction in East Africa. 3. Comparison of model's efficiency under consideration In order to achieve the above objectives, the monthly precipitation data covering the period from 1949 to 2000 was obtained from Climate Research Unit (CRU). Next to that, the first differenced climate indices were used as predictors. In the first part of this study, the analyses of the rainfall fluctuation in whole Central- East Africa region which span over a longitude of 15 degrees East to 55 degrees East and a latitude of 15 degrees South to 15 degrees North was done by the help of maps. For models' comparison, the R-squared values for the MLR model are subtracted from the R-squared values of RARIMAE model. The results show positive values which indicates that R-squared is improved by RARIMAE model. On the other side, the root mean square errors (RMSE) values of the RARIMAE model are subtracted from the RMSE values of the MLR model and the results show negative value which indicates that RMSE is reduced by RARIMAE model for training and testing datasets. For the second part of this study, the area which is considered covers a longitude of 31.5 degrees East to 41 degrees East and a latitude of 3.5 degrees South to 0.5 degrees South. This region covers Central-East of the Democratic Republic of Congo (DRC), north of Burundi, south of Uganda, Rwanda, north of Tanzania and south of Kenya. Considering a model constructed based on the average rainfall time series in this region, the long rainfall season counts the nine months lead of the first principal component of Indian sea level pressure (SLP_PC19) and the nine months lead of Dipole Mode Index (DMI_LR9) as selected predictors for both statistical and predictive model. On the other side, the short rainfall season counts the three months lead of the first principal component of Indian sea surface temperature (SST_PC13) and the three months lead of Southern Oscillation Index (SOI_SR3) as predictors for predictive model. For short rainfall season statistical model SAOD current time series (SAOD_SR0) was added on the two predictors in predictive model. By applying a MLR model it is shown that the forecast can explain 27.4\% of the total variation and has a RMSE of 74.2mm/season for long rainfall season while for the RARIMAE the forecast explains 53.6\% of the total variation and has a RMSE of 59.4mm/season. By applying a MLR model it is shown that the forecast can explain 22.8\% of the total variation and has a RMSE of 106.1 mm/season for short rainfall season predictive model while for the RARIMAE the forecast explains 55.1\% of the total variation and has a RMSE of 81.1 mm/season. From such comparison, a significant rise in R-squared, a decrease of RMSE values were observed in RARIMAE models for both short rainfall and long rainfall season averaged time series. In terms of reliability, RARIMAE outperformed its MLR counterparts with better efficiency and accuracy. Therefore, whenever the data suffer from autocorrelation, we can go for MLR with ARIMA error, the ARIMA error part is more to correct the autocorrelation thereby improving the variance and productiveness of the model.},
  subject      = {Regression},
  language  = {en}
}
@article{ChenYuZhangetal.2011,
  author    = {Chen, Nanhai G. and Yu, Yong A. and Zhang, Qian and Szalay, Aladar A.},
  title     = {Replication efficiency of oncolytic vaccinia virus in cell cultures prognosticates the virulence and antitumor efficacy in mice},
  series = {Journal of Translational Medicine},
  volume    = {9},
  journal   = {Journal of Translational Medicine},
  number    = {164},
  doi       = {10.1186/1479-5876-9-164},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-142268},
  pages     = {1-11},
  year      = {2011},
  abstract  = {Background: We have shown that insertion of the three vaccinia virus (VACV) promoter-driven foreign gene expression cassettes encoding Renilla luciferase-Aequorea GFP fusion protein, beta-galactosidase, and beta-glucuronidase into the F14.5L, J2R, and A56R loci of the VACV LIVP genome, respectively, results in a highly attenuated mutant strain GLV 1h68. This strain shows tumor specific replication and is capable of eradicating tumors with little or no virulence in mice. This study aimed to distinguish the contribution of added VACV promoter-driven transcriptional units as inserts from the effects of insertional inactivation of three viral genes, and to determine the correlation between replication efficiency of oncolytic vaccinia virus in cell cultures and the virulence and antitumor efficacy in mice Methods: A series of recombinant VACV strains was generated by replacing one, two, or all three of the expression cassettes in GLV 1h68 with short non coding DNA sequences. The replication efficiency and tumor cell killing capacity of these newly generated VACV strains were compared with those of the parent virus GLV-1h68 in cell cultures. The virus replication efficiency in tumors and antitumor efficacy as well as the virulence were evaluated in nu/nu (nude) mice bearing human breast tumor xenografts. Results: we found that virus replication efficiency increased with removal of each of the expression cassettes. The increase in virus replication efficiency was proportionate to the strength of removed VACV promoters linked to foreign genes. The replication efficiency of the new VACV strains paralleled their cytotoxicity in cell cultures. The increased replication efficiency in tumor xenografts resulted in enhanced antitumor efficacy in nude mice. Similarly, the enhanced virus replication efficiency was indicative of increased virulence in nude mice. Conclusions: These data demonstrated that insertion of VACV promoter-driven transcriptional units into the viral genome for the purpose of insertional mutagenesis did modulate the efficiency of virus replication together with antitumor efficacy as well as virulence. Replication efficiency of oncolytic VACV in cell cultures can predict the virulence and therapeutic efficacy in nude mice. These findings may be essential for rational design of safe and potent VACV strains for vaccination and virotherapy of cancer in humans and animals.},
  language  = {en}
}
@article{AllignolSchumacherWanneretal.2011,
  author    = {Allignol, Arthur and Schumacher, Martin and Wanner, Christoph and Drechsler, Christiane and Beyersmann, Jan},
  title     = {Understanding competing risks: a simulation point of view},
  series = {BMC Medical Research Methodology},
  volume    = {11},
  journal   = {BMC Medical Research Methodology},
  number    = {86},
  doi       = {10.1186/1471-2288-11-86},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-142811},
  pages     = {1-13},
  year      = {2011},
  abstract  = {Background: Competing risks methodology allows for an event-specific analysis of the single components of composite time-to-event endpoints. A key feature of competing risks is that there are as many hazards as there are competing risks. This is not always well accounted for in the applied literature. Methods: We advocate a simulation point of view for understanding competing risks. The hazards are envisaged as momentary event forces. They jointly determine the event time. Their relative magnitude determines the event type. 'Empirical simulations' using data from a recent study on cardiovascular events in diabetes patients illustrate subsequent interpretation. The method avoids concerns on identifiability and plausibility known from the latent failure time approach. Results: The 'empirical simulations' served as a proof of concept. Additionally manipulating baseline hazards and treatment effects illustrated both scenarios that require greater care for interpretation and how the simulation point of view aids the interpretation. The simulation algorithm applied to real data also provides for a general tool for study planning. Conclusions: There are as many hazards as there are competing risks. All of them should be analysed. This includes estimation of baseline hazards. Study planning must equally account for these aspects.},
  language  = {en}
}