@phdthesis{NavarroBullock2015, author = {Navarro Bullock, Beate}, title = {Privacy aware social information retrieval and spam filtering using folksonomies}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-120941}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2015}, abstract = {Social interactions as introduced by Web 2.0 applications during the last decade have changed the way the Internet is used. Today, it is part of our daily lives to maintain contacts through social networks, to comment on the latest developments in microblogging services or to save and share information snippets such as photos or bookmarks online. Social bookmarking systems are part of this development. Users can share links to interesting web pages by publishing bookmarks and providing descriptive keywords for them. The structure which evolves from the collection of annotated bookmarks is called a folksonomy. The sharing of interesting and relevant posts enables new ways of retrieving information from the Web. Users can search or browse the folksonomy looking at resources related to specific tags or users. Ranking methods known from search engines have been adjusted to facilitate retrieval in social bookmarking systems. Hence, social bookmarking systems have become an alternative or addendum to search engines. In order to better understand the commonalities and differences of social bookmarking systems and search engines, this thesis compares several aspects of the two systems' structure, usage behaviour and content. This includes the use of tags and query terms, the composition of the document collections and the rankings of bookmarks and search engine URLs. Searchers (recorded via session ids), their search terms and the clicked on URLs can be extracted from a search engine query logfile. They form similar links as can be found in folksonomies where a user annotates a resource with tags. We use this analogy to build a tripartite hypergraph from query logfiles (a logsonomy), and compare structural and semantic properties of log- and folksonomies. Overall, we have found similar behavioural, structural and semantic characteristics in both systems. Driven by this insight, we investigate, if folksonomy data can be of use in web information retrieval in a similar way to query log data: we construct training data from query logs and a folksonomy to build models for a learning-to-rank algorithm. First experiments show a positive correlation of ranking results generated from the ranking models of both systems. The research is based on various data collections from the social bookmarking systems BibSonomy and Delicious, Microsoft's search engine MSN (now Bing) and Google data. To maintain social bookmarking systems as a good source for information retrieval, providers need to fight spam. This thesis introduces and analyses different features derived from the specific characteristics of social bookmarking systems to be used in spam detection classification algorithms. Best results can be derived from a combination of profile, activity, semantic and location-based features. Based on the experiments, a spam detection framework which identifies and eliminates spam activities for the social bookmarking system BibSonomy has been developed. The storing and publication of user-related bookmarks and profile information raises questions about user data privacy. What kinds of personal information is collected and how do systems handle user-related items? In order to answer these questions, the thesis looks into the handling of data privacy in the social bookmarking system BibSonomy. Legal guidelines about how to deal with the private data collected and processed in social bookmarking systems are also presented. Experiments will show that the consideration of user data privacy in the process of feature design can be a first step towards strengthening data privacy.}, subject = {Information Retrieval}, language = {en} } @phdthesis{Schloer2022, author = {Schl{\"o}r, Daniel}, title = {Detecting Anomalies in Transaction Data}, doi = {10.25972/OPUS-29856}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-298569}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2022}, abstract = {Detecting anomalies in transaction data is an important task with a high potential to avoid financial loss due to irregularities deliberately or inadvertently carried out, such as credit card fraud, occupational fraud in companies or ordering and accounting errors. With ongoing digitization of our world, data-driven approaches, including machine learning, can draw benefit from data with less manual effort and feature engineering. A large variety of machine learning-based anomaly detection methods approach this by learning a precise model of normality from which anomalies can be distinguished. Modeling normality in transactional data, however, requires to capture distributions and dependencies within the data precisely with special attention to numerical dependencies such as quantities, prices or amounts. To implicitly model numerical dependencies, Neural Arithmetic Logic Units have been proposed as neural architecture. In practice, however, these have stability and precision issues. Therefore, we first develop an improved neural network architecture, iNALU, which is designed to better model numerical dependencies as found in transaction data. We compare this architecture to the previous approach and show in several experiments of varying complexity that our novel architecture provides better precision and stability. We integrate this architecture into two generative neural network models adapted for transaction data and investigate how well normal behavior is modeled. We show that both architectures can successfully model normal transaction data, with our neural architecture improving generative performance for one model. Since categorical and numerical variables are common in transaction data, but many machine learning methods only process numerical representations, we explore different representation learning techniques to transform categorical transaction data into dense numerical vectors. We extend this approach by proposing an outlier-aware discretization, thus incorporating numerical attributes into the computation of categorical embeddings, and investigate latent spaces, as well as quantitative performance for anomaly detection. Next, we evaluate different scenarios for anomaly detection on transaction data. We extend our iNALU architecture to a neural layer that can model both numerical and non-numerical dependencies and evaluate it in a supervised and one-class setting. We investigate the stability and generalizability of our approach and show that it outperforms a variety of models in the balanced supervised setting and performs comparably in the one-class setting. Finally, we evaluate three approaches to using a generative model as an anomaly detector and compare the anomaly detection performance.}, subject = {Anomalieerkennung}, language = {en} } @phdthesis{Becker2018, author = {Becker, Martin}, title = {Understanding Human Navigation using Bayesian Hypothesis Comparison}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-163522}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2018}, abstract = {Understanding human navigation behavior has implications for a wide range of application scenarios. For example, insights into geo-spatial navigation in urban areas can impact city planning or public transport. Similarly, knowledge about navigation on the web can help to improve web site structures or service experience. In this work, we focus on a hypothesis-driven approach to address the task of understanding human navigation: We aim to formulate and compare ideas — for example stemming from existing theory, literature, intuition, or previous experiments — based on a given set of navigational observations. For example, we may compare whether tourists exploring a city walk "short distances" before taking their next photo vs. they tend to "travel long distances between points of interest", or whether users browsing Wikipedia "navigate semantically" vs. "click randomly". For this, the Bayesian method HypTrails has recently been proposed. However, while HypTrails is a straightforward and flexible approach, several major challenges remain: i) HypTrails does not account for heterogeneity (e.g., incorporating differently behaving user groups such as tourists and locals is not possible), ii) HypTrails does not support the user in conceiving novel hypotheses when confronted with a large set of possibly relevant background information or influence factors, e.g., points of interest, popularity of locations, time of the day, or user properties, and finally iii) formulating hypotheses can be technically challenging depending on the application scenario (e.g., due to continuous observations or temporal constraints). In this thesis, we address these limitations by introducing various novel methods and tools and explore a wide range of case studies. In particular, our main contributions are the methods MixedTrails and SubTrails which specifically address the first two limitations: MixedTrails is an approach for hypothesis comparison that extends the previously proposed HypTrails method to allow formulating and comparing heterogeneous hypotheses (e.g., incorporating differently behaving user groups). SubTrails is a method that supports hypothesis conception by automatically discovering interpretable subgroups with exceptional navigation behavior. In addition, our methodological contributions also include several tools consisting of a distributed implementation of HypTrails, a web application for visualizing geo-spatial human navigation in the context of background information, as well as a system for collecting, analyzing, and visualizing mobile participatory sensing data. Furthermore, we conduct case studies in many application domains, which encompass — among others — geo-spatial navigation based on photos from the photo-sharing platform Flickr, browsing behavior on the social tagging system BibSonomy, and task choosing behavior on a commercial crowdsourcing platform. In the process, we develop approaches to cope with application specific subtleties (like continuous observations and temporal constraints). The corresponding studies illustrate the variety of domains and facets in which navigation behavior can be studied and, thus, showcase the expressiveness, applicability, and flexibility of our methods. Using these methods, we present new aspects of navigational phenomena which ultimately help to better understand the multi-faceted characteristics of human navigation behavior.}, subject = {Bayes-Verfahren}, language = {en} } @phdthesis{Ring2021, author = {Ring, Markus}, title = {Detektion sicherheitskritischer Ereignisse in Unternehmensnetzwerken mittels Data Mining}, doi = {10.25972/OPUS-21956}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-219561}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2021}, abstract = {E-Mails, Online Banking und Videokonferenzen sind aus unserem heutigen Alltag nicht mehr wegzudenken. Bei all diesen Aktivit{\"a}ten werden zahlreiche personenbezogene Informationen und vertrauensw{\"u}rdige Daten digital {\"u}bertragen und gespeichert. Zur Sicherstellung der digitalen Daten vor unbefugten Zugriffen und Manipulationen existieren verschiedenste Konzepte, Methoden und Verfahren, die sich unter dem Begriff IT-Sicherheit zusammenfassen lassen. Klassische Sicherheitsl{\"o}sungen aus dem Bereich IT-Sicherheit sind Firewalls und Virenscanner. Derartige Ans{\"a}tze sind meist regelbasiert und pr{\"u}fen Dateien beziehungsweise eingehenden Netzwerkverkehr anhand einer Liste bekannter Angriffssignaturen. Folglich k{\"o}nnen diese Systeme nur bereits bekannte Angriffsszenarien detektieren und bieten keinen Schutz vor neuartigen Angriffen. Somit entsteht im Bereich IT-Sicherheit ein Wettlauf zwischen Hackern und IT-Sicherheitsexperten, bei dem die Hacker stets nach neuen Mitteln und Wegen suchen, die existierenden Sicherheitsl{\"o}sungen zu {\"u}berwinden, w{\"a}hrend IT-Sicherheitsexperten stetig ihre Schutzmechanismen verbessern. Die vorliegende Arbeit widmet sich der Detektion von Angriffsszenarien in Unternehmensnetzwerken mithilfe von Data Mining-Methoden. Diese Methoden sind in der Lage anhand von repr{\"a}sentativen Daten die darin enthaltenen Strukturen zu erlernen und zu generalisieren. Folglich k{\"o}nnen sich Data Mining-Methoden grunds{\"a}tzlich zur Detektion neuer Angriffsszenarien eignen, wenn diese Angriffsszenarien {\"U}berschneidungen mit bekannten Angriffsszenarien aufweisen oder sich wesentlich vom bekannten Normalverhalten unterscheiden. In dieser Arbeit werden netzwerkbasierte Daten im NetFlow Format analysiert, da diese einen aggregierten {\"U}berblick {\"u}ber das Geschehen im Netzwerk bieten. H{\"a}ufig k{\"o}nnen Netzwerkdaten aufgrund datenschutzrechtlicher Bedenken nicht ver{\"o}ffentlicht werden, was f{\"u}r die Erzeugung synthetischer, aber realistischer Netzwerkdaten spricht. Des Weiteren f{\"u}hrt die Beschaffenheit der Netzwerkdaten dazu, dass eine Kombination von kontinuierlichen und kategorischen Attributen analysiert werden muss, was vor allem das Vergleichen der Daten bez{\"u}glich ihrer {\"A}hnlichkeit erschwert. Diese Arbeit liefert methodische Beitr{\"a}ge zu jeder der drei genannten Herausforderungen. Im Bereich der Abstandsberechnung kategorischer Werte werden mit ConDist und IP2Vec zwei unterschiedliche Ans{\"a}tze entwickelt. ConDist ist ein universell einsetzbares Abstandsmaß zur Berechnung von Abst{\"a}nden zwischen Datenpunkten, die aus kontinuierlichen und kategorischen Attributen bestehen. IP2Vec ist auf Netzwerkdaten spezialisiert und transformiert kategorische Werte in kontinuierliche Vektoren. Im Bereich der Generierung realistischer Netzwerkdaten werden neben einer ausf{\"u}hrlichen Literaturrecherche zwei unterschiedliche Ans{\"a}tze vorgestellt. Zun{\"a}chst wird ein auf Simulation basierter Ansatz zur Generierung flowbasierter Datens{\"a}tze entwickelt. Dieser Ansatz basiert auf einer Testumgebung und simuliert typische Benutzeraktivit{\"a}ten durch automatisierte Python Skripte. Parallel hierzu wird ein zweiter Ansatz zur synthetischen Generierung flowbasierter Netzwerkdaten durch Modellierung mithilfe von Generative Adversarial Networks entwickelt. Dieser Ansatz erlernt die zugrundeliegenden Eigenschaften der Netzwerkdaten und ist anschließend in der Lage, neue Netzwerkdaten mit gleichen Eigenschaften zu generieren.W{\"a}hrend sich der erste Ansatz zur Erstellung neuer Datens{\"a}tze eignet, kann der zweite Ansatz zur Anreicherung existierender Datens{\"a}tze genutzt werden. Schließlich liefert diese Arbeit noch zwei Beitr{\"a}ge zur Detektion von Angriffsszenarien. Im ersten Beitrag wird ein Konzept zur Detektion von Angriffsszenarien entwickelt, welches sich an die typischen Phasen eines Angriffsszenarios orientiert. Im zweiten Beitrag werden eine {\"u}berwachte und eine un{\"u}berwachte Methode zur Detektion von langsamen Port Scans vorgestellt.}, subject = {Data Mining}, language = {de} } @phdthesis{Steininger2023, author = {Steininger, Michael}, title = {Deep Learning for Geospatial Environmental Regression}, doi = {10.25972/OPUS-31312}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-313121}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2023}, abstract = {Environmental issues have emerged especially since humans burned fossil fuels, which led to air pollution and climate change that harm the environment. These issues' substantial consequences evoked strong efforts towards assessing the state of our environment. Various environmental machine learning (ML) tasks aid these efforts. These tasks concern environmental data but are common ML tasks otherwise, i.e., datasets are split (training, validatition, test), hyperparameters are optimized on validation data, and test set metrics measure a model's generalizability. This work focuses on the following environmental ML tasks: Regarding air pollution, land use regression (LUR) estimates air pollutant concentrations at locations where no measurements are available based on measured locations and each location's land use (e.g., industry, streets). For LUR, this work uses data from London (modeled) and Zurich (measured). Concerning climate change, a common ML task is model output statistics (MOS), where a climate model's output for a study area is altered to better fit Earth observations and provide more accurate climate data. This work uses the regional climate model (RCM) REMO and Earth observations from the E-OBS dataset for MOS. Another task regarding climate is grain size distribution interpolation where soil properties at locations without measurements are estimated based on the few measured locations. This can provide climate models with soil information, that is important for hydrology. For this task, data from Lower Franconia is used. Such environmental ML tasks commonly have a number of properties: (i) geospatiality, i.e., their data refers to locations relative to the Earth's surface. (ii) The environmental variables to estimate or predict are usually continuous. (iii) Data can be imbalanced due to relatively rare extreme events (e.g., extreme precipitation). (iv) Multiple related potential target variables can be available per location, since measurement devices often contain different sensors. (v) Labels are spatially often only sparsely available since conducting measurements at all locations of interest is usually infeasible. These properties present challenges but also opportunities when designing ML methods for such tasks. In the past, environmental ML tasks have been tackled with conventional ML methods, such as linear regression or random forests (RFs). However, the field of ML has made tremendous leaps beyond these classic models through deep learning (DL). In DL, models use multiple layers of neurons, producing increasingly higher-level feature representations with growing layer depth. DL has made previously infeasible ML tasks feasible, improved the performance for many tasks in comparison to existing ML models significantly, and eliminated the need for manual feature engineering in some domains due to its ability to learn features from raw data. To harness these advantages for environmental domains it is promising to develop novel DL methods for environmental ML tasks. This thesis presents methods for dealing with special challenges and exploiting opportunities inherent to environmental ML tasks in conjunction with DL. To this end, the proposed methods explore the following techniques: (i) Convolutions as in convolutional neural networks (CNNs) to exploit reoccurring spatial patterns in geospatial data. (ii) Posing the problems as regression tasks to estimate the continuous variables. (iii) Density-based weighting to improve estimation performance for rare and extreme events. (iv) Multi-task learning to make use of multiple related target variables. (v) Semi-supervised learning to cope with label sparsity. Using these techniques, this thesis considers four research questions: (i) Can air pollution be estimated without manual feature engineering? This is answered positively by the introduction of the CNN-based LUR model MapLUR as well as the off-the-shelf LUR solution OpenLUR. (ii) Can colocated pollution data improve spatial air pollution models? Multi-task learning for LUR is developed for this, showing potential for improvements with colocated data. (iii) Can DL models improve the quality of climate model outputs? The proposed DL climate MOS architecture ConvMOS demonstrates this. Additionally, semi-supervised training of multilayer perceptrons (MLPs) for grain size distribution interpolation is presented, which can provide improved input data. (iv) Can DL models be taught to better estimate climate extremes? To this end, density-based weighting for imbalanced regression (DenseLoss) is proposed and applied to the DL architecture ConvMOS, improving climate extremes estimation. These methods show how especially DL techniques can be developed for environmental ML tasks with their special characteristics in mind. This allows for better models than previously possible with conventional ML, leading to more accurate assessment and better understanding of the state of our environment.}, subject = {Deep learning}, language = {en} } @phdthesis{Niebler2019, author = {Niebler, Thomas}, title = {Extracting and Learning Semantics from Social Web Data}, doi = {10.25972/OPUS-17866}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-178666}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2019}, abstract = {Making machines understand natural language is a dream of mankind that existed since a very long time. Early attempts at programming machines to converse with humans in a supposedly intelligent way with humans relied on phrase lists and simple keyword matching. However, such approaches cannot provide semantically adequate answers, as they do not consider the specific meaning of the conversation. Thus, if we want to enable machines to actually understand language, we need to be able to access semantically relevant background knowledge. For this, it is possible to query so-called ontologies, which are large networks containing knowledge about real-world entities and their semantic relations. However, creating such ontologies is a tedious task, as often extensive expert knowledge is required. Thus, we need to find ways to automatically construct and update ontologies that fit human intuition of semantics and semantic relations. More specifically, we need to determine semantic entities and find relations between them. While this is usually done on large corpora of unstructured text, previous work has shown that we can at least facilitate the first issue of extracting entities by considering special data such as tagging data or human navigational paths. Here, we do not need to detect the actual semantic entities, as they are already provided because of the way those data are collected. Thus we can mainly focus on the problem of assessing the degree of semantic relatedness between tags or web pages. However, there exist several issues which need to be overcome, if we want to approximate human intuition of semantic relatedness. For this, it is necessary to represent words and concepts in a way that allows easy and highly precise semantic characterization. This also largely depends on the quality of data from which these representations are constructed. In this thesis, we extract semantic information from both tagging data created by users of social tagging systems and human navigation data in different semantic-driven social web systems. Our main goal is to construct high quality and robust vector representations of words which can the be used to measure the relatedness of semantic concepts. First, we show that navigation in the social media systems Wikipedia and BibSonomy is driven by a semantic component. After this, we discuss and extend methods to model the semantic information in tagging data as low-dimensional vectors. Furthermore, we show that tagging pragmatics influences different facets of tagging semantics. We then investigate the usefulness of human navigational paths in several different settings on Wikipedia and BibSonomy for measuring semantic relatedness. Finally, we propose a metric-learning based algorithm in adapt pre-trained word embeddings to datasets containing human judgment of semantic relatedness. This work contributes to the field of studying semantic relatedness between words by proposing methods to extract semantic relatedness from web navigation, learn highquality and low-dimensional word representations from tagging data, and to learn semantic relatedness from any kind of vector representation by exploiting human feedback. Applications first and foremest lie in ontology learning for the Semantic Web, but also semantic search or query expansion.}, subject = {Semantik}, language = {en} } @phdthesis{Kobs2024, author = {Kobs, Konstantin}, title = {Think outside the Black Box: Model-Agnostic Deep Learning with Domain Knowledge}, doi = {10.25972/OPUS-34968}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-349689}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2024}, abstract = {Deep Learning (DL) models are trained on a downstream task by feeding (potentially preprocessed) input data through a trainable Neural Network (NN) and updating its parameters to minimize the loss function between the predicted and the desired output. While this general framework has mainly remained unchanged over the years, the architectures of the trainable models have greatly evolved. Even though it is undoubtedly important to choose the right architecture, we argue that it is also beneficial to develop methods that address other components of the training process. We hypothesize that utilizing domain knowledge can be helpful to improve DL models in terms of performance and/or efficiency. Such model-agnostic methods can be applied to any existing or future architecture. Furthermore, the black box nature of DL models motivates the development of techniques to understand their inner workings. Considering the rapid advancement of DL architectures, it is again crucial to develop model-agnostic methods. In this thesis, we explore six principles that incorporate domain knowledge to understand or improve models. They are applied either on the input or output side of the trainable model. Each principle is applied to at least two DL tasks, leading to task-specific implementations. To understand DL models, we propose to use Generated Input Data coming from a controllable generation process requiring knowledge about the data properties. This way, we can understand the model's behavior by analyzing how it changes when one specific high-level input feature changes in the generated data. On the output side, Gradient-Based Attribution methods create a gradient at the end of the NN and then propagate it back to the input, indicating which low-level input features have a large influence on the model's prediction. The resulting input features can be interpreted by humans using domain knowledge. To improve the trainable model in terms of downstream performance, data and compute efficiency, or robustness to unwanted features, we explore principles that each address one of the training components besides the trainable model. Input Masking and Augmentation directly modifies the training input data, integrating knowledge about the data and its impact on the model's output. We also explore the use of Feature Extraction using Pretrained Multimodal Models which can be seen as a beneficial preprocessing step to extract useful features. When no training data is available for the downstream task, using such features and domain knowledge expressed in other modalities can result in a Zero-Shot Learning (ZSL) setting, completely eliminating the trainable model. The Weak Label Generation principle produces new desired outputs using knowledge about the labels, giving either a good pretraining or even exclusive training dataset to solve the downstream task. Finally, improving and choosing the right Loss Function is another principle we explore in this thesis. Here, we enrich existing loss functions with knowledge about label interactions or utilize and combine multiple task-specific loss functions in a multitask setting. We apply the principles to classification, regression, and representation tasks as well as to image and text modalities. We propose, apply, and evaluate existing and novel methods to understand and improve the model. Overall, this thesis introduces and evaluates methods that complement the development and choice of DL model architectures.}, subject = {Deep learning}, language = {en} }