@phdthesis{NavarroBullock2015, author = {Navarro Bullock, Beate}, title = {Privacy aware social information retrieval and spam filtering using folksonomies}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-120941}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2015}, abstract = {Social interactions as introduced by Web 2.0 applications during the last decade have changed the way the Internet is used. Today, it is part of our daily lives to maintain contacts through social networks, to comment on the latest developments in microblogging services or to save and share information snippets such as photos or bookmarks online. Social bookmarking systems are part of this development. Users can share links to interesting web pages by publishing bookmarks and providing descriptive keywords for them. The structure which evolves from the collection of annotated bookmarks is called a folksonomy. The sharing of interesting and relevant posts enables new ways of retrieving information from the Web. Users can search or browse the folksonomy looking at resources related to specific tags or users. Ranking methods known from search engines have been adjusted to facilitate retrieval in social bookmarking systems. Hence, social bookmarking systems have become an alternative or addendum to search engines. In order to better understand the commonalities and differences of social bookmarking systems and search engines, this thesis compares several aspects of the two systems' structure, usage behaviour and content. This includes the use of tags and query terms, the composition of the document collections and the rankings of bookmarks and search engine URLs. Searchers (recorded via session ids), their search terms and the clicked on URLs can be extracted from a search engine query logfile. They form similar links as can be found in folksonomies where a user annotates a resource with tags. We use this analogy to build a tripartite hypergraph from query logfiles (a logsonomy), and compare structural and semantic properties of log- and folksonomies. Overall, we have found similar behavioural, structural and semantic characteristics in both systems. Driven by this insight, we investigate, if folksonomy data can be of use in web information retrieval in a similar way to query log data: we construct training data from query logs and a folksonomy to build models for a learning-to-rank algorithm. First experiments show a positive correlation of ranking results generated from the ranking models of both systems. The research is based on various data collections from the social bookmarking systems BibSonomy and Delicious, Microsoft's search engine MSN (now Bing) and Google data. To maintain social bookmarking systems as a good source for information retrieval, providers need to fight spam. This thesis introduces and analyses different features derived from the specific characteristics of social bookmarking systems to be used in spam detection classification algorithms. Best results can be derived from a combination of profile, activity, semantic and location-based features. Based on the experiments, a spam detection framework which identifies and eliminates spam activities for the social bookmarking system BibSonomy has been developed. The storing and publication of user-related bookmarks and profile information raises questions about user data privacy. What kinds of personal information is collected and how do systems handle user-related items? In order to answer these questions, the thesis looks into the handling of data privacy in the social bookmarking system BibSonomy. Legal guidelines about how to deal with the private data collected and processed in social bookmarking systems are also presented. Experiments will show that the consideration of user data privacy in the process of feature design can be a first step towards strengthening data privacy.}, subject = {Information Retrieval}, language = {en} } @phdthesis{Schoeneberg2014, author = {Sch{\"o}neberg, Hendrik}, title = {Semiautomatische Metadaten-Extraktion und Qualit{\"a}tsmanagement in Workflow-Systemen zur Digitalisierung historischer Dokumente}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-104878}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2014}, abstract = {Performing Named Entity Recognition on ancient documents is a time-consuming, complex and error-prone manual task. It is a prerequisite though to being able to identify related documents and correlate between named entities in distinct sources, helping to precisely recreate historic events. In order to reduce the manual effort, automated classification approaches could be leveraged. Classifying terms in ancient documents in an automated manner poses a difficult task due to the sources' challenging syntax and poor conservation states. This thesis introduces and evaluates approaches that can cope with complex syntactial environments by using statistical information derived from a term's context and combining it with domain-specific heuristic knowledge to perform a classification. Furthermore this thesis demonstrates how metadata generated by these approaches can be used as error heuristics to greatly improve the performance of workflow systems for digitizations of early documents.}, subject = {Klassifikation}, language = {de} } @phdthesis{Selbach2011, author = {Selbach, Stefan}, title = {Hybride bitparallele Volltextsuche}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-66476}, school = {Universit{\"a}t W{\"u}rzburg}, year = {2011}, abstract = {Der große Vorteil eines q-Gramm Indexes liegt darin, dass es m{\"o}glich ist beliebige Zeichenketten in einer Dokumentensammlung zu suchen. Ein Nachteil jedoch liegt darin, dass bei gr{\"o}ßer werdenden Datenmengen dieser Index dazu neigt, sehr groß zu werden, was mit einem deutlichem Leistungsabfall verbunden ist. In dieser Arbeit wird eine neuartige Technik vorgestellt, die die Leistung eines q-Gramm Indexes mithilfe zus{\"a}tzlicher M-Matrizen f{\"u}r jedes q-Gramm und durch die Kombination mit einem invertierten Index erh{\"o}ht. Eine M-Matrix ist eine Bit-Matrix, die Informationen {\"u}ber die Positionen eines q-Gramms enth{\"a}lt. Auch bei der Kombination von zwei oder mehreren Q-Grammen bieten diese M-Matrizen Informationen {\"u}ber die Positionen der Kombination. Dies kann verwendet werden, um die Komplexit{\"a}t der Zusammenf{\"u}hrung der q-Gramm Trefferlisten f{\"u}r eine gegebene Suchanfrage zu reduzieren und verbessert die Leistung des n-Gramm-invertierten Index. Die Kombination mit einem termbasierten invertierten Index beschleunigt die durchschnittliche Suchzeit zus{\"a}tzlich und vereint die Vorteile beider Index-Formate. Redundante Informationen werden in dem q-Gramm Index reduziert und weitere Funktionalit{\"a}t hinzugef{\"u}gt, wie z.B. die Bewertung von Treffern nach Relevanz, die M{\"o}glichkeit, nach Konzepten zu suchen oder Indexpartitionierungen nach Wichtigkeit der enthaltenen Terme zu erstellen.}, subject = {Information Retrieval}, language = {de} }