@article{ToepferCorovicFetteetal.2015, author = {Toepfer, Martin and Corovic, Hamo and Fette, Georg and Kl{\"u}gl, Peter and St{\"o}rk, Stefan and Puppe, Frank}, title = {Fine-grained information extraction from German transthoracic echocardiography reports}, series = {BMC Medical Informatics and Decision Making}, volume = {15}, journal = {BMC Medical Informatics and Decision Making}, number = {91}, doi = {doi:10.1186/s12911-015-0215-x}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-125509}, year = {2015}, abstract = {Background Information extraction techniques that get structured representations out of unstructured data make a large amount of clinically relevant information about patients accessible for semantic applications. These methods typically rely on standardized terminologies that guide this process. Many languages and clinical domains, however, lack appropriate resources and tools, as well as evaluations of their applications, especially if detailed conceptualizations of the domain are required. For instance, German transthoracic echocardiography reports have not been targeted sufficiently before, despite of their importance for clinical trials. This work therefore aimed at development and evaluation of an information extraction component with a fine-grained terminology that enables to recognize almost all relevant information stated in German transthoracic echocardiography reports at the University Hospital of W{\"u}rzburg. Methods A domain expert validated and iteratively refined an automatically inferred base terminology. The terminology was used by an ontology-driven information extraction system that outputs attribute value pairs. The final component has been mapped to the central elements of a standardized terminology, and it has been evaluated according to documents with different layouts. Results The final system achieved state-of-the-art precision (micro average.996) and recall (micro average.961) on 100 test documents that represent more than 90 \% of all reports. In particular, principal aspects as defined in a standardized external terminology were recognized with f 1=.989 (micro average) and f 1=.963 (macro average). As a result of keyword matching and restraint concept extraction, the system obtained high precision also on unstructured or exceptionally short documents, and documents with uncommon layout. Conclusions The developed terminology and the proposed information extraction system allow to extract fine-grained information from German semi-structured transthoracic echocardiography reports with very high precision and high recall on the majority of documents at the University Hospital of W{\"u}rzburg. Extracted results populate a clinical data warehouse which supports clinical research.}, language = {en} } @phdthesis{Kluegl2015, author = {Kl{\"u}gl, Peter}, title = {Context-specific Consistencies in Information Extraction: Rule-based and Probabilistic Approaches}, publisher = {W{\"u}rzburg University Press}, address = {W{\"u}rzburg}, isbn = {978-3-95826-018-4 (print)}, doi = {10.25972/WUP-978-3-95826-019-1}, url = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-108352}, school = {W{\"u}rzburg University Press}, year = {2015}, abstract = {Large amounts of communication, documentation as well as knowledge and information are stored in textual documents. Most often, these texts like webpages, books, tweets or reports are only available in an unstructured representation since they are created and interpreted by humans. In order to take advantage of this huge amount of concealed information and to include it in analytic processes, it needs to be transformed into a structured representation. Information extraction considers exactly this task. It tries to identify well-defined entities and relations in unstructured data and especially in textual documents. Interesting entities are often consistently structured within a certain context, especially in semi-structured texts. However, their actual composition varies and is possibly inconsistent among different contexts. Information extraction models stay behind their potential and return inferior results if they do not consider these consistencies during processing. This work presents a selection of practical and novel approaches for exploiting these context-specific consistencies in information extraction tasks. The approaches direct their attention not only to one technique, but are based on handcrafted rules as well as probabilistic models. A new rule-based system called UIMA Ruta has been developed in order to provide optimal conditions for rule engineers. This system consists of a compact rule language with a high expressiveness and strong development support. Both elements facilitate rapid development of information extraction applications and improve the general engineering experience, which reduces the necessary efforts and costs when specifying rules. The advantages and applicability of UIMA Ruta for exploiting context-specific consistencies are illustrated in three case studies. They utilize different engineering approaches for including the consistencies in the information extraction task. Either the recall is increased by finding additional entities with similar composition, or the precision is improved by filtering inconsistent entities. Furthermore, another case study highlights how transformation-based approaches are able to correct preliminary entities using the knowledge about the occurring consistencies. The approaches of this work based on machine learning rely on Conditional Random Fields, popular probabilistic graphical models for sequence labeling. They take advantage of a consistency model, which is automatically induced during processing the document. The approach based on stacked graphical models utilizes the learnt descriptions as feature functions that have a static meaning for the model, but change their actual function for each document. The other two models extend the graph structure with additional factors dependent on the learnt model of consistency. They include feature functions for consistent and inconsistent entities as well as for additional positions that fulfill the consistencies. The presented approaches are evaluated in three real-world domains: segmentation of scientific references, template extraction in curricula vitae, and identification and categorization of sections in clinical discharge letters. They are able to achieve remarkable results and provide an error reduction of up to 30\% compared to usually applied techniques.}, subject = {Information Extraction}, language = {en} }