@article{CaliskanDangwalDandekar2023,
  author    = {Caliskan, Aylin and Dangwal, Seema and Dandekar, Thomas},
  title     = {Metadata integrity in bioinformatics: bridging the gap between data and knowledge},
  series = {Computational and Structural Biotechnology Journal},
  volume    = {21},
  journal   = {Computational and Structural Biotechnology Journal},
  issn      = {2001-0370},
  doi       = {10.1016/j.csbj.2023.10.006},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-349990},
  pages     = {4895-4913},
  year      = {2023},
  abstract  = {In the fast-evolving landscape of biomedical research, the emergence of big data has presented researchers with extraordinary opportunities to explore biological complexities. In biomedical research, big data imply also a big responsibility. This is not only due to genomics data being sensitive information but also due to genomics data being shared and re-analysed among the scientific community. This saves valuable resources and can even help to find new insights in silico. To fully use these opportunities, detailed and correct metadata are imperative. This includes not only the availability of metadata but also their correctness. Metadata integrity serves as a fundamental determinant of research credibility, supporting the reliability and reproducibility of data-driven findings. Ensuring metadata availability, curation, and accuracy are therefore essential for bioinformatic research. Not only must metadata be readily available, but they must also be meticulously curated and ideally error-free. Motivated by an accidental discovery of a critical metadata error in patient data published in two high-impact journals, we aim to raise awareness for the need of correct, complete, and curated metadata. We describe how the metadata error was found, addressed, and present examples for metadata-related challenges in omics research, along with supporting measures, including tools for checking metadata and software to facilitate various steps from data analysis to published research. Highlights • Data awareness and data integrity underpins the trustworthiness of results and subsequent further analysis. • Big data and bioinformatics enable efficient resource use by repurposing publicly available RNA-Sequencing data. • Manual checks of data quality and integrity are insufficient due to the overwhelming volume and rapidly growing data. • Automation and artificial intelligence provide cost-effective and efficient solutions for data integrity and quality checks. • FAIR data management, various software solutions and analysis tools assist metadata maintenance.},
  language  = {en}
}
@article{KrenzerMakowskiHekaloetal.2022,
  author    = {Krenzer, Adrian and Makowski, Kevin and Hekalo, Amar and Fitting, Daniel and Troya, Joel and Zoller, Wolfram G. and Hann, Alexander and Puppe, Frank},
  title     = {Fast machine learning annotation in the medical domain: a semi-automated video annotation tool for gastroenterologists},
  series = {BioMedical Engineering OnLine},
  volume    = {21},
  journal   = {BioMedical Engineering OnLine},
  number    = {1},
  doi       = {10.1186/s12938-022-01001-x},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-300231},
  year      = {2022},
  abstract  = {Background Machine learning, especially deep learning, is becoming more and more relevant in research and development in the medical domain. For all the supervised deep learning applications, data is the most critical factor in securing successful implementation and sustaining the progress of the machine learning model. Especially gastroenterological data, which often involves endoscopic videos, are cumbersome to annotate. Domain experts are needed to interpret and annotate the videos. To support those domain experts, we generated a framework. With this framework, instead of annotating every frame in the video sequence, experts are just performing key annotations at the beginning and the end of sequences with pathologies, e.g., visible polyps. Subsequently, non-expert annotators supported by machine learning add the missing annotations for the frames in-between. Methods In our framework, an expert reviews the video and annotates a few video frames to verify the object's annotations for the non-expert. In a second step, a non-expert has visual confirmation of the given object and can annotate all following and preceding frames with AI assistance. After the expert has finished, relevant frames will be selected and passed on to an AI model. This information allows the AI model to detect and mark the desired object on all following and preceding frames with an annotation. Therefore, the non-expert can adjust and modify the AI predictions and export the results, which can then be used to train the AI model. Results Using this framework, we were able to reduce workload of domain experts on average by a factor of 20 on our data. This is primarily due to the structure of the framework, which is designed to minimize the workload of the domain expert. Pairing this framework with a state-of-the-art semi-automated AI model enhances the annotation speed further. Through a prospective study with 10 participants, we show that semi-automated annotation using our tool doubles the annotation speed of non-expert annotators compared to a well-known state-of-the-art annotation tool. Conclusion In summary, we introduce a framework for fast expert annotation for gastroenterologists, which reduces the workload of the domain expert considerably while maintaining a very high annotation quality. The framework incorporates a semi-automated annotation system utilizing trained object detection models. The software and framework are open-source.},
  language  = {en}
}
@article{GuptaSrivastavaOsmanogluetal.2020,
  author    = {Gupta, Shishir K. and Srivastava, Mugdha and Osmanoglu, Oezge and Dandekar, Thomas},
  title     = {Genome-wide inference of the Camponotus floridanus protein-protein interaction network using homologous mapping and interacting domain profile pairs},
  series = {Scientific Reports},
  volume    = {10},
  journal   = {Scientific Reports},
  number    = {1},
  doi       = {10.1038/s41598-020-59344-1},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-229406},
  year      = {2020},
  abstract  = {Apart from some model organisms, the interactome of most organisms is largely unidentified. High-throughput experimental techniques to determine protein-protein interactions (PPIs) are resource intensive and highly susceptible to noise. Computational methods of PPI determination can accelerate biological discovery by identifying the most promising interacting pairs of proteins and by assessing the reliability of identified PPIs. Here we present a first in-depth study describing a global view of the ant Camponotus floridanus interactome. Although several ant genomes have been sequenced in the last eight years, studies exploring and investigating PPIs in ants are lacking. Our study attempts to fill this gap and the presented interactome will also serve as a template for determining PPIs in other ants in future. Our C. floridanus interactome covers 51,866 non-redundant PPIs among 6,274 proteins, including 20,544 interactions supported by domain-domain interactions (DDIs), 13,640 interactions supported by DDIs and subcellular localization, and 10,834 high confidence interactions mediated by 3,289 proteins. These interactions involve and cover 30.6\% of the entire C. floridanus proteome.},
  language  = {en}
}
@article{HornKellerHildebrandtetal.2016,
  author    = {Horn, Hannes and Keller, Alexander and Hildebrandt, Ulrich and K{\"a}mpfer, Peter and Riederer, Markus and Hentschel, Ute},
  title     = {Draft genome of the \(Arabidopsis\) \(thaliana\) phyllosphere bacterium, \(Williamsia\) sp. ARP1},
  series = {Standards in Genomic Sciences},
  volume    = {11},
  journal   = {Standards in Genomic Sciences},
  number    = {8},
  doi       = {10.1186/s40793-015-0122-x},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-146008},
  year      = {2016},
  abstract  = {The Gram-positive actinomycete \(Williamsia\) sp. ARP1 was originally isolated from the \(Arabidopsis\) \(thaliana\) phyllosphere. Here we describe the general physiological features of this microorganism together with the draft genome sequence and annotation. The 4,745,080 bp long genome contains 4434 protein-coding genes and 70 RNA genes. To our knowledge, this is only the second reported genome from the genus \(Williamsia\) and the first sequenced strain from the phyllosphere. The presented genomic information is interpreted in the context of an adaptation to the phyllosphere habitat.},
  language  = {en}
}
@article{WagnerVolkmerSharanetal.2014,
  author    = {Wagner, Ines and Volkmer, Michael and Sharan, Malvika and Villaveces, Jose M. and Oswald, Felix and Surendranath, Vineeth and Habermann, Bianca H.},
  title     = {morFeus: a web-based program to detect remotely conserved orthologs using symmetrical best hits and orthology network scoring},
  series = {BMC Bioinformatics},
  volume    = {15},
  journal   = {BMC Bioinformatics},
  number    = {263},
  doi       = {10.1186/1471-2105-15-263},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-115590},
  year      = {2014},
  abstract  = {Background: Searching the orthologs of a given protein or DNA sequence is one of the most important and most commonly used Bioinformatics methods in Biology. Programs like BLAST or the orthology search engine Inparanoid can be used to find orthologs when the similarity between two sequences is sufficiently high. They however fail when the level of conservation is low. The detection of remotely conserved proteins oftentimes involves sophisticated manual intervention that is difficult to automate. Results: Here, we introduce morFeus, a search program to find remotely conserved orthologs. Based on relaxed sequence similarity searches, morFeus selects sequences based on the similarity of their alignments to the query, tests for orthology by iterative reciprocal BLAST searches and calculates a network score for the resulting network of orthologs that is a measure of orthology independent of the E-value. Detecting remotely conserved orthologs of a protein using morFeus thus requires no manual intervention. We demonstrate the performance of morFeus by comparing it to state-of-the-art orthology resources and methods. We provide an example of remotely conserved orthologs, which were experimentally shown to be functionally equivalent in the respective organisms and therefore meet the criteria of the orthology-function conjecture. Conclusions: Based on our results, we conclude that morFeus is a powerful and specific search method for detecting remotely conserved orthologs.},
  language  = {en}
}