@phdthesis{Wick2020,
  author    = {Wick, Christoph},
  title     = {Optical Medieval Music Recognition},
  doi       = {10.25972/OPUS-21434},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-214348},
  school      = {Universit{\"a}t W{\"u}rzburg},
  year      = {2020},
  abstract  = {In recent years, great progress has been made in the area of Artificial Intelligence (AI) due to the possibilities of Deep Learning which steadily yielded new state-of-the-art results especially in many image recognition tasks. Currently, in some areas, human performance is achieved or already exceeded. This great development already had an impact on the area of Optical Music Recognition (OMR) as several novel methods relying on Deep Learning succeeded in specific tasks. Musicologists are interested in large-scale musical analysis and in publishing digital transcriptions in a collection enabling to develop tools for searching and data retrieving. The application of OMR promises to simplify and thus speed-up the transcription process by either providing fully-automatic or semi-automatic approaches. This thesis focuses on the automatic transcription of Medieval music with a focus on square notation which poses a challenging task due to complex layouts, highly varying handwritten notations, and degradation. However, since handwritten music notations are quite complex to read, even for an experienced musicologist, it is to be expected that even with new techniques of OMR manual corrections are required to obtain the transcriptions. This thesis presents several new approaches and open source software solutions for layout analysis and Automatic Text Recognition (ATR) for early documents and for OMR of Medieval manuscripts providing state-of-the-art technology. Fully Convolutional Networks (FCN) are applied for the segmentation of historical manuscripts and early printed books, to detect staff lines, and to recognize neume notations. The ATR engine Calamari is presented which allows for ATR of early prints and also the recognition of lyrics. Configurable CNN/LSTM-network architectures which are trained with the segmentation-free CTC-loss are applied to the sequential recognition of text but also monophonic music. Finally, a syllable-to-neume assignment algorithm is presented which represents the final step to obtain a complete transcription of the music. The evaluations show that the performances of any algorithm is highly depending on the material at hand and the number of training instances. The presented staff line detection correctly identifies staff lines and staves with an \$F_1\$-score of above \$99.5\\%\$. The symbol recognition yields a diplomatic Symbol Accuracy Rate (dSAR) of above \$90\\%\$ by counting the number of correct predictions in the symbols sequence normalized by its length. The ATR of lyrics achieved a Character Error Rate (CAR) (equivalently the number of correct predictions normalized by the sentence length) of above \$93\\%\$ trained on 771 lyric lines of Medieval manuscripts and of 99.89\\% when training on around 3.5 million lines of contemporary printed fonts. The assignment of syllables and their corresponding neumes reached \$F_1\$-scores of up to \$99.2\\%\$. A direct comparison to previously published performances is difficult due to different materials and metrics. However, estimations show that the reported values of this thesis exceed the state-of-the-art in the area of square notation. A further goal of this thesis is to enable musicologists without technical background to apply the developed algorithms in a complete workflow by providing a user-friendly and comfortable Graphical User Interface (GUI) encapsulating the technical details. For this purpose, this thesis presents the web-application OMMR4all. Its fully-functional workflow includes the proposed state-of-the-art machine-learning algorithms and optionally allows for a manual intervention at any stage to correct the output preventing error propagation. To simplify the manual (post-) correction, OMMR4all provides an overlay-editor that superimposes the annotations with a scan of the original manuscripts so that errors can easily be spotted. The workflow is designed to be iteratively improvable by training better models as soon as new Ground Truth (GT) is available.},
  subject      = {Neumenschrift},
  language  = {en}
}