@phdthesis{Schuebel2003,
  author    = {Sch{\"u}bel, Niels},
  title     = {Evaluation von Beobachtungsskalen zur Beurteilung musiktherapeutischer Improvisationen},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-12612},
  school      = {Universit{\"a}t W{\"u}rzburg},
  year      = {2003},
  abstract  = {Die Arbeit besch{\"a}ftigt sich mit der Beurteilung von Musiktherapie. Hierzu wurden Skalen entwickelt die musiktherapeutische Improvisation abbilden sollen. Zur Bewertung dieser Skalen wurde die Interraterreliabilit{\"a}t berechnet. Unterschiedliche Spielarten zeigten deutlich unterschiedlich gute {\"U}bereinstimmungen. Diese wurden herausgearbeitet und Vorschl{\"a}ge erarbeitet zur weiteren Optimierung dieser Skalen.},
  language  = {de}
}
@phdthesis{Pretzell2005,
  author    = {Pretzell, Constance Barberine},
  title     = {Chirurgische Therapie des prim{\"a}ren Hyperparathyreoidismus : Untersuchungen zu Effizienz und Sicherheit der offenen minimal invasiven Parathyreoidektomie},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-16445},
  school      = {Universit{\"a}t W{\"u}rzburg},
  year      = {2005},
  abstract  = {Minimal invasive chirurgische Techniken zur Therapie des prim{\"a}ren Hyperparathyreoidismus (pHPT) konkurrieren mit dem bisherigen Standardverfahren, der bilateralen zervikalen Exploration, sofern eine lokalisierte Eindr{\"u}senerkrankung vorliegt. Zus{\"a}tzlich vorhandene, operationspflichtige Schilddr{\"u}senknoten k{\"o}nnen jedoch ein minimal invasives Vorgehen verhindern. Ziel der hier vorliegenden Untersuchung war es f{\"u}r das eigene Krankengut prospektiv zu analysieren, ob die minimal invasive offene Parathyreoidektomie (MIOP) zur Therapie des pHPT bei Eindr{\"u}senerkrankungen sicher durchf{\"u}hrbar war, ob die postoperativen Ergebnisse denen des konventionellen Vorgehens entsprechen und mit welcher Genauigkeit der intraoperative Parathormonschnelltest (PTH-Quick-Test) die biochemische Heilung des Patienten vorhersagen bzw. eine Mehrdr{\"u}senerkrankung ausschließen konnte.},
  language  = {de}
}
@article{BorchersMuellerSynofziketal.2013,
  author    = {Borchers, Svenja and M{\"u}ller, Laura and Synofzik, Matthis and Himmelbach, Marc},
  title     = {Guidelines and quality measures for the diagnosis of optic ataxia},
  series = {Frontiers in Human Neuroscience},
  volume    = {7},
  journal   = {Frontiers in Human Neuroscience},
  number    = {324},
  issn      = {1662-5161},
  doi       = {10.3389/fnhum.2013.00324},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-122439},
  year      = {2013},
  abstract  = {Since the first description of a systematic mis-reaching by Balint in 1909, a reasonable number of patients showing a similar phenomenology, later termed optic ataxia (OA), has been described. However, there is surprising inconsistency regarding the behavioral measures that are used to detect OA in experimental and clinical reports, if the respective measures are reported at all. A typical screening method that was presumably used by most researchers and clinicians, reaching for a target object in the peripheral visual space, has never been evaluated. We developed a set of instructions and evaluation criteria for the scoring of a semi-standardized version of this reaching task. We tested 36 healthy participants, a group of 52 acute and chronic stroke patients, and 24 patients suffering from cerebellar ataxia. We found a high interrater reliability and a moderate test-retest reliability comparable to other clinical instruments in the stroke sample. The calculation of cut-off thresholds based on healthy control and cerebellar patient data showed an unexpected high number of false positives in these samples due to individual outliers that made a considerable number of errors in peripheral reaching. This study provides first empirical data from large control and patient groups for a screening procedure that seems to be widely used but rarely explicitly reported and prepares the grounds for its use as a standard tool for the description of patients who are included in single case or group studies addressing optic ataxia similar to the use of neglect, extinction, or apraxia screening tools.},
  language  = {en}
}
@article{GrubeKoenneckeWalteretal.2013,
  author    = {Grube, Maike Miriam and Koennecke, Hans-Christian and Walter, Georg and Meisel, Andreas and Sobesky, Jan and Nolte, Christian Hans and Wellwood, Ian and Heuschmann, Peter Ulrich},
  title     = {Influence of Acute Complications on Outcome 3 Months after Ischemic Stroke},
  series = {PLOS ONE},
  volume    = {8},
  journal   = {PLOS ONE},
  number    = {9},
  issn      = {1932-6203},
  doi       = {10.1371/journal.pone.0075719},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-128362},
  pages     = {e75719},
  year      = {2013},
  abstract  = {Background: Early medical complications are potentially modifiable factors influencing in-hospital outcome. We investigated the influence of acute complications on mortality and poor outcome 3 months after ischemic stroke. Methods: Data were obtained from patients admitted to one of 13 stroke units of the Berlin Stroke Registry (BSR) who participated in a 3-months-follow up between June 2010 and September 2012. We examined the influence of the cumulative number of early in-hospital complications on mortality and poor outcome (death, disability or institutionalization) 3 months after stroke using multivariable logistic regression analyses and calculated attributable fractions to determine the impact of early complications on mortality and poor outcome. Results: A total of 2349 ischemic stroke patients alive at discharge from acute care were included in the analysis. Older age, stroke severity, pre-stroke dependency and early complications were independent predictors of mortality 3 months after stroke. Poor outcome was independently associated with older age, stroke severity, pre-stroke dependency, previous stroke and early complications. More than 60\% of deaths and poor outcomes were attributed to age, pre-stroke dependency and stroke severity and in-hospital complications contributed to 12.3\% of deaths and 9.1\% of poor outcomes 3 months after stroke. Conclusion: The majority of deaths and poor outcomes after stroke were attributed to non-modifiable factors. However, early in-hospital complications significantly affect outcome in patients who survived the acute phase after stroke, underlining the need to improve prevention and treatment of complications in hospital.},
  language  = {en}
}
@article{ManchiaAdliAkulaetal.2013,
  author    = {Manchia, Mirko and Adli, Mazda and Akula, Nirmala and Arda, Raffaella and Aubry, Jean-Michel and Backlund, Lena and Banzato, Claudio E. M. and Baune, Bernhard T. and Bellivier, Frank and Bengesser, Susanne and Biernacka, Joanna M. and Brichant-Petitjean, Clara and Bui, Elise and Calkin, Cynthia V. and Cheng, Andrew Tai Ann and Chillotti, Caterina and Cichon, Sven and Clark, Scott and Czerski, Piotr M. and Dantas, Clarissa and Del Zompo, Maria and DePaulo, J. Raymond and Detera-Wadleigh, Sevilla D. and Etain, Bruno and Falkai, Peter and Fris{\´e}n, Louise and Frye, Mark A. and Fullerton, Jan and Gard, S{\´e}bastien and Garnham, Julie and Goes, Fernando S. and Grof, Paul and Gruber, Oliver and Hashimoto, Ryota and Hauser, Joanna and Heilbronner, Urs and Hoban, Rebecca and Hou, Liping and Jamain, St{\´e}phane and Kahn, Jean-Pierre and Kassem, Layla and Kato, Tadafumi and Kelsoe, John R. and Kittel-Schneider, Sarah and Kliwicki, Sebastian and Kuo, Po-Hsiu and Kusumi, Ichiro and Laje, Gonzalo and Lavebratt, Catharina and Leboyer, Marion and Leckband, Susan G. and L{\´o}pez Jaramillo, Carlos A. and Maj, Mario and Malafosse, Alain and Martinsson, Lina and Masui, Takuya and Mitchell, Philip B. and Mondimore, Frank and Monteleone, Palmiero and Nallet, Audrey and Neuner, Maria and Nov{\´a}k, Tom{\´a}s and O'Donovan, Claire and {\"O}sby, Urban and Ozaki, Norio and Perlis, Roy H. and Pfennig, Andrea and Potash, James B. and Reich-Erkelenz, Daniela and Reif, Andreas and Reininghaus, Eva and Richardson, Sara and Rouleau, Guy A. and Rybakowski, Janusz K. and Schalling, Martin and Schofield, Peter R. and Schubert, Oliver K. and Schweizer, Barbara and Seem{\"u}ller, Florian and Grigoroiu-Serbanescu, Maria and Severino, Giovanni and Seymour, Lisa R. and Slaney, Claire and Smoller, Jordan W. and Squassina, Alessio and Stamm, Thomas and Steele, Jo and Stopkova, Pavla and Tighe, Sarah K. and Tortorella, Alfonso and Turecki, Gustavo and Wray, Naomi R. and Wright, Adam and Zandi, Peter P. and Zilles, David and Bauer, Michael and Rietschel, Marcella and McMahon, Francis J. and Schulze, Thomas G. and Alda, Martin},
  title     = {Assessment of Response to Lithium Maintenance Treatment in Bipolar Disorder: A Consortium on Lithium Genetics (ConLiGen) Report},
  series = {PLoS ONE},
  volume    = {8},
  journal   = {PLoS ONE},
  number    = {6},
  doi       = {10.1371/journal.pone.0065636},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-130938},
  pages     = {e65636},
  year      = {2013},
  abstract  = {Objective: The assessment of response to lithium maintenance treatment in bipolar disorder (BD) is complicated by variable length of treatment, unpredictable clinical course, and often inconsistent compliance. Prospective and retrospective methods of assessment of lithium response have been proposed in the literature. In this study we report the key phenotypic measures of the "Retrospective Criteria of Long-Term Treatment Response in Research Subjects with Bipolar Disorder" scale currently used in the Consortium on Lithium Genetics (ConLiGen) study. Materials and Methods: Twenty-nine ConLiGen sites took part in a two-stage case-vignette rating procedure to examine inter-rater agreement [Kappa (\(\kappa\))] and reliability [intra-class correlation coefficient (ICC)] of lithium response. Annotated first-round vignettes and rating guidelines were circulated to expert research clinicians for training purposes between the two stages. Further, we analyzed the distributional properties of the treatment response scores available for 1,308 patients using mixture modeling. Results: Substantial and moderate agreement was shown across sites in the first and second sets of vignettes (\(\kappa\) = 0.66 and \(\kappa\) = 0.54, respectively), without significant improvement from training. However, definition of response using the A score as a quantitative trait and selecting cases with B criteria of 4 or less showed an improvement between the two stages (\(ICC_1 = 0.71\) and \(ICC_2 = 0.75\), respectively). Mixture modeling of score distribution indicated three subpopulations (full responders, partial responders, non responders). Conclusions: We identified two definitions of lithium response, one dichotomous and the other continuous, with moderate to substantial inter-rater agreement and reliability. Accurate phenotypic measurement of lithium response is crucial for the ongoing ConLiGen pharmacogenomic study.},
  language  = {en}
}
@article{MeuleHermannKuebler2014,
  author    = {Meule, Adrian and Hermann, Tina and K{\"u}bler, Andrea},
  title     = {A short version of the Food Cravings Questionnaire—Trait: the FCQ-T-reduced},
  doi       = {10.3389/fpsyg.2014.00190},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-112748},
  year      = {2014},
  abstract  = {One of the most often used instruments for the assessment of food cravings is the Food Cravings Questionnaire (FCQ), which consists of a trait (FCQ-T; 39 items) and state (FCQ-S; 15 items) version. Scores on the FCQ-T have been found to be positively associated with eating pathology, body mass index (BMI), low dieting success and increases in state food craving during cognitive tasks involving appealing food stimuli. The current studies evaluated reliability and validity of a reduced version of the FCQ-T consisting of 15 items only (FCQ-T-r). Study 1 was a questionnaire study conducted online among students (N = 323). In study 2, female students (N = 70) performed a working memory task involving food and neutral pictures. Study 1 indicated a one-factorial structure and high internal consistency (α = 0.94) of the FCQ-T-r. Scores of the FCQ-T-r were positively correlated with BMI and negatively correlated with dieting success. In study 2, participants reported higher state food craving after the task compared to before. This increase was positively correlated with the FCQ-T-r. Hours since the last meal positively predicted food craving before the task when controlling for FCQ-T-r scores and the interaction of both variables. Contrarily, FCQ-T-r scores positively predicted food craving after the task when controlling for food deprivation and the interaction term. Thus, trait food craving was specifically associated with state food craving triggered by palatable food-cues, but not with state food craving related to plain hunger. Results indicate high reliability of the FCQ-T-r. Replicating studies that used the long version, small-to-medium correlations with BMI and dieting success could be found. Finally, scores on the FCQ-T-r predicted cue-elicited food craving, providing further support of its validity. The FCQ-T-r constitutes a succinct, valid and reliable self-report measure to efficiently assess experiences of food craving as a trait.},
  language  = {en}
}
@article{SmithBrayHoffmanetal.2015,
  author    = {Smith, Craig J. and Bray, Benjamin D. and Hoffman, Alex and Meisel, Andreas and Heuschmann, Peter U. and Wolfe, Charles D. A. and Tyrrell, Pippa J. and Rudd, Anthony G.},
  title     = {Can a novel clinical risk score improve pneumonia prediction in acute stroke care? A UK multicenter cohort study},
  series = {Journal of the American Heart Association},
  volume    = {4},
  journal   = {Journal of the American Heart Association},
  number    = {1},
  doi       = {10.1161/JAHA.114.001307},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-144602},
  pages     = {e001307},
  year      = {2015},
  abstract  = {Background Pneumonia frequently complicates stroke and has amajor impact on outcome. We derived and internally validated a simple clinical risk score for predicting stroke-associated pneumonia (SAP), and compared the performance with an existing score (A\(^{2}\)DS\(^{2}\)). Methods and Results We extracted data for patients with ischemic stroke or intracerebral hemorrhage from the Sentinel Stroke National Audit Programme multicenter UK registry. The data were randomly allocated into derivation (n=11 551) and validation (n=11 648) samples. A multivariable logistic regression model was fitted to the derivation data to predict SAP in the first 7 days of admission. The characteristics of the score were evaluated using receiver operating characteristics (discrimination) and by plotting predicted versus observed SAP frequency in deciles of risk (calibration). Prevalence of SAP was 6.7\% overall. The final 22-point score (ISAN: prestroke Independence [modified Rankin scale], Sex, Age, National Institutes of Health Stroke Scale) exhibited good discrimination in the ischemic stroke derivation (C-statistic 0.79; 95\% CI 0.77 to 0.81) and validation (C-statistic 0.78; 95\% CI 0.76 to 0.80) samples. It was well calibrated in ischemic stroke and was further classified into meaningful risk groups (low 0 to 5, medium6 to 10, high 11 to 14, and very high >= 15) associated with SAP frequencies of 1.6\%, 4.9\%, 12.6\%, and 26.4\%, respectively, in the validation sample. Discrimination for both scores was similar, although they performed less well in the intracerebral hemorrhage patients with an apparent ceiling effect. Conclusions The ISAN score is a simple tool for predicting SAP in clinical practice. External validation is required in ischemic and hemorrhagic stroke cohorts.},
  language  = {en}
}
@article{StrahlGerlichAlpersetal.2019,
  author    = {Strahl, Andr{\´e} and Gerlich, Christian and Alpers, Georg W. and Gehrke, J{\"o}rg and M{\"u}ller-Garnn, Annette and Vogel, Heiner},
  title     = {An instrument for quality assurance in work capacity evaluation: development, evaluation, and inter-rater reliability},
  series = {BMC Health Services Research},
  volume    = {19},
  journal   = {BMC Health Services Research},
  doi       = {10.1186/s12913-019-4387-4},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-200289},
  pages     = {556},
  year      = {2019},
  abstract  = {Background: Employees insured in pension insurance, who are incapable of working due to ill health, are entitled to a disability pension. To assess whether an individual meets the medical requirements to be considered as disabled, a work capacity evaluation is conducted. However, there are no official guidelines on how to perform an external quality assurance for this evaluation process. Furthermore, the quality of medical reports in the field of insurance medicine can vary substantially, and systematic evaluations are scarce. Reliability studies using peer review have repeatedly shown insufficient ability to distinguish between high, moderate and low quality. Considering literature recommendations, we developed an instrument to examine the quality of medical experts'reports. Methods: The peer review manual developed contains six quality domains (formal structure, clarity, transparency, completeness, medical-scientific principles, and efficiency) comprising 22 items. In addition, a superordinate criterion (survey confirmability) rank the overall quality and usefulness of a report. This criterion evaluates problems of innerlogic and reasoning. Development of the manual was assisted by experienced physicians in a pre-test. We examined the observable variance in peer judgements and reliability as the most important outcome criteria. To evaluate inter-rater reliability, 20 anonymous experts' reports detailing the work capacity evaluation were reviewed by 19 trained raters (peers). Percentage agreement and Kendall's W, a reliability measure of concordance between two or more peers, were calculated. A total of 325 reviews were conducted. Results: Agreement of peer judgements with respect to the superordinate criterion ranged from 29.2 to 87.5\%. Kendall's W for the quality domain items varied greatly, ranging from 0.09 to 0.88. With respect to the superordinate criterion, Kendall's W was 0.39, which indicates fair agreement. The results of the percentage agreement revealed systemic peer preferences for certain deficit scale categories. Conclusion: The superordinate criterion was not sufficiently reliable. However, in comparison to other reliability studies, this criterion showed an equivalent reliability value. This report aims to encourage further efforts to improve evaluation instruments. To reduce disagreement between peer judgments, we propose the revision of the peer review instrumentand the development and implementation of a standardized rater training to improve reliability.},
  language  = {en}
}
@article{GuptaSrivastavaOsmanogluetal.2020,
  author    = {Gupta, Shishir K. and Srivastava, Mugdha and Osmanoglu, Oezge and Dandekar, Thomas},
  title     = {Genome-wide inference of the Camponotus floridanus protein-protein interaction network using homologous mapping and interacting domain profile pairs},
  series = {Scientific Reports},
  volume    = {10},
  journal   = {Scientific Reports},
  number    = {1},
  doi       = {10.1038/s41598-020-59344-1},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-229406},
  year      = {2020},
  abstract  = {Apart from some model organisms, the interactome of most organisms is largely unidentified. High-throughput experimental techniques to determine protein-protein interactions (PPIs) are resource intensive and highly susceptible to noise. Computational methods of PPI determination can accelerate biological discovery by identifying the most promising interacting pairs of proteins and by assessing the reliability of identified PPIs. Here we present a first in-depth study describing a global view of the ant Camponotus floridanus interactome. Although several ant genomes have been sequenced in the last eight years, studies exploring and investigating PPIs in ants are lacking. Our study attempts to fill this gap and the presented interactome will also serve as a template for determining PPIs in other ants in future. Our C. floridanus interactome covers 51,866 non-redundant PPIs among 6,274 proteins, including 20,544 interactions supported by domain-domain interactions (DDIs), 13,640 interactions supported by DDIs and subcellular localization, and 10,834 high confidence interactions mediated by 3,289 proteins. These interactions involve and cover 30.6\% of the entire C. floridanus proteome.},
  language  = {en}
}
@article{MayrKleinRutzingeretal.2021,
  author    = {Mayr, Stefan and Klein, Igor and Rutzinger, Martin and Kuenzer, Claudia},
  title     = {Determining temporal uncertainty of a global inland surface water time series},
  series = {Remote Sensing},
  volume    = {13},
  journal   = {Remote Sensing},
  number    = {17},
  issn      = {2072-4292},
  doi       = {10.3390/rs13173454},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-245234},
  year      = {2021},
  abstract  = {Earth observation time series are well suited to monitor global surface dynamics. However, data products that are aimed at assessing large-area dynamics with a high temporal resolution often face various error sources (e.g., retrieval errors, sampling errors) in their acquisition chain. Addressing uncertainties in a spatiotemporal consistent manner is challenging, as extensive high-quality validation data is typically scarce. Here we propose a new method that utilizes time series inherent information to assess the temporal interpolation uncertainty of time series datasets. For this, we utilized data from the DLR-DFD Global WaterPack (GWP), which provides daily information on global inland surface water. As the time series is primarily based on optical MODIS (Moderate Resolution Imaging Spectroradiometer) images, the requirement of data gap interpolation due to clouds constitutes the main uncertainty source of the product. With a focus on different temporal and spatial characteristics of surface water dynamics, seven auxiliary layers were derived. Each layer provides probability and reliability estimates regarding water observations at pixel-level. This enables the quantification of uncertainty corresponding to the full spatiotemporal range of the product. Furthermore, the ability of temporal layers to approximate unknown pixel states was evaluated for stratified artificial gaps, which were introduced into the original time series of four climatologic diverse test regions. Results show that uncertainty is quantified accurately (>90\%), consequently enhancing the product's quality with respect to its use for modeling and the geoscientific community.},
  language  = {en}
}
@article{HuflageFieberFaerberetal.2022,
  author    = {Huflage, Henner and Fieber, Tabea and F{\"a}rber, Christian and Knarr, Jonas and Veldhoen, Simon and Jordan, Martin C. and Gilbert, Fabian and Bley, Thorsten Alexander and Meffert, Rainer H. and Grunz, Jan-Peter and Schmalzl, Jonas},
  title     = {Interobserver reliability of scapula fracture classifications in intra- and extra-articular injury patterns},
  series = {BMC Musculoskeletal Disorders},
  volume    = {23},
  journal   = {BMC Musculoskeletal Disorders},
  number    = {1},
  doi       = {10.1186/s12891-022-05146-7},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-299795},
  year      = {2022},
  abstract  = {Background Morphology and glenoid involvement determine the necessity of surgical management in scapula fractures. While being present in only a small share of patients with shoulder trauma, numerous classification systems have been in use over the years for categorization of scapula fractures. The purpose of this study was to evaluate the established AO/OTA classification in comparison to the classification system of Euler and R{\"u}edi (ER) with regard to interobserver reliability and confidence in clinical practice. Methods Based on CT imaging, 149 patients with scapula fractures were retrospectively categorized by two trauma surgeons and two radiologists using the classification systems of ER and AO/OTA. To measure the interrater reliability, Fleiss kappa (κ) was calculated independently for both fracture classifications. Rater confidence was stated subjectively on a five-point scale and compared with Wilcoxon signed rank tests. Additionally, we computed the intraclass correlation coefficient (ICC) based on absolute agreement in a two-way random effects model to assess the diagnostic confidence agreement between observers. Results In scapula fractures involving the glenoid fossa, interrater reliability was substantial (κ = 0.722; 95\% confidence interval [CI] 0.676-0.769) for the AO/OTA classification in contrast to moderate agreement (κ = 0.579; 95\% CI 0.525-0.634) for the ER classification system. Diagnostic confidence for intra-articular fracture patterns was superior using the AO/OTA classification compared to ER (p < 0.001) with higher confidence agreement (ICC: 0.882 versus 0.831). For extra-articular fractures, ER (κ = 0.817; 95\% CI 0.771-0.863) provided better interrater reliability compared to AO/OTA (κ = 0.734; 95\% CI 0.692-0.776) with higher diagnostic confidence (p < 0.001) and superior agreement between confidence ratings (ICC: 0.881 versus 0.912). Conclusions The AO/OTA classification is most suitable to categorize intra-articular scapula fractures with glenoid involvement, whereas the classification system of Euler and R{\"u}edi appears to be superior in extra-articular injury patterns with fractures involving only the scapula body, spine, acromion and coracoid process.},
  language  = {en}
}
@article{WaltmannSchlagenhaufDeserno2022,
  author    = {Waltmann, Maria and Schlagenhauf, Florian and Deserno, Lorenz},
  title     = {Sufficient reliability of the behavioral and computational readouts of a probabilistic reversal learning task},
  series = {Behavior Research Methods},
  volume    = {54},
  journal   = {Behavior Research Methods},
  number    = {6},
  issn      = {1554-3528},
  doi       = {10.3758/s13428-021-01739-7},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-324246},
  pages     = {2993-3014},
  year      = {2022},
  abstract  = {Task-based measures that capture neurocognitive processes can help bridge the gap between brain and behavior. To transfer tasks to clinical application, reliability is a crucial benchmark because it imposes an upper bound to potential correlations with other variables (e.g., symptom or brain data). However, the reliability of many task readouts is low. In this study, we scrutinized the retest reliability of a probabilistic reversal learning task (PRLT) that is frequently used to characterize cognitive flexibility in psychiatric populations. We analyzed data from N = 40 healthy subjects, who completed the PRLT twice. We focused on how individual metrics are derived, i.e., whether data were partially pooled across participants and whether priors were used to inform estimates. We compared the reliability of the resulting indices across sessions, as well as the internal consistency of a selection of indices. We found good to excellent reliability for behavioral indices as derived from mixed-effects models that included data from both sessions. The internal consistency was good to excellent. For indices derived from computational modeling, we found excellent reliability when using hierarchical estimation with empirical priors and including data from both sessions. Our results indicate that the PRLT is well equipped to measure individual differences in cognitive flexibility in reinforcement learning. However, this depends heavily on hierarchical modeling of the longitudinal data (whether sessions are modeled separately or jointly), on estimation methods, and on the combination of parameters included in computational models. We discuss implications for the applicability of PRLT indices in psychiatric research and as diagnostic tools.},
  language  = {en}
}