@article{HentschelKobsHotho2022,
  author    = {Hentschel, Simon and Kobs, Konstantin and Hotho, Andreas},
  title     = {CLIP knows image aesthetics},
  series = {Frontiers in Artificial Intelligence},
  volume    = {5},
  journal   = {Frontiers in Artificial Intelligence},
  issn      = {2624-8212},
  doi       = {10.3389/frai.2022.976235},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-297150},
  year      = {2022},
  abstract  = {Most Image Aesthetic Assessment (IAA) methods use a pretrained ImageNet classification model as a base to fine-tune. We hypothesize that content classification is not an optimal pretraining task for IAA, since the task discourages the extraction of features that are useful for IAA, e.g., composition, lighting, or style. On the other hand, we argue that the Contrastive Language-Image Pretraining (CLIP) model is a better base for IAA models, since it has been trained using natural language supervision. Due to the rich nature of language, CLIP needs to learn a broad range of image features that correlate with sentences describing the image content, composition, environments, and even subjective feelings about the image. While it has been shown that CLIP extracts features useful for content classification tasks, its suitability for tasks that require the extraction of style-based features like IAA has not yet been shown. We test our hypothesis by conducting a three-step study, investigating the usefulness of features extracted by CLIP compared to features obtained from the last layer of a comparable ImageNet classification model. In each step, we get more computationally expensive. First, we engineer natural language prompts that let CLIP assess an image's aesthetic without adjusting any weights in the model. To overcome the challenge that CLIP's prompting only is applicable to classification tasks, we propose a simple but effective strategy to convert multiple prompts to a continuous scalar as required when predicting an image's mean aesthetic score. Second, we train a linear regression on the AVA dataset using image features obtained by CLIP's image encoder. The resulting model outperforms a linear regression trained on features from an ImageNet classification model. It also shows competitive performance with fully fine-tuned networks based on ImageNet, while only training a single layer. Finally, by fine-tuning CLIP's image encoder on the AVA dataset, we show that CLIP only needs a fraction of training epochs to converge, while also performing better than a fine-tuned ImageNet model. Overall, our experiments suggest that CLIP is better suited as a base model for IAA methods than ImageNet pretrained networks.},
  language  = {en}
}
@article{WienrichCarolusRothIsigkeitetal.2022,
  author    = {Wienrich, Carolin and Carolus, Astrid and Roth-Isigkeit, David and Hotho, Andreas},
  title     = {Inhibitors and enablers to explainable AI success: a systematic examination of explanation complexity and individual characteristics},
  series = {Multimodal Technologies and Interaction},
  volume    = {6},
  journal   = {Multimodal Technologies and Interaction},
  number    = {12},
  issn      = {2414-4088},
  doi       = {10.3390/mti6120106},
  url       = {http://nbn-resolving.de/urn:nbn:de:bvb:20-opus-297288},
  year      = {2022},
  abstract  = {With the increasing adaptability and complexity of advisory artificial intelligence (AI)-based agents, the topics of explainable AI and human-centered AI are moving close together. Variations in the explanation itself have been widely studied, with some contradictory results. These could be due to users' individual differences, which have rarely been systematically studied regarding their inhibiting or enabling effect on the fulfillment of explanation objectives (such as trust, understanding, or workload). This paper aims to shed light on the significance of human dimensions (gender, age, trust disposition, need for cognition, affinity for technology, self-efficacy, attitudes, and mind attribution) as well as their interplay with different explanation modes (no, simple, or complex explanation). Participants played the game Deal or No Deal while interacting with an AI-based agent. The agent gave advice to the participants on whether they should accept or reject the deals offered to them. As expected, giving an explanation had a positive influence on the explanation objectives. However, the users' individual characteristics particularly reinforced the fulfillment of the objectives. The strongest predictor of objective fulfillment was the degree of attribution of human characteristics. The more human characteristics were attributed, the more trust was placed in the agent, advice was more likely to be accepted and understood, and important needs were satisfied during the interaction. Thus, the current work contributes to a better understanding of the design of explanations of an AI-based agent system that takes into account individual characteristics and meets the demand for both explainable and human-centered agent systems.},
  language  = {en}
}