diff --git a/doc/references.bib b/doc/references.bib index 43604ab42c..62d56a1673 100644 --- a/doc/references.bib +++ b/doc/references.bib @@ -99,9 +99,12 @@ @misc{odin2024 @article{inie2025summon, title = {Summon a Demon and Bind it: A Grounded Theory of {LLM} Red Teaming}, author = {Nanna Inie and Jonathan Stray and Leon Derczynski}, - journal = {PLoS ONE}, + journal = {PLOS ONE}, + volume = {20}, + number = {1}, + pages = {e0314658}, year = {2025}, - url = {https://arxiv.org/abs/2311.06237}, + url = {https://doi.org/10.1371/journal.pone.0314658}, } @misc{vantaylor2024socialbias, @@ -633,17 +636,19 @@ @article{rottger2025msts url = {https://arxiv.org/abs/2501.10057}, } -@article{zong2024vlguard, +@inproceedings{zong2024vlguard, title = {Safety Fine-Tuning at (Almost) No Cost: A Baseline for Vision Large Language Models}, author = {Yongshuo Zong and Ondrej Bohdal and Tingyang Yu and Yongxin Yang and Timothy Hospedales}, - journal = {arXiv preprint arXiv:2402.02207}, + booktitle = {Proceedings of the 41st International Conference on Machine Learning (ICML)}, + pages = {62867--62891}, year = {2024}, - url = {https://arxiv.org/abs/2402.02207}, + publisher = {PMLR}, + url = {https://proceedings.mlr.press/v235/zong24a.html}, } @article{lopez2024pyrit, title = {{PyRIT}: A Framework for Security Risk Identification and Red Teaming in Generative {AI} Systems}, - author = {Gary D. Lopez Munoz and Amanda J. Minnich and Roman Lutz and Richard Lundeen and Raja Sekhar Rao Dheekonda and Nina Chikanov and Bolor-Erdene Jagdagdorj and Martin Pouliot and Shiven Chawla and Whitney Maxwell and Blake Bullwinkel and Katherine Pratt and Joris de Gruyter and Charlotte Siska and Pete Bryan and Tori Westerhoff and Chang Kawaguchi and Christian Seifert and Ram Shankar Siva Kumar and Yonatan Zunger}, + author = {Gary D. {Lopez Munoz} and Amanda J. Minnich and Roman Lutz and Richard Lundeen and Raja Sekhar Rao Dheekonda and Nina Chikanov and Bolor-Erdene Jagdagdorj and Martin Pouliot and Shiven Chawla and Whitney Maxwell and Blake Bullwinkel and Katherine Pratt and Joris de Gruyter and Charlotte Siska and Pete Bryan and Tori Westerhoff and Chang Kawaguchi and Christian Seifert and Ram Shankar Siva Kumar and Yonatan Zunger}, journal = {arXiv preprint arXiv:2410.02828}, year = {2024}, url = {https://arxiv.org/abs/2410.02828}, @@ -667,12 +672,13 @@ @inproceedings{wang2025siuo note = {Introduces the {SIUO} (Safe Inputs but Unsafe Output) benchmark}, } -@misc{darkbench2025, - title = {{DarkBench}: A Comprehensive Benchmark for Dark Design Patterns in Large Language Models}, - author = {{Apart Research}}, +@inproceedings{darkbench2025, + title = {{DarkBench}: Benchmarking Dark Patterns in Large Language Models}, + author = {Esben Kran and Hieu Minh Nguyen and Akash Kundu and Sami Jawhar and Jinsuk Park and Mateusz Maria Jurewicz}, + booktitle = {International Conference on Learning Representations (ICLR)}, year = {2025}, - url = {https://darkbench.ai/}, - note = {OpenReview: https://openreview.net/forum?id=odjMSBSWRt}, + url = {https://arxiv.org/abs/2503.10728}, + note = {Oral presentation at ICLR 2025}, } @misc{embracethered2025sneakybits,