Skip to content

Commit 5546ca4

Browse files
committed
update
1 parent 7500bde commit 5546ca4

File tree

1 file changed

+8
-6
lines changed

1 file changed

+8
-6
lines changed

_bibliography/papers.bib

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ @inproceedings{Lu2024WildVisionEV
3333
abstract={Recent breakthroughs in vision-language models (VLMs) emphasize the necessity of benchmarking human preferences in real-world multimodal interactions. To address this gap, we launched WildVision-Arena (WV-Arena), an online platform that collects human preferences to evaluate VLMs. We curated WV-Bench by selecting 500 high-quality samples from 8,000 user submissions in WV-Arena. WV-Bench uses GPT-4 as the judge to compare each VLM with Claude-3-Sonnet, achieving a Spearman correlation of 0.94 with the WV-Arena Elo. This significantly outperforms other benchmarks like MMVet, MMMU, and MMStar.
3434
3535
Our comprehensive analysis of 20K real-world interactions reveals important insights into the failure cases of top-performing VLMs. For example, we find that although GPT-4V surpasses many other models like Reka-Flash, Opus, and Yi-VL-Plus in simple visual recognition and reasoning tasks, it still faces challenges with subtle contextual cues, spatial reasoning, visual imagination, and expert domain knowledge. Additionally, current VLMs exhibit issues with hallucinations and safety when intentionally provoked. We are releasing our chat and feedback data to further advance research in the field of VLMs.},
36-
abbr={NeurIPS 2024 (D/B)},
36+
abbr={NeurIPS 2024},
37+
github={WildVision-AI/WildVision-Arena},
3738
preview={wildvision.png},
3839
arxiv={2406.11069},
3940
twitter = "https://twitter.com/billyuchenlin/status/1755207605537120513",
@@ -51,8 +52,9 @@ @inproceedings{Jiang2024GenAIAA
5152
month={Dec},
5253
year={2024},
5354
url={https://openreview.net/forum?id=0Gmi8TkUC7#discussion},
55+
abstract = {Generative AI has made remarkable strides to revolutionize fields such as image and video generation. These advancements are driven by innovative algorithms, architecture, and data. However, the rapid proliferation of generative models has highlighted a critical gap: the absence of trustworthy evaluation metrics. Current automatic assessments such as FID, CLIP, FVD, etc often fail to capture the nuanced quality and user satisfaction associated with generative outputs. This paper proposes an open platform \arena to evaluate different image and video generative models, where users can actively participate in evaluating these models. By leveraging collective user feedback and votes, \arena aims to provide a more democratic and accurate measure of model performance. It covers three arenas for text-to-image generation, text-to-video generation, and image editing respectively. Currently, we cover a total of 27 open-source generative models. \arena has been operating for four months, amassing over 6000 votes from the community. We describe our platform, analyze the data, and explain the statistical methods for ranking the models. To further promote the research in building model-based evaluation metrics, we release a cleaned version of our preference data for the three tasks, namely GenAI-Bench. We prompt the existing multi-modal models like Gemini, GPT-4o to mimic human voting. We compute the correlation between model voting with human voting to understand their judging abilities. Our results show existing multimodal models are still lagging in assessing the generated visual content, even the best model GPT-4o only achieves a Pearson correlation of 0.22 in quality subscore, and behave like random guessing in others.},
5456
github = "TIGER-AI-Lab/GenAI-Arena",
55-
abbr={NeurIPS 2024 (D/B)},
57+
abbr={NeurIPS 2024},
5658
preview={genai-arena.png},
5759
arxiv={2406.04485},
5860
huggingface="https://huggingface.co/spaces/TIGER-Lab/GenAI-Arena",
@@ -86,7 +88,7 @@ @article{jiang2024mantis
8688
@inproceedings{Ku2023VIEScoreTE,
8789
title={VIEScore: Towards Explainable Metrics for Conditional Image Synthesis Evaluation},
8890
author={Max W.F. Ku and Dongfu Jiang and Cong Wei and Xiang Yue and Wenhu Chen},
89-
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
91+
booktitle = "Proceedings of ACL",
9092
publisher = "Association for Computational Linguistics",
9193
month = aug,
9294
year={2024},
@@ -112,7 +114,7 @@ @inproceedings{Yue2023MMMUAM
112114
address = {Seattle, US},
113115
abstract = {We introduce MMMU: a new benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning. MMMU includes 11.5K meticulously collected multimodal questions from college exams quizzes and textbooks covering six core disciplines: Art & Design Business Science Health & Medicine Humanities & Social Science and Tech & Engineering. These questions span 30 subjects and 183 subfields comprising 30 highly heterogeneous image types such as charts diagrams maps tables music sheets and chemical structures. Unlike existing benchmarks MMMU focuses on advanced perception and reasoning with domain-specific knowledge challenging models to perform tasks akin to those faced by experts. The evaluation of 28 open-source LMMs as well as the proprietary GPT-4V(ision) and Gemini highlights the substantial challenges posed by MMMU. Even the advanced GPT-4V and Gemini Ultra only achieve accuracies of 56% and 59% respectively indicating significant room for improvement. We believe MMMU will stimulate the community to build next-generation multimodal foundation models towards expert artificial general intelligence.},
114116
year={2023},
115-
booktitle = {CVPR <span style="color: red; font-weight: bold;">oral</span>},
117+
booktitle = {Proceedings of CVPR <span style="color: red; font-weight: bold;">oral</span>},
116118
url={https://arxiv.org/abs/2311.16502},
117119
website = "https://mmmu-benchmark.github.io",
118120
github = "MMMU-Benchmark/MMMU",
@@ -130,7 +132,7 @@ @inproceedings{Yue2023MMMUAM
130132
@article{jiang2024tigerscore,
131133
title={{TIGERS}core: Towards Building Explainable Metric for All Text Generation Tasks},
132134
author={Dongfu Jiang and Yishan Li and Ge Zhang and Wenhao Huang and Bill Yuchen Lin and Wenhu Chen},
133-
journal={Transactions on Machine Learning Research},
135+
journal={Transactions on Machine Learning Research (TMLR)},
134136
year={2024},
135137
month={May},
136138
bibtex_show={true},
@@ -158,7 +160,7 @@ @inproceedings{jiang-etal-2023-llm
158160
author = "Jiang, Dongfu and
159161
Ren, Xiang and
160162
Lin, Bill Yuchen",
161-
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
163+
booktitle = "Proceedings of ACL",
162164
month = jul,
163165
year = "2023",
164166
address = "Toronto, Canada",

0 commit comments

Comments
 (0)