diff --git a/swift/llm/dataset/dataset/mllm.py b/swift/llm/dataset/dataset/mllm.py index d08494eeff..8b26627f02 100644 --- a/swift/llm/dataset/dataset/mllm.py +++ b/swift/llm/dataset/dataset/mllm.py @@ -989,6 +989,14 @@ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: preprocess_func=ScienceQAPreprocessor(), tags=['multi-modal', 'science', 'vqa', 'quality'])) +register_dataset( + DatasetMeta( + ms_dataset_id='lmms-lab/ScienceQA-IMG', + hf_dataset_id='lmms-lab/ScienceQA-IMG', + split=['train', 'validation'], + preprocess_func=ScienceQAPreprocessor(), + tags=['multi-modal', 'science', 'vqa', 'quality'])) + class GritPreprocessor(RowPreprocessor, GroundingMixin): @@ -1213,3 +1221,21 @@ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: hf_dataset_id='leonardPKU/clevr_cogen_a_train', preprocess_func=ClevrPreprocessor(), tags=['qa', 'math', 'vision', 'grpo'])) + + +class OpenVLPreprocessor(ResponsePreprocessor): + + def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: + row['query'] = 'What is the caption of this image?' + return super().preprocess(row) + + +register_dataset( + DatasetMeta( + ms_dataset_id='swift/Open-Qwen2VL-Data', + hf_dataset_id='weizhiwang/Open-Qwen2VL-Data', + preprocess_func=OpenVLPreprocessor(columns={ + 'caption': 'response', + 'url': 'images' + }), + tags=['caption', 'pretrain', 'vision']))