Skip to content

Commit 277d94d

Browse files
dizcologyThe TensorFlow Datasets Authors
authored andcommitted
Add COVR to TFDS
PiperOrigin-RevId: 777797279
1 parent 56fea7b commit 277d94d

File tree

18 files changed

+225
-0
lines changed

18 files changed

+225
-0
lines changed
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
@inproceedings{bogin-etal-2021-covr,
2+
title = "{COVR}: A Test-Bed for Visually Grounded Compositional Generalization with Real Images",
3+
author = "Bogin, Ben and
4+
Gupta, Shivanshu and
5+
Gardner, Matt and
6+
Berant, Jonathan",
7+
editor = "Moens, Marie-Francine and
8+
Huang, Xuanjing and
9+
Specia, Lucia and
10+
Yih, Scott Wen-tau",
11+
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
12+
month = nov,
13+
year = "2021",
14+
address = "Online and Punta Cana, Dominican Republic",
15+
publisher = "Association for Computational Linguistics",
16+
url = "https://aclanthology.org/2021.emnlp-main.774/",
17+
doi = "10.18653/v1/2021.emnlp-main.774",
18+
pages = "9824--9846",
19+
abstract = "While interest in models that generalize at test time to new compositions has risen in recent years, benchmarks in the visually-grounded domain have thus far been restricted to synthetic images. In this work, we propose COVR, a new test-bed for visually-grounded compositional generalization with real images. To create COVR, we use real images annotated with scene graphs, and propose an almost fully automatic procedure for generating question-answer pairs along with a set of context images. COVR focuses on questions that require complex reasoning, including higher-order operations such as quantification and aggregation. Due to the automatic generation process, COVR facilitates the creation of compositional splits, where models at test time need to generalize to new concepts and compositions in a zero- or few-shot setting. We construct compositional splits using COVR and demonstrate a myriad of cases where state-of-the-art pre-trained language-and-vision models struggle to compositionally generalize."
20+
}
21+
22+
@inproceedings{yatskar2016,
23+
title={Situation Recognition: Visual Semantic Role Labeling for Image Understanding},
24+
author={Yatskar, Mark and Zettlemoyer, Luke and Farhadi, Ali},
25+
booktitle={Conference on Computer Vision and Pattern Recognition},
26+
year={2016}
27+
}
28+
29+
@article{cite-key,
30+
abstract = {Despite progress in perceptual tasks such as image classification, computers still perform poorly on cognitive tasks such as image description and question answering. Cognition is core to tasks that involve not just recognizing, but reasoning about our visual world. However, models used to tackle the rich content in images for cognitive tasks are still being trained using the same datasets designed for perceptual tasks. To achieve success at cognitive tasks, models need to understand the interactions and relationships between objects in an image. When asked ``What vehicle is the person riding?'', computers will need to identify the objects in an image as well as the relationships riding(man, carriage) and pulling(horse, carriage) to answer correctly that ``the person is riding a horse-drawn carriage.''In this paper, we present the Visual Genome dataset to enable the modeling of such relationships. We collect dense annotations of objects, attributes, and relationships within each image to learn these models. Specifically, our dataset contains over 108K images where each image has an average of {\$}{\$}35{\$}{\$}objects, {\$}{\$}26{\$}{\$}attributes, and {\$}{\$}21{\$}{\$}pairwise relationships between objects. We canonicalize the objects, attributes, relationships, and noun phrases in region descriptions and questions answer pairs to WordNet synsets. Together, these annotations represent the densest and largest dataset of image descriptions, objects, attributes, relationships, and question answer pairs.},
31+
author = {Krishna, Ranjay and Zhu, Yuke and Groth, Oliver and Johnson, Justin and Hata, Kenji and Kravitz, Joshua and Chen, Stephanie and Kalantidis, Yannis and Li, Li-Jia and Shamma, David A. and Bernstein, Michael S. and Fei-Fei, Li},
32+
date = {2017/05/01},
33+
date-added = {2025-07-10 08:32:03 -0700},
34+
date-modified = {2025-07-10 08:32:03 -0700},
35+
doi = {10.1007/s11263-016-0981-7},
36+
id = {Krishna2017},
37+
isbn = {1573-1405},
38+
journal = {International Journal of Computer Vision},
39+
number = {1},
40+
pages = {32--73},
41+
title = {Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations},
42+
url = {https://doi.org/10.1007/s11263-016-0981-7},
43+
volume = {123},
44+
year = {2017},
45+
bdsk-url-1 = {https://doi.org/10.1007/s11263-016-0981-7}}
46+
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[COVR](https://covr-dataset.github.io/) dataset with [imSitu](https://github.com/my89/imSitu) and [Visual Genome](https://homes.cs.washington.edu/~ranjay/visualgenome/index.html) images.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
content.data-type.image # Contains image data.
2+
content.data-type.text # Contains text data.
3+
content.language.en # Contains text in language English / en.
4+
content.monolingual # Contains text in 1 natural language.
5+
ml.task.common-sense-reasoning # Relates to Common Sense Reasoning, a machine learning task.
6+
ml.task.natural-language-inference # Relates to Natural Language Inference, a machine learning task.
7+
ml.task.natural-language-understanding # Relates to Natural Language Understanding, a machine learning task.
8+
ml.task.object-detection # Relates to Object Detection, a machine learning task.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# coding=utf-8
2+
# Copyright 2025 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip 9731705982 51c682d2721f880150720bb416e0346a4c787e4c55d7f80dfd1bd3f73ba81646 images.zip
2+
https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip 5471658058 99da1a0ddf87011319ff3b05cf9176ffee2731cc3c52951162d9ef0d68e3cfb5 images2.zip
3+
https://drive.google.com/uc?export=download&id=10xlQ6isRdGX94BypoqN6klniGeqdLBJA 21964401 83443ffd6493cdc807aaab8c559a38ad757d47e40d4d6f27b8c65efd4d889091 covr_v1_0.zip
4+
https://s3.amazonaws.com/my89-frame-annotation/public/of500_images.tar 36690524160 94dee93095d0325fb9aef1e8d956b6be297ab13bf2e62d6027fd5dcc782e8f61 of500_images.tar
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# coding=utf-8
2+
# Copyright 2025 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""covr dataset."""
17+
18+
import json
19+
20+
import tensorflow_datasets.public_api as tfds
21+
22+
23+
class Builder(tfds.core.GeneratorBasedBuilder):
24+
"""DatasetBuilder for covr dataset."""
25+
26+
VERSION = tfds.core.Version('1.0.0')
27+
RELEASE_NOTES = {
28+
'1.0.0': 'Initial release.',
29+
}
30+
31+
def _info(self) -> tfds.core.DatasetInfo:
32+
"""Returns the dataset metadata."""
33+
return self.dataset_info_from_configs(
34+
features=tfds.features.FeaturesDict({
35+
'utterance': tfds.features.Text(),
36+
'scenes': tfds.features.Sequence(
37+
feature=tfds.features.Text(),
38+
),
39+
'properties': tfds.features.Sequence(
40+
feature=tfds.features.Text(),
41+
),
42+
'pattern_name': tfds.features.Text(),
43+
'program': tfds.features.Text(),
44+
'label': tfds.features.Text(),
45+
'images': tfds.features.Sequence(
46+
feature=tfds.features.Image(),
47+
),
48+
}),
49+
supervised_keys=None,
50+
homepage='https://covr-dataset.github.io/',
51+
)
52+
53+
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
54+
"""Returns SplitGenerators."""
55+
extracted_dirs = dl_manager.download_and_extract({
56+
'covr': (
57+
'https://drive.google.com/uc?export=download&'
58+
'id=10xlQ6isRdGX94BypoqN6klniGeqdLBJA'
59+
),
60+
'imsitu': (
61+
'https://s3.amazonaws.com/my89-frame-annotation'
62+
'/public/of500_images.tar'
63+
),
64+
'vg1': 'https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip',
65+
'vg2': 'https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip',
66+
})
67+
68+
# Each name is the image file name without the ".jpg" extension, which is
69+
# also used as the scene id in COVR.
70+
filepath_by_name: dict[str, str] = {}
71+
globs = [
72+
extracted_dirs['vg1'].glob('*/*.jpg'),
73+
extracted_dirs['vg2'].glob('*/*.jpg'),
74+
extracted_dirs['imsitu'].glob('of500_images/*/*.jpg'),
75+
]
76+
for glob in globs:
77+
for fp in glob:
78+
name = fp.with_suffix('').name
79+
filepath_by_name[name] = str(fp)
80+
path = extracted_dirs['covr']
81+
return {
82+
'train': self._generate_examples(
83+
path / 'train.jsonl', filepath_by_name
84+
),
85+
'test': self._generate_examples(path / 'test.jsonl', filepath_by_name),
86+
'validation': self._generate_examples(
87+
path / 'val.jsonl', filepath_by_name
88+
),
89+
}
90+
91+
def _generate_examples(self, path, filepath_by_name):
92+
"""Yields examples."""
93+
with open(path) as f:
94+
for line in f:
95+
item = json.loads(line)
96+
images = []
97+
for scene_id in item['scenes']:
98+
images.append(filepath_by_name[scene_id])
99+
yield item['qid'], {
100+
'utterance': item['utterance'],
101+
'scenes': item['scenes'],
102+
'properties': item['properties'],
103+
'pattern_name': item['pattern_name'],
104+
'program': str(item['program']),
105+
'label': str(item.get('answer')),
106+
'images': images,
107+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# coding=utf-8
2+
# Copyright 2025 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""covr dataset."""
17+
18+
from tensorflow_datasets.datasets.covr import covr_dataset_builder
19+
import tensorflow_datasets.public_api as tfds
20+
21+
22+
class CovrTest(tfds.testing.DatasetBuilderTestCase):
23+
"""Tests for covr dataset."""
24+
25+
DATASET_CLASS = covr_dataset_builder.Builder
26+
SPLITS = {
27+
'train': 1,
28+
'test': 1,
29+
'validation': 1,
30+
}
31+
32+
DL_EXTRACT_RESULT = {
33+
'covr': 'covr',
34+
'imsitu': 'imsitu',
35+
'vg1': 'vg1',
36+
'vg2': 'vg2',
37+
}
38+
39+
40+
if __name__ == '__main__':
41+
tfds.testing.test_main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"qid": "val_test_299485", "utterance": "There are more people that are wearing hat than cats that are wearing hat", "scenes": ["3", "4", "5", "6", "7"], "properties": ["has_compare", "has_compare_more", "lexical_3", "program_3", "program_2", "has_count"], "pattern_name": "compare_count", "program": [{"operation": "find", "arguments": ["hat"]}, {"operation": "find", "arguments": ["person"]}, {"operation": "with_relation", "arguments": ["wearing"], "dependencies": [1, 0]}, {"operation": "count", "dependencies": [2]}, {"operation": "find", "arguments": ["hat"]}, {"operation": "find", "arguments": ["cat"]}, {"operation": "with_relation", "arguments": ["wearing"], "dependencies": [5, 4]}, {"operation": "count", "dependencies": [6]}, {"operation": "gt", "dependencies": [3, 7]}]}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"qid": "train_648732", "utterance": "Is the tree that is next to a fence narrow or large?", "scenes": ["1", "2", "3", "4", "5"], "properties": ["program_2"], "pattern_name": "choose_attr", "program": [{"operation": "find", "arguments": ["fence"]}, {"operation": "find", "arguments": ["tree"]}, {"operation": "with_relation", "arguments": ["next to"], "dependencies": [1, 0]}, {"operation": "unique", "dependencies": [2]}, {"operation": "choose_attr", "dependencies": [3], "arguments": ["narrow", "large"]}], "answer": "narrow"}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"qid": "val_test_747468", "utterance": "Do all phones that are on a table have the same color?", "scenes": ["7", "8"], "properties": ["program_1", "has_same_attribute_color", "has_quantifier", "has_complex_quantifier_scope", "has_quantifier_all", "tpl_verify_quantifier_attribute"], "pattern_name": "quantifier_same_attr", "program": [{"operation": "find", "arguments": ["table"]}, {"operation": "find", "arguments": ["phone"]}, {"operation": "with_relation", "arguments": ["on"], "dependencies": [1, 0]}, {"operation": "all_same", "dependencies": [2], "arguments": ["color"]}], "answer": true}

0 commit comments

Comments
 (0)