Skip to content

Commit 560af73

Browse files
authored
Add Dataset small_coralnet (#240)
* Improve messages and translations with 'split' infromation Initial addition of whoi_small_coralnet * fix md5 values and add tests * update translations
1 parent da2dd3d commit 560af73

File tree

11 files changed

+166
-31
lines changed

11 files changed

+166
-31
lines changed

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ export(transform_to_tensor)
189189
export(transform_vflip)
190190
export(vision_make_grid)
191191
export(whoi_plankton_dataset)
192+
export(whoi_small_coralnet_dataset)
192193
export(whoi_small_plankton_dataset)
193194
importFrom(grDevices,dev.off)
194195
importFrom(graphics,polygon)

NEWS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* Added `lfw_people_dataset()` and `lfw_pairs_dataset()` for loading Labelled Faces in the Wild (LFW) datasets (@DerrickUnleashed, #203).
66
* Added `places365_dataset()`for loading the Places365 dataset (@koshtiakanksha, #196).
77
* Added `pascal_segmentation_dataset()`, and `pascal_detection_dataset()` for loading the Pascal Visual Object Classes datasets (@DerrickUnleashed, #209).
8-
* Added `whoi_plankton_dataset()`, and `whoi_small_plankton_dataset()` (@cregouby, #236).
8+
* Added `whoi_plankton_dataset()`, `whoi_small_plankton_dataset()`, and `whoi_small_coral_dataset()` (@cregouby, #236).
99

1010
## New models
1111

R/dataset-flowers.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ flowers102_dataset <- dataset(
9999
meta <- readRDS(file.path(self$processed_folder, glue::glue("{self$split}.rds")))
100100
self$img_path <- meta$img_path
101101
self$labels <- meta$labels
102-
cli_inform("Split {.val {self$split}} of dataset {.cls {class(self)[[1]]}} loaded with {length(self$img_path)} samples.")
102+
cli_inform("Split {.val {self$split}} of dataset {.cls {class(self)[[1]]}} loaded with {self$.length()} samples.")
103103
},
104104

105105
.getitem = function(index) {

R/dataset-plankton.R

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,13 @@
22
#' WHOI-Plankton Dataset
33
#'
44
#' The WHOI-Plankton and WHOI-Plankton small are **image classification** datasets
5-
#' of submarine plankton small grayscale images of varying size, classified into 100 classes.
5+
#' from the Woods Hole Oceanographic Institution (WHOI) of microscopic marine plankton.
6+
#' https://hdl.handle.net/10.1575/1912/7341
7+
#' Images were collected in situ by automated submersible imaging-in-flow cytometry
8+
#' with an instrument called Imaging FlowCytobot (IFCB). They are small grayscale images
9+
#' of varying size.
10+
#' Images are classified into 100 classes, with an overview available in
11+
#' [project Wiki page](https://whoigit.github.io/whoi-plankton/)
612
#' Dataset size is 957k and 58k respectively, and each provides a train / val / test split.
713
#'
814
#' @inheritParams eurosat_dataset
@@ -66,6 +72,7 @@ whoi_small_plankton_dataset <- torch::dataset(
6672
install.packages("prettyunits")
6773
}
6874

75+
self$split <- match.arg(split, c("train", "val", "test"))
6976
self$transform <- transform
7077
self$target_transform <- target_transform
7178
self$archive_url <- self$resources[self$resources$split == split,]$url
@@ -74,7 +81,7 @@ whoi_small_plankton_dataset <- torch::dataset(
7481
self$split_file <- sapply(self$archive_url, \(x) file.path(rappdirs::user_cache_dir("torch"), class(self)[1], sub("\\?download=.*", "", basename(x))))
7582

7683
if (download) {
77-
cli_inform("Dataset {.cls {class(self)[[1]]}} (~{.emph {self$archive_size}}) will be downloaded and processed if not already available.")
84+
cli_inform("Split {.val {self$split}} of dataset {.cls {class(self)[[1]]}} (~{.emph {self$archive_size}}) will be downloaded and processed if not already available.")
7885
self$download()
7986
}
8087

@@ -106,7 +113,12 @@ whoi_small_plankton_dataset <- torch::dataset(
106113

107114
.getitem = function(index) {
108115
df <- self$.data[index,]$to_data_frame()
109-
x <- df$image$bytes %>% unlist() %>% as.raw() %>% png::readPNG()
116+
x_raw <- df$image$bytes %>% unlist() %>% as.raw()
117+
if (tolower(tools::file_ext(df$image$path)) == "jpg") {
118+
x <- jpeg::readJPEG(x_raw)
119+
} else {
120+
x <- png::readPNG(x_raw)
121+
}
110122
y <- df$label + 1L
111123

112124
if (!is.null(self$transform))
@@ -127,7 +139,7 @@ whoi_small_plankton_dataset <- torch::dataset(
127139

128140
#' WHOI-Plankton Dataset
129141
#'
130-
#' @inheritParams whoi_plankton_dataset#'
142+
#' @inheritParams whoi_plankton_dataset
131143
#' @rdname whoi_plankton_dataset
132144
#' @export
133145
whoi_plankton_dataset <- torch::dataset(
@@ -162,3 +174,32 @@ whoi_plankton_dataset <- torch::dataset(
162174
size = c(rep(450e6, 4), rep(490e6, 13), rep(450e6, 2))
163175
)
164176
)
177+
178+
179+
#' Coralnet Dataset
180+
#'
181+
#' Small Coralnet dataset is an image **classification dataset**
182+
#' of very large submarine coral reef images annotated into 3 classes
183+
#' and produced by [CoralNet](https://coralnet.ucsd.edu),
184+
#' a resource for benthic images classification.
185+
#'
186+
#' @inheritParams whoi_plankton_dataset
187+
#' @export
188+
whoi_small_coralnet_dataset <- torch::dataset(
189+
name = "whoi_small_coralnet",
190+
inherit = whoi_small_plankton_dataset,
191+
archive_size = "2.1 GB",
192+
resources = data.frame(
193+
split = c("test", rep("train", 4), "val"),
194+
url = c("https://huggingface.co/datasets/nf-whoi/coralnet-small/resolve/main/data/test-00000-of-00001.parquet?download=true",
195+
paste0("https://huggingface.co/datasets/nf-whoi/coralnet-small/resolve/main/data/train-0000",0:3,"-of-00004.parquet?download=true"),
196+
"https://huggingface.co/datasets/nf-whoi/coralnet-small/resolve/main/data/validation-00000-of-00001.parquet?download=true"),
197+
md5 = c("f9a3ce864fdbeb5f1f3d243fe1285186",
198+
"82269e2251db22ef213e438126198afd",
199+
"82d2cafbad7740e476310565a2bcd44e",
200+
"f4dd2d2effc1f9c02918e3ee614b85d3",
201+
"d66ec691a4c5c63878a9cfff164a6aaf",
202+
"7ea146b9b2f7b6cee99092bd44182d06"),
203+
size = c(430e6, rep(380e6, 4), 192e6)
204+
)
205+
)

_pkgdown.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ reference:
5353
- ends_with("1_dataset")
5454
- ends_with("2_dataset")
5555
- ends_with("5_dataset")
56+
- ends_with("kton_dataset")
5657
- subtitle: for Object Detection
5758
descr: >
5859
Dataset having items with "y" as a named list of bounding-box and
1.25 KB
Binary file not shown.

man/whoi_plankton_dataset.Rd

Lines changed: 7 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/whoi_small_coralnet_dataset.Rd

Lines changed: 28 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

po/R-fr.po

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
msgid ""
22
msgstr ""
33
"Project-Id-Version: torchvision 0.7.0.9000\n"
4-
"POT-Creation-Date: 2025-08-10 20:26+0200\n"
5-
"PO-Revision-Date: 2025-08-10 20:35+0200\n"
4+
"POT-Creation-Date: 2025-08-15 10:15+0200\n"
5+
"PO-Revision-Date: 2025-08-15 10:18+0200\n"
66
"Last-Translator: Christophe Regouby <[email protected]>\n"
77
"Language-Team: \n"
88
"Language: fr\n"
@@ -46,12 +46,12 @@ msgstr ""
4646
#: dataset-lfw.R:246 dataset-mnist.R:86 dataset-mnist.R:234 dataset-mnist.R:421
4747
#: dataset-oxfordiiitpet.R:71 dataset-oxfordiiitpet.R:283
4848
#: dataset-oxfordiiitpet.R:347 dataset-pascal.R:136 dataset-pascal.R:294
49-
#: dataset-places365.R:98 dataset-plankton.R:71
49+
#: dataset-places365.R:98
5050
msgid ""
5151
"Dataset {.cls {class(self)[[1]]}} (~{.emph {self$archive_size}}) will be "
5252
"downloaded and processed if not already available."
5353
msgstr ""
54-
"Le jeu de données {.cls {class(self)[[1]]}} de taille (~{.emph "
54+
"Le jeu de données {.cls {class(self)[[1]]}} (de taille ~{.emph "
5555
"{self$archive_size}}) sera téléchargé et traité s'il n'est pas déjà "
5656
"disponible."
5757

@@ -60,7 +60,7 @@ msgstr ""
6060
#: dataset-flowers.R:97 dataset-lfw.R:113 dataset-lfw.R:251 dataset-mnist.R:91
6161
#: dataset-mnist.R:239 dataset-mnist.R:426 dataset-oxfordiiitpet.R:76
6262
#: dataset-oxfordiiitpet.R:288 dataset-oxfordiiitpet.R:352 dataset-pascal.R:141
63-
#: dataset-pascal.R:299 dataset-places365.R:103 dataset-plankton.R:76
63+
#: dataset-pascal.R:299 dataset-places365.R:103 dataset-plankton.R:89
6464
msgid "Dataset not found. You can use `download = TRUE` to download it."
6565
msgstr ""
6666
"Jeu de données introuvable. Veuillez ajouter `download = TRUE` pour le "
@@ -69,7 +69,7 @@ msgstr ""
6969
#: dataset-caltech.R:84 dataset-caltech.R:208 dataset-lfw.R:135
7070
#: dataset-lfw.R:286 dataset-oxfordiiitpet.R:95 dataset-oxfordiiitpet.R:302
7171
#: dataset-oxfordiiitpet.R:366 dataset-pascal.R:149 dataset-pascal.R:317
72-
#: dataset-plankton.R:83
72+
#: dataset-plankton.R:93
7373
msgid ""
7474
"{.cls {class(self)[[1]]}} dataset loaded with {self$.length()} images across "
7575
"{length(self$classes)} classes."
@@ -79,15 +79,15 @@ msgstr ""
7979

8080
#: dataset-caltech.R:117 dataset-coco.R:187 dataset-eurosat.R:81
8181
#: dataset-flickr.R:136 dataset-lfw.R:146 dataset-oxfordiiitpet.R:107
82-
#: dataset-pascal.R:158 dataset-plankton.R:90
82+
#: dataset-pascal.R:158 dataset-plankton.R:100
8383
msgid "Downloading {.cls {class(self)[[1]]}}..."
8484
msgstr "Téléchargement de {.cls {class(self)[[1]]}}..."
8585

8686
#: dataset-caltech.R:125 dataset-cifar.R:111 dataset-coco.R:193
8787
#: dataset-eurosat.R:88 dataset-fer.R:129 dataset-flowers.R:136
8888
#: dataset-lfw.R:157 dataset-lfw.R:172 dataset-mnist.R:121 dataset-mnist.R:264
8989
#: dataset-mnist.R:453 dataset-oxfordiiitpet.R:115 dataset-pascal.R:168
90-
#: dataset-places365.R:182 dataset-plankton.R:95 models-vit.R:49
90+
#: dataset-places365.R:182 dataset-plankton.R:105 models-vit.R:49
9191
msgid "Corrupt file! Delete the file in {archive} and try again."
9292
msgstr "Fichier corrompu. Supprimez le fichier {archive} et recommencez."
9393

@@ -160,7 +160,7 @@ msgstr "Extraction de l'archive {.cls {class(self)[[1]]}} terminée..."
160160
#: dataset-eurosat.R:98
161161
msgid "Downloading {.cls {class(self)[[1]]}} split file: {self$split_url}"
162162
msgstr ""
163-
"Téléchargement du fichier de split de {.cls {class(self)[[1]]}} depuis "
163+
"Téléchargement du fichier de partition de {.cls {class(self)[[1]]}} depuis "
164164
"{self$split_url} ..."
165165

166166
#: dataset-eurosat.R:102
@@ -214,18 +214,18 @@ msgstr ""
214214
#: dataset-flowers.R:102
215215
msgid ""
216216
"Split {.val {self$split}} of dataset {.cls {class(self)[[1]]}} loaded with "
217-
"{length(self$img_path)} samples."
217+
"{self$.length()} samples."
218218
msgstr ""
219-
"Le sous-ensemble {.val {self$split}} du jeu de données {.cls {class(self)"
220-
"[[1]]}} chargé avec {length(self$img_path)} images."
219+
"La partition {.val {self$split}} du jeu de données {.cls {class(self)[[1]]}} "
220+
"est chargée avec {length(self$img_path)} images."
221221

222222
#: dataset-flowers.R:126
223223
msgid ""
224224
"Split {.val {self$split}} of dataset {.cls {class(self)[[1]]}} is already "
225225
"processed and cached."
226226
msgstr ""
227-
"Le sous-ensemble {.val {self$split}} du jeu de données {.cls {class(self)"
228-
"[[1]]}} a déjà été préparé et mis en cache."
227+
"La partition {.val {self$split}} du jeu de données {.cls {class(self)[[1]]}} "
228+
"a déjà été préparé et mis en cache."
229229

230230
#: dataset-flowers.R:140
231231
msgid "{.cls {class(self)[[1]]}} Extracting images and processing dataset..."
@@ -262,9 +262,19 @@ msgstr "Partage non valide : {self$split}"
262262
#: dataset-places365.R:163
263263
msgid "Downloading {.cls {class(self)[[1]]}} split '{self$split}'..."
264264
msgstr ""
265-
"Téléchargement du fichier de partage '{self$split}' de {.cls {class(self)"
265+
"Téléchargement du fichier de partition '{self$split}' de {.cls {class(self)"
266266
"[[1]]} ..."
267267

268+
#: dataset-plankton.R:84
269+
msgid ""
270+
"Split {.val {self$split}} of dataset {.cls {class(self)[[1]]}} (~{.emph "
271+
"{self$archive_size}}) will be downloaded and processed if not already "
272+
"available."
273+
msgstr ""
274+
"La partition {.val {self$split}} du jeu de données {.cls {class(self)[[1]]}} "
275+
"(de taille ~{.emph {self$archive_size}}) sera téléchargée et traitée si elle "
276+
"n'est pas déjà disponible."
277+
268278
#: extension.R:2
269279
msgid ""
270280
"has_ops() Not implemented yet. https://github.com/pytorch/vision/blob/"

po/R-torchvision.pot

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
msgid ""
22
msgstr ""
33
"Project-Id-Version: torchvision 0.7.0.9000\n"
4-
"POT-Creation-Date: 2025-08-10 20:26+0200\n"
4+
"POT-Creation-Date: 2025-08-15 10:15+0200\n"
55
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
66
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
77
"Language-Team: LANGUAGE <[email protected]>\n"
@@ -38,27 +38,27 @@ msgstr ""
3838
msgid "deprecated"
3939
msgstr ""
4040

41-
#: dataset-caltech.R:61 dataset-cifar.R:52 dataset-coco.R:85 dataset-coco.R:305 dataset-eurosat.R:57 dataset-fer.R:63 dataset-fgvc.R:91 dataset-flickr.R:69 dataset-flickr.R:231 dataset-flowers.R:92 dataset-lfw.R:108 dataset-lfw.R:246 dataset-mnist.R:86 dataset-mnist.R:234 dataset-mnist.R:421 dataset-oxfordiiitpet.R:71 dataset-oxfordiiitpet.R:283 dataset-oxfordiiitpet.R:347 dataset-pascal.R:136 dataset-pascal.R:294 dataset-places365.R:98 dataset-plankton.R:71
41+
#: dataset-caltech.R:61 dataset-cifar.R:52 dataset-coco.R:85 dataset-coco.R:305 dataset-eurosat.R:57 dataset-fer.R:63 dataset-fgvc.R:91 dataset-flickr.R:69 dataset-flickr.R:231 dataset-flowers.R:92 dataset-lfw.R:108 dataset-lfw.R:246 dataset-mnist.R:86 dataset-mnist.R:234 dataset-mnist.R:421 dataset-oxfordiiitpet.R:71 dataset-oxfordiiitpet.R:283 dataset-oxfordiiitpet.R:347 dataset-pascal.R:136 dataset-pascal.R:294 dataset-places365.R:98
4242
msgid ""
4343
"Dataset {.cls {class(self)[[1]]}} (~{.emph {self$archive_size}}) will be "
4444
"downloaded and processed if not already available."
4545
msgstr ""
4646

47-
#: dataset-caltech.R:66 dataset-caltech.R:187 dataset-coco.R:90 dataset-coco.R:310 dataset-eurosat.R:64 dataset-fer.R:70 dataset-flowers.R:97 dataset-lfw.R:113 dataset-lfw.R:251 dataset-mnist.R:91 dataset-mnist.R:239 dataset-mnist.R:426 dataset-oxfordiiitpet.R:76 dataset-oxfordiiitpet.R:288 dataset-oxfordiiitpet.R:352 dataset-pascal.R:141 dataset-pascal.R:299 dataset-places365.R:103 dataset-plankton.R:76
47+
#: dataset-caltech.R:66 dataset-caltech.R:187 dataset-coco.R:90 dataset-coco.R:310 dataset-eurosat.R:64 dataset-fer.R:70 dataset-flowers.R:97 dataset-lfw.R:113 dataset-lfw.R:251 dataset-mnist.R:91 dataset-mnist.R:239 dataset-mnist.R:426 dataset-oxfordiiitpet.R:76 dataset-oxfordiiitpet.R:288 dataset-oxfordiiitpet.R:352 dataset-pascal.R:141 dataset-pascal.R:299 dataset-places365.R:103 dataset-plankton.R:89
4848
msgid "Dataset not found. You can use `download = TRUE` to download it."
4949
msgstr ""
5050

51-
#: dataset-caltech.R:84 dataset-caltech.R:208 dataset-lfw.R:135 dataset-lfw.R:286 dataset-oxfordiiitpet.R:95 dataset-oxfordiiitpet.R:302 dataset-oxfordiiitpet.R:366 dataset-pascal.R:149 dataset-pascal.R:317 dataset-plankton.R:83
51+
#: dataset-caltech.R:84 dataset-caltech.R:208 dataset-lfw.R:135 dataset-lfw.R:286 dataset-oxfordiiitpet.R:95 dataset-oxfordiiitpet.R:302 dataset-oxfordiiitpet.R:366 dataset-pascal.R:149 dataset-pascal.R:317 dataset-plankton.R:93
5252
msgid ""
5353
"{.cls {class(self)[[1]]}} dataset loaded with {self$.length()} images across "
5454
"{length(self$classes)} classes."
5555
msgstr ""
5656

57-
#: dataset-caltech.R:117 dataset-coco.R:187 dataset-eurosat.R:81 dataset-flickr.R:136 dataset-lfw.R:146 dataset-oxfordiiitpet.R:107 dataset-pascal.R:158 dataset-plankton.R:90
57+
#: dataset-caltech.R:117 dataset-coco.R:187 dataset-eurosat.R:81 dataset-flickr.R:136 dataset-lfw.R:146 dataset-oxfordiiitpet.R:107 dataset-pascal.R:158 dataset-plankton.R:100
5858
msgid "Downloading {.cls {class(self)[[1]]}}..."
5959
msgstr ""
6060

61-
#: dataset-caltech.R:125 dataset-cifar.R:111 dataset-coco.R:193 dataset-eurosat.R:88 dataset-fer.R:129 dataset-flowers.R:136 dataset-lfw.R:157 dataset-lfw.R:172 dataset-mnist.R:121 dataset-mnist.R:264 dataset-mnist.R:453 dataset-oxfordiiitpet.R:115 dataset-pascal.R:168 dataset-places365.R:182 dataset-plankton.R:95 models-vit.R:49
61+
#: dataset-caltech.R:125 dataset-cifar.R:111 dataset-coco.R:193 dataset-eurosat.R:88 dataset-fer.R:129 dataset-flowers.R:136 dataset-lfw.R:157 dataset-lfw.R:172 dataset-mnist.R:121 dataset-mnist.R:264 dataset-mnist.R:453 dataset-oxfordiiitpet.R:115 dataset-pascal.R:168 dataset-places365.R:182 dataset-plankton.R:105 models-vit.R:49
6262
msgid "Corrupt file! Delete the file in {archive} and try again."
6363
msgstr ""
6464

@@ -158,7 +158,7 @@ msgstr ""
158158
#: dataset-flowers.R:102
159159
msgid ""
160160
"Split {.val {self$split}} of dataset {.cls {class(self)[[1]]}} loaded with "
161-
"{length(self$img_path)} samples."
161+
"{self$.length()} samples."
162162
msgstr ""
163163

164164
#: dataset-flowers.R:126
@@ -198,6 +198,13 @@ msgstr ""
198198
msgid "Downloading {.cls {class(self)[[1]]}} split '{self$split}'..."
199199
msgstr ""
200200

201+
#: dataset-plankton.R:84
202+
msgid ""
203+
"Split {.val {self$split}} of dataset {.cls {class(self)[[1]]}} (~{.emph "
204+
"{self$archive_size}}) will be downloaded and processed if not already "
205+
"available."
206+
msgstr ""
207+
201208
#: extension.R:2
202209
msgid ""
203210
"has_ops() Not implemented yet. https://github.com/pytorch/vision/blob/"

0 commit comments

Comments
 (0)