From ebdf5fa8f59bd534e28c3cafbf1c96121325c43e Mon Sep 17 00:00:00 2001 From: "Abhishek P (VMware)" Date: Wed, 31 Mar 2021 20:12:55 +0530 Subject: [PATCH] Add HuggingfaceDatasetSplitReader for using Huggingface datasets Added a new reader to allow for reading huggingface datasets as instance Mapped limited `datasets.features` to `allenlp.data.fields` Verified for selective dataset and/or dataset configurations New Dependency - "datasets==1.5.0" Signed-off-by: Abhishek P (VMware) --- CHANGELOG.md | 4 +- .../huggingface_datasets_reader.py | 183 ++++++++++++++++++ setup.py | 1 + .../huggingface_datasets_test.py | 47 +++++ 4 files changed, 233 insertions(+), 2 deletions(-) create mode 100644 allennlp/data/dataset_readers/huggingface_datasets_reader.py create mode 100644 tests/data/dataset_readers/huggingface_datasets_test.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 87c5d6f9cb8..46dca6f253c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased ### Added - +- Add `HuggingfaceDatasetSplitReader` for using huggingface datasets in AllenNLP with limited support - Ported the following Huggingface `LambdaLR`-based schedulers: `ConstantLearningRateScheduler`, `ConstantWithWarmupLearningRateScheduler`, `CosineWithWarmupLearningRateScheduler`, `CosineHardRestartsWithWarmupLearningRateScheduler`. ### Changed @@ -264,7 +264,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added sampler class and parameter in beam search for non-deterministic search, with several implementations, including `MultinomialSampler`, `TopKSampler`, `TopPSampler`, and `GumbelSampler`. Utilizing `GumbelSampler` will give [Stochastic Beam Search](https://api.semanticscholar.org/CorpusID:76662039). - + ### Changed - Pass batch metrics to `BatchCallback`. diff --git a/allennlp/data/dataset_readers/huggingface_datasets_reader.py b/allennlp/data/dataset_readers/huggingface_datasets_reader.py new file mode 100644 index 00000000000..088c7507aac --- /dev/null +++ b/allennlp/data/dataset_readers/huggingface_datasets_reader.py @@ -0,0 +1,183 @@ +from typing import Iterable, Optional + +from allennlp.data import DatasetReader, Token +from allennlp.data.fields import TextField, LabelField, ListField +from allennlp.data.instance import Instance +from datasets import load_dataset +from datasets.features import ClassLabel, Sequence, Translation, TranslationVariableLanguages +from datasets.features import Value + + +class HuggingfaceDatasetSplitReader(DatasetReader): + """ + This reader implementation wraps the huggingface datasets package + to utilize it's dataset management functionality and load the information in AllenNLP friendly formats + Note: Reader works w.r.t to only one split of the dataset, + i.e. you would need to create separate reader for separate splits + + Following dataset and configurations have been verified and work with this reader + + Dataset Dataset Configuration + `xnli` `ar` + `xnli` `en` + `xnli` `de` + `xnli` `all_languages` + `glue` `cola` + `glue` `mrpc` + `glue` `sst2` + `glue` `qqp` + `glue` `mnli` + `glue` `mnli_matched` + `universal_dependencies` `en_lines` + `universal_dependencies` `ko_kaist` + `universal_dependencies` `af_afribooms` + `afrikaans_ner_corpus` `NA` + `swahili` `NA` + `conll2003` `NA` + `dbpedia_14` `NA` + `trec` `NA` + `emotion` `NA` + """ + + def __init__( + self, + max_instances: Optional[int] = None, + manual_distributed_sharding: bool = False, + manual_multiprocess_sharding: bool = False, + serialization_dir: Optional[str] = None, + dataset_name: [str] = None, + split: str = "train", + config_name: Optional[str] = None, + ) -> None: + super().__init__( + max_instances, + manual_distributed_sharding, + manual_multiprocess_sharding, + serialization_dir, + ) + + # It would be cleaner to create a separate reader object for different dataset + self.dataset = None + self.dataset_name = dataset_name + self.config_name = config_name + self.index = -1 + + if config_name: + self.dataset = load_dataset(self.dataset_name, self.config_name, split=split) + else: + self.dataset = load_dataset(self.dataset_name, split=split) + + def _read(self, file_path) -> Iterable[Instance]: + """ + Reads the dataset and converts the entry to AllenNLP friendly instance + """ + for entry in self.dataset: + yield self.text_to_instance(entry) + + def text_to_instance(self, *inputs) -> Instance: + """ + Takes care of converting dataset entry into AllenNLP friendly instance + Currently it is implemented in an unseemly catch-up model + where it converts datasets.features that are required for the supported dataset, + ideally it would require design where we cleanly deliberate, decide + map dataset.feature to an allenlp.data.field and then go ahead with converting it + Doing that would provide the best chance of providing largest possible coverage with datasets + + Currently this is how datasets.features types are mapped to AllenNLP Fields + + dataset.feature type allennlp.data.fields + `ClassLabel` `LabelField` in feature name namespace + `Value.string` `TextField` with value as Token + `Value.*` `LabelField` with value being label in feature name namespace + `Sequence.string` `ListField` of `TextField` with individual string as token + `Sequence.ClassLabel` `ListField` of `ClassLabel` in feature name namespace + `Translation` `ListField` of 2 ListField (ClassLabel and TextField) + `TranslationVariableLanguages` `ListField` of 2 ListField (ClassLabel and TextField) + """ + + # features indicate the different information available in each entry from dataset + # feature types decide what type of information they are + # e.g. In a Sentiment dataset an entry could have one feature (of type text/string) indicating the text + # and another indicate the sentiment (of typeint32/ClassLabel) + features = self.dataset.features + fields = dict() + + # TODO we need to support all different datasets features described + # in https://huggingface.co/docs/datasets/features.html + for feature in features: + value = features[feature] + + # datasets ClassLabel maps to LabelField + if isinstance(value, ClassLabel): + field = LabelField(inputs[0][feature], label_namespace=feature, skip_indexing=True) + + # datasets Value can be of different types + elif isinstance(value, Value): + + # String value maps to TextField + if value.dtype == "string": + # Since TextField has to be made of Tokens add whole text as a token + # TODO Should we use simple heuristics to identify what is token and what is not? + field = TextField([Token(inputs[0][feature])]) + + else: + field = LabelField( + inputs[0][feature], label_namespace=feature, skip_indexing=True + ) + + elif isinstance(value, Sequence): + # datasets Sequence of strings to ListField of TextField + if value.feature.dtype == "string": + field_list = list() + for item in inputs[0][feature]: + item_field = TextField([Token(item)]) + field_list.append(item_field) + if len(field_list) == 0: + continue + field = ListField(field_list) + + # datasets Sequence of strings to ListField of LabelField + elif isinstance(value.feature, ClassLabel): + field_list = list() + for item in inputs[0][feature]: + item_field = LabelField( + label=item, label_namespace=feature, skip_indexing=True + ) + field_list.append(item_field) + if len(field_list) == 0: + continue + field = ListField(field_list) + + # datasets.Translation cannot be mapped directly + # but it's dict structure can be mapped to a ListField of 2 ListField + elif isinstance(value, Translation): + if value.dtype == "dict": + input_dict = inputs[0][feature] + langs = list(input_dict.keys()) + field_langs = [LabelField(lang, label_namespace="languages") for lang in langs] + langs_field = ListField(field_langs) + texts = list() + for lang in langs: + texts.append(TextField([Token(input_dict[lang])])) + field = ListField([langs_field, ListField(texts)]) + + # datasets.TranslationVariableLanguages + # is functionally a pair of Lists and hence mapped to a ListField of 2 ListField + elif isinstance(value, TranslationVariableLanguages): + if value.dtype == "dict": + input_dict = inputs[0][feature] + langs = input_dict["language"] + field_langs = [LabelField(lang, label_namespace="languages") for lang in langs] + langs_field = ListField(field_langs) + texts = list() + for lang in langs: + index = langs.index(lang) + texts.append(TextField([Token(input_dict["translation"][index])])) + field = ListField([langs_field, ListField(texts)]) + + else: + raise ValueError(f"Datasets feature type {type(value)} is not supported yet.") + + fields[feature] = field + + return Instance(fields) diff --git a/setup.py b/setup.py index 40655093c83..8ca267749d6 100644 --- a/setup.py +++ b/setup.py @@ -73,6 +73,7 @@ "lmdb", "more-itertools", "wandb>=0.10.0,<0.11.0", + "datasets==1.5.0", ], entry_points={"console_scripts": ["allennlp=allennlp.__main__:run"]}, include_package_data=True, diff --git a/tests/data/dataset_readers/huggingface_datasets_test.py b/tests/data/dataset_readers/huggingface_datasets_test.py new file mode 100644 index 00000000000..2a9be7ea435 --- /dev/null +++ b/tests/data/dataset_readers/huggingface_datasets_test.py @@ -0,0 +1,47 @@ +import pytest + +from allennlp.data.dataset_readers.huggingface_datasets_reader import HuggingfaceDatasetSplitReader +import logging + +logger = logging.getLogger(__name__) + + +# TODO these UTs are actually downloading the datasets and will be very very slow +# TODO add UT were we compare huggingface wrapped reader with an explicitly coded builder +class HuggingfaceDatasetSplitReaderTest: + + SUPPORTED_DATASETS_WITHOUT_CONFIG = [ + "afrikaans_ner_corpus", + "dbpedia_14", + "trec", + "swahili", + "conll2003", + "emotion", + ] + + """ + Running the tests for supported datasets which do not require config name to be specified + """ + + @pytest.mark.parametrize("dataset", SUPPORTED_DATASETS_WITHOUT_CONFIG) + def test_read_for_datasets_without_config(self, dataset): + huggingface_reader = HuggingfaceDatasetSplitReader(dataset_name=dataset) + instances = list(huggingface_reader.read(None)) + assert len(instances) == len(huggingface_reader.dataset) + + # Not testing for all configurations only some + SUPPORTED_DATASET_CONFIGURATION = ( + ("glue", "cola"), + ("universal_dependencies", "af_afribooms"), + ("xnli", "all_languages"), + ) + + """ + Running the tests for supported datasets which require config name to be specified + """ + + @pytest.mark.parametrize("dataset, config", SUPPORTED_DATASET_CONFIGURATION) + def test_read_for_datasets_requiring_config(self, dataset, config): + huggingface_reader = HuggingfaceDatasetSplitReader(dataset_name=dataset, config_name=config) + instances = list(huggingface_reader.read(None)) + assert len(instances) == len(huggingface_reader.dataset)