Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Add HuggingfaceDatasetReader for using Huggingface datasets
Browse files Browse the repository at this point in the history
Introduced new dependency - "datasets>=1.5.0,<1.6.0""
Added a new reader to allow for reading huggingface datasets as instance
Mapped limited `datasets.features` to `allenlp.data.fields`
Added Tests for the same
Verified for selective dataset and/or dataset configurations
Added `test-with-cov-html` to provide contributor friendly html coverage report

Signed-off-by: Abhishek P (VMware) <[email protected]>
  • Loading branch information
Abhishek-P committed Apr 15, 2021
1 parent 0c7d60b commit 9007258
Show file tree
Hide file tree
Showing 6 changed files with 472 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ __pycache__
.coverage
.pytest_cache/
.benchmarks
htmlcov/

# documentation build artifacts

Expand Down
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## Unreleased

### Added

- Add `HuggingfaceDatasetReader` for using huggingface datasets in AllenNLP with known support for limited datasets
- The test for distributed metrics now takes a parameter specifying how often you want to run it.


## [v2.3.0](https://github.com/allenai/allennlp/releases/tag/v2.3.0) - 2021-04-14

### Added

=======
>>>>>>> Fix Doc mistake and the dataset availability check
- Ported the following Huggingface `LambdaLR`-based schedulers: `ConstantLearningRateScheduler`, `ConstantWithWarmupLearningRateScheduler`, `CosineWithWarmupLearningRateScheduler`, `CosineHardRestartsWithWarmupLearningRateScheduler`.
- Added new `sub_token_mode` parameter to `pretrained_transformer_mismatched_embedder` class to support first sub-token embedding
- Added a way to run a multi task model with a dataset reader as part of `allennlp predict`.
Expand Down
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,13 @@ test-with-cov :
--cov=$(SRC) \
--cov-report=xml

.PHONY : test-with-cov-html
test-with-cov-html :
pytest --color=yes -rf --durations=40 \
--cov-config=.coveragerc \
--cov=$(SRC) \
--cov-report=html

.PHONY : gpu-test
gpu-test : check-for-cuda
pytest --color=yes -v -rf -m gpu
Expand Down
276 changes: 276 additions & 0 deletions allennlp/data/dataset_readers/huggingface_datasets_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
import typing
from typing import Iterable, Optional

from allennlp.data import DatasetReader, Token, Field, Tokenizer
from allennlp.data.fields import TextField, LabelField, ListField
from allennlp.data.instance import Instance
from datasets import load_dataset, DatasetDict, Split, list_datasets
from datasets.features import ClassLabel, Sequence, Translation, TranslationVariableLanguages
from datasets.features import Value


@DatasetReader.register("huggingface-datasets")
class HuggingfaceDatasetReader(DatasetReader):
"""
Reads instances from the given huggingface supported dataset
This reader implementation wraps the huggingface datasets package
Following dataset and configurations have been verified and work with this reader
Dataset Dataset Configuration
`xnli` `ar`
`xnli` `en`
`xnli` `de`
`xnli` `all_languages`
`glue` `cola`
`glue` `mrpc`
`glue` `sst2`
`glue` `qqp`
`glue` `mnli`
`glue` `mnli_matched`
`universal_dependencies` `en_lines`
`universal_dependencies` `ko_kaist`
`universal_dependencies` `af_afribooms`
`swahili` `NA`
`conll2003` `NA`
`dbpedia_14` `NA`
`trec` `NA`
`emotion` `NA`
Note: universal_dependencies will require you to install `conllu` package separately
Registered as a `DatasetReader` with name `huggingface-datasets`
# Parameters
dataset_name : `str`
Name of the dataset from huggingface datasets the reader will be used for.
config_name : `str`, optional (default=`None`)
Configuration(mandatory for some datasets) of the dataset.
preload : `bool`, optional (default=`False`)
If `True` all splits for the dataset is loaded(includes download etc) as part of the initialization,
otherwise each split is loaded on when `read()` is used for the same for the first time.
tokenizer : `Tokenizer`, optional (default=`None`)
If specified is used for tokenization of string and text fields from the dataset.
This is useful since text in allennlp is dealt with as a series of tokens.
"""

SUPPORTED_SPLITS = [Split.TRAIN, Split.TEST, Split.VALIDATION]

def __init__(
self,
dataset_name: str = None,
config_name: Optional[str] = None,
preload: Optional[bool] = False,
tokenizer: Optional[Tokenizer] = None,
**kwargs,
) -> None:
super().__init__(
manual_distributed_sharding=True,
manual_multiprocess_sharding=True,
**kwargs,
)

# It would be cleaner to create a separate reader object for diferent dataset
if dataset_name not in list_datasets():
raise ValueError(f"Dataset {dataset_name} not available in huggingface datasets")
self.dataset: DatasetDict = DatasetDict()
self.dataset_name = dataset_name
self.config_name = config_name
self.tokenizer = tokenizer

if preload:
self.load_dataset()

def load_dataset(self):
if self.config_name is not None:
self.dataset = load_dataset(self.dataset_name, self.config_name)
else:
self.dataset = load_dataset(self.dataset_name)

def load_dataset_split(self, split: str):
# TODO add support for datasets.split.NamedSplit
if split in self.SUPPORTED_SPLITS:
if self.config_name is not None:
self.dataset[split] = load_dataset(self.dataset_name, self.config_name, split=split)
else:
self.dataset[split] = load_dataset(self.dataset_name, split=split)
else:
raise ValueError(
f"Only default splits:{self.SUPPORTED_SPLITS} are currently supported."
)

def _read(self, file_path: str) -> Iterable[Instance]:
"""
Reads the dataset and converts the entry to AllenNLP friendly instance
"""
if file_path is None:
raise ValueError("parameter split cannot be None")

# If split is not loaded, load the specific split
if file_path not in self.dataset:
self.load_dataset_split(file_path)

# TODO see if use of Dataset.select() is better
for entry in self.shard_iterable(self.dataset[file_path]):
yield self.text_to_instance(file_path, entry)

def raise_feature_not_supported_value_error(self, value):
raise ValueError(f"Datasets feature type {type(value)} is not supported yet.")

def text_to_instance(self, *inputs) -> Instance:
"""
Takes care of converting dataset entry into AllenNLP friendly instance
Currently it is implemented in an unseemly catch-up model
where it converts datasets.features that are required for the supported dataset,
ideally it would require design where we cleanly deliberate, decide
map dataset.feature to an allenlp.data.field and then go ahead with converting it
Doing that would provide the best chance of providing largest possible coverage with datasets
Currently this is how datasets.features types are mapped to AllenNLP Fields
dataset.feature type allennlp.data.fields
`ClassLabel` `LabelField` in feature name namespace
`Value.string` `TextField` with value as Token
`Value.*` `LabelField` with value being label in feature name namespace
`Sequence.string` `ListField` of `TextField` with individual string as token
`Sequence.ClassLabel` `ListField` of `ClassLabel` in feature name namespace
`Translation` `ListField` of 2 ListField (ClassLabel and TextField)
`TranslationVariableLanguages` `ListField` of 2 ListField (ClassLabel and TextField)
"""

# features indicate the different information available in each entry from dataset
# feature types decide what type of information they are
# e.g. In a Sentiment dataset an entry could have one feature (of type text/string) indicating the text
# and another indicate the sentiment (of typeint32/ClassLabel)

split = inputs[0]
features = self.dataset[split].features
fields = dict()

# TODO we need to support all different datasets features described
# in https://huggingface.co/docs/datasets/features.html
for feature in features:
fields_to_be_added: typing.Dict[str, Field] = dict()
item_field: Field
field_list: list
value = features[feature]

# datasets ClassLabel maps to LabelField
if isinstance(value, ClassLabel):
fields_to_be_added[feature] = LabelField(
inputs[1][feature], label_namespace=feature, skip_indexing=True
)

# datasets Value can be of different types
elif isinstance(value, Value):

# String value maps to TextField
if value.dtype == "string":
# datasets.Value[string] maps to TextField
# If tokenizer is provided we will use it to split it to tokens
# Else put whole text as a single token
if self.tokenizer is not None:
fields_to_be_added[feature] = TextField(
self.tokenizer.tokenize(inputs[1][feature])
)

else:
fields_to_be_added[feature] = TextField([Token(inputs[1][feature])])

else:
fields_to_be_added[feature] = LabelField(
inputs[1][feature], label_namespace=feature, skip_indexing=True
)

elif isinstance(value, Sequence):
# We do not know if the string is token or text, we will assume text and make each a TextField
# datasets.features.Sequence of strings maps to ListField of TextField
if hasattr(value.feature, "dtype") and value.feature.dtype == "string":
field_list2: typing.List[TextField] = list()
for item in inputs[1][feature]:
# If tokenizer is provided we will use it to split it to tokens
# Else put whole text as a single token
tokens: typing.List[Token]
if self.tokenizer is not None:
tokens = self.tokenizer.tokenize(item)

else:
tokens = [Token(item)]

item_field = TextField(tokens)
field_list2.append(item_field)

fields_to_be_added[feature] = ListField(field_list2)

# datasets Sequence of strings to ListField of LabelField
elif isinstance(value.feature, ClassLabel):
field_list = list()
for item in inputs[1][feature]:
item_field = LabelField(
label=item, label_namespace=feature, skip_indexing=True
)
field_list.append(item_field)

fields_to_be_added[feature] = ListField(field_list)

else:
self.raise_feature_not_supported_value_error(value)

# datasets.Translation cannot be mapped directly
# but it's dict structure can be mapped to a ListField of 2 ListField
elif isinstance(value, Translation):
if value.dtype == "dict":
input_dict = inputs[1][feature]
langs = list(input_dict.keys())
texts = list()
for lang in langs:
if self.tokenizer is not None:
tokens = self.tokenizer.tokenize(input_dict[lang])

else:
tokens = [Token(input_dict[lang])]
texts.append(TextField(tokens))

fields_to_be_added[feature + "-languages"] = ListField(
[LabelField(lang, label_namespace="languages") for lang in langs]
)
fields_to_be_added[feature + "-texts"] = ListField(texts)

else:
raise ValueError(f"Datasets feature type {type(value)} is not supported yet.")

# datasets.TranslationVariableLanguages
# is functionally a pair of Lists and hence mapped to a ListField of 2 ListField
elif isinstance(value, TranslationVariableLanguages):
if value.dtype == "dict":
input_dict = inputs[1][feature]
fields_to_be_added[feature + "-language"] = ListField(
[
LabelField(lang, label_namespace=feature + "-language")
for lang in input_dict["language"]
]
)

if self.tokenizer is not None:
fields_to_be_added[feature + "-translation"] = ListField(
[
TextField(self.tokenizer.tokenize(text))
for text in input_dict["translation"]
]
)
else:
fields_to_be_added[feature + "-translation"] = ListField(
[TextField([Token(text)]) for text in input_dict["translation"]]
)

else:
raise ValueError(f"Datasets feature type {type(value)} is not supported yet.")

else:
raise ValueError(f"Datasets feature type {type(value)} is not supported yet.")

for field_key in fields_to_be_added:
fields[field_key] = fields_to_be_added[field_key]

return Instance(fields)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
"lmdb",
"more-itertools",
"wandb>=0.10.0,<0.11.0",
"datasets>=1.5.0,<1.6.0",
],
entry_points={"console_scripts": ["allennlp=allennlp.__main__:run"]},
include_package_data=True,
Expand Down
Loading

0 comments on commit 9007258

Please sign in to comment.