Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Fix Doc mistake and the dataset availability check
Browse files Browse the repository at this point in the history
Signed-off-by: Abhishek P (VMware) <[email protected]>
  • Loading branch information
Abhishek-P committed Apr 10, 2021
1 parent edf2681 commit d082e55
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 25 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## Unreleased

### Added
- Add `HuggingfaceDatasetSplitReader` for using huggingface datasets in AllenNLP with limited support
- Add `HuggingfaceDatasetReader` for using huggingface datasets in AllenNLP with limited support
- Ported the following Huggingface `LambdaLR`-based schedulers: `ConstantLearningRateScheduler`, `ConstantWithWarmupLearningRateScheduler`, `CosineWithWarmupLearningRateScheduler`, `CosineHardRestartsWithWarmupLearningRateScheduler`.

### Changed
Expand Down
39 changes: 20 additions & 19 deletions allennlp/data/dataset_readers/huggingface_datasets_reader.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import typing
from typing import Iterable, Optional

from allennlp.data import DatasetReader, Token, Field, Tokenizer
from allennlp.data.fields import TextField, LabelField, ListField
from allennlp.data.instance import Instance
from datasets import load_dataset, DatasetDict, Split
from datasets import load_dataset, DatasetDict, Split, list_datasets
from datasets.features import ClassLabel, Sequence, Translation, TranslationVariableLanguages
from datasets.features import Value

Expand Down Expand Up @@ -43,15 +44,15 @@ class HuggingfaceDatasetReader(DatasetReader):
# Parameters
dataset_name : `str`
Name of the dataset from huggingface datasets the reader will be used for
config_name : `str`, optional (default=`None`)
Configuration(mandatory for some datasets) of the dataset
pre_load : `bool`, optional (default='False`)
Name of the dataset from huggingface datasets the reader will be used for.
config_name : `str`, optional (default=`None`)
Configuration(mandatory for some datasets) of the dataset.
preload : `bool`, optional (default=`False`)
If `True` all splits for the dataset is loaded(includes download etc) as part of the initialization,
otherwise each split is loaded on when `read()` is used for the same for the first time
tokenizer : `Tokenizer`, optional (default=`None`)
If specified is used for tokenization of string and text fields from the dataset
This is useful since Text in allennlp is dealt with as a series of tokens.
otherwise each split is loaded on when `read()` is used for the same for the first time.
tokenizer : `Tokenizer`, optional (default=`None`)
If specified is used for tokenization of string and text fields from the dataset.
This is useful since text in allennlp is dealt with as a series of tokens.
"""

SUPPORTED_SPLITS = [Split.TRAIN, Split.TEST, Split.VALIDATION]
Expand All @@ -60,7 +61,7 @@ def __init__(
self,
dataset_name: str = None,
config_name: Optional[str] = None,
pre_load: Optional[bool] = False,
preload: Optional[bool] = False,
tokenizer: Optional[Tokenizer] = None,
**kwargs,
) -> None:
Expand All @@ -71,17 +72,17 @@ def __init__(
)

# It would be cleaner to create a separate reader object for diferent dataset
if dataset_name not in load_dataset():
raise NotImplementedError(
if dataset_name not in list_datasets():
raise ValueError(
f"Dataset {dataset_name} does not seem to available in huggingface datasets"
)
self.dataset: DatasetDict = DatasetDict()
self.dataset_name = dataset_name
self.config_name = config_name
self.tokenizer = tokenizer

if pre_load:
load_dataset()
if preload:
self.load_dataset()

def load_dataset(self):
if self.config_name is not None:
Expand Down Expand Up @@ -152,7 +153,7 @@ def text_to_instance(self, *inputs) -> Instance:
# TODO we need to support all different datasets features described
# in https://huggingface.co/docs/datasets/features.html
for feature in features:
fields_to_be_added = dict[str, Field]()
fields_to_be_added: typing.Dict[str, Field] = dict()
item_field: Field
field_list: list
value = features[feature]
Expand Down Expand Up @@ -188,21 +189,21 @@ def text_to_instance(self, *inputs) -> Instance:
# We do not know if the string is token or text, we will assume text and make each a TextField
# datasets.features.Sequence of strings maps to ListField of TextField
if value.feature.dtype == "string":
field_list = list[TextField]()
field_list2: typing.List[TextField] = list()
for item in inputs[1][feature]:
# If tokenizer is provided we will use it to split it to tokens
# Else put whole text as a single token
tokens: list[Token]
tokens: typing.List[Token]
if self.tokenizer is not None:
tokens = self.tokenizer.tokenize(item)

else:
tokens = [Token(item)]

item_field = TextField(tokens)
field_list.append(item_field)
field_list2.append(item_field)

fields_to_be_added[feature] = ListField(field_list)
fields_to_be_added[feature] = ListField(field_list2)

# datasets Sequence of strings to ListField of LabelField
elif isinstance(value.feature, ClassLabel):
Expand Down
10 changes: 5 additions & 5 deletions tests/data/dataset_readers/huggingface_datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@ class HuggingfaceDatasetReaderTest:

@pytest.mark.parametrize(
"dataset, config, split",
(
("glue", "cola", "train"),
("glue", "cola", "test"),
("universal_dependencies", "en_lines", "validation"),
),
(("glue", "cola", "train"), ("glue", "cola", "test")),
)
def test_read(self, dataset, config, split):
huggingface_reader = HuggingfaceDatasetReader(dataset_name=dataset, config_name=config)
Expand Down Expand Up @@ -75,3 +71,7 @@ def test_xnli_all_languages(self):
# datasets.features.TranslationVariableLanguages into two fields each
# For XNLI that means 3 fields become 5
assert len(instance.fields) == 5

def test_non_available_dataset(self):
with pytest.raises(ValueError):
HuggingfaceDatasetReader(dataset_name="surely-such-a-dataset-cannot-exist")

0 comments on commit d082e55

Please sign in to comment.