This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Abhishek P (VMware) <[email protected]>
Converted HFDatasetSplitReader to HFDatasetReader Now all splits can be used in the same reader Support for both pre-load of all splits or on demand load of the split Reduced tests to glue-cola dataset:config which is ~ 0.36MB download Updated dataset dep to be the range of >=1.5.0 and <1.6.0
- Loading branch information
1 parent
6e613b9
commit f77cfa3
Showing
2 changed files
with
51 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,23 @@ | ||
import pytest | ||
|
||
from allennlp.data.dataset_readers.huggingface_datasets_reader import HuggingfaceDatasetSplitReader | ||
from allennlp.data.dataset_readers.huggingface_datasets_reader import HuggingfaceDatasetReader | ||
import logging | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
# TODO these UTs are actually downloading the datasets and will be very very slow | ||
# TODO add UT were we compare huggingface wrapped reader with an explicitly coded builder | ||
# TODO add UT were we compare huggingface wrapped reader with an explicitly coded dataset | ||
class HuggingfaceDatasetSplitReaderTest: | ||
|
||
""" | ||
Running the tests for supported datasets which require config name to be specified | ||
Running the tests for supported datasets which require config name to be specified | ||
""" | ||
@pytest.mark.parametrize("dataset, config, split", (("glue", "cola", "train"), ("glue", "cola", "test"))) | ||
|
||
@pytest.mark.parametrize( | ||
"dataset, config, split", (("glue", "cola", "train"), ("glue", "cola", "test")) | ||
) | ||
def test_read_for_datasets_requiring_config(self, dataset, config, split): | ||
huggingface_reader = HuggingfaceDatasetSplitReader(dataset_name=dataset, config_name=config) | ||
huggingface_reader = HuggingfaceDatasetReader(dataset_name=dataset, config_name=config) | ||
instances = list(huggingface_reader.read(split)) | ||
assert len(instances) == len(huggingface_reader.datasets[split]) | ||
print(instances[0], print(huggingface_reader.datasets[split][0])) |