from datasets import load_dataset
# 加载整个数据集
dataset = load_dataset("/chatgpt_nas/xxxx/Phantom-data-Koala36M")
# 查看一个样本
print(dataset)
print(dataset['train'][0])
Generating train split: 651031 examples [00:07, 87257.26 examples/s]Failed to read file '/chatgpt_nas/xxxx/Phantom-data-Koala36M/koala36M_multi_ref_meta_info_merged.parquet' with error <class 'datasets.table.CastError'>: Couldn't cast
vid: string
width: int32
height: int32
duration: float
fps: float
youtube_url: string
timestamp: string
video_caption: string
aesthetic_score: double
clarity_score: double
motion_score: double
suitability_score: double
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 1505
to
{'video_id': Value(dtype='string', id=None), 'video_caption': Value(dtype='string', id=None), 'cross_pair': Value(dtype='string', id=None)}
because column names don't match
Generating train split: 651031 examples [00:11, 56002.79 examples/s]
Traceback (most recent call last):
File "/root/miniforge3/envs/diffusion-pipe/lib/python3.10/site-packages/datasets/builder.py", line 1855, in _prepare_split_single
for _, table in generator:
File "/root/miniforge3/envs/diffusion-pipe/lib/python3.10/site-packages/datasets/packaged_modules/parquet/parquet.py", line 106, in _generate_tables
yield f"{file_idx}_{batch_idx}", self._cast_table(pa_table)
File "/root/miniforge3/envs/diffusion-pipe/lib/python3.10/site-packages/datasets/packaged_modules/parquet/parquet.py", line 73, in _cast_table
pa_table = table_cast(pa_table, self.info.features.arrow_schema)
File "/root/miniforge3/envs/diffusion-pipe/lib/python3.10/site-packages/datasets/table.py", line 2293, in table_cast
return cast_table_to_schema(table, schema)
File "/root/miniforge3/envs/diffusion-pipe/lib/python3.10/site-packages/datasets/table.py", line 2241, in cast_table_to_schema
raise CastError(
datasets.table.CastError: Couldn't cast
vid: string
width: int32
height: int32
duration: float
fps: float
youtube_url: string
timestamp: string
video_caption: string
aesthetic_score: double
clarity_score: double
motion_score: double
suitability_score: double
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 1505
to
{'video_id': Value(dtype='string', id=None), 'video_caption': Value(dtype='string', id=None), 'cross_pair': Value(dtype='string', id=None)}
because column names don't match
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/chatgpt_nas/xxx/data.py", line 3, in <module>
dataset = load_dataset("/chatgpt_nas/xxxx/Phantom-data-Koala36M")
File "/root/miniforge3/envs/diffusion-pipe/lib/python3.10/site-packages/datasets/load.py", line 2084, in load_dataset
builder_instance.download_and_prepare(
File "/root/miniforge3/envs/diffusion-pipe/lib/python3.10/site-packages/datasets/builder.py", line 925, in download_and_prepare
self._download_and_prepare(
File "/root/miniforge3/envs/diffusion-pipe/lib/python3.10/site-packages/datasets/builder.py", line 1001, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "/root/miniforge3/envs/diffusion-pipe/lib/python3.10/site-packages/datasets/builder.py", line 1742, in _prepare_split
for job_id, done, content in self._prepare_split_single(
File "/root/miniforge3/envs/diffusion-pipe/lib/python3.10/site-packages/datasets/builder.py", line 1898, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
Hi, I download the dataset on huggingface and load use this code
但是出现了下面error
@LeoniusChen Can you review this dataset? Thank you!