Skip to content

Commit 419ca89

Browse files
authored
GH-36628: [Python][Parquet] Fail when instantiating internal Parquet metadata classes (#45549)
### Rationale for this change As described in #36628, we want to raise an exception instead of showing segfaults when users try to instantiate some internal Parquet metadata classes. ### What changes are included in this PR? This PR is a carry-on of #44245. I modified a few things and covered more test cases. This PR includes these changes: * Raise an exception when users try to instantiate `Statistics`, `ParquetLogicalType`, `ColumnChunkMetaData`, `RowGroupMetaData`, `FileMetaData` ### Are these changes tested? Yes, added unit tests ### Are there any user-facing changes? Yes, after this change, users can't instantiate these classes anymore using the regular `__init__` workflow * GitHub Issue: #36628 Authored-by: Tim Nguyen <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent c124bb5 commit 419ca89

File tree

4 files changed

+58
-15
lines changed

4 files changed

+58
-15
lines changed

python/pyarrow/_dataset_parquet.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ cdef class ParquetFileFormat(FileFormat):
164164
metadata = deref(
165165
deref(parquet_file_writer).parquet_writer()).metadata()
166166
if metadata:
167-
parquet_metadata = FileMetaData()
167+
parquet_metadata = FileMetaData.__new__(FileMetaData)
168168
parquet_metadata.init(metadata)
169169
parquet_metadata.set_file_path(os.path.relpath(path, base_dir))
170170

@@ -390,7 +390,7 @@ cdef class ParquetFileFragment(FileFragment):
390390
@property
391391
def metadata(self):
392392
self.ensure_complete_metadata()
393-
cdef FileMetaData metadata = FileMetaData()
393+
cdef FileMetaData metadata = FileMetaData.__new__(FileMetaData)
394394
metadata.init(self.parquet_file_fragment.metadata())
395395
return metadata
396396

python/pyarrow/_parquet.pxd

+9
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,15 @@ cdef class RowGroupMetaData(_Weakrefable):
631631
CRowGroupMetaData* metadata
632632
FileMetaData parent
633633

634+
cdef inline init(self, FileMetaData parent, int index):
635+
if index < 0 or index >= parent.num_row_groups:
636+
raise IndexError('{0} out of bounds'.format(index))
637+
self.up_metadata = parent._metadata.RowGroup(index)
638+
self.metadata = self.up_metadata.get()
639+
self.parent = parent
640+
self.index = index
641+
642+
634643
cdef class ColumnChunkMetaData(_Weakrefable):
635644
cdef:
636645
unique_ptr[CColumnChunkMetaData] up_metadata

python/pyarrow/_parquet.pyx

+27-13
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ _MAX_ROW_GROUP_SIZE = 64*1024*1024
5252
cdef class Statistics(_Weakrefable):
5353
"""Statistics for a single column in a single row group."""
5454

55+
def __init__(self):
56+
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly")
57+
5558
def __cinit__(self):
5659
pass
5760

@@ -217,6 +220,10 @@ cdef class Statistics(_Weakrefable):
217220

218221
cdef class ParquetLogicalType(_Weakrefable):
219222
"""Logical type of parquet type."""
223+
224+
def __init__(self):
225+
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly")
226+
220227
cdef:
221228
shared_ptr[const CParquetLogicalType] type
222229

@@ -253,7 +260,7 @@ cdef class ParquetLogicalType(_Weakrefable):
253260

254261

255262
cdef wrap_logical_type(const shared_ptr[const CParquetLogicalType]& type):
256-
cdef ParquetLogicalType out = ParquetLogicalType()
263+
cdef ParquetLogicalType out = ParquetLogicalType.__new__(ParquetLogicalType)
257264
out.init(type)
258265
return out
259266

@@ -315,6 +322,9 @@ cdef _box_flba(ParquetFLBA val, uint32_t len):
315322
cdef class ColumnChunkMetaData(_Weakrefable):
316323
"""Column metadata for a single row group."""
317324

325+
def __init__(self):
326+
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly")
327+
318328
def __cinit__(self):
319329
pass
320330

@@ -436,7 +446,7 @@ cdef class ColumnChunkMetaData(_Weakrefable):
436446
"""Statistics for column chunk (:class:`Statistics`)."""
437447
if not self.metadata.is_stats_set():
438448
return None
439-
statistics = Statistics()
449+
cdef Statistics statistics = Statistics.__new__(Statistics)
440450
statistics.init(self.metadata.statistics(), self)
441451
return statistics
442452

@@ -739,13 +749,11 @@ cdef class SortingColumn:
739749
cdef class RowGroupMetaData(_Weakrefable):
740750
"""Metadata for a single row group."""
741751

742-
def __cinit__(self, FileMetaData parent, int index):
743-
if index < 0 or index >= parent.num_row_groups:
744-
raise IndexError('{0} out of bounds'.format(index))
745-
self.up_metadata = parent._metadata.RowGroup(index)
746-
self.metadata = self.up_metadata.get()
747-
self.parent = parent
748-
self.index = index
752+
def __init__(self):
753+
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly")
754+
755+
def __cinit__(self):
756+
pass
749757

750758
def __reduce__(self):
751759
return RowGroupMetaData, (self.parent, self.index)
@@ -787,7 +795,7 @@ cdef class RowGroupMetaData(_Weakrefable):
787795
"""
788796
if i < 0 or i >= self.num_columns:
789797
raise IndexError('{0} out of bounds'.format(i))
790-
chunk = ColumnChunkMetaData()
798+
cdef ColumnChunkMetaData chunk = ColumnChunkMetaData.__new__(ColumnChunkMetaData)
791799
chunk.init(self, i)
792800
return chunk
793801

@@ -866,6 +874,9 @@ def _reconstruct_filemetadata(Buffer serialized):
866874
cdef class FileMetaData(_Weakrefable):
867875
"""Parquet metadata for a single file."""
868876

877+
def __init__(self):
878+
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly")
879+
869880
def __cinit__(self):
870881
pass
871882

@@ -1027,7 +1038,9 @@ cdef class FileMetaData(_Weakrefable):
10271038
-------
10281039
row_group_metadata : RowGroupMetaData
10291040
"""
1030-
return RowGroupMetaData(self, i)
1041+
cdef RowGroupMetaData row_group = RowGroupMetaData.__new__(RowGroupMetaData)
1042+
row_group.init(self, i)
1043+
return row_group
10311044

10321045
def set_file_path(self, path):
10331046
"""
@@ -1516,7 +1529,8 @@ cdef class ParquetReader(_Weakrefable):
15161529
# Set up metadata
15171530
with nogil:
15181531
c_metadata = builder.raw_reader().metadata()
1519-
self._metadata = result = FileMetaData()
1532+
cdef FileMetaData result = FileMetaData.__new__(FileMetaData)
1533+
self._metadata = result
15201534
result.init(c_metadata)
15211535

15221536
if read_dictionary is not None:
@@ -2259,7 +2273,7 @@ cdef class ParquetWriter(_Weakrefable):
22592273
with nogil:
22602274
metadata = self.writer.get().metadata()
22612275
if metadata:
2262-
result = FileMetaData()
2276+
result = FileMetaData.__new__(FileMetaData)
22632277
result.init(metadata)
22642278
return result
22652279
raise RuntimeError(

python/pyarrow/tests/parquet/test_metadata.py

+20
Original file line numberDiff line numberDiff line change
@@ -794,3 +794,23 @@ def test_column_chunk_key_value_metadata(parquet_test_datadir):
794794
assert key_value_metadata1 == {b'foo': b'bar', b'thisiskeywithoutvalue': b''}
795795
key_value_metadata2 = metadata.row_group(0).column(1).metadata
796796
assert key_value_metadata2 is None
797+
798+
799+
def test_internal_class_instantiation():
800+
def msg(c):
801+
return f"Do not call {c}'s constructor directly"
802+
803+
with pytest.raises(TypeError, match=msg("Statistics")):
804+
pq.Statistics()
805+
806+
with pytest.raises(TypeError, match=msg("ParquetLogicalType")):
807+
pq.ParquetLogicalType()
808+
809+
with pytest.raises(TypeError, match=msg("ColumnChunkMetaData")):
810+
pq.ColumnChunkMetaData()
811+
812+
with pytest.raises(TypeError, match=msg("RowGroupMetaData")):
813+
pq.RowGroupMetaData()
814+
815+
with pytest.raises(TypeError, match=msg("FileMetaData")):
816+
pq.FileMetaData()

0 commit comments

Comments
 (0)