Skip to content

Commit 8eca76d

Browse files
authored
Move ParquetMetadata decoder state machine into ParquetMetadataPushDecoder (#8340)
# Which issue does this PR close? - part of #8000 - Follow on to #8080 - Closes #8439 # Rationale for this change The current ParquetMetadataDecoder intermixes three things: 1. The state machine for decoding parquet metadata (footer, then metadata, then (optional) indexes) 2. orchestrating IO (aka calling read, etc) 3. Decoding thrift encoded byte into objects This makes it almost impossible to add features like "only decode a subset of the columns in the ColumnIndex" and other potentially advanced usecases Now that we have a "push" style API for metadata decoding that avoids IO, the next step is to extract out the actual work into this API so that the existing ParquetMetadataDecoder just calls into the PushDecoder # What changes are included in this PR? 1. Extract decoding state machine into PushMetadataDecoder 2. Extract thrift parsing into its own `parser` module 3. Update ParquetMetadataDecoder to use the PushMetadataDecoder 4. Extract the bytes --> object code into its own module This almost certainly will conflict with @etseidl 's plans in thrift-remodel. # Are these changes tested? by existing tests # Are there any user-facing changes? Not really -- this is an internal change that will make it easier to add features like "only decode a subset of the columns in the ColumnIndex, for example
1 parent 07ae1dd commit 8eca76d

File tree

4 files changed

+472
-175
lines changed

4 files changed

+472
-175
lines changed

parquet/src/file/metadata/parser.rs

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,86 @@ use crate::encryption::{
4343
#[cfg(feature = "encryption")]
4444
use crate::format::EncryptionAlgorithm;
4545

46+
/// Helper struct for metadata parsing
47+
///
48+
/// This structure parses thrift-encoded bytes into the correct Rust structs,
49+
/// such as [`ParquetMetaData`], handling decryption if necessary.
50+
//
51+
// Note this structure is used to minimize the number of
52+
// places need to add `#[cfg(feature = "encryption")]` checks.
53+
pub(crate) use inner::MetadataParser;
54+
55+
#[cfg(feature = "encryption")]
56+
mod inner {
57+
use super::*;
58+
use crate::encryption::decrypt::FileDecryptionProperties;
59+
use crate::errors::Result;
60+
61+
/// API for decoding metadata that may be encrypted
62+
#[derive(Debug, Default)]
63+
pub(crate) struct MetadataParser {
64+
// the credentials and keys needed to decrypt metadata
65+
file_decryption_properties: Option<Arc<FileDecryptionProperties>>,
66+
}
67+
68+
impl MetadataParser {
69+
pub(crate) fn new() -> Self {
70+
MetadataParser::default()
71+
}
72+
73+
pub(crate) fn with_file_decryption_properties(
74+
mut self,
75+
file_decryption_properties: Option<Arc<FileDecryptionProperties>>,
76+
) -> Self {
77+
self.file_decryption_properties = file_decryption_properties;
78+
self
79+
}
80+
81+
pub(crate) fn decode_metadata(
82+
&self,
83+
buf: &[u8],
84+
encrypted_footer: bool,
85+
) -> Result<ParquetMetaData> {
86+
decode_metadata_with_encryption(
87+
buf,
88+
encrypted_footer,
89+
self.file_decryption_properties.as_deref(),
90+
)
91+
}
92+
}
93+
}
94+
95+
#[cfg(not(feature = "encryption"))]
96+
mod inner {
97+
use super::*;
98+
use crate::errors::Result;
99+
/// parallel implementation when encryption feature is not enabled
100+
///
101+
/// This has the same API as the encryption-enabled version
102+
#[derive(Debug, Default)]
103+
pub(crate) struct MetadataParser;
104+
105+
impl MetadataParser {
106+
pub(crate) fn new() -> Self {
107+
MetadataParser
108+
}
109+
110+
pub(crate) fn decode_metadata(
111+
&self,
112+
buf: &[u8],
113+
encrypted_footer: bool,
114+
) -> Result<ParquetMetaData> {
115+
if encrypted_footer {
116+
Err(general_err!(
117+
"Parquet file has an encrypted footer but the encryption feature is disabled"
118+
))
119+
} else {
120+
decode_metadata(buf)
121+
}
122+
}
123+
}
124+
}
125+
46126
/// Decodes [`ParquetMetaData`] from the provided bytes.
47127
///
48128
/// Typically this is used to decode the metadata from the end of a parquet
@@ -79,7 +159,7 @@ pub(crate) fn decode_metadata(buf: &[u8]) -> crate::errors::Result<ParquetMetaDa
79159

80160
/// Parses column orders from Thrift definition.
81161
/// If no column orders are defined, returns `None`.
82-
pub(crate) fn parse_column_orders(
162+
fn parse_column_orders(
83163
t_column_orders: Option<Vec<crate::format::ColumnOrder>>,
84164
schema_descr: &SchemaDescriptor,
85165
) -> crate::errors::Result<Option<Vec<ColumnOrder>>> {
@@ -288,7 +368,7 @@ fn parse_single_offset_index(
288368
/// [Parquet Spec]: https://github.com/apache/parquet-format#metadata
289369
/// [Parquet Encryption Spec]: https://parquet.apache.org/docs/file-format/data-pages/encryption/
290370
#[cfg(feature = "encryption")]
291-
pub(crate) fn decode_metadata_with_encryption(
371+
fn decode_metadata_with_encryption(
292372
buf: &[u8],
293373
encrypted_footer: bool,
294374
file_decryption_properties: Option<&FileDecryptionProperties>,

0 commit comments

Comments
 (0)