Skip to content

modules/zstd: add frame header parser #1168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions dependency_support/com_github_facebook_zstd/bundled.BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ cc_library(
]),
hdrs = [
"lib/zstd.h",
"lib/common/zstd_errors.h",
],
strip_include_prefix = "lib",
local_defines = [
Expand All @@ -48,3 +49,42 @@ cc_library(
],
visibility = ["//visibility:public"],
)

# NOTE: Required because of direct zstd_compress.c include in decodecorpus sources
cc_library(
name = "decodecorpus_includes",
hdrs = [
"lib/compress/zstd_compress.c",
],
)

cc_binary(
name = "decodecorpus",
srcs = [
"tests/decodecorpus.c",
] + glob(
[
"programs/*.c",
"programs/*.h",
],
exclude = [
"programs/zstdcli.c",
],
),
deps = [
":zstd",
":decodecorpus_includes",
],
includes = [
"lib/",
"lib/common/",
"lib/compress/",
"lib/dictBuilder/",
"lib/deprecated/",
"programs/",
],
local_defines = [
"XXH_NAMESPACE=ZSTD_",
],
visibility = ["//visibility:public"],
)
152 changes: 152 additions & 0 deletions dependency_support/com_github_facebook_zstd/decodecorpus.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
diff --git tests/decodecorpus.c tests/decodecorpus.c
index 50935d31..522b3769 100644
--- tests/decodecorpus.c
+++ tests/decodecorpus.c
@@ -240,6 +240,12 @@ typedef enum {
gt_block, /* generate compressed blocks without block/frame headers */
} genType_e;

+typedef enum {
+ lt_raw,
+ lt_rle,
+ lt_compressed,
+} literalType_e;
+
/*-*******************************************************
* Global variables (set from command line)
*********************************************************/
@@ -252,7 +258,11 @@ U32 g_maxBlockSize = ZSTD_BLOCKSIZE_MAX; /* <= 128 KB */

struct {
int contentSize; /* force the content size to be present */
-} opts; /* advanced options on generation */
+ blockType_e *blockType; /* force specific block type */
+ literalType_e *literalType; /* force specific literals type */
+ int frame_header_only; /* generate only frame header */
+ int no_magic; /* do not generate magic number */
+} opts;

/* Generate and write a random frame header */
static void writeFrameHeader(U32* seed, frame_t* frame, dictInfo info)
@@ -317,8 +327,10 @@ static void writeFrameHeader(U32* seed, frame_t* frame, dictInfo info)
}

/* write out the header */
- MEM_writeLE32(op + pos, ZSTD_MAGICNUMBER);
- pos += 4;
+ if (!opts.no_magic) {
+ MEM_writeLE32(op + pos, ZSTD_MAGICNUMBER);
+ pos += 4;
+ }

{
/*
@@ -363,8 +375,10 @@ static void writeFrameHeader(U32* seed, frame_t* frame, dictInfo info)
/* Write a literal block in either raw or RLE form, return the literals size */
static size_t writeLiteralsBlockSimple(U32* seed, frame_t* frame, size_t contentSize)
{
+ int force_literal_type = opts.literalType != NULL;
+ int const type = (force_literal_type) ? *(opts.literalType) : RAND(seed) % 2;
+
BYTE* op = (BYTE*)frame->data;
- int const type = RAND(seed) % 2;
int const sizeFormatDesc = RAND(seed) % 8;
size_t litSize;
size_t maxLitSize = MIN(contentSize, g_maxBlockSize);
@@ -612,8 +626,15 @@ static size_t writeLiteralsBlockCompressed(U32* seed, frame_t* frame, size_t con

static size_t writeLiteralsBlock(U32* seed, frame_t* frame, size_t contentSize)
{
- /* only do compressed for larger segments to avoid compressibility issues */
- if (RAND(seed) & 7 && contentSize >= 64) {
+ int select_compressed = 0;
+ if (opts.literalType) {
+ select_compressed = *(opts.literalType) == lt_compressed;
+ } else {
+ /* only do compressed for larger segments to avoid compressibility issues */
+ select_compressed = RAND(seed) & 7 && contentSize >= 64;
+ }
+
+ if (select_compressed) {
return writeLiteralsBlockCompressed(seed, frame, contentSize);
} else {
return writeLiteralsBlockSimple(seed, frame, contentSize);
@@ -1030,7 +1051,8 @@ static size_t writeCompressedBlock(U32* seed, frame_t* frame, size_t contentSize
static void writeBlock(U32* seed, frame_t* frame, size_t contentSize,
int lastBlock, dictInfo info)
{
- int const blockTypeDesc = RAND(seed) % 8;
+ int force_block_type = opts.blockType != NULL;
+ int const blockTypeDesc = (force_block_type) ? *(opts.blockType) : RAND(seed) % 8;
size_t blockSize;
int blockType;

@@ -1069,7 +1091,7 @@ static void writeBlock(U32* seed, frame_t* frame, size_t contentSize,

frame->data = op;
compressedSize = writeCompressedBlock(seed, frame, contentSize, info);
- if (compressedSize >= contentSize) { /* compressed block must be strictly smaller than uncompressed one */
+ if (compressedSize >= contentSize && !force_block_type) { /* compressed block must be strictly smaller than uncompressed one */
blockType = 0;
memcpy(op, frame->src, contentSize);

@@ -1240,7 +1262,11 @@ static U32 generateFrame(U32 seed, frame_t* fr, dictInfo info)
DISPLAYLEVEL(3, "frame seed: %u\n", (unsigned)seed);
initFrame(fr);

+
writeFrameHeader(&seed, fr, info);
+ if (opts.frame_header_only)
+ return seed;
+
writeBlocks(&seed, fr, info);
writeChecksum(fr);

@@ -1768,6 +1794,9 @@ static void advancedUsage(const char* programName)
DISPLAY( " --max-block-size-log=# : max block size log, must be in range [2, 17]\n");
DISPLAY( " --max-content-size-log=# : max content size log, must be <= 20\n");
DISPLAY( " (this is ignored with gen-blocks)\n");
+ DISPLAY( " --block-type=# : force certain block type (raw=0, rle=1, compressed=2)\n");
+ DISPLAY( " --frame-header-only : dump only frame header\n");
+ DISPLAY( " --no-magic : do not add magic number\n");
}

/*! readU32FromChar() :
@@ -1889,6 +1918,18 @@ int main(int argc, char** argv)
U32 value = readU32FromChar(&argument);
g_maxDecompressedSizeLog =
MIN(MAX_DECOMPRESSED_SIZE_LOG, value);
+ } else if (longCommandWArg(&argument, "block-type=")) {
+ U32 value = readU32FromChar(&argument);
+ opts.blockType = malloc(sizeof(blockType_e));
+ *(opts.blockType) = value;
+ } else if (longCommandWArg(&argument, "literal-type=")) {
+ U32 value = readU32FromChar(&argument);
+ opts.literalType = malloc(sizeof(literalType_e));
+ *(opts.literalType) = value;
+ } else if (strcmp(argument, "frame-header-only") == 0) {
+ opts.frame_header_only = 1;
+ } else if (strcmp(argument, "no-magic") == 0) {
+ opts.no_magic = 1;
} else {
advancedUsage(argv[0]);
return 1;
@@ -1900,6 +1941,18 @@ int main(int argc, char** argv)
return 1;
} } } } /* for (argNb=1; argNb<argc; argNb++) */

+ if (opts.blockType) {
+ if ((opts.contentSize == 0) && (*(opts.blockType) == bt_rle)) {
+ DISPLAY("Error: content-size has to be used together with blockType=1 (rle block)\n");
+ return 1;
+ }
+
+ if (opts.literalType && (*(opts.blockType) != bt_compressed)) {
+ DISPLAY("Error: literal-type can be used only with blockType=2 (compressed block)\n");
+ return 1;
+ }
+ }
+
if (!seedset) {
seed = makeSeed();
}
2 changes: 2 additions & 0 deletions dependency_support/load_external.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -297,4 +297,6 @@ def load_external_repositories():
strip_prefix = "zstd-1.4.7",
urls = ["https://github.com/facebook/zstd/releases/download/v1.4.7/zstd-1.4.7.tar.gz"],
build_file = "@//dependency_support/com_github_facebook_zstd:bundled.BUILD.bazel",
# Modify decodecorpus to allow generation of zstd frame headers only
patches = ["@com_google_xls//dependency_support/com_github_facebook_zstd:decodecorpus.patch"],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment about the patch? has it been submitted upstream?

Copy link
Contributor

@lpawelcz lpawelcz Nov 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the comment.
Changes were not submitted upstream yet. The patch modifies decodecorpus utility from zstd library. Unmodified tool allows generating valid zstd frames with randomized contents and size. With our modifications it is possibile to generate only some parts of the zstd frame. In this case we modify decodecorpus so that it allows generating only frame headers.

)
Loading