Skip to content

Commit 2b79e09

Browse files
committed
Improved decompression speed
1 parent 4cfe92f commit 2b79e09

File tree

1 file changed

+138
-20
lines changed

1 file changed

+138
-20
lines changed

src/ZstdSharp/Unsafe/ZstdDecompressBlock.cs

Lines changed: 138 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -917,18 +917,21 @@ private static nuint ZSTD_execSequenceEndSplitLitBuffer(byte* op, byte* oend, by
917917
[MethodImpl(MethodImplOptions.AggressiveInlining)]
918918
private static nuint ZSTD_execSequence(byte* op, byte* oend, seq_t sequence, byte** litPtr, byte* litLimit, byte* prefixStart, byte* virtualStart, byte* dictEnd)
919919
{
920-
byte* oLitEnd = op + sequence.litLength;
921-
nuint sequenceLength = sequence.litLength + sequence.matchLength;
920+
var sequence_litLength = sequence.litLength;
921+
var sequence_matchLength = sequence.matchLength;
922+
var sequence_offset = sequence.offset;
923+
byte* oLitEnd = op + sequence_litLength;
924+
nuint sequenceLength = sequence_litLength + sequence_matchLength;
922925
/* risk : address space overflow (32-bits) */
923926
byte* oMatchEnd = op + sequenceLength;
924927
/* risk : address space underflow on oend=NULL */
925928
byte* oend_w = oend - 32;
926-
byte* iLitEnd = *litPtr + sequence.litLength;
927-
byte* match = oLitEnd - sequence.offset;
929+
byte* iLitEnd = *litPtr + sequence_litLength;
930+
byte* match = oLitEnd - sequence_offset;
928931
assert(op != null);
929932
assert(oend_w < oend);
930933
if (iLitEnd > litLimit || oMatchEnd > oend_w || MEM_32bits && (nuint)(oend - op) < sequenceLength + 32)
931-
return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
934+
return ZSTD_execSequenceEnd(op, oend, new seq_t { litLength = sequence_litLength, matchLength = sequence_matchLength, offset = sequence_offset }, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
932935
assert(op <= oLitEnd);
933936
assert(oLitEnd < oMatchEnd);
934937
assert(oMatchEnd <= oend);
@@ -937,52 +940,52 @@ private static nuint ZSTD_execSequence(byte* op, byte* oend, seq_t sequence, byt
937940
assert(oMatchEnd <= oend_w);
938941
assert(32 >= 16);
939942
ZSTD_copy16(op, *litPtr);
940-
if (sequence.litLength > 16)
943+
if (sequence_litLength > 16)
941944
{
942-
ZSTD_wildcopy(op + 16, *litPtr + 16, (nint)(sequence.litLength - 16), ZSTD_overlap_e.ZSTD_no_overlap);
945+
ZSTD_wildcopy(op + 16, *litPtr + 16, (nint)(sequence_litLength - 16), ZSTD_overlap_e.ZSTD_no_overlap);
943946
}
944947

945948
op = oLitEnd;
946949
*litPtr = iLitEnd;
947-
if (sequence.offset > (nuint)(oLitEnd - prefixStart))
950+
if (sequence_offset > (nuint)(oLitEnd - prefixStart))
948951
{
949-
if (sequence.offset > (nuint)(oLitEnd - virtualStart))
952+
if (sequence_offset > (nuint)(oLitEnd - virtualStart))
950953
{
951954
return unchecked((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_corruption_detected));
952955
}
953956

954957
match = dictEnd + (match - prefixStart);
955-
if (match + sequence.matchLength <= dictEnd)
958+
if (match + sequence_matchLength <= dictEnd)
956959
{
957-
memmove(oLitEnd, match, sequence.matchLength);
960+
memmove(oLitEnd, match, sequence_matchLength);
958961
return sequenceLength;
959962
}
960963

961964
{
962965
nuint length1 = (nuint)(dictEnd - match);
963966
memmove(oLitEnd, match, length1);
964967
op = oLitEnd + length1;
965-
sequence.matchLength -= length1;
968+
sequence_matchLength -= length1;
966969
match = prefixStart;
967970
}
968971
}
969972

970973
assert(op <= oMatchEnd);
971974
assert(oMatchEnd <= oend_w);
972975
assert(match >= prefixStart);
973-
assert(sequence.matchLength >= 1);
974-
if (sequence.offset >= 16)
976+
assert(sequence_matchLength >= 1);
977+
if (sequence_offset >= 16)
975978
{
976-
ZSTD_wildcopy(op, match, (nint)sequence.matchLength, ZSTD_overlap_e.ZSTD_no_overlap);
979+
ZSTD_wildcopy(op, match, (nint)sequence_matchLength, ZSTD_overlap_e.ZSTD_no_overlap);
977980
return sequenceLength;
978981
}
979982

980-
assert(sequence.offset < 16);
981-
ZSTD_overlapCopy8(&op, &match, sequence.offset);
982-
if (sequence.matchLength > 8)
983+
assert(sequence_offset < 16);
984+
ZSTD_overlapCopy8(ref op, ref match, sequence_offset);
985+
if (sequence_matchLength > 8)
983986
{
984987
assert(op < oMatchEnd);
985-
ZSTD_wildcopy(op, match, (nint)sequence.matchLength - 8, ZSTD_overlap_e.ZSTD_overlap_src_before_dst);
988+
ZSTD_wildcopy(op, match, (nint)sequence_matchLength - 8, ZSTD_overlap_e.ZSTD_overlap_src_before_dst);
986989
}
987990

988991
return sequenceLength;
@@ -1319,6 +1322,8 @@ private static nuint ZSTD_decompressSequences_bodySplitLitBuffer(ZSTD_DCtx_s* dc
13191322
[MethodImpl(MethodImplOptions.AggressiveInlining)]
13201323
private static nuint ZSTD_decompressSequences_body(ZSTD_DCtx_s* dctx, void* dst, nuint maxDstSize, void* seqStart, nuint seqSize, int nbSeq, ZSTD_longOffset_e isLongOffset, int frame)
13211324
{
1325+
// HACK, force nbSeq to stack (better register usage)
1326+
System.Threading.Thread.VolatileRead(ref nbSeq);
13221327
byte* ip = (byte*)seqStart;
13231328
byte* iend = ip + seqSize;
13241329
byte* ostart = (byte*)dst;
@@ -1352,7 +1357,88 @@ private static nuint ZSTD_decompressSequences_body(ZSTD_DCtx_s* dctx, void* dst,
13521357
for (; ; )
13531358
{
13541359
seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1355-
nuint oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1360+
nuint oneSeqSize;
1361+
{
1362+
var sequence_litLength = sequence.litLength;
1363+
var sequence_matchLength = sequence.matchLength;
1364+
var sequence_offset = sequence.offset;
1365+
byte* oLitEnd = op + sequence_litLength;
1366+
oneSeqSize = sequence_litLength + sequence_matchLength;
1367+
/* risk : address space overflow (32-bits) */
1368+
byte* oMatchEnd = op + oneSeqSize;
1369+
/* risk : address space underflow on oend=NULL */
1370+
byte* oend_w = oend - 32;
1371+
byte* iLitEnd = litPtr + sequence_litLength;
1372+
byte* match = oLitEnd - sequence_offset;
1373+
assert(op != null);
1374+
assert(oend_w < oend);
1375+
if (iLitEnd > litEnd || oMatchEnd > oend_w || MEM_32bits && (nuint)(oend - op) < oneSeqSize + 32)
1376+
{
1377+
oneSeqSize = ZSTD_execSequenceEnd(op, oend, new seq_t { litLength = sequence_litLength, matchLength = sequence_matchLength, offset = sequence_offset }, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1378+
goto returnOneSeqSize;
1379+
}
1380+
1381+
assert(op <= oLitEnd);
1382+
assert(oLitEnd < oMatchEnd);
1383+
assert(oMatchEnd <= oend);
1384+
assert(iLitEnd <= litEnd);
1385+
assert(oLitEnd <= oend_w);
1386+
assert(oMatchEnd <= oend_w);
1387+
assert(32 >= 16);
1388+
ZSTD_copy16(op, litPtr);
1389+
if (sequence_litLength > 16)
1390+
{
1391+
ZSTD_wildcopy(op + 16, litPtr + 16, (nint)(sequence_litLength - 16), ZSTD_overlap_e.ZSTD_no_overlap);
1392+
}
1393+
1394+
byte* opInner = oLitEnd;
1395+
litPtr = iLitEnd;
1396+
if (sequence_offset > (nuint)(oLitEnd - prefixStart))
1397+
{
1398+
if (sequence_offset > (nuint)(oLitEnd - vBase))
1399+
{
1400+
oneSeqSize = unchecked((nuint)(-(int)ZSTD_ErrorCode.ZSTD_error_corruption_detected));
1401+
goto returnOneSeqSize;
1402+
}
1403+
1404+
match = dictEnd + (match - prefixStart);
1405+
if (match + sequence_matchLength <= dictEnd)
1406+
{
1407+
memmove(oLitEnd, match, sequence_matchLength);
1408+
goto returnOneSeqSize;
1409+
}
1410+
1411+
{
1412+
nuint length1 = (nuint)(dictEnd - match);
1413+
memmove(oLitEnd, match, length1);
1414+
opInner = oLitEnd + length1;
1415+
sequence_matchLength -= length1;
1416+
match = prefixStart;
1417+
}
1418+
}
1419+
1420+
assert(opInner <= oMatchEnd);
1421+
assert(oMatchEnd <= oend_w);
1422+
assert(match >= prefixStart);
1423+
assert(sequence_matchLength >= 1);
1424+
if (sequence_offset >= 16)
1425+
{
1426+
ZSTD_wildcopy(opInner, match, (nint)sequence_matchLength, ZSTD_overlap_e.ZSTD_no_overlap);
1427+
goto returnOneSeqSize;
1428+
}
1429+
1430+
assert(sequence_offset < 16);
1431+
ZSTD_overlapCopy8(ref opInner, ref match, sequence_offset);
1432+
if (sequence_matchLength > 8)
1433+
{
1434+
assert(opInner < oMatchEnd);
1435+
ZSTD_wildcopy(opInner, match, (nint)sequence_matchLength - 8, ZSTD_overlap_e.ZSTD_overlap_src_before_dst);
1436+
}
1437+
1438+
returnOneSeqSize:
1439+
;
1440+
}
1441+
13561442
if (ERR_isError(oneSeqSize))
13571443
return oneSeqSize;
13581444
op += oneSeqSize;
@@ -1797,5 +1883,37 @@ public static nuint ZSTD_decompressBlock(ZSTD_DCtx_s* dctx, void* dst, nuint dst
17971883
dctx->previousDstEnd = (sbyte*)dst + dSize;
17981884
return dSize;
17991885
}
1886+
1887+
/*! ZSTD_overlapCopy8() :
1888+
* Copies 8 bytes from ip to op and updates op and ip where ip <= op.
1889+
* If the offset is < 8 then the offset is spread to at least 8 bytes.
1890+
*
1891+
* Precondition: *ip <= *op
1892+
* Postcondition: *op - *op >= 8
1893+
*/
1894+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1895+
private static void ZSTD_overlapCopy8(ref byte* op, ref byte* ip, nuint offset)
1896+
{
1897+
assert(ip <= op);
1898+
if (offset < 8)
1899+
{
1900+
int sub2 = dec64table[offset];
1901+
op[0] = ip[0];
1902+
op[1] = ip[1];
1903+
op[2] = ip[2];
1904+
op[3] = ip[3];
1905+
ip += dec32table[offset];
1906+
ZSTD_copy4(op + 4, ip);
1907+
ip -= sub2;
1908+
}
1909+
else
1910+
{
1911+
ZSTD_copy8(op, ip);
1912+
}
1913+
1914+
ip += 8;
1915+
op += 8;
1916+
assert(op - ip >= 8);
1917+
}
18001918
}
18011919
}

0 commit comments

Comments
 (0)