diff options
author | Nick Terrell <terrelln@fb.com> | 2022-10-17 13:32:37 -0700 |
---|---|---|
committer | Nick Terrell <terrelln@fb.com> | 2022-10-24 12:12:32 -0700 |
commit | 2aa14b1ab2c41a4fe41efae80d58bb77da91f19f (patch) | |
tree | 17f83bdf97a2a93f8d0aa4d5daf6a92caa7bde79 /lib/zstd/compress | |
parent | 4782c725c1538aa9ef894ae4a3938db40be7f02c (diff) | |
download | linux-2aa14b1ab2c41a4fe41efae80d58bb77da91f19f.tar.bz2 |
zstd: import usptream v1.5.2
Updates the kernel's zstd library to v1.5.2, the latest zstd release.
The upstream tag it is updated to is `v1.5.2-kernel`, which contains
several cherry-picked commits on top of the v1.5.2 release which are
required for the kernel update. I will create this tag once the PR is
ready to merge, until then reference the temporary upstream branch
`v1.5.2-kernel-cherrypicks`.
I plan to submit this patch as part of the v6.2 merge window.
I've done basic build testing & testing on x86-64, i386, and aarch64.
I'm merging these patches into my `zstd-next` branch, which is pulled
into `linux-next` for further testing.
I've benchmarked BtrFS with zstd compression on a x86-64 machine, and
saw these results. Decompression speed is a small win across the board.
The lower compression levels 1-4 see both compression speed and
compression ratio wins. The higher compression levels see a small
compression speed loss and about neutral ratio. I expect the lower
compression levels to be used much more heavily than the high
compression levels, so this should be a net win.
Level CTime DTime Ratio
1 -2.95% -1.1% -0.7%
3 -3.5% -1.2% -0.5%
5 +3.7% -1.0% +0.0%
7 +3.2% -0.9% +0.0%
9 -4.3% -0.8% +0.1%
Signed-off-by: Nick Terrell <terrelln@fb.com>
Diffstat (limited to 'lib/zstd/compress')
-rw-r--r-- | lib/zstd/compress/clevels.h | 132 | ||||
-rw-r--r-- | lib/zstd/compress/fse_compress.c | 83 | ||||
-rw-r--r-- | lib/zstd/compress/huf_compress.c | 644 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_compress.c | 1998 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_compress_internal.h | 375 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_compress_literals.c | 9 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_compress_literals.h | 4 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_compress_sequences.c | 31 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_compress_superblock.c | 295 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_cwksp.h | 225 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_double_fast.c | 413 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_fast.c | 441 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_lazy.c | 1352 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_lazy.h | 38 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_ldm.c | 76 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_ldm.h | 1 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_ldm_geartab.h | 5 | ||||
-rw-r--r-- | lib/zstd/compress/zstd_opt.c | 402 |
18 files changed, 4705 insertions, 1819 deletions
diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h new file mode 100644 index 000000000000..d9a76112ec3a --- /dev/null +++ b/lib/zstd/compress/clevels.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_CLEVELS_H +#define ZSTD_CLEVELS_H + +#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_compressionParameters */ +#include <linux/zstd.h> + +/*-===== Pre-defined compression levels =====-*/ + +#define ZSTD_MAX_CLEVEL 22 + +__attribute__((__unused__)) + +static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = { +{ /* "default" - for any srcSize > 256 KB */ + /* W, C, H, S, L, TL, strat */ + { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels */ + { 19, 13, 14, 1, 7, 0, ZSTD_fast }, /* level 1 */ + { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */ + { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */ + { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */ + { 21, 18, 19, 3, 5, 2, ZSTD_greedy }, /* level 5 */ + { 21, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6 */ + { 21, 19, 20, 4, 5, 8, ZSTD_lazy }, /* level 7 */ + { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 8 */ + { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */ + { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 10 */ + { 22, 21, 22, 6, 5, 16, ZSTD_lazy2 }, /* level 11 */ + { 22, 22, 23, 6, 5, 32, ZSTD_lazy2 }, /* level 12 */ + { 22, 22, 22, 4, 5, 32, ZSTD_btlazy2 }, /* level 13 */ + { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */ + { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */ + { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */ + { 23, 23, 22, 5, 4, 64, ZSTD_btopt }, /* level 17 */ + { 23, 23, 22, 6, 3, 64, ZSTD_btultra }, /* level 18 */ + { 23, 24, 22, 7, 3,256, ZSTD_btultra2}, /* level 19 */ + { 25, 25, 23, 7, 3,256, ZSTD_btultra2}, /* level 20 */ + { 26, 26, 24, 7, 3,512, ZSTD_btultra2}, /* level 21 */ + { 27, 27, 25, 9, 3,999, ZSTD_btultra2}, /* level 22 */ +}, +{ /* for srcSize <= 256 KB */ + /* W, C, H, S, L, T, strat */ + { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */ + { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */ + { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */ + { 18, 16, 17, 3, 5, 2, ZSTD_greedy }, /* level 4.*/ + { 18, 17, 18, 5, 5, 2, ZSTD_greedy }, /* level 5.*/ + { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/ + { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */ + { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ + { 18, 18, 19, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ + { 18, 18, 19, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ + { 18, 18, 19, 5, 4, 12, ZSTD_btlazy2 }, /* level 11.*/ + { 18, 19, 19, 7, 4, 12, ZSTD_btlazy2 }, /* level 12.*/ + { 18, 18, 19, 4, 4, 16, ZSTD_btopt }, /* level 13 */ + { 18, 18, 19, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ + { 18, 18, 19, 6, 3,128, ZSTD_btopt }, /* level 15.*/ + { 18, 19, 19, 6, 3,128, ZSTD_btultra }, /* level 16.*/ + { 18, 19, 19, 8, 3,256, ZSTD_btultra }, /* level 17.*/ + { 18, 19, 19, 6, 3,128, ZSTD_btultra2}, /* level 18.*/ + { 18, 19, 19, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 18, 19, 19, 10, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 18, 19, 19, 12, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 18, 19, 19, 13, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +{ /* for srcSize <= 128 KB */ + /* W, C, H, S, L, T, strat */ + { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 17, 12, 13, 1, 6, 0, ZSTD_fast }, /* level 1 */ + { 17, 13, 15, 1, 5, 0, ZSTD_fast }, /* level 2 */ + { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */ + { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */ + { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */ + { 17, 16, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */ + { 17, 16, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */ + { 17, 16, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ + { 17, 16, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ + { 17, 16, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ + { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */ + { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */ + { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/ + { 17, 18, 17, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ + { 17, 18, 17, 6, 3,256, ZSTD_btopt }, /* level 15.*/ + { 17, 18, 17, 6, 3,128, ZSTD_btultra }, /* level 16.*/ + { 17, 18, 17, 8, 3,256, ZSTD_btultra }, /* level 17.*/ + { 17, 18, 17, 10, 3,512, ZSTD_btultra }, /* level 18.*/ + { 17, 18, 17, 5, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 17, 18, 17, 7, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 17, 18, 17, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 17, 18, 17, 11, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +{ /* for srcSize <= 16 KB */ + /* W, C, H, S, L, T, strat */ + { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 14, 14, 15, 1, 5, 0, ZSTD_fast }, /* level 1 */ + { 14, 14, 15, 1, 4, 0, ZSTD_fast }, /* level 2 */ + { 14, 14, 15, 2, 4, 0, ZSTD_dfast }, /* level 3 */ + { 14, 14, 14, 4, 4, 2, ZSTD_greedy }, /* level 4 */ + { 14, 14, 14, 3, 4, 4, ZSTD_lazy }, /* level 5.*/ + { 14, 14, 14, 4, 4, 8, ZSTD_lazy2 }, /* level 6 */ + { 14, 14, 14, 6, 4, 8, ZSTD_lazy2 }, /* level 7 */ + { 14, 14, 14, 8, 4, 8, ZSTD_lazy2 }, /* level 8.*/ + { 14, 15, 14, 5, 4, 8, ZSTD_btlazy2 }, /* level 9.*/ + { 14, 15, 14, 9, 4, 8, ZSTD_btlazy2 }, /* level 10.*/ + { 14, 15, 14, 3, 4, 12, ZSTD_btopt }, /* level 11.*/ + { 14, 15, 14, 4, 3, 24, ZSTD_btopt }, /* level 12.*/ + { 14, 15, 14, 5, 3, 32, ZSTD_btultra }, /* level 13.*/ + { 14, 15, 15, 6, 3, 64, ZSTD_btultra }, /* level 14.*/ + { 14, 15, 15, 7, 3,256, ZSTD_btultra }, /* level 15.*/ + { 14, 15, 15, 5, 3, 48, ZSTD_btultra2}, /* level 16.*/ + { 14, 15, 15, 6, 3,128, ZSTD_btultra2}, /* level 17.*/ + { 14, 15, 15, 7, 3,256, ZSTD_btultra2}, /* level 18.*/ + { 14, 15, 15, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 14, 15, 15, 8, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 14, 15, 15, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 14, 15, 15, 10, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +}; + + + +#endif /* ZSTD_CLEVELS_H */ diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c index 436985b620e5..ec5b1ca6d71a 100644 --- a/lib/zstd/compress/fse_compress.c +++ b/lib/zstd/compress/fse_compress.c @@ -75,13 +75,14 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ; FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); U32 const step = FSE_TABLESTEP(tableSize); + U32 const maxSV1 = maxSymbolValue+1; - U32* cumul = (U32*)workSpace; - FSE_FUNCTION_TYPE* tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSymbolValue + 2)); + U16* cumul = (U16*)workSpace; /* size = maxSV1 */ + FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSV1+1)); /* size = tableSize */ U32 highThreshold = tableSize-1; - if ((size_t)workSpace & 3) return ERROR(GENERIC); /* Must be 4 byte aligned */ + assert(((size_t)workSpace & 1) == 0); /* Must be 2 bytes-aligned */ if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSize) return ERROR(tableLog_tooLarge); /* CTable header */ tableU16[-2] = (U16) tableLog; @@ -98,20 +99,61 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, /* symbol start positions */ { U32 u; cumul[0] = 0; - for (u=1; u <= maxSymbolValue+1; u++) { + for (u=1; u <= maxSV1; u++) { if (normalizedCounter[u-1]==-1) { /* Low proba symbol */ cumul[u] = cumul[u-1] + 1; tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1); } else { - cumul[u] = cumul[u-1] + normalizedCounter[u-1]; + assert(normalizedCounter[u-1] >= 0); + cumul[u] = cumul[u-1] + (U16)normalizedCounter[u-1]; + assert(cumul[u] >= cumul[u-1]); /* no overflow */ } } - cumul[maxSymbolValue+1] = tableSize+1; + cumul[maxSV1] = (U16)(tableSize+1); } /* Spread symbols */ - { U32 position = 0; + if (highThreshold == tableSize - 1) { + /* Case for no low prob count symbols. Lay down 8 bytes at a time + * to reduce branch misses since we are operating on a small block + */ + BYTE* const spread = tableSymbol + tableSize; /* size = tableSize + 8 (may write beyond tableSize) */ + { U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; + for (s=0; s<maxSV1; ++s, sv += add) { + int i; + int const n = normalizedCounter[s]; + MEM_write64(spread + pos, sv); + for (i = 8; i < n; i += 8) { + MEM_write64(spread + pos + i, sv); + } + assert(n>=0); + pos += (size_t)n; + } + } + /* Spread symbols across the table. Lack of lowprob symbols means that + * we don't need variable sized inner loop, so we can unroll the loop and + * reduce branch misses. + */ + { size_t position = 0; + size_t s; + size_t const unroll = 2; /* Experimentally determined optimal unroll */ + assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ + for (s = 0; s < (size_t)tableSize; s += unroll) { + size_t u; + for (u = 0; u < unroll; ++u) { + size_t const uPosition = (position + (u * step)) & tableMask; + tableSymbol[uPosition] = spread[s + u]; + } + position = (position + (unroll * step)) & tableMask; + } + assert(position == 0); /* Must have initialized all positions */ + } + } else { + U32 position = 0; U32 symbol; - for (symbol=0; symbol<=maxSymbolValue; symbol++) { + for (symbol=0; symbol<maxSV1; symbol++) { int nbOccurrences; int const freq = normalizedCounter[symbol]; for (nbOccurrences=0; nbOccurrences<freq; nbOccurrences++) { @@ -120,7 +162,6 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, while (position > highThreshold) position = (position + step) & tableMask; /* Low proba area */ } } - assert(position==0); /* Must have initialized all positions */ } @@ -144,16 +185,17 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, case -1: case 1: symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog); - symbolTT[s].deltaFindState = total - 1; + assert(total <= INT_MAX); + symbolTT[s].deltaFindState = (int)(total - 1); total ++; break; default : - { - U32 const maxBitsOut = tableLog - BIT_highbit32 (normalizedCounter[s]-1); - U32 const minStatePlus = normalizedCounter[s] << maxBitsOut; + assert(normalizedCounter[s] > 1); + { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1); + U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut; symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; - symbolTT[s].deltaFindState = total - normalizedCounter[s]; - total += normalizedCounter[s]; + symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); + total += (unsigned)normalizedCounter[s]; } } } } #if 0 /* debug : symbol costs */ @@ -164,8 +206,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, symbol, normalizedCounter[symbol], FSE_getMaxNbBits(symbolTT, symbol), (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256); - } - } + } } #endif return 0; @@ -173,16 +214,18 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, - #ifndef FSE_COMMONDEFS_ONLY - /*-************************************************************** * FSE NCount encoding ****************************************************************/ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog) { - size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3; + size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog + + 4 /* bitCount initialized at 4 */ + + 2 /* first two symbols may use one additional bit each */) / 8) + + 1 /* round up to whole nb bytes */ + + 2 /* additional two bytes for bitstream flush */; return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ } diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c index f76a526bfa54..74ef0db47621 100644 --- a/lib/zstd/compress/huf_compress.c +++ b/lib/zstd/compress/huf_compress.c @@ -50,6 +50,28 @@ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxS /* ******************************************************* * HUF : Huffman block compression *********************************************************/ +#define HUF_WORKSPACE_MAX_ALIGNMENT 8 + +static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePtr, size_t align) +{ + size_t const mask = align - 1; + size_t const rem = (size_t)workspace & mask; + size_t const add = (align - rem) & mask; + BYTE* const aligned = (BYTE*)workspace + add; + assert((align & (align - 1)) == 0); /* pow 2 */ + assert(align <= HUF_WORKSPACE_MAX_ALIGNMENT); + if (*workspaceSizePtr >= add) { + assert(add < align); + assert(((size_t)aligned & mask) == 0); + *workspaceSizePtr -= add; + return aligned; + } else { + *workspaceSizePtr = 0; + return NULL; + } +} + + /* HUF_compressWeights() : * Same as FSE_compress(), but dedicated to huff0's weights compression. * The use case needs much less stack memory. @@ -72,7 +94,7 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT unsigned maxSymbolValue = HUF_TABLELOG_MAX; U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER; - HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)workspace; + HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32)); if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENERIC); @@ -103,6 +125,40 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT return (size_t)(op-ostart); } +static size_t HUF_getNbBits(HUF_CElt elt) +{ + return elt & 0xFF; +} + +static size_t HUF_getNbBitsFast(HUF_CElt elt) +{ + return elt; +} + +static size_t HUF_getValue(HUF_CElt elt) +{ + return elt & ~0xFF; +} + +static size_t HUF_getValueFast(HUF_CElt elt) +{ + return elt; +} + +static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits) +{ + assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX); + *elt = nbBits; +} + +static void HUF_setValue(HUF_CElt* elt, size_t value) +{ + size_t const nbBits = HUF_getNbBits(*elt); + if (nbBits > 0) { + assert((value >> nbBits) == 0); + *elt |= value << (sizeof(HUF_CElt) * 8 - nbBits); + } +} typedef struct { HUF_CompressWeightsWksp wksp; @@ -114,9 +170,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize) { + HUF_CElt const* const ct = CTable + 1; BYTE* op = (BYTE*)dst; U32 n; - HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)workspace; + HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32)); /* check conditions */ if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); @@ -127,9 +184,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, for (n=1; n<huffLog+1; n++) wksp->bitsToWeight[n] = (BYTE)(huffLog + 1 - n); for (n=0; n<maxSymbolValue; n++) - wksp->huffWeight[n] = wksp->bitsToWeight[CTable[n].nbBits]; + wksp->huffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])]; /* attempt weights compression by FSE */ + if (maxDstSize < 1) return ERROR(dstSize_tooSmall); { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) ); if ((hSize>1) & (hSize < maxSymbolValue/2)) { /* FSE compressed */ op[0] = (BYTE)hSize; @@ -163,6 +221,7 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */ U32 tableLog = 0; U32 nbSymbols = 0; + HUF_CElt* const ct = CTable + 1; /* get symbol weights */ CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize)); @@ -172,6 +231,8 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); + CTable[0] = tableLog; + /* Prepare base value per rank */ { U32 n, nextRankStart = 0; for (n=1; n<=tableLog; n++) { @@ -183,13 +244,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void /* fill nbBits */ { U32 n; for (n=0; n<nbSymbols; n++) { const U32 w = huffWeight[n]; - CTable[n].nbBits = (BYTE)(tableLog + 1 - w) & -(w != 0); + HUF_setNbBits(ct + n, (BYTE)(tableLog + 1 - w) & -(w != 0)); } } /* fill val */ { U16 nbPerRank[HUF_TABLELOG_MAX+2] = {0}; /* support w=0=>n=tableLog+1 */ U16 valPerRank[HUF_TABLELOG_MAX+2] = {0}; - { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; } + { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[HUF_getNbBits(ct[n])]++; } /* determine stating value per rank */ valPerRank[tableLog+1] = 0; /* for w==0 */ { U16 min = 0; @@ -199,18 +260,18 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void min >>= 1; } } /* assign value within rank, symbol order */ - { U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; } + { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); } } *maxSymbolValuePtr = nbSymbols - 1; return readSize; } -U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue) +U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue) { - const HUF_CElt* table = (const HUF_CElt*)symbolTable; + const HUF_CElt* ct = CTable + 1; assert(symbolValue <= HUF_SYMBOLVALUE_MAX); - return table[symbolValue].nbBits; + return (U32)HUF_getNbBits(ct[symbolValue]); } @@ -364,22 +425,118 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) } typedef struct { - U32 base; - U32 curr; + U16 base; + U16 curr; } rankPos; typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; -#define RANK_POSITION_TABLE_SIZE 32 +/* Number of buckets available for HUF_sort() */ +#define RANK_POSITION_TABLE_SIZE 192 typedef struct { huffNodeTable huffNodeTbl; rankPos rankPosition[RANK_POSITION_TABLE_SIZE]; } HUF_buildCTable_wksp_tables; +/* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing. + * Strategy is to use as many buckets as possible for representing distinct + * counts while using the remainder to represent all "large" counts. + * + * To satisfy this requirement for 192 buckets, we can do the following: + * Let buckets 0-166 represent distinct counts of [0, 166] + * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. + */ +#define RANK_POSITION_MAX_COUNT_LOG 32 +#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */ +#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */ + +/* Return the appropriate bucket index for a given count. See definition of + * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. + */ +static U32 HUF_getIndex(U32 const count) { + return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) + ? count + : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; +} + +/* Helper swap function for HUF_quickSortPartition() */ +static void HUF_swapNodes(nodeElt* a, nodeElt* b) { + nodeElt tmp = *a; + *a = *b; + *b = tmp; +} + +/* Returns 0 if the huffNode array is not sorted by descending count */ +MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1) { + U32 i; + for (i = 1; i < maxSymbolValue1; ++i) { + if (huffNode[i].count > huffNode[i-1].count) { + return 0; + } + } + return 1; +} + +/* Insertion sort by descending order */ +HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) { + int i; + int const size = high-low+1; + huffNode += low; + for (i = 1; i < size; ++i) { + nodeElt const key = huffNode[i]; + int j = i - 1; + while (j >= 0 && huffNode[j].count < key.count) { + huffNode[j + 1] = huffNode[j]; + j--; + } + huffNode[j + 1] = key; + } +} + +/* Pivot helper function for quicksort. */ +static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) { + /* Simply select rightmost element as pivot. "Better" selectors like + * median-of-three don't experimentally appear to have any benefit. + */ + U32 const pivot = arr[high].count; + int i = low - 1; + int j = low; + for ( ; j < high; j++) { + if (arr[j].count > pivot) { + i++; + HUF_swapNodes(&arr[i], &arr[j]); + } + } + HUF_swapNodes(&arr[i + 1], &arr[high]); + return i + 1; +} + +/* Classic quicksort by descending with partially iterative calls + * to reduce worst case callstack size. + */ +static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) { + int const kInsertionSortThreshold = 8; + if (high - low < kInsertionSortThreshold) { + HUF_insertionSort(arr, low, high); + return; + } + while (low < high) { + int const idx = HUF_quickSortPartition(arr, low, high); + if (idx - low < high - idx) { + HUF_simpleQuickSort(arr, low, idx - 1); + low = idx + 1; + } else { + HUF_simpleQuickSort(arr, idx + 1, high); + high = idx - 1; + } + } +} + /* * HUF_sort(): * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order. + * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket. * * @param[out] huffNode Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled. * Must have (maxSymbolValue + 1) entries. @@ -387,44 +544,52 @@ typedef struct { * @param[in] maxSymbolValue Maximum symbol value. * @param rankPosition This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries. */ -static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition) -{ - int n; - int const maxSymbolValue1 = (int)maxSymbolValue + 1; +static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) { + U32 n; + U32 const maxSymbolValue1 = maxSymbolValue+1; /* Compute base and set curr to base. - * For symbol s let lowerRank = BIT_highbit32(count[n]+1) and rank = lowerRank + 1. - * Then 2^lowerRank <= count[n]+1 <= 2^rank. + * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1. + * See HUF_getIndex to see bucketing strategy. * We attribute each symbol to lowerRank's base value, because we want to know where * each rank begins in the output, so for rank R we want to count ranks R+1 and above. */ ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE); for (n = 0; n < maxSymbolValue1; ++n) { - U32 lowerRank = BIT_highbit32(count[n] + 1); + U32 lowerRank = HUF_getIndex(count[n]); + assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1); rankPosition[lowerRank].base++; } + assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0); + /* Set up the rankPosition table */ for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) { rankPosition[n-1].base += rankPosition[n].base; rankPosition[n-1].curr = rankPosition[n-1].base; } - /* Sort */ + + /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */ for (n = 0; n < maxSymbolValue1; ++n) { U32 const c = count[n]; - U32 const r = BIT_highbit32(c+1) + 1; - U32 pos = rankPosition[r].curr++; - /* Insert into the correct position in the rank. - * We have at most 256 symbols, so this insertion should be fine. - */ - while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) { - huffNode[pos] = huffNode[pos-1]; - pos--; - } + U32 const r = HUF_getIndex(c) + 1; + U32 const pos = rankPosition[r].curr++; + assert(pos < maxSymbolValue1); huffNode[pos].count = c; huffNode[pos].byte = (BYTE)n; } -} + /* Sort each bucket. */ + for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { + U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base; + U32 const bucketStartIdx = rankPosition[n].base; + if (bucketSize > 1) { + assert(bucketStartIdx < maxSymbolValue1); + HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1); + } + } + + assert(HUF_isSorted(huffNode, maxSymbolValue1)); +} /* HUF_buildCTable_wksp() : * Same as HUF_buildCTable(), but using externally allocated scratch buffer. @@ -487,6 +652,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) */ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits) { + HUF_CElt* const ct = CTable + 1; /* fill result into ctable (val, nbBits) */ int n; U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0}; @@ -502,20 +668,20 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i min >>= 1; } } for (n=0; n<alphabetSize; n++) - CTable[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */ + HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */ for (n=0; n<alphabetSize; n++) - CTable[n].val = valPerRank[CTable[n].nbBits]++; /* assign value within rank, symbol order */ + HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); /* assign value within rank, symbol order */ + CTable[0] = maxNbBits; } -size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize) +size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize) { - HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)workSpace; + HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32)); nodeElt* const huffNode0 = wksp_tables->huffNodeTbl; nodeElt* const huffNode = huffNode0+1; int nonNullRank; /* safety checks */ - if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) return ERROR(workSpace_tooSmall); if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; @@ -533,99 +699,334 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbo maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ - HUF_buildCTableFromTree(tree, huffNode, nonNullRank, maxSymbolValue, maxNbBits); + HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits); return maxNbBits; } size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { + HUF_CElt const* ct = CTable + 1; size_t nbBits = 0; int s; for (s = 0; s <= (int)maxSymbolValue; ++s) { - nbBits += CTable[s].nbBits * count[s]; + nbBits += HUF_getNbBits(ct[s]) * count[s]; } return nbBits >> 3; } int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { + HUF_CElt const* ct = CTable + 1; int bad = 0; int s; for (s = 0; s <= (int)maxSymbolValue; ++s) { - bad |= (count[s] != 0) & (CTable[s].nbBits == 0); + bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); } return !bad; } size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } +/* HUF_CStream_t: + * Huffman uses its own BIT_CStream_t implementation. + * There are three major differences from BIT_CStream_t: + * 1. HUF_addBits() takes a HUF_CElt (size_t) which is + * the pair (nbBits, value) in the format: + * format: + * - Bits [0, 4) = nbBits + * - Bits [4, 64 - nbBits) = 0 + * - Bits [64 - nbBits, 64) = value + * 2. The bitContainer is built from the upper bits and + * right shifted. E.g. to add a new value of N bits + * you right shift the bitContainer by N, then or in + * the new value into the N upper bits. + * 3. The bitstream has two bit containers. You can add + * bits to the second container and merge them into + * the first container. + */ + +#define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8) + +typedef struct { + size_t bitContainer[2]; + size_t bitPos[2]; + + BYTE* startPtr; + BYTE* ptr; + BYTE* endPtr; +} HUF_CStream_t; + +/*! HUF_initCStream(): + * Initializes the bitstream. + * @returns 0 or an error code. + */ +static size_t HUF_initCStream(HUF_CStream_t* bitC, + void* startPtr, size_t dstCapacity) +{ + ZSTD_memset(bitC, 0, sizeof(*bitC)); + bitC->startPtr = (BYTE*)startPtr; + bitC->ptr = bitC->startPtr; + bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]); + if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall); + return 0; +} + +/*! HUF_addBits(): + * Adds the symbol stored in HUF_CElt elt to the bitstream. + * + * @param elt The element we're adding. This is a (nbBits, value) pair. + * See the HUF_CStream_t docs for the format. + * @param idx Insert into the bitstream at this idx. + * @param kFast This is a template parameter. If the bitstream is guaranteed + * to have at least 4 unused bits after this call it may be 1, + * otherwise it must be 0. HUF_addBits() is faster when fast is set. + */ +FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast) +{ + assert(idx <= 1); + assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX); + /* This is efficient on x86-64 with BMI2 because shrx + * only reads the low 6 bits of the register. The compiler + * knows this and elides the mask. When fast is set, + * every operation can use the same value loaded from elt. + */ + bitC->bitContainer[idx] >>= HUF_getNbBits(elt); + bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt); + /* We only read the low 8 bits of bitC->bitPos[idx] so it + * doesn't matter that the high bits have noise from the value. + */ + bitC->bitPos[idx] += HUF_getNbBitsFast(elt); + assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); + /* The last 4-bits of elt are dirty if fast is set, + * so we must not be overwriting bits that have already been + * inserted into the bit container. + */ +#if DEBUGLEVEL >= 1 + { + size_t const nbBits = HUF_getNbBits(elt); + size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1; + (void)dirtyBits; + /* Middle bits are 0. */ + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); + /* We didn't overwrite any bits in the bit container. */ + assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); + (void)dirtyBits; + } +#endif +} + +FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC) +{ + bitC->bitContainer[1] = 0; + bitC->bitPos[1] = 0; +} + +/*! HUF_mergeIndex1() : + * Merges the bit container @ index 1 into the bit container @ index 0 + * and zeros the bit container @ index 1. + */ +FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC) +{ + assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER); + bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF); + bitC->bitContainer[0] |= bitC->bitContainer[1]; + bitC->bitPos[0] += bitC->bitPos[1]; + assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER); +} + +/*! HUF_flushBits() : +* Flushes the bits in the bit container @ index 0. +* +* @post bitPos will be < 8. +* @param kFast If kFast is set then we must know a-priori that +* the bit container will not overflow. +*/ +FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast) +{ + /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */ + size_t const nbBits = bitC->bitPos[0] & 0xFF; + size_t const nbBytes = nbBits >> 3; + /* The top nbBits bits of bitContainer are the ones we need. */ + size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits); + /* Mask bitPos to account for the bytes we consumed. */ + bitC->bitPos[0] &= 7; + assert(nbBits > 0); + assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8); + assert(bitC->ptr <= bitC->endPtr); + MEM_writeLEST(bitC->ptr, bitContainer); + bitC->ptr += nbBytes; + assert(!kFast || bitC->ptr <= bitC->endPtr); + if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr; + /* bitContainer doesn't need to be modified because the leftover + * bits are already the top bitPos bits. And we don't care about + * noise in the lower values. + */ +} + +/*! HUF_endMark() + * @returns The Huffman stream end mark: A 1-bit value = 1. + */ +static HUF_CElt HUF_endMark(void) +{ + HUF_CElt endMark; + HUF_setNbBits(&endMark, 1); + HUF_setValue(&endMark, 1); + return endMark; +} + +/*! HUF_closeCStream() : + * @return Size of CStream, in bytes, + * or 0 if it could not fit into dstBuffer */ +static size_t HUF_closeCStream(HUF_CStream_t* bitC) +{ + HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0); + HUF_flushBits(bitC, /* kFast */ 0); + { + size_t const nbBits = bitC->bitPos[0] & 0xFF; + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ + return (bitC->ptr - bitC->startPtr) + (nbBits > 0); + } +} + FORCE_INLINE_TEMPLATE void -HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) +HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast) { - BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); + HUF_addBits(bitCPtr, CTable[symbol], idx, fast); } -#define HUF_FLUSHBITS(s) BIT_flushBits(s) +FORCE_INLINE_TEMPLATE void +HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC, + const BYTE* ip, size_t srcSize, + const HUF_CElt* ct, + int kUnroll, int kFastFlush, int kLastFast) +{ + /* Join to kUnroll */ + int n = (int)srcSize; + int rem = n % kUnroll; + if (rem > 0) { + for (; rem > 0; --rem) { + HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0); + } + HUF_flushBits(bitC, kFastFlush); + } + assert(n % kUnroll == 0); + + /* Join to 2 * kUnroll */ + if (n % (2 * kUnroll)) { + int u; + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast); + HUF_flushBits(bitC, kFastFlush); + n -= kUnroll; + } + assert(n % (2 * kUnroll) == 0); + + for (; n>0; n-= 2 * kUnroll) { + /* Encode kUnroll symbols into the bitstream @ index 0. */ + int u; + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast); + HUF_flushBits(bitC, kFastFlush); + /* Encode kUnroll symbols into the bitstream @ index 1. + * This allows us to start filling the bit container + * without any data dependencies. + */ + HUF_zeroIndex1(bitC); + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast); + /* Merge bitstream @ index 1 into the bitstream @ index 0 */ + HUF_mergeIndex1(bitC); + HUF_flushBits(bitC, kFastFlush); + } + assert(n == 0); + +} -#define HUF_FLUSHBITS_1(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) +/* + * Returns a tight upper bound on the output space needed by Huffman + * with 8 bytes buffer to handle over-writes. If the output is at least + * this large we don't need to do bounds checks during Huffman encoding. + */ +static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog) +{ + return ((srcSize * tableLog) >> 3) + 8; +} -#define HUF_FLUSHBITS_2(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream) FORCE_INLINE_TEMPLATE size_t HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) { + U32 const tableLog = (U32)CTable[0]; + HUF_CElt const* ct = CTable + 1; const BYTE* ip = (const BYTE*) src; BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstSize; BYTE* op = ostart; - size_t n; - BIT_CStream_t bitC; + HUF_CStream_t bitC; /* init */ if (dstSize < 8) return 0; /* not enough space to compress */ - { size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op)); + { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); if (HUF_isError(initErr)) return 0; } - n = srcSize & ~3; /* join to mod 4 */ - switch (srcSize & 3) - { - case 3: - HUF_encodeSymbol(&bitC, ip[n+ 2], CTable); - HUF_FLUSHBITS_2(&bitC); - ZSTD_FALLTHROUGH; - case 2: - HUF_encodeSymbol(&bitC, ip[n+ 1], CTable); - HUF_FLUSHBITS_1(&bitC); - ZSTD_FALLTHROUGH; - case 1: - HUF_encodeSymbol(&bitC, ip[n+ 0], CTable); - HUF_FLUSHBITS(&bitC); - ZSTD_FALLTHROUGH; - case 0: ZSTD_FALLTHROUGH; - default: break; - } - - for (; n>0; n-=4) { /* note : n&3==0 at this stage */ - HUF_encodeSymbol(&bitC, ip[n- 1], CTable); - HUF_FLUSHBITS_1(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 2], CTable); - HUF_FLUSHBITS_2(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 3], CTable); - HUF_FLUSHBITS_1(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 4], CTable); - HUF_FLUSHBITS(&bitC); - } - - return BIT_closeCStream(&bitC); + if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0); + else { + if (MEM_32bits()) { + switch (tableLog) { + case 11: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 10: ZSTD_FALLTHROUGH; + case 9: ZSTD_FALLTHROUGH; + case 8: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + case 7: ZSTD_FALLTHROUGH; + default: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + } + } else { + switch (tableLog) { + case 11: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 10: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + case 9: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 8: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 7: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 6: ZSTD_FALLTHROUGH; + default: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + } + } + } + assert(bitC.ptr <= bitC.endPtr); + + return HUF_closeCStream(&bitC); } #if DYNAMIC_BMI2 -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) @@ -667,9 +1068,13 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) { - return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); + return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); } +size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +{ + return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); +} static size_t HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, @@ -689,8 +1094,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, assert(op <= oend); { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); - if (cSize==0) return 0; - assert(cSize <= 65535); + if (cSize == 0 || cSize > 65535) return 0; MEM_writeLE16(ostart, (U16)cSize); op += cSize; } @@ -698,8 +1102,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, ip += segmentSize; assert(op <= oend); { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); - if (cSize==0) return 0; - assert(cSize <= 65535); + if (cSize == 0 || cSize > 65535) return 0; MEM_writeLE16(ostart+2, (U16)cSize); op += cSize; } @@ -707,8 +1110,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, ip += segmentSize; assert(op <= oend); { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); - if (cSize==0) return 0; - assert(cSize <= 65535); + if (cSize == 0 || cSize > 65535) return 0; MEM_writeLE16(ostart+4, (U16)cSize); op += cSize; } @@ -717,7 +1119,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, assert(op <= oend); assert(ip <= iend); { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); - if (cSize==0) return 0; + if (cSize == 0 || cSize > 65535) return 0; op += cSize; } @@ -726,7 +1128,12 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) { - return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); + return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +} + +size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +{ + return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); } typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; @@ -750,35 +1157,38 @@ static size_t HUF_compressCTable_internal( typedef struct { unsigned count[HUF_SYMBOLVALUE_MAX + 1]; - HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1]; + HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)]; union { HUF_buildCTable_wksp_tables buildCTable_wksp; HUF_WriteCTableWksp writeCTable_wksp; + U32 hist_wksp[HIST_WKSP_SIZE_U32]; } wksps; } HUF_compress_tables_t; +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ + /* HUF_compress_internal() : * `workSpace_align4` must be aligned on 4-bytes boundaries, - * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U32 unsigned */ + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ static size_t HUF_compress_internal (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, HUF_nbStreams_e nbStreams, - void* workSpace_align4, size_t wkspSize, + void* workSpace, size_t wkspSize, HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, - const int bmi2) + const int bmi2, unsigned suspectUncompressible) { - HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace_align4; + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstSize; BYTE* op = ostart; - HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE); - assert(((size_t)workSpace_align4 & 3) == 0); /* must be aligned on 4-bytes boundaries */ + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); /* checks & inits */ - if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall); + if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall); if (!srcSize) return 0; /* Uncompressed */ if (!dstSize) return 0; /* cannot fit anything within dst budget */ if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */ @@ -794,8 +1204,23 @@ HUF_compress_internal (void* dst, size_t dstSize, nbStreams, oldHufTable, bmi2); } + /* If uncompressible data is suspected, do a smaller sampling first */ + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); + if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { + size_t largestTotal = 0; + { unsigned maxSymbolValueBegin = maxSymbolValue; + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestBegin; + } + { unsigned maxSymbolValueEnd = maxSymbolValue; + CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestEnd; + } + if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0; /* heuristic : probably not compressible enough */ + } + /* Scan input and build symbol stats */ - { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace_align4, wkspSize) ); + { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) ); if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ } @@ -820,9 +1245,12 @@ HUF_compress_internal (void* dst, size_t dstSize, &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); CHECK_F(maxBits); huffLog = (U32)maxBits; - /* Zero unused symbols in CTable, so we can check it for validity */ - ZSTD_memset(table->CTable + (maxSymbolValue + 1), 0, - sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt))); + } + /* Zero unused symbols in CTable, so we can check it for validity */ + { + size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue); + size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt); + ZSTD_memset(table->CTable + ctableSize, 0, unusedSize); } /* Write table description header */ @@ -859,19 +1287,20 @@ size_t HUF_compress1X_wksp (void* dst, size_t dstSize, return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_singleStream, workSpace, wkspSize, - NULL, NULL, 0, 0 /*bmi2*/); + NULL, NULL, 0, 0 /*bmi2*/, 0); } size_t HUF_compress1X_repeat (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, + int bmi2, unsigned suspectUncompressible) { return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_singleStream, workSpace, wkspSize, hufTable, - repeat, preferRepeat, bmi2); + repeat, preferRepeat, bmi2, suspectUncompressible); } /* HUF_compress4X_repeat(): @@ -885,21 +1314,22 @@ size_t HUF_compress4X_wksp (void* dst, size_t dstSize, return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_fourStreams, workSpace, wkspSize, - NULL, NULL, 0, 0 /*bmi2*/); + NULL, NULL, 0, 0 /*bmi2*/, 0); } /* HUF_compress4X_repeat(): * compress input using 4 streams. + * consider skipping quickly * re-use an existing huffman compression table */ size_t HUF_compress4X_repeat (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible) { return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_fourStreams, workSpace, wkspSize, - hufTable, repeat, preferRepeat, bmi2); + hufTable, repeat, preferRepeat, bmi2, suspectUncompressible); } diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c index 73fff4c60149..f620cafca633 100644 --- a/lib/zstd/compress/zstd_compress.c +++ b/lib/zstd/compress/zstd_compress.c @@ -12,7 +12,6 @@ * Dependencies ***************************************/ #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ -#include "../common/cpu.h" #include "../common/mem.h" #include "hist.h" /* HIST_countFast_wksp */ #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ @@ -39,6 +38,18 @@ * Note that functions with explicit context such as ZSTD_compressCCtx() are unaffected. */ +/*! + * ZSTD_HASHLOG3_MAX : + * Maximum size of the hash table dedicated to find 3-bytes matches, + * in log format, aka 17 => 1 << 17 == 128Ki positions. + * This structure is only used in zstd_opt. + * Since allocation is centralized for all strategies, it has to be known here. + * The actual (selected) size of the hash table is then stored in ZSTD_matchState_t.hashLog3, + * so that zstd_opt.c doesn't need to know about this constant. + */ +#ifndef ZSTD_HASHLOG3_MAX +# define ZSTD_HASHLOG3_MAX 17 +#endif /*-************************************* * Helper functions @@ -69,6 +80,10 @@ struct ZSTD_CDict_s { ZSTD_customMem customMem; U32 dictID; int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */ + ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use + * row-based matchfinder. Unless the cdict is reloaded, we will use + * the same greedy/lazy matchfinder at compression time. + */ }; /* typedef'd to ZSTD_CDict within "zstd.h" */ ZSTD_CCtx* ZSTD_createCCtx(void) @@ -81,7 +96,7 @@ static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager) assert(cctx != NULL); ZSTD_memset(cctx, 0, sizeof(*cctx)); cctx->customMem = memManager; - cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + cctx->bmi2 = ZSTD_cpuSupportsBmi2(); { size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters); assert(!ZSTD_isError(err)); (void)err; @@ -192,12 +207,64 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs) /* private API call, for dictBuilder only */ const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } +/* Returns true if the strategy supports using a row based matchfinder */ +static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) { + return (strategy >= ZSTD_greedy && strategy <= ZSTD_lazy2); +} + +/* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder + * for this compression. + */ +static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_paramSwitch_e mode) { + assert(mode != ZSTD_ps_auto); + return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable); +} + +/* Returns row matchfinder usage given an initial mode and cParams */ +static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { +#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON) + int const kHasSIMD128 = 1; +#else + int const kHasSIMD128 = 0; +#endif + if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */ + mode = ZSTD_ps_disable; + if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode; + if (kHasSIMD128) { + if (cParams->windowLog > 14) mode = ZSTD_ps_enable; + } else { + if (cParams->windowLog > 17) mode = ZSTD_ps_enable; + } + return mode; +} + +/* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */ +static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { + if (mode != ZSTD_ps_auto) return mode; + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable; +} + +/* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */ +static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, + const ZSTD_paramSwitch_e useRowMatchFinder, + const U32 forDDSDict) { + assert(useRowMatchFinder != ZSTD_ps_auto); + /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate. + * We do not allocate a chaintable if we are using ZSTD_fast, or are using the row-based matchfinder. + */ + return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); +} + /* Returns 1 if compression parameters are such that we should * enable long distance matching (wlog >= 27, strategy >= btopt). * Returns 0 otherwise. */ -static U32 ZSTD_CParams_shouldEnableLdm(const ZSTD_compressionParameters* const cParams) { - return cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27; +static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { + if (mode != ZSTD_ps_auto) return mode; + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; } static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( @@ -208,15 +275,15 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( ZSTD_CCtxParams_init(&cctxParams, ZSTD_CLEVEL_DEFAULT); cctxParams.cParams = cParams; - if (ZSTD_CParams_shouldEnableLdm(&cParams)) { - DEBUGLOG(4, "ZSTD_makeCCtxParamsFromCParams(): Including LDM into cctx params"); - cctxParams.ldmParams.enableLdm = 1; - /* LDM is enabled by default for optimal parser and window size >= 128MB */ + /* Adjust advanced params according to cParams */ + cctxParams.ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams.ldmParams.enableLdm, &cParams); + if (cctxParams.ldmParams.enableLdm == ZSTD_ps_enable) { ZSTD_ldm_adjustParameters(&cctxParams.ldmParams, &cParams); assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog); assert(cctxParams.ldmParams.hashRateLog < 32); } - + cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); assert(!ZSTD_checkCParams(cParams)); return cctxParams; } @@ -275,6 +342,11 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par * But, set it for tracing anyway. */ cctxParams->compressionLevel = compressionLevel; + cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); + cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); + cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); + DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", + cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); } size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) @@ -431,9 +503,9 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) return bounds; case ZSTD_c_literalCompressionMode: - ZSTD_STATIC_ASSERT(ZSTD_lcm_auto < ZSTD_lcm_huffman && ZSTD_lcm_huffman < ZSTD_lcm_uncompressed); - bounds.lowerBound = ZSTD_lcm_auto; - bounds.upperBound = ZSTD_lcm_uncompressed; + ZSTD_STATIC_ASSERT(ZSTD_ps_auto < ZSTD_ps_enable && ZSTD_ps_enable < ZSTD_ps_disable); + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; return bounds; case ZSTD_c_targetCBlockSize: @@ -462,6 +534,21 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) bounds.upperBound = 1; return bounds; + case ZSTD_c_useBlockSplitter: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_useRowMatchFinder: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_deterministicRefPrefix: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + default: bounds.error = ERROR(parameter_unsupported); return bounds; @@ -523,6 +610,9 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) case ZSTD_c_stableOutBuffer: case ZSTD_c_blockDelimiters: case ZSTD_c_validateSequences: + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: default: return 0; } @@ -575,6 +665,9 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) case ZSTD_c_stableOutBuffer: case ZSTD_c_blockDelimiters: case ZSTD_c_validateSequences: + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); @@ -672,7 +765,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, } case ZSTD_c_literalCompressionMode : { - const ZSTD_literalCompressionMode_e lcm = (ZSTD_literalCompressionMode_e)value; + const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value; BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); CCtxParams->literalCompressionMode = lcm; return CCtxParams->literalCompressionMode; @@ -699,7 +792,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, return CCtxParams->enableDedicatedDictSearch; case ZSTD_c_enableLongDistanceMatching : - CCtxParams->ldmParams.enableLdm = (value!=0); + CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value; return CCtxParams->ldmParams.enableLdm; case ZSTD_c_ldmHashLog : @@ -758,6 +851,21 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, CCtxParams->validateSequences = value; return CCtxParams->validateSequences; + case ZSTD_c_useBlockSplitter: + BOUNDCHECK(ZSTD_c_useBlockSplitter, value); + CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value; + return CCtxParams->useBlockSplitter; + + case ZSTD_c_useRowMatchFinder: + BOUNDCHECK(ZSTD_c_useRowMatchFinder, value); + CCtxParams->useRowMatchFinder = (ZSTD_paramSwitch_e)value; + return CCtxParams->useRowMatchFinder; + + case ZSTD_c_deterministicRefPrefix: + BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); + CCtxParams->deterministicRefPrefix = !!value; + return CCtxParams->deterministicRefPrefix; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } } @@ -863,6 +971,15 @@ size_t ZSTD_CCtxParams_getParameter( case ZSTD_c_validateSequences : *value = (int)CCtxParams->validateSequences; break; + case ZSTD_c_useBlockSplitter : + *value = (int)CCtxParams->useBlockSplitter; + break; + case ZSTD_c_useRowMatchFinder : + *value = (int)CCtxParams->useRowMatchFinder; + break; + case ZSTD_c_deterministicRefPrefix: + *value = (int)CCtxParams->deterministicRefPrefix; + break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } return 0; @@ -889,7 +1006,7 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams( return 0; } -ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) +size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) { DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, @@ -969,14 +1086,14 @@ size_t ZSTD_CCtx_loadDictionary_advanced( return 0; } -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference( +size_t ZSTD_CCtx_loadDictionary_byReference( ZSTD_CCtx* cctx, const void* dict, size_t dictSize) { return ZSTD_CCtx_loadDictionary_advanced( cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); } -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) +size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) { return ZSTD_CCtx_loadDictionary_advanced( cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); @@ -1146,7 +1263,7 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, break; case ZSTD_cpm_createCDict: /* Assume a small source size when creating a dictionary - * with an unkown source size. + * with an unknown source size. */ if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN) srcSize = minSrcSize; @@ -1220,7 +1337,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( srcSizeHint = CCtxParams->srcSizeHint; } cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode); - if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; + if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); assert(!ZSTD_checkCParams(cParams)); /* srcSizeHint == 0 means 0 */ @@ -1229,9 +1346,14 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( static size_t ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + const ZSTD_paramSwitch_e useRowMatchFinder, + const U32 enableDedicatedDictSearch, const U32 forCCtx) { - size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); + /* chain table size should be 0 for fast or row-hash strategies */ + size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, enableDedicatedDictSearch && !forCCtx) + ? ((size_t)1 << cParams->chainLog) + : 0; size_t const hSize = ((size_t)1) << cParams->hashLog; U32 const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; @@ -1241,43 +1363,53 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + hSize * sizeof(U32) + h3Size * sizeof(U32); size_t const optPotentialSpace = - ZSTD_cwksp_alloc_size((MaxML+1) * sizeof(U32)) - + ZSTD_cwksp_alloc_size((MaxLL+1) * sizeof(U32)) - + ZSTD_cwksp_alloc_size((MaxOff+1) * sizeof(U32)) - + ZSTD_cwksp_alloc_size((1<<Litbits) * sizeof(U32)) - + ZSTD_cwksp_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)) - + ZSTD_cwksp_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); + ZSTD_cwksp_aligned_alloc_size((MaxML+1) * sizeof(U32)) + + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32)) + + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32)) + + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32)) + + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)) + + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); + size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder) + ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16)) + : 0; size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) ? optPotentialSpace : 0; + size_t const slackSpace = ZSTD_cwksp_slack_space_required(); + + /* tables are guaranteed to be sized in multiples of 64 bytes (or 16 uint32_t) */ + ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >= 4 && ZSTD_WINDOWLOG_MIN >= 4 && ZSTD_CHAINLOG_MIN >= 4); + assert(useRowMatchFinder != ZSTD_ps_auto); + DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u", (U32)chainSize, (U32)hSize, (U32)h3Size); - return tableSpace + optSpace; + return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; } static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( const ZSTD_compressionParameters* cParams, const ldmParams_t* ldmParams, const int isStatic, + const ZSTD_paramSwitch_e useRowMatchFinder, const size_t buffInSize, const size_t buffOutSize, const U64 pledgedSrcSize) { - size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << cParams->windowLog), pledgedSrcSize)); + size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); U32 const divider = (cParams->minMatch==3) ? 3 : 4; size_t const maxNbSeq = blockSize / divider; size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) - + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef)) + + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE); size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); - size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, /* forCCtx */ 1); + size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams); size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize); - size_t const ldmSeqSpace = ldmParams->enableLdm ? - ZSTD_cwksp_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; + size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ? + ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) @@ -1303,19 +1435,32 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) { ZSTD_compressionParameters const cParams = ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); + ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, + &cParams); RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); /* estimateCCtxSize is for one-shot compression. So no buffers should * be needed. However, we still allocate two 0-sized buffers, which can * take space under ASAN. */ return ZSTD_estimateCCtxSize_usingCCtxParams_internal( - &cParams, ¶ms->ldmParams, 1, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); } size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) { - ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); - return ZSTD_estimateCCtxSize_usingCCtxParams(¶ms); + ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); + if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { + /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ + size_t noRowCCtxSize; + size_t rowCCtxSize; + initialParams.useRowMatchFinder = ZSTD_ps_disable; + noRowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); + initialParams.useRowMatchFinder = ZSTD_ps_enable; + rowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); + return MAX(noRowCCtxSize, rowCCtxSize); + } else { + return ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); + } } static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel) @@ -1355,17 +1500,29 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered) ? ZSTD_compressBound(blockSize) + 1 : 0; + ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ¶ms->cParams); return ZSTD_estimateCCtxSize_usingCCtxParams_internal( - &cParams, ¶ms->ldmParams, 1, inBuffSize, outBuffSize, + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, ZSTD_CONTENTSIZE_UNKNOWN); } } size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams) { - ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); - return ZSTD_estimateCStreamSize_usingCCtxParams(¶ms); + ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); + if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { + /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ + size_t noRowCCtxSize; + size_t rowCCtxSize; + initialParams.useRowMatchFinder = ZSTD_ps_disable; + noRowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); + initialParams.useRowMatchFinder = ZSTD_ps_enable; + rowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); + return MAX(noRowCCtxSize, rowCCtxSize); + } else { + return ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); + } } static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) @@ -1480,20 +1637,27 @@ typedef enum { ZSTD_resetTarget_CCtx } ZSTD_resetTarget_e; + static size_t ZSTD_reset_matchState(ZSTD_matchState_t* ms, ZSTD_cwksp* ws, const ZSTD_compressionParameters* cParams, + const ZSTD_paramSwitch_e useRowMatchFinder, const ZSTD_compResetPolicy_e crp, const ZSTD_indexResetPolicy_e forceResetIndex, const ZSTD_resetTarget_e forWho) { - size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); + /* disable chain table allocation for fast or row-based strategies */ + size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, + ms->dedicatedDictSearch && (forWho == ZSTD_resetTarget_CDict)) + ? ((size_t)1 << cParams->chainLog) + : 0; size_t const hSize = ((size_t)1) << cParams->hashLog; U32 const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset); + assert(useRowMatchFinder != ZSTD_ps_auto); if (forceResetIndex == ZSTDirp_reset) { ZSTD_window_init(&ms->window); ZSTD_cwksp_mark_tables_dirty(ws); @@ -1532,11 +1696,23 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); } + if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { + { /* Row match finder needs an additional table of hashes ("tags") */ + size_t const tagTableSize = hSize*sizeof(U16); + ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); + if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); + } + { /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); + assert(cParams->hashLog >= rowLog); + ms->rowHashLog = cParams->hashLog - rowLog; + } + } + ms->cParams = *cParams; RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, "failed a workspace allocation in ZSTD_reset_matchState"); - return 0; } @@ -1553,61 +1729,87 @@ static int ZSTD_indexTooCloseToMax(ZSTD_window_t w) return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN); } +/* ZSTD_dictTooBig(): + * When dictionaries are larger than ZSTD_CHUNKSIZE_MAX they can't be loaded in + * one go generically. So we ensure that in that case we reset the tables to zero, + * so that we can load as much of the dictionary as possible. + */ +static int ZSTD_dictTooBig(size_t const loadedDictSize) +{ + return loadedDictSize > ZSTD_CHUNKSIZE_MAX; +} + /*! ZSTD_resetCCtx_internal() : - note : `params` are assumed fully validated at this stage */ + * @param loadedDictSize The size of the dictionary to be loaded + * into the context, if any. If no dictionary is used, or the + * dictionary is being attached / copied, then pass 0. + * note : `params` are assumed fully validated at this stage. + */ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, - ZSTD_CCtx_params params, + ZSTD_CCtx_params const* params, U64 const pledgedSrcSize, + size_t const loadedDictSize, ZSTD_compResetPolicy_e const crp, ZSTD_buffered_policy_e const zbuff) { ZSTD_cwksp* const ws = &zc->workspace; - DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u", - (U32)pledgedSrcSize, params.cParams.windowLog); - assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d", + (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter); + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); zc->isFirstBlock = 1; - if (params.ldmParams.enableLdm) { + /* Set applied params early so we can modify them for LDM, + * and point params at the applied params. + */ + zc->appliedParams = *params; + params = &zc->appliedParams; + + assert(params->useRowMatchFinder != ZSTD_ps_auto); + assert(params->useBlockSplitter != ZSTD_ps_auto); + assert(params->ldmParams.enableLdm != ZSTD_ps_auto); + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* Adjust long distance matching parameters */ - ZSTD_ldm_adjustParameters(¶ms.ldmParams, ¶ms.cParams); - assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog); - assert(params.ldmParams.hashRateLog < 32); + ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); + assert(params->ldmParams.hashLog >= params->ldmParams.bucketSizeLog); + assert(params->ldmParams.hashRateLog < 32); } - { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize)); + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); - U32 const divider = (params.cParams.minMatch==3) ? 3 : 4; + U32 const divider = (params->cParams.minMatch==3) ? 3 : 4; size_t const maxNbSeq = blockSize / divider; - size_t const buffOutSize = (zbuff == ZSTDb_buffered && params.outBufferMode == ZSTD_bm_buffered) + size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) ? ZSTD_compressBound(blockSize) + 1 : 0; - size_t const buffInSize = (zbuff == ZSTDb_buffered && params.inBufferMode == ZSTD_bm_buffered) + size_t const buffInSize = (zbuff == ZSTDb_buffered && params->inBufferMode == ZSTD_bm_buffered) ? windowSize + blockSize : 0; - size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize); + size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize); int const indexTooClose = ZSTD_indexTooCloseToMax(zc->blockState.matchState.window); + int const dictTooBig = ZSTD_dictTooBig(loadedDictSize); ZSTD_indexResetPolicy_e needsIndexReset = - (!indexTooClose && zc->initialized) ? ZSTDirp_continue : ZSTDirp_reset; + (indexTooClose || dictTooBig || !zc->initialized) ? ZSTDirp_reset : ZSTDirp_continue; size_t const neededSpace = ZSTD_estimateCCtxSize_usingCCtxParams_internal( - ¶ms.cParams, ¶ms.ldmParams, zc->staticSize != 0, + ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, buffInSize, buffOutSize, pledgedSrcSize); + int resizeWorkspace; + FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0); - /* Check if workspace is large enough, alloc a new one if needed */ - { + { /* Check if workspace is large enough, alloc a new one if needed */ int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); - + resizeWorkspace = workspaceTooSmall || workspaceWasteful; DEBUGLOG(4, "Need %zu B workspace", neededSpace); DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); - if (workspaceTooSmall || workspaceWasteful) { + if (resizeWorkspace) { DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB", ZSTD_cwksp_sizeof(ws) >> 10, neededSpace >> 10); @@ -1629,14 +1831,13 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock"); zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, ENTROPY_WORKSPACE_SIZE); - RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate entropyWorkspace"); + RETURN_ERROR_IF(zc->entropyWorkspace == NULL, memory_allocation, "couldn't allocate entropyWorkspace"); } } ZSTD_cwksp_clear(ws); /* init params */ - zc->appliedParams = params; - zc->blockState.matchState.cParams = params.cParams; + zc->blockState.matchState.cParams = params->cParams; zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; zc->consumedSrcSize = 0; zc->producedCSize = 0; @@ -1667,11 +1868,11 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); /* ldm bucketOffsets table */ - if (params.ldmParams.enableLdm) { + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* TODO: avoid memset? */ size_t const numBuckets = - ((size_t)1) << (params.ldmParams.hashLog - - params.ldmParams.bucketSizeLog); + ((size_t)1) << (params->ldmParams.hashLog - + params->ldmParams.bucketSizeLog); zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets); ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); } @@ -1687,32 +1888,28 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, FORWARD_IF_ERROR(ZSTD_reset_matchState( &zc->blockState.matchState, ws, - ¶ms.cParams, + ¶ms->cParams, + params->useRowMatchFinder, crp, needsIndexReset, ZSTD_resetTarget_CCtx), ""); /* ldm hash table */ - if (params.ldmParams.enableLdm) { + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* TODO: avoid memset? */ - size_t const ldmHSize = ((size_t)1) << params.ldmParams.hashLog; + size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); zc->maxNbLdmSequences = maxNbLdmSeq; ZSTD_window_init(&zc->ldmState.window); - ZSTD_window_clear(&zc->ldmState.window); zc->ldmState.loadedDictEnd = 0; } - /* Due to alignment, when reusing a workspace, we can actually consume - * up to 3 extra bytes for alignment. See the comments in zstd_cwksp.h - */ - assert(ZSTD_cwksp_used(ws) >= neededSpace && - ZSTD_cwksp_used(ws) <= neededSpace + 3); - DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); + assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); + zc->initialized = 1; return 0; @@ -1768,6 +1965,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { + DEBUGLOG(4, "ZSTD_resetCCtx_byAttachingCDict() pledgedSrcSize=%llu", + (unsigned long long)pledgedSrcSize); { ZSTD_compressionParameters adjusted_cdict_cParams = cdict->matchState.cParams; unsigned const windowLog = params.cParams.windowLog; @@ -1783,7 +1982,9 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, cdict->dictContentSize, ZSTD_cpm_attachDict); params.cParams.windowLog = windowLog; - FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, ZSTDcrp_makeClean, zbuff), ""); assert(cctx->appliedParams.cParams.strategy == adjusted_cdict_cParams.strategy); } @@ -1827,15 +2028,17 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams; assert(!cdict->matchState.dedicatedDictSearch); - - DEBUGLOG(4, "copying dictionary into context"); + DEBUGLOG(4, "ZSTD_resetCCtx_byCopyingCDict() pledgedSrcSize=%llu", + (unsigned long long)pledgedSrcSize); { unsigned const windowLog = params.cParams.windowLog; assert(windowLog != 0); /* Copy only compression parameters related to tables. */ params.cParams = *cdict_cParams; params.cParams.windowLog = windowLog; - FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + params.useRowMatchFinder = cdict->useRowMatchFinder; + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, ZSTDcrp_leaveDirty, zbuff), ""); assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy); assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog); @@ -1843,17 +2046,30 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, } ZSTD_cwksp_mark_tables_dirty(&cctx->workspace); + assert(params.useRowMatchFinder != ZSTD_ps_auto); /* copy tables */ - { size_t const chainSize = (cdict_cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict_cParams->chainLog); + { size_t const chainSize = ZSTD_allocateChainTable(cdict_cParams->strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */) + ? ((size_t)1 << cdict_cParams->chainLog) + : 0; size_t const hSize = (size_t)1 << cdict_cParams->hashLog; ZSTD_memcpy(cctx->blockState.matchState.hashTable, cdict->matchState.hashTable, hSize * sizeof(U32)); - ZSTD_memcpy(cctx->blockState.matchState.chainTable, + /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ + if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { + ZSTD_memcpy(cctx->blockState.matchState.chainTable, cdict->matchState.chainTable, chainSize * sizeof(U32)); + } + /* copy tag table */ + if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { + size_t const tagTableSize = hSize*sizeof(U16); + ZSTD_memcpy(cctx->blockState.matchState.tagTable, + cdict->matchState.tagTable, + tagTableSize); + } } /* Zero the hashTable3, since the cdict never fills it */ @@ -1917,16 +2133,22 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { - DEBUGLOG(5, "ZSTD_copyCCtx_internal"); RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong, "Can't copy a ctx that's not in init stage."); - + DEBUGLOG(5, "ZSTD_copyCCtx_internal"); ZSTD_memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem)); { ZSTD_CCtx_params params = dstCCtx->requestedParams; /* Copy only compression parameters related to tables. */ params.cParams = srcCCtx->appliedParams.cParams; + assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto); + assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto); + assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto); + params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder; + params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; + params.ldmParams = srcCCtx->appliedParams.ldmParams; params.fParams = fParams; - ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize, + ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, ZSTDcrp_leaveDirty, zbuff); assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog); assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy); @@ -1938,7 +2160,11 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace); /* copy tables */ - { size_t const chainSize = (srcCCtx->appliedParams.cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog); + { size_t const chainSize = ZSTD_allocateChainTable(srcCCtx->appliedParams.cParams.strategy, + srcCCtx->appliedParams.useRowMatchFinder, + 0 /* forDDSDict */) + ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog) + : 0; size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog; int const h3log = srcCCtx->blockState.matchState.hashLog3; size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; @@ -2005,6 +2231,8 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa int const nbRows = (int)size / ZSTD_ROWSIZE; int cellNb = 0; int rowNb; + /* Protect special index values < ZSTD_WINDOW_START_INDEX. */ + U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX; assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */ assert(size < (1U<<31)); /* can be casted to int */ @@ -2012,12 +2240,17 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa for (rowNb=0 ; rowNb < nbRows ; rowNb++) { int column; for (column=0; column<ZSTD_ROWSIZE; column++) { - if (preserveMark) { - U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0; - table[cellNb] += adder; + U32 newVal; + if (preserveMark && table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) { + /* This write is pointless, but is required(?) for the compiler + * to auto-vectorize the loop. */ + newVal = ZSTD_DUBT_UNSORTED_MARK; + } else if (table[cellNb] < reducerThreshold) { + newVal = 0; + } else { + newVal = table[cellNb] - reducerValue; } - if (table[cellNb] < reducerValue) table[cellNb] = 0; - else table[cellNb] -= reducerValue; + table[cellNb] = newVal; cellNb++; } } } @@ -2040,7 +2273,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par ZSTD_reduceTable(ms->hashTable, hSize, reducerValue); } - if (params->cParams.strategy != ZSTD_fast) { + if (ZSTD_allocateChainTable(params->cParams.strategy, params->useRowMatchFinder, (U32)ms->dedicatedDictSearch)) { U32 const chainSize = (U32)1 << params->cParams.chainLog; if (params->cParams.strategy == ZSTD_btlazy2) ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue); @@ -2072,14 +2305,14 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) assert(nbSeq <= seqStorePtr->maxNbSeq); for (u=0; u<nbSeq; u++) { U32 const llv = sequences[u].litLength; - U32 const mlv = sequences[u].matchLength; + U32 const mlv = sequences[u].mlBase; llCodeTable[u] = (BYTE)ZSTD_LLcode(llv); - ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset); + ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase); mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv); } - if (seqStorePtr->longLengthID==1) + if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) llCodeTable[seqStorePtr->longLengthPos] = MaxLL; - if (seqStorePtr->longLengthID==2) + if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) mlCodeTable[seqStorePtr->longLengthPos] = MaxML; } @@ -2093,10 +2326,161 @@ static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams) return (cctxParams->targetCBlockSize != 0); } -/* ZSTD_entropyCompressSequences_internal(): - * actually compresses both literals and sequences */ +/* ZSTD_blockSplitterEnabled(): + * Returns if block splitting param is being used + * If used, compression will do best effort to split a block in order to improve compression ratio. + * At the time this function is called, the parameter must be finalized. + * Returns 1 if true, 0 otherwise. */ +static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams) +{ + DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter); + assert(cctxParams->useBlockSplitter != ZSTD_ps_auto); + return (cctxParams->useBlockSplitter == ZSTD_ps_enable); +} + +/* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types + * and size of the sequences statistics + */ +typedef struct { + U32 LLtype; + U32 Offtype; + U32 MLtype; + size_t size; + size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ +} ZSTD_symbolEncodingTypeStats_t; + +/* ZSTD_buildSequencesStatistics(): + * Returns a ZSTD_symbolEncodingTypeStats_t, or a zstd error code in the `size` field. + * Modifies `nextEntropy` to have the appropriate values as a side effect. + * nbSeq must be greater than 0. + * + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) + */ +static ZSTD_symbolEncodingTypeStats_t +ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, + BYTE* dst, const BYTE* const dstEnd, + ZSTD_strategy strategy, unsigned* countWorkspace, + void* entropyWorkspace, size_t entropyWkspSize) { + BYTE* const ostart = dst; + const BYTE* const oend = dstEnd; + BYTE* op = ostart; + FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; + ZSTD_symbolEncodingTypeStats_t stats; + + stats.lastCountSize = 0; + /* convert length/distances into codes */ + ZSTD_seqToCodes(seqStorePtr); + assert(op <= oend); + assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ + /* build CTable for Literal Lengths */ + { unsigned max = MaxLL; + size_t const mostFrequent = HIST_countFast_wksp(countWorkspace, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + DEBUGLOG(5, "Building LL table"); + nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; + stats.LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, + countWorkspace, max, mostFrequent, nbSeq, + LLFSELog, prevEntropy->litlengthCTable, + LL_defaultNorm, LL_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(set_basic < set_compressed && set_rle < set_compressed); + assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LLtype, + countWorkspace, max, llCodeTable, nbSeq, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + prevEntropy->litlengthCTable, + sizeof(prevEntropy->litlengthCTable), + entropyWorkspace, entropyWkspSize); + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for LitLens failed"); + stats.size = countSize; + return stats; + } + if (stats.LLtype == set_compressed) + stats.lastCountSize = countSize; + op += countSize; + assert(op <= oend); + } } + /* build CTable for Offsets */ + { unsigned max = MaxOff; + size_t const mostFrequent = HIST_countFast_wksp( + countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ + ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; + DEBUGLOG(5, "Building OF table"); + nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; + stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, + countWorkspace, max, mostFrequent, nbSeq, + OffFSELog, prevEntropy->offcodeCTable, + OF_defaultNorm, OF_defaultNormLog, + defaultPolicy, strategy); + assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.Offtype, + countWorkspace, max, ofCodeTable, nbSeq, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + prevEntropy->offcodeCTable, + sizeof(prevEntropy->offcodeCTable), + entropyWorkspace, entropyWkspSize); + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for Offsets failed"); + stats.size = countSize; + return stats; + } + if (stats.Offtype == set_compressed) + stats.lastCountSize = countSize; + op += countSize; + assert(op <= oend); + } } + /* build CTable for MatchLengths */ + { unsigned max = MaxML; + size_t const mostFrequent = HIST_countFast_wksp( + countWorkspace, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); + nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; + stats.MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, + countWorkspace, max, mostFrequent, nbSeq, + MLFSELog, prevEntropy->matchlengthCTable, + ML_defaultNorm, ML_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.MLtype, + countWorkspace, max, mlCodeTable, nbSeq, + ML_defaultNorm, ML_defaultNormLog, MaxML, + prevEntropy->matchlengthCTable, + sizeof(prevEntropy->matchlengthCTable), + entropyWorkspace, entropyWkspSize); + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for MatchLengths failed"); + stats.size = countSize; + return stats; + } + if (stats.MLtype == set_compressed) + stats.lastCountSize = countSize; + op += countSize; + assert(op <= oend); + } } + stats.size = (size_t)(op-ostart); + return stats; +} + +/* ZSTD_entropyCompressSeqStore_internal(): + * compresses both literals and sequences + * Returns compressed size of block, or a zstd error. + */ +#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 MEM_STATIC size_t -ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, +ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, const ZSTD_entropyCTables_t* prevEntropy, ZSTD_entropyCTables_t* nextEntropy, const ZSTD_CCtx_params* cctxParams, @@ -2110,36 +2494,38 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; - U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */ const seqDef* const sequences = seqStorePtr->sequencesStart; + const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; const BYTE* const ofCodeTable = seqStorePtr->ofCode; const BYTE* const llCodeTable = seqStorePtr->llCode; const BYTE* const mlCodeTable = seqStorePtr->mlCode; BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstCapacity; BYTE* op = ostart; - size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); - BYTE* seqHead; - BYTE* lastNCount = NULL; + size_t lastCountSize; entropyWorkspace = count + (MaxSeq + 1); entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); - DEBUGLOG(4, "ZSTD_entropyCompressSequences_internal (nbSeq=%zu)", nbSeq); + DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq); ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog))); assert(entropyWkspSize >= HUF_WORKSPACE_SIZE); /* Compress literals */ { const BYTE* const literals = seqStorePtr->litStart; + size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart; + size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart; + /* Base suspicion of uncompressibility on ratio of literals to sequences */ + unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); size_t const litSize = (size_t)(seqStorePtr->lit - literals); size_t const cSize = ZSTD_compressLiterals( &prevEntropy->huf, &nextEntropy->huf, cctxParams->cParams.strategy, - ZSTD_disableLiteralsCompression(cctxParams), + ZSTD_literalsCompressionIsDisabled(cctxParams), op, dstCapacity, literals, litSize, entropyWorkspace, entropyWkspSize, - bmi2); + bmi2, suspectUncompressible); FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); assert(cSize <= dstCapacity); op += cSize; @@ -2165,95 +2551,20 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); return (size_t)(op - ostart); } - - /* seqHead : flags for FSE encoding type */ - seqHead = op++; - assert(op <= oend); - - /* convert length/distances into codes */ - ZSTD_seqToCodes(seqStorePtr); - /* build CTable for Literal Lengths */ - { unsigned max = MaxLL; - size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ - DEBUGLOG(5, "Building LL table"); - nextEntropy->fse.litlength_repeatMode = prevEntropy->fse.litlength_repeatMode; - LLtype = ZSTD_selectEncodingType(&nextEntropy->fse.litlength_repeatMode, - count, max, mostFrequent, nbSeq, - LLFSELog, prevEntropy->fse.litlengthCTable, - LL_defaultNorm, LL_defaultNormLog, - ZSTD_defaultAllowed, strategy); - assert(set_basic < set_compressed && set_rle < set_compressed); - assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize = ZSTD_buildCTable( - op, (size_t)(oend - op), - CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, - count, max, llCodeTable, nbSeq, - LL_defaultNorm, LL_defaultNormLog, MaxLL, - prevEntropy->fse.litlengthCTable, - sizeof(prevEntropy->fse.litlengthCTable), - entropyWorkspace, entropyWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed"); - if (LLtype == set_compressed) - lastNCount = op; - op += countSize; - assert(op <= oend); - } } - /* build CTable for Offsets */ - { unsigned max = MaxOff; - size_t const mostFrequent = HIST_countFast_wksp( - count, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ - /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ - ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; - DEBUGLOG(5, "Building OF table"); - nextEntropy->fse.offcode_repeatMode = prevEntropy->fse.offcode_repeatMode; - Offtype = ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repeatMode, - count, max, mostFrequent, nbSeq, - OffFSELog, prevEntropy->fse.offcodeCTable, - OF_defaultNorm, OF_defaultNormLog, - defaultPolicy, strategy); - assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize = ZSTD_buildCTable( - op, (size_t)(oend - op), - CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, - count, max, ofCodeTable, nbSeq, - OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, - prevEntropy->fse.offcodeCTable, - sizeof(prevEntropy->fse.offcodeCTable), - entropyWorkspace, entropyWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed"); - if (Offtype == set_compressed) - lastNCount = op; - op += countSize; - assert(op <= oend); - } } - /* build CTable for MatchLengths */ - { unsigned max = MaxML; - size_t const mostFrequent = HIST_countFast_wksp( - count, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ - DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); - nextEntropy->fse.matchlength_repeatMode = prevEntropy->fse.matchlength_repeatMode; - MLtype = ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_repeatMode, - count, max, mostFrequent, nbSeq, - MLFSELog, prevEntropy->fse.matchlengthCTable, - ML_defaultNorm, ML_defaultNormLog, - ZSTD_defaultAllowed, strategy); - assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize = ZSTD_buildCTable( - op, (size_t)(oend - op), - CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, - count, max, mlCodeTable, nbSeq, - ML_defaultNorm, ML_defaultNormLog, MaxML, - prevEntropy->fse.matchlengthCTable, - sizeof(prevEntropy->fse.matchlengthCTable), - entropyWorkspace, entropyWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed"); - if (MLtype == set_compressed) - lastNCount = op; - op += countSize; - assert(op <= oend); - } } - - *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); + { + ZSTD_symbolEncodingTypeStats_t stats; + BYTE* seqHead = op++; + /* build stats for sequences */ + stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + &prevEntropy->fse, &nextEntropy->fse, + op, oend, + strategy, count, + entropyWorkspace, entropyWkspSize); + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); + *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); + lastCountSize = stats.lastCountSize; + op += stats.size; + } { size_t const bitstreamSize = ZSTD_encodeSequences( op, (size_t)(oend - op), @@ -2273,9 +2584,9 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, * In this exceedingly rare case, we will simply emit an uncompressed * block, since it isn't worth optimizing. */ - if (lastNCount && (op - lastNCount) < 4) { - /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ - assert(op - lastNCount == 3); + if (lastCountSize && (lastCountSize + bitstreamSize) < 4) { + /* lastCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ + assert(lastCountSize + bitstreamSize == 3); DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by " "emitting an uncompressed block."); return 0; @@ -2287,7 +2598,7 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, } MEM_STATIC size_t -ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr, +ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, const ZSTD_entropyCTables_t* prevEntropy, ZSTD_entropyCTables_t* nextEntropy, const ZSTD_CCtx_params* cctxParams, @@ -2296,7 +2607,7 @@ ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr, void* entropyWorkspace, size_t entropyWkspSize, int bmi2) { - size_t const cSize = ZSTD_entropyCompressSequences_internal( + size_t const cSize = ZSTD_entropyCompressSeqStore_internal( seqStorePtr, prevEntropy, nextEntropy, cctxParams, dst, dstCapacity, entropyWorkspace, entropyWkspSize, bmi2); @@ -2306,20 +2617,20 @@ ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr, */ if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) return 0; /* block not compressed */ - FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSequences_internal failed"); + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); /* Check compressibility */ { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); if (cSize >= maxCSize) return 0; /* block not compressed */ } - DEBUGLOG(4, "ZSTD_entropyCompressSequences() cSize: %zu\n", cSize); + DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); return cSize; } /* ZSTD_selectBlockCompressor() : * Not static, but internal use only (used by long distance matcher) * assumption : strat is a valid strategy */ -ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode) +ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode) { static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { { ZSTD_compressBlock_fast /* default for 0 */, @@ -2367,7 +2678,28 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMo ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1); assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); - selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; + DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); + if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { + static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = { + { ZSTD_compressBlock_greedy_row, + ZSTD_compressBlock_lazy_row, + ZSTD_compressBlock_lazy2_row }, + { ZSTD_compressBlock_greedy_extDict_row, + ZSTD_compressBlock_lazy_extDict_row, + ZSTD_compressBlock_lazy2_extDict_row }, + { ZSTD_compressBlock_greedy_dictMatchState_row, + ZSTD_compressBlock_lazy_dictMatchState_row, + ZSTD_compressBlock_lazy2_dictMatchState_row }, + { ZSTD_compressBlock_greedy_dedicatedDictSearch_row, + ZSTD_compressBlock_lazy_dedicatedDictSearch_row, + ZSTD_compressBlock_lazy2_dedicatedDictSearch_row } + }; + DEBUGLOG(4, "Selecting a row-based matchfinder"); + assert(useRowMatchFinder != ZSTD_ps_auto); + selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy]; + } else { + selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; + } assert(selectedCompressor != NULL); return selectedCompressor; } @@ -2383,7 +2715,7 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr) { ssPtr->lit = ssPtr->litStart; ssPtr->sequences = ssPtr->sequencesStart; - ssPtr->longLengthID = 0; + ssPtr->longLengthType = ZSTD_llt_none; } typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; @@ -2430,15 +2762,16 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i]; } if (zc->externSeqStore.pos < zc->externSeqStore.size) { - assert(!zc->appliedParams.ldmParams.enableLdm); + assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); /* Updates ldmSeqStore.pos */ lastLLSize = ZSTD_ldm_blockCompress(&zc->externSeqStore, ms, &zc->seqStore, zc->blockState.nextCBlock->rep, + zc->appliedParams.useRowMatchFinder, src, srcSize); assert(zc->externSeqStore.pos <= zc->externSeqStore.size); - } else if (zc->appliedParams.ldmParams.enableLdm) { + } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { rawSeqStore_t ldmSeqStore = kNullRawSeqStore; ldmSeqStore.seq = zc->ldmSequences; @@ -2452,10 +2785,13 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) ZSTD_ldm_blockCompress(&ldmSeqStore, ms, &zc->seqStore, zc->blockState.nextCBlock->rep, + zc->appliedParams.useRowMatchFinder, src, srcSize); assert(ldmSeqStore.pos == ldmSeqStore.size); } else { /* not long range mode */ - ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode); + ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, + dictMode); ms->ldmSeqStore = NULL; lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); } @@ -2483,22 +2819,22 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1); ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); for (i = 0; i < seqStoreSeqSize; ++i) { - U32 rawOffset = seqStoreSeqs[i].offset - ZSTD_REP_NUM; + U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM; outSeqs[i].litLength = seqStoreSeqs[i].litLength; - outSeqs[i].matchLength = seqStoreSeqs[i].matchLength + MINMATCH; + outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH; outSeqs[i].rep = 0; if (i == seqStore->longLengthPos) { - if (seqStore->longLengthID == 1) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { outSeqs[i].litLength += 0x10000; - } else if (seqStore->longLengthID == 2) { + } else if (seqStore->longLengthType == ZSTD_llt_matchLength) { outSeqs[i].matchLength += 0x10000; } } - if (seqStoreSeqs[i].offset <= ZSTD_REP_NUM) { + if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) { /* Derive the correct offset corresponding to a repcode */ - outSeqs[i].rep = seqStoreSeqs[i].offset; + outSeqs[i].rep = seqStoreSeqs[i].offBase; if (outSeqs[i].litLength != 0) { rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1]; } else { @@ -2512,9 +2848,9 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) outSeqs[i].offset = rawOffset; /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode so we provide seqStoreSeqs[i].offset - 1 */ - updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, - seqStoreSeqs[i].offset - 1, - seqStoreSeqs[i].litLength == 0); + ZSTD_updateRep(updatedRepcodes.rep, + seqStoreSeqs[i].offBase - 1, + seqStoreSeqs[i].litLength == 0); literalsRead += outSeqs[i].litLength; } /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. @@ -2602,16 +2938,740 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore) return nbSeqs < 4 && nbLits < 10; } -static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc) +static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) +{ + ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; + bs->prevCBlock = bs->nextCBlock; + bs->nextCBlock = tmp; +} + +/* Writes the block header */ +static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) { + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(op, cBlockHeader); + DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock); +} + +/* ZSTD_buildBlockEntropyStats_literals() : + * Builds entropy for the literals. + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * Requires ENTROPY_WORKSPACE_SIZE workspace + * @return : size of huffman description table or error code */ +static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, + const ZSTD_hufCTables_t* prevHuf, + ZSTD_hufCTables_t* nextHuf, + ZSTD_hufCTablesMetadata_t* hufMetadata, + const int literalsCompressionIsDisabled, + void* workspace, size_t wkspSize) +{ + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; + BYTE* const countWkspStart = wkspStart; + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); + BYTE* const nodeWksp = countWkspStart + countWkspSize; + const size_t nodeWkspSize = wkspEnd-nodeWksp; + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; + unsigned huffLog = HUF_TABLELOG_DEFAULT; + HUF_repeat repeat = prevHuf->repeatMode; + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); + + /* Prepare nextEntropy assuming reusing the existing table */ + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + + if (literalsCompressionIsDisabled) { + DEBUGLOG(5, "set_basic - disabled"); + hufMetadata->hType = set_basic; + return 0; + } + + /* small ? don't even attempt compression (speed opt) */ +#ifndef COMPRESS_LITERALS_SIZE_MIN +#define COMPRESS_LITERALS_SIZE_MIN 63 +#endif + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType = set_basic; + return 0; + } + } + + /* Scan input and build symbol stats */ + { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest == srcSize) { + DEBUGLOG(5, "set_rle"); + hufMetadata->hType = set_rle; + return 0; + } + if (largest <= (srcSize >> 7)+4) { + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType = set_basic; + return 0; + } + } + + /* Validate the previous Huffman table */ + if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat = HUF_repeat_none; + } + + /* Build Huffman Tree */ + ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); + huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, + maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog = (U32)maxBits; + { /* Build and write the CTable */ + size_t const newCSize = HUF_estimateCompressedSize( + (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); + size_t const hSize = HUF_writeCTable_wksp( + hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), + (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + /* Check against repeating the previous CTable */ + if (repeat != HUF_repeat_none) { + size_t const oldCSize = HUF_estimateCompressedSize( + (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); + if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { + DEBUGLOG(5, "set_repeat - smaller"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType = set_repeat; + return 0; + } + } + if (newCSize + hSize >= srcSize) { + DEBUGLOG(5, "set_basic - no gains"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType = set_basic; + return 0; + } + DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); + hufMetadata->hType = set_compressed; + nextHuf->repeatMode = HUF_repeat_check; + return hSize; + } + } +} + + +/* ZSTD_buildDummySequencesStatistics(): + * Returns a ZSTD_symbolEncodingTypeStats_t with all encoding types as set_basic, + * and updates nextEntropy to the appropriate repeatMode. + */ +static ZSTD_symbolEncodingTypeStats_t +ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { + ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0}; + nextEntropy->litlength_repeatMode = FSE_repeat_none; + nextEntropy->offcode_repeatMode = FSE_repeat_none; + nextEntropy->matchlength_repeatMode = FSE_repeat_none; + return stats; +} + +/* ZSTD_buildBlockEntropyStats_sequences() : + * Builds entropy for the sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * Requires ENTROPY_WORKSPACE_SIZE wksp. + * @return : size of fse tables or error code */ +static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, + const ZSTD_fseCTables_t* prevEntropy, + ZSTD_fseCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_fseCTablesMetadata_t* fseMetadata, + void* workspace, size_t wkspSize) +{ + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; + BYTE* const ostart = fseMetadata->fseTablesBuffer; + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op = ostart; + unsigned* countWorkspace = (unsigned*)workspace; + unsigned* entropyWorkspace = countWorkspace + (MaxSeq + 1); + size_t entropyWorkspaceSize = wkspSize - (MaxSeq + 1) * sizeof(*countWorkspace); + ZSTD_symbolEncodingTypeStats_t stats; + + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_sequences (nbSeq=%zu)", nbSeq); + stats = nbSeq != 0 ? ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + prevEntropy, nextEntropy, op, oend, + strategy, countWorkspace, + entropyWorkspace, entropyWorkspaceSize) + : ZSTD_buildDummySequencesStatistics(nextEntropy); + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); + fseMetadata->llType = (symbolEncodingType_e) stats.LLtype; + fseMetadata->ofType = (symbolEncodingType_e) stats.Offtype; + fseMetadata->mlType = (symbolEncodingType_e) stats.MLtype; + fseMetadata->lastCountSize = stats.lastCountSize; + return stats.size; +} + + +/* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * Requires workspace size ENTROPY_WORKSPACE_SIZE + * + * @return : 0 on success or error code + */ +size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize) +{ + size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; + entropyMetadata->hufMetadata.hufDesSize = + ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, + &prevEntropy->huf, &nextEntropy->huf, + &entropyMetadata->hufMetadata, + ZSTD_literalsCompressionIsDisabled(cctxParams), + workspace, wkspSize); + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); + entropyMetadata->fseMetadata.fseTablesSize = + ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, + &prevEntropy->fse, &nextEntropy->fse, + cctxParams, + &entropyMetadata->fseMetadata, + workspace, wkspSize); + FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildBlockEntropyStats_sequences failed"); + return 0; +} + +/* Returns the size estimate for the literals section (header + content) of a block */ +static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, + const ZSTD_hufCTables_t* huf, + const ZSTD_hufCTablesMetadata_t* hufMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; + size_t literalSectionHeaderSize = 3 + (litSize >= 1 KB) + (litSize >= 16 KB); + U32 singleStream = litSize < 256; + + if (hufMetadata->hType == set_basic) return litSize; + else if (hufMetadata->hType == set_rle) return 1; + else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) { + size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize); + if (ZSTD_isError(largest)) return litSize; + { size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue); + if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize; + if (!singleStream) cLitSizeEstimate += 6; /* multi-stream huffman uses 6-byte jump table */ + return cLitSizeEstimate + literalSectionHeaderSize; + } } + assert(0); /* impossible */ + return 0; +} + +/* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ +static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, + const BYTE* codeTable, size_t nbSeq, unsigned maxCode, + const FSE_CTable* fseCTable, + const U8* additionalBits, + short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, + void* workspace, size_t wkspSize) +{ + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; + const BYTE* const ctStart = ctp; + const BYTE* const ctEnd = ctStart + nbSeq; + size_t cSymbolTypeSizeEstimateInBits = 0; + unsigned max = maxCode; + + HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize); /* can't fail */ + if (type == set_basic) { + /* We selected this encoding type, so it must be valid. */ + assert(max <= defaultMax); + (void)defaultMax; + cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max); + } else if (type == set_rle) { + cSymbolTypeSizeEstimateInBits = 0; + } else if (type == set_compressed || type == set_repeat) { + cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max); + } + if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) { + return nbSeq * 10; + } + while (ctp < ctEnd) { + if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp]; + else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */ + ctp++; + } + return cSymbolTypeSizeEstimateInBits >> 3; +} + +/* Returns the size estimate for the sequences section (header + content) of a block */ +static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_fseCTables_t* fseTables, + const ZSTD_fseCTablesMetadata_t* fseMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); + size_t cSeqSizeEstimate = 0; + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, + fseTables->offcodeCTable, NULL, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, + fseTables->litlengthCTable, LL_bits, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, + fseTables->matchlengthCTable, ML_bits, + ML_defaultNorm, ML_defaultNormLog, MaxML, + workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; +} + +/* Returns the size estimate for a given stream of literals, of, ll, ml */ +static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, + const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize, + int writeLitEntropy, int writeSeqEntropy) { + size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, + &entropy->huf, &entropyMetadata->hufMetadata, + workspace, wkspSize, writeLitEntropy); + size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, + nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, + workspace, wkspSize, writeSeqEntropy); + return seqSize + literalsSize + ZSTD_blockHeaderSize; +} + +/* Builds entropy statistics and uses them for blocksize estimation. + * + * Returns the estimated compressed size of the seqStore, or a zstd error. + */ +static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) { + ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata; + DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + entropyMetadata, + zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); + return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), + seqStore->ofCode, seqStore->llCode, seqStore->mlCode, + (size_t)(seqStore->sequences - seqStore->sequencesStart), + &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, + (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); +} + +/* Returns literals bytes represented in a seqStore */ +static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) { + size_t literalsBytes = 0; + size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; + size_t i; + for (i = 0; i < nbSeqs; ++i) { + seqDef seq = seqStore->sequencesStart[i]; + literalsBytes += seq.litLength; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { + literalsBytes += 0x10000; + } + } + return literalsBytes; +} + +/* Returns match bytes represented in a seqStore */ +static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { + size_t matchBytes = 0; + size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; + size_t i; + for (i = 0; i < nbSeqs; ++i) { + seqDef seq = seqStore->sequencesStart[i]; + matchBytes += seq.mlBase + MINMATCH; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { + matchBytes += 0x10000; + } + } + return matchBytes; +} + +/* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx). + * Stores the result in resultSeqStore. + */ +static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + const seqStore_t* originalSeqStore, + size_t startIdx, size_t endIdx) { + BYTE* const litEnd = originalSeqStore->lit; + size_t literalsBytes; + size_t literalsBytesPreceding = 0; + + *resultSeqStore = *originalSeqStore; + if (startIdx > 0) { + resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; + literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + } + + /* Move longLengthPos into the correct position if necessary */ + if (originalSeqStore->longLengthType != ZSTD_llt_none) { + if (originalSeqStore->longLengthPos < startIdx || originalSeqStore->longLengthPos > endIdx) { + resultSeqStore->longLengthType = ZSTD_llt_none; + } else { + resultSeqStore->longLengthPos -= (U32)startIdx; + } + } + resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; + resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; + literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + resultSeqStore->litStart += literalsBytesPreceding; + if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { + /* This accounts for possible last literals if the derived chunk reaches the end of the block */ + resultSeqStore->lit = litEnd; + } else { + resultSeqStore->lit = resultSeqStore->litStart+literalsBytes; + } + resultSeqStore->llCode += startIdx; + resultSeqStore->mlCode += startIdx; + resultSeqStore->ofCode += startIdx; +} + +/* + * Returns the raw offset represented by the combination of offCode, ll0, and repcode history. + * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq(). + */ +static U32 +ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0) +{ + U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */ + assert(STORED_IS_REPCODE(offCode)); + if (adjustedOffCode == ZSTD_REP_NUM) { + /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */ + assert(rep[0] > 0); + return rep[0] - 1; + } + return rep[adjustedOffCode]; +} + +/* + * ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in offset history that may arise + * due to emission of RLE/raw blocks that disturb the offset history, + * and replaces any repcodes within the seqStore that may be invalid. + * + * dRepcodes are updated as would be on the decompression side. + * cRepcodes are updated exactly in accordance with the seqStore. + * + * Note : this function assumes seq->offBase respects the following numbering scheme : + * 0 : invalid + * 1-3 : repcode 1-3 + * 4+ : real_offset+3 + */ +static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, + seqStore_t* const seqStore, U32 const nbSeq) { + U32 idx = 0; + for (; idx < nbSeq; ++idx) { + seqDef* const seq = seqStore->sequencesStart + idx; + U32 const ll0 = (seq->litLength == 0); + U32 const offCode = OFFBASE_TO_STORED(seq->offBase); + assert(seq->offBase > 0); + if (STORED_IS_REPCODE(offCode)) { + U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); + U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); + /* Adjust simulated decompression repcode history if we come across a mismatch. Replace + * the repcode with the offset it actually references, determined by the compression + * repcode history. + */ + if (dRawOffset != cRawOffset) { + seq->offBase = cRawOffset + ZSTD_REP_NUM; + } + } + /* Compression repcode history is always updated with values directly from the unmodified seqStore. + * Decompression repcode history may use modified seq->offset value taken from compression repcode history. + */ + ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0); + ZSTD_updateRep(cRepcodes->rep, offCode, ll0); + } +} + +/* ZSTD_compressSeqStore_singleBlock(): + * Compresses a seqStore into a block with a block header, into the buffer dst. + * + * Returns the total size of that block (including header) or a ZSTD error code. + */ +static size_t +ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, + repcodes_t* const dRep, repcodes_t* const cRep, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastBlock, U32 isPartition) { - ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock; - zc->blockState.prevCBlock = zc->blockState.nextCBlock; - zc->blockState.nextCBlock = tmp; + const U32 rleMaxLength = 25; + BYTE* op = (BYTE*)dst; + const BYTE* ip = (const BYTE*)src; + size_t cSize; + size_t cSeqsSize; + + /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */ + repcodes_t const dRepOriginal = *dRep; + DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock"); + if (isPartition) + ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart)); + + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "Block header doesn't fit"); + cSeqsSize = ZSTD_entropyCompressSeqStore(seqStore, + &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, + srcSize, + zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, + zc->bmi2); + FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!"); + + if (!zc->isFirstBlock && + cSeqsSize < rleMaxLength && + ZSTD_isRLE((BYTE const*)src, srcSize)) { + /* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 + */ + cSeqsSize = 1; + } + + if (zc->seqCollector.collectSequences) { + ZSTD_copyBlockSequences(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } + + if (cSeqsSize == 0) { + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "Nocompress block failed"); + DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else if (cSeqsSize == 1) { + cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "RLE compress block failed"); + DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + writeBlockHeader(op, cSeqsSize, srcSize, lastBlock); + cSize = ZSTD_blockHeaderSize + cSeqsSize; + DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize); + } + + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + return cSize; +} + +/* Struct to keep track of where we are in our recursive calls. */ +typedef struct { + U32* splitLocations; /* Array of split indices */ + size_t idx; /* The current index within splitLocations being worked on */ +} seqStoreSplits; + +#define MIN_SEQUENCES_BLOCK_SPLITTING 300 + +/* Helper function to perform the recursive search for block splits. + * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. + * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then + * we do not recurse. + * + * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING. + * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). + * In practice, recursion depth usually doesn't go beyond 4. + * + * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize + * maximum of 128 KB, this value is actually impossible to reach. + */ +static void +ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, + ZSTD_CCtx* zc, const seqStore_t* origSeqStore) +{ + seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; + seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; + seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; + size_t estimatedOriginalSize; + size_t estimatedFirstHalfSize; + size_t estimatedSecondHalfSize; + size_t midIdx = (startIdx + endIdx)/2; + + if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { + DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences"); + return; + } + DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); + ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); + estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); + estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); + estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); + DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", + estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); + if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { + return; + } + if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { + ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); + splits->splitLocations[splits->idx] = (U32)midIdx; + splits->idx++; + ZSTD_deriveBlockSplitsHelper(splits, midIdx, endIdx, zc, origSeqStore); + } +} + +/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio. + * + * Returns the number of splits made (which equals the size of the partition table - 1). + */ +static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) { + seqStoreSplits splits = {partitions, 0}; + if (nbSeq <= 4) { + DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split"); + /* Refuse to try and split anything with less than 4 sequences */ + return 0; + } + ZSTD_deriveBlockSplitsHelper(&splits, 0, nbSeq, zc, &zc->seqStore); + splits.splitLocations[splits.idx] = nbSeq; + DEBUGLOG(5, "ZSTD_deriveBlockSplits: final nb partitions: %zu", splits.idx+1); + return splits.idx; +} + +/* ZSTD_compressBlock_splitBlock(): + * Attempts to split a given block into multiple blocks to improve compression ratio. + * + * Returns combined size of all blocks (which includes headers), or a ZSTD error code. + */ +static size_t +ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, + const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) +{ + size_t cSize = 0; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + size_t i = 0; + size_t srcBytesTotal = 0; + U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ + seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore; + seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore; + size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + + /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history + * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two + * separate repcode histories that simulate repcode history on compression and decompression side, + * and use the histories to determine whether we must replace a particular repcode with its raw offset. + * + * 1) cRep gets updated for each partition, regardless of whether the block was emitted as uncompressed + * or RLE. This allows us to retrieve the offset value that an invalid repcode references within + * a nocompress/RLE block. + * 2) dRep gets updated only for compressed partitions, and when a repcode gets replaced, will use + * the replacement offset value rather than the original repcode to update the repcode history. + * dRep also will be the final repcode history sent to the next block. + * + * See ZSTD_seqStore_resolveOffCodes() for more details. + */ + repcodes_t dRep; + repcodes_t cRep; + ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); + ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); + ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); + + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + if (numSplits == 0) { + size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, + &dRep, &cRep, + op, dstCapacity, + ip, blockSize, + lastBlock, 0 /* isPartition */); + FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); + assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); + return cSizeSingleBlock; + } + + ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); + for (i = 0; i <= numSplits; ++i) { + size_t srcBytes; + size_t cSizeChunk; + U32 const lastPartition = (i == numSplits); + U32 lastBlockEntireSrc = 0; + + srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); + srcBytesTotal += srcBytes; + if (lastPartition) { + /* This is the final partition, need to account for possible last literals */ + srcBytes += blockSize - srcBytesTotal; + lastBlockEntireSrc = lastBlock; + } else { + ZSTD_deriveSeqStoreChunk(nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]); + } + + cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, currSeqStore, + &dRep, &cRep, + op, dstCapacity, + ip, srcBytes, + lastBlockEntireSrc, 1 /* isPartition */); + DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); + FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); + + ip += srcBytes; + op += cSizeChunk; + dstCapacity -= cSizeChunk; + cSize += cSizeChunk; + *currSeqStore = *nextSeqStore; + assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); + } + /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes + * for the next block. + */ + ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); + return cSize; +} + +static size_t +ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 lastBlock) +{ + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + U32 nbSeq; + size_t cSize; + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); + assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable); + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + if (bss == ZSTDbss_noCompress) { + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); + return cSize; + } + nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart); + } + + cSize = ZSTD_compressBlock_splitBlock_internal(zc, dst, dstCapacity, src, srcSize, lastBlock, nbSeq); + FORWARD_IF_ERROR(cSize, "Splitting blocks failed!"); + return cSize; } -static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, U32 frame) +static size_t +ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 frame) { /* This the upper bound for the length of an rle block. * This isn't the actual upper bound. Finding the real threshold @@ -2632,12 +3692,12 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, if (zc->seqCollector.collectSequences) { ZSTD_copyBlockSequences(zc); - ZSTD_confirmRepcodesAndEntropyTables(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return 0; } /* encode sequences and literals */ - cSize = ZSTD_entropyCompressSequences(&zc->seqStore, + cSize = ZSTD_entropyCompressSeqStore(&zc->seqStore, &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, &zc->appliedParams, dst, dstCapacity, @@ -2645,12 +3705,6 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, zc->bmi2); - if (zc->seqCollector.collectSequences) { - ZSTD_copyBlockSequences(zc); - return 0; - } - - if (frame && /* We don't want to emit our first block as a RLE even if it qualifies because * doing so will cause the decoder (cli only) to throw a "should consume all input error." @@ -2666,7 +3720,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, out: if (!ZSTD_isError(cSize) && cSize > 1) { - ZSTD_confirmRepcodesAndEntropyTables(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); } /* We check that dictionaries have offset codes available for the first * block. After the first block, the offcode table might not have large @@ -2719,7 +3773,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { - ZSTD_confirmRepcodesAndEntropyTables(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return cSize; } } @@ -2759,9 +3813,9 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, void const* ip, void const* iend) { - if (ZSTD_window_needOverflowCorrection(ms->window, iend)) { - U32 const maxDist = (U32)1 << params->cParams.windowLog; - U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); + U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); + U32 const maxDist = (U32)1 << params->cParams.windowLog; + if (ZSTD_window_needOverflowCorrection(ms->window, cycleLog, maxDist, ms->loadedDictEnd, ip, iend)) { U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip); ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30); ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30); @@ -2784,7 +3838,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, * Frame is supposed already started (header already produced) * @return : compressed size, or an error code */ -static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, +static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastFrameChunk) @@ -2814,6 +3868,7 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, ZSTD_overflowCorrectIfNeeded( ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize); ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); + ZSTD_window_enforceMaxDist(&ms->window, ip, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); /* Ensure hash/chain table insertion resumes no sooner than lowlimit */ if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit; @@ -2824,6 +3879,10 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed"); assert(cSize > 0); assert(cSize <= blockSize + ZSTD_blockHeaderSize); + } else if (ZSTD_blockSplitterEnabled(&cctx->appliedParams)) { + cSize = ZSTD_compressBlock_splitBlock(cctx, op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_splitBlock failed"); + assert(cSize > 0 || cctx->seqCollector.collectSequences == 1); } else { cSize = ZSTD_compressBlock_internal(cctx, op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize, @@ -2946,7 +4005,7 @@ size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSe { RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong, "wrong cctx stage"); - RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm, + RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable, parameter_unsupported, "incompatible with ldm"); cctx->externSeqStore.seq = seq; @@ -2983,11 +4042,12 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, if (!srcSize) return fhSize; /* do not generate an empty block if no input */ - if (!ZSTD_window_update(&ms->window, src, srcSize)) { + if (!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContiguous)) { + ms->forceNonContiguous = 0; ms->nextToUpdate = ms->window.dictLimit; } - if (cctx->appliedParams.ldmParams.enableLdm) { - ZSTD_window_update(&cctx->ldmState.window, src, srcSize); + if (cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { + ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceNonContiguous */ 0); } if (!frame) { @@ -3055,63 +4115,86 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, { const BYTE* ip = (const BYTE*) src; const BYTE* const iend = ip + srcSize; + int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; - ZSTD_window_update(&ms->window, src, srcSize); + /* Assert that we the ms params match the params we're being given */ + ZSTD_assertEqualCParams(params->cParams, ms->cParams); + + if (srcSize > ZSTD_CHUNKSIZE_MAX) { + /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. + * Dictionaries right at the edge will immediately trigger overflow + * correction, but I don't want to insert extra constraints here. + */ + U32 const maxDictSize = ZSTD_CURRENT_MAX - 1; + /* We must have cleared our windows when our source is this large. */ + assert(ZSTD_window_isEmpty(ms->window)); + if (loadLdmDict) + assert(ZSTD_window_isEmpty(ls->window)); + /* If the dictionary is too large, only load the suffix of the dictionary. */ + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; + src = ip; + srcSize = maxDictSize; + } + } + + DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); + ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); + ms->forceNonContiguous = params->deterministicRefPrefix; - if (params->ldmParams.enableLdm && ls != NULL) { - ZSTD_window_update(&ls->window, src, srcSize); + if (loadLdmDict) { + ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); } - /* Assert that we the ms params match the params we're being given */ - ZSTD_assertEqualCParams(params->cParams, ms->cParams); - if (srcSize <= HASH_READ_SIZE) return 0; - while (iend - ip > HASH_READ_SIZE) { - size_t const remaining = (size_t)(iend - ip); - size_t const chunk = MIN(remaining, ZSTD_CHUNKSIZE_MAX); - const BYTE* const ichunk = ip + chunk; - - ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, ichunk); + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); - if (params->ldmParams.enableLdm && ls != NULL) - ZSTD_ldm_fillHashTable(ls, (const BYTE*)src, (const BYTE*)src + srcSize, ¶ms->ldmParams); + if (loadLdmDict) + ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); - switch(params->cParams.strategy) - { - case ZSTD_fast: - ZSTD_fillHashTable(ms, ichunk, dtlm); - break; - case ZSTD_dfast: - ZSTD_fillDoubleHashTable(ms, ichunk, dtlm); - break; + switch(params->cParams.strategy) + { + case ZSTD_fast: + ZSTD_fillHashTable(ms, iend, dtlm); + break; + case ZSTD_dfast: + ZSTD_fillDoubleHashTable(ms, iend, dtlm); + break; - case ZSTD_greedy: - case ZSTD_lazy: - case ZSTD_lazy2: - if (chunk >= HASH_READ_SIZE && ms->dedicatedDictSearch) { - assert(chunk == remaining); /* must load everything in one go */ - ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, ichunk-HASH_READ_SIZE); - } else if (chunk >= HASH_READ_SIZE) { - ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE); + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + assert(srcSize >= HASH_READ_SIZE); + if (ms->dedicatedDictSearch) { + assert(ms->chainTable != NULL); + ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, iend-HASH_READ_SIZE); + } else { + assert(params->useRowMatchFinder != ZSTD_ps_auto); + if (params->useRowMatchFinder == ZSTD_ps_enable) { + size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ZSTD_row_update(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using row-based hash table for lazy dict"); + } else { + ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using chain-based hash table for lazy dict"); } - break; - - case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ - case ZSTD_btopt: - case ZSTD_btultra: - case ZSTD_btultra2: - if (chunk >= HASH_READ_SIZE) - ZSTD_updateTree(ms, ichunk-HASH_READ_SIZE, ichunk); - break; - - default: - assert(0); /* not possible : not a valid strategy id */ } + break; + + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: + assert(srcSize >= HASH_READ_SIZE); + ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); + break; - ip = ichunk; + default: + assert(0); /* not possible : not a valid strategy id */ } ms->nextToUpdate = (U32)(iend - ms->window.base); @@ -3250,7 +4333,6 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, const BYTE* const dictEnd = dictPtr + dictSize; size_t dictID; size_t eSize; - ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog))); assert(dictSize >= 8); assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY); @@ -3321,6 +4403,7 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { + size_t const dictContentSize = cdict ? cdict->dictContentSize : dictSize; DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog); /* params are supposed to be fully validated at this point */ assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); @@ -3335,7 +4418,8 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff); } - FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, *params, pledgedSrcSize, + FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + dictContentSize, ZSTDcrp_makeClean, zbuff) , ""); { size_t const dictID = cdict ? ZSTD_compress_insertDictionary( @@ -3350,7 +4434,7 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); assert(dictID <= UINT_MAX); cctx->dictID = (U32)dictID; - cctx->dictContentSize = cdict ? cdict->dictContentSize : dictSize; + cctx->dictContentSize = dictContentSize; } return 0; } @@ -3485,15 +4569,14 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, const void* dict,size_t dictSize, ZSTD_parameters params) { - ZSTD_CCtx_params cctxParams; DEBUGLOG(4, "ZSTD_compress_advanced"); FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); - ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, ZSTD_NO_CLEVEL); + ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, ZSTD_NO_CLEVEL); return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, - &cctxParams); + &cctx->simpleApiParams); } /* Internal */ @@ -3517,14 +4600,13 @@ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_CCtx_params cctxParams; { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict); assert(params.fParams.contentSizeFlag == 1); - ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel); + ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel); } DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize); - return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctxParams); + return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctx->simpleApiParams); } size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, @@ -3561,7 +4643,10 @@ size_t ZSTD_estimateCDictSize_advanced( DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict)); return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) - + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + /* enableDedicatedDictSearch == 1 ensures that CDict estimation will not be too small + * in case we are using DDS with row-hash. */ + + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams), + /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); } @@ -3592,9 +4677,6 @@ static size_t ZSTD_initCDict_internal( assert(!ZSTD_checkCParams(params.cParams)); cdict->matchState.cParams = params.cParams; cdict->matchState.dedicatedDictSearch = params.enableDedicatedDictSearch; - if (cdict->matchState.dedicatedDictSearch && dictSize > ZSTD_CHUNKSIZE_MAX) { - cdict->matchState.dedicatedDictSearch = 0; - } if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) { cdict->dictContent = dictBuffer; } else { @@ -3615,6 +4697,7 @@ static size_t ZSTD_initCDict_internal( &cdict->matchState, &cdict->workspace, ¶ms.cParams, + params.useRowMatchFinder, ZSTDcrp_makeClean, ZSTDirp_reset, ZSTD_resetTarget_CDict), ""); @@ -3638,14 +4721,17 @@ static size_t ZSTD_initCDict_internal( static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, - ZSTD_compressionParameters cParams, ZSTD_customMem customMem) + ZSTD_compressionParameters cParams, + ZSTD_paramSwitch_e useRowMatchFinder, + U32 enableDedicatedDictSearch, + ZSTD_customMem customMem) { if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; { size_t const workspaceSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + - ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + + ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); void* const workspace = ZSTD_customMalloc(workspaceSize, customMem); @@ -3664,7 +4750,7 @@ static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, ZSTD_cwksp_move(&cdict->workspace, &ws); cdict->customMem = customMem; cdict->compressionLevel = ZSTD_NO_CLEVEL; /* signals advanced API usage */ - + cdict->useRowMatchFinder = useRowMatchFinder; return cdict; } } @@ -3686,7 +4772,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, &cctxParams, customMem); } -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( +ZSTD_CDict* ZSTD_createCDict_advanced2( const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, @@ -3716,10 +4802,13 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); } + DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDedicatedDictSearch); cctxParams.cParams = cParams; + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); cdict = ZSTD_createCDict_advanced_internal(dictSize, dictLoadMethod, cctxParams.cParams, + cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch, customMem); if (ZSTD_isError( ZSTD_initCDict_internal(cdict, @@ -3788,7 +4877,9 @@ const ZSTD_CDict* ZSTD_initStaticCDict( ZSTD_dictContentType_e dictContentType, ZSTD_compressionParameters cParams) { - size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0); + ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams); + /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */ + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0); size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))) @@ -3813,6 +4904,8 @@ const ZSTD_CDict* ZSTD_initStaticCDict( ZSTD_CCtxParams_init(¶ms, 0); params.cParams = cParams; + params.useRowMatchFinder = useRowMatchFinder; + cdict->useRowMatchFinder = useRowMatchFinder; if (ZSTD_isError( ZSTD_initCDict_internal(cdict, dict, dictSize, @@ -3839,15 +4932,15 @@ unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict) return cdict->dictID; } - -/* ZSTD_compressBegin_usingCDict_advanced() : - * cdict must be != NULL */ -size_t ZSTD_compressBegin_usingCDict_advanced( +/* ZSTD_compressBegin_usingCDict_internal() : + * Implementation of various ZSTD_compressBegin_usingCDict* functions. + */ +static size_t ZSTD_compressBegin_usingCDict_internal( ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) { ZSTD_CCtx_params cctxParams; - DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced"); + DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_internal"); RETURN_ERROR_IF(cdict==NULL, dictionary_wrong, "NULL pointer!"); /* Initialize the cctxParams from the cdict */ { @@ -3879,25 +4972,48 @@ size_t ZSTD_compressBegin_usingCDict_advanced( ZSTDb_not_buffered); } + +/* ZSTD_compressBegin_usingCDict_advanced() : + * This function is DEPRECATED. + * cdict must be != NULL */ +size_t ZSTD_compressBegin_usingCDict_advanced( + ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, + ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) +{ + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, pledgedSrcSize); +} + /* ZSTD_compressBegin_usingCDict() : - * pledgedSrcSize=0 means "unknown" - * if pledgedSrcSize>0, it will enable contentSizeFlag */ + * cdict must be != NULL */ size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) { ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; - DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag); - return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); } -size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, +/*! ZSTD_compress_usingCDict_internal(): + * Implementation of various ZSTD_compress_usingCDict* functions. + */ +static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) { - FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); } +/*! ZSTD_compress_usingCDict_advanced(): + * This function is DEPRECATED. + */ +size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) +{ + return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); +} + /*! ZSTD_compress_usingCDict() : * Compression using a digested Dictionary. * Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times. @@ -3909,7 +5025,7 @@ size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) { ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; - return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); + return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); } @@ -4313,8 +5429,13 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* single usage */ assert(prefixDict.dict==NULL || cctx->cdict==NULL); /* only one can be set */ - if (cctx->cdict) - params.compressionLevel = cctx->cdict->compressionLevel; /* let cdict take priority in terms of compression level */ + if (cctx->cdict && !cctx->localDict.cdict) { + /* Let the cdict's compression level take priority over the requested params. + * But do not take the cdict's compression level if the "cdict" is actually a localDict + * generated from ZSTD_initLocalDict(). + */ + params.compressionLevel = cctx->cdict->compressionLevel; + } DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */ { @@ -4327,11 +5448,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, dictSize, mode); } - if (ZSTD_CParams_shouldEnableLdm(¶ms.cParams)) { - /* Enable LDM by default for optimal parser and window size >= 128MB */ - DEBUGLOG(4, "LDM enabled by default (window size >= 128MB, strategy >= btopt)"); - params.ldmParams.enableLdm = 1; - } + params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); + params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); + params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); @@ -4436,39 +5555,39 @@ typedef struct { size_t posInSrc; /* Number of bytes given by sequences provided so far */ } ZSTD_sequencePosition; -/* Returns a ZSTD error code if sequence is not valid */ -static size_t ZSTD_validateSequence(U32 offCode, U32 matchLength, - size_t posInSrc, U32 windowLog, size_t dictSize, U32 minMatch) { - size_t offsetBound; - U32 windowSize = 1 << windowLog; +/* ZSTD_validateSequence() : + * @offCode : is presumed to follow format required by ZSTD_storeSeq() + * @returns a ZSTD error code if sequence is not valid + */ +static size_t +ZSTD_validateSequence(U32 offCode, U32 matchLength, + size_t posInSrc, U32 windowLog, size_t dictSize) +{ + U32 const windowSize = 1 << windowLog; /* posInSrc represents the amount of data the decoder would decode up to this point. * As long as the amount of data decoded is less than or equal to window size, offsets may be * larger than the total length of output decoded in order to reference the dict, even larger than * window size. After output surpasses windowSize, we're limited to windowSize offsets again. */ - offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; - RETURN_ERROR_IF(offCode > offsetBound + ZSTD_REP_MOVE, corruption_detected, "Offset too large!"); - RETURN_ERROR_IF(matchLength < minMatch, corruption_detected, "Matchlength too small"); + size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; + RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!"); + RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small"); return 0; } /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ -static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) { - U32 offCode = rawOffset + ZSTD_REP_MOVE; - U32 repCode = 0; +static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) +{ + U32 offCode = STORE_OFFSET(rawOffset); if (!ll0 && rawOffset == rep[0]) { - repCode = 1; + offCode = STORE_REPCODE_1; } else if (rawOffset == rep[1]) { - repCode = 2 - ll0; + offCode = STORE_REPCODE(2 - ll0); } else if (rawOffset == rep[2]) { - repCode = 3 - ll0; + offCode = STORE_REPCODE(3 - ll0); } else if (ll0 && rawOffset == rep[0] - 1) { - repCode = 3; - } - if (repCode) { - /* ZSTD_storeSeq expects a number in the range [0, 2] to represent a repcode */ - offCode = repCode - 1; + offCode = STORE_REPCODE_3; } return offCode; } @@ -4476,18 +5595,17 @@ static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 /* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. */ -static size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, - const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, - const void* src, size_t blockSize) { +static size_t +ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize) +{ U32 idx = seqPos->idx; BYTE const* ip = (BYTE const*)(src); const BYTE* const iend = ip + blockSize; repcodes_t updatedRepcodes; U32 dictSize; - U32 litLength; - U32 matchLength; - U32 ll0; - U32 offCode; if (cctx->cdict) { dictSize = (U32)cctx->cdict->dictContentSize; @@ -4498,23 +5616,22 @@ static size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ZS } ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) { - litLength = inSeqs[idx].litLength; - matchLength = inSeqs[idx].matchLength; - ll0 = litLength == 0; - offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); - updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); + U32 const litLength = inSeqs[idx].litLength; + U32 const ll0 = (litLength == 0); + U32 const matchLength = inSeqs[idx].matchLength; + U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); + ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); if (cctx->appliedParams.validateSequences) { seqPos->posInSrc += litLength + matchLength; FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, - cctx->appliedParams.cParams.windowLog, dictSize, - cctx->appliedParams.cParams.minMatch), + cctx->appliedParams.cParams.windowLog, dictSize), "Sequence validation failed"); } RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); - ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength - MINMATCH); + ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ip += matchLength + litLength; } ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); @@ -4541,9 +5658,11 @@ static size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ZS * avoid splitting a match, or to avoid splitting a match such that it would produce a match * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. */ -static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, - const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, - const void* src, size_t blockSize) { +static size_t +ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize) +{ U32 idx = seqPos->idx; U32 startPosInSequence = seqPos->posInSequence; U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize; @@ -4553,10 +5672,6 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_seq repcodes_t updatedRepcodes; U32 bytesAdjustment = 0; U32 finalMatchSplit = 0; - U32 litLength; - U32 matchLength; - U32 rawOffset; - U32 offCode; if (cctx->cdict) { dictSize = cctx->cdict->dictContentSize; @@ -4570,9 +5685,10 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_seq ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { const ZSTD_Sequence currSeq = inSeqs[idx]; - litLength = currSeq.litLength; - matchLength = currSeq.matchLength; - rawOffset = currSeq.offset; + U32 litLength = currSeq.litLength; + U32 matchLength = currSeq.matchLength; + U32 const rawOffset = currSeq.offset; + U32 offCode; /* Modify the sequence depending on where endPosInSequence lies */ if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { @@ -4625,22 +5741,21 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_seq } } /* Check if this offset can be represented with a repcode */ - { U32 ll0 = (litLength == 0); + { U32 const ll0 = (litLength == 0); offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0); - updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); + ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); } if (cctx->appliedParams.validateSequences) { seqPos->posInSrc += litLength + matchLength; FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, - cctx->appliedParams.cParams.windowLog, dictSize, - cctx->appliedParams.cParams.minMatch), + cctx->appliedParams.cParams.windowLog, dictSize), "Sequence validation failed"); } DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); - ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength - MINMATCH); + ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ip += matchLength + litLength; } DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); @@ -4665,7 +5780,8 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_seq typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, const void* src, size_t blockSize); -static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) { +static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) +{ ZSTD_sequenceCopier sequenceCopier = NULL; assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode)); if (mode == ZSTD_sf_explicitBlockDelimiters) { @@ -4679,12 +5795,15 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) /* Compress, block-by-block, all of the sequences given. * - * Returns the cumulative size of all compressed blocks (including their headers), otherwise a ZSTD error. + * Returns the cumulative size of all compressed blocks (including their headers), + * otherwise a ZSTD error. */ -static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, - void* dst, size_t dstCapacity, - const ZSTD_Sequence* inSeqs, size_t inSeqsSize, - const void* src, size_t srcSize) { +static size_t +ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) +{ size_t cSize = 0; U32 lastBlock; size_t blockSize; @@ -4694,7 +5813,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, BYTE const* ip = (BYTE const*)src; BYTE* op = (BYTE*)dst; - ZSTD_sequenceCopier sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); + ZSTD_sequenceCopier const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize); /* Special case: empty frame */ @@ -4732,7 +5851,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, continue; } - compressedSeqsSize = ZSTD_entropyCompressSequences(&cctx->seqStore, + compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, &cctx->appliedParams, op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, @@ -4764,7 +5883,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, } else { U32 cBlockHeader; /* Error checking and repcodes update */ - ZSTD_confirmRepcodesAndEntropyTables(cctx); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState); if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; @@ -4794,7 +5913,8 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity, const ZSTD_Sequence* inSeqs, size_t inSeqsSize, - const void* src, size_t srcSize) { + const void* src, size_t srcSize) +{ BYTE* op = (BYTE*)dst; size_t cSize = 0; size_t compressedBlocksSize = 0; @@ -4861,117 +5981,11 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) /*-===== Pre-defined compression levels =====-*/ +#include "clevels.h" -#define ZSTD_MAX_CLEVEL 22 int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; } int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; } - -static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = { -{ /* "default" - for any srcSize > 256 KB */ - /* W, C, H, S, L, TL, strat */ - { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels */ - { 19, 13, 14, 1, 7, 0, ZSTD_fast }, /* level 1 */ - { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */ - { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */ - { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */ - { 21, 18, 19, 2, 5, 2, ZSTD_greedy }, /* level 5 */ - { 21, 19, 19, 3, 5, 4, ZSTD_greedy }, /* level 6 */ - { 21, 19, 19, 3, 5, 8, ZSTD_lazy }, /* level 7 */ - { 21, 19, 19, 3, 5, 16, ZSTD_lazy2 }, /* level 8 */ - { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */ - { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 10 */ - { 22, 21, 22, 4, 5, 16, ZSTD_lazy2 }, /* level 11 */ - { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 12 */ - { 22, 21, 22, 5, 5, 32, ZSTD_btlazy2 }, /* level 13 */ - { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */ - { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */ - { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */ - { 23, 23, 22, 5, 4, 64, ZSTD_btopt }, /* level 17 */ - { 23, 23, 22, 6, 3, 64, ZSTD_btultra }, /* level 18 */ - { 23, 24, 22, 7, 3,256, ZSTD_btultra2}, /* level 19 */ - { 25, 25, 23, 7, 3,256, ZSTD_btultra2}, /* level 20 */ - { 26, 26, 24, 7, 3,512, ZSTD_btultra2}, /* level 21 */ - { 27, 27, 25, 9, 3,999, ZSTD_btultra2}, /* level 22 */ -}, -{ /* for srcSize <= 256 KB */ - /* W, C, H, S, L, T, strat */ - { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ - { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */ - { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */ - { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */ - { 18, 16, 17, 2, 5, 2, ZSTD_greedy }, /* level 4.*/ - { 18, 18, 18, 3, 5, 2, ZSTD_greedy }, /* level 5.*/ - { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/ - { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */ - { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ - { 18, 18, 19, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ - { 18, 18, 19, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ - { 18, 18, 19, 5, 4, 12, ZSTD_btlazy2 }, /* level 11.*/ - { 18, 19, 19, 7, 4, 12, ZSTD_btlazy2 }, /* level 12.*/ - { 18, 18, 19, 4, 4, 16, ZSTD_btopt }, /* level 13 */ - { 18, 18, 19, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ - { 18, 18, 19, 6, 3,128, ZSTD_btopt }, /* level 15.*/ - { 18, 19, 19, 6, 3,128, ZSTD_btultra }, /* level 16.*/ - { 18, 19, 19, 8, 3,256, ZSTD_btultra }, /* level 17.*/ - { 18, 19, 19, 6, 3,128, ZSTD_btultra2}, /* level 18.*/ - { 18, 19, 19, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ - { 18, 19, 19, 10, 3,512, ZSTD_btultra2}, /* level 20.*/ - { 18, 19, 19, 12, 3,512, ZSTD_btultra2}, /* level 21.*/ - { 18, 19, 19, 13, 3,999, ZSTD_btultra2}, /* level 22.*/ -}, -{ /* for srcSize <= 128 KB */ - /* W, C, H, S, L, T, strat */ - { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ - { 17, 12, 13, 1, 6, 0, ZSTD_fast }, /* level 1 */ - { 17, 13, 15, 1, 5, 0, ZSTD_fast }, /* level 2 */ - { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */ - { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */ - { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */ - { 17, 17, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */ - { 17, 17, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */ - { 17, 17, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ - { 17, 17, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ - { 17, 17, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ - { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */ - { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */ - { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/ - { 17, 18, 17, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ - { 17, 18, 17, 6, 3,256, ZSTD_btopt }, /* level 15.*/ - { 17, 18, 17, 6, 3,128, ZSTD_btultra }, /* level 16.*/ - { 17, 18, 17, 8, 3,256, ZSTD_btultra }, /* level 17.*/ - { 17, 18, 17, 10, 3,512, ZSTD_btultra }, /* level 18.*/ - { 17, 18, 17, 5, 3,256, ZSTD_btultra2}, /* level 19.*/ - { 17, 18, 17, 7, 3,512, ZSTD_btultra2}, /* level 20.*/ - { 17, 18, 17, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ - { 17, 18, 17, 11, 3,999, ZSTD_btultra2}, /* level 22.*/ -}, -{ /* for srcSize <= 16 KB */ - /* W, C, H, S, L, T, strat */ - { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ - { 14, 14, 15, 1, 5, 0, ZSTD_fast }, /* level 1 */ - { 14, 14, 15, 1, 4, 0, ZSTD_fast }, /* level 2 */ - { 14, 14, 15, 2, 4, 0, ZSTD_dfast }, /* level 3 */ - { 14, 14, 14, 4, 4, 2, ZSTD_greedy }, /* level 4 */ - { 14, 14, 14, 3, 4, 4, ZSTD_lazy }, /* level 5.*/ - { 14, 14, 14, 4, 4, 8, ZSTD_lazy2 }, /* level 6 */ - { 14, 14, 14, 6, 4, 8, ZSTD_lazy2 }, /* level 7 */ - { 14, 14, 14, 8, 4, 8, ZSTD_lazy2 }, /* level 8.*/ - { 14, 15, 14, 5, 4, 8, ZSTD_btlazy2 }, /* level 9.*/ - { 14, 15, 14, 9, 4, 8, ZSTD_btlazy2 }, /* level 10.*/ - { 14, 15, 14, 3, 4, 12, ZSTD_btopt }, /* level 11.*/ - { 14, 15, 14, 4, 3, 24, ZSTD_btopt }, /* level 12.*/ - { 14, 15, 14, 5, 3, 32, ZSTD_btultra }, /* level 13.*/ - { 14, 15, 15, 6, 3, 64, ZSTD_btultra }, /* level 14.*/ - { 14, 15, 15, 7, 3,256, ZSTD_btultra }, /* level 15.*/ - { 14, 15, 15, 5, 3, 48, ZSTD_btultra2}, /* level 16.*/ - { 14, 15, 15, 6, 3,128, ZSTD_btultra2}, /* level 17.*/ - { 14, 15, 15, 7, 3,256, ZSTD_btultra2}, /* level 18.*/ - { 14, 15, 15, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ - { 14, 15, 15, 8, 3,512, ZSTD_btultra2}, /* level 20.*/ - { 14, 15, 15, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ - { 14, 15, 15, 10, 3,999, ZSTD_btultra2}, /* level 22.*/ -}, -}; +int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; } static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const compressionLevel, size_t const dictSize) { @@ -4999,7 +6013,7 @@ static int ZSTD_dedicatedDictSearch_isSupported( { return (cParams->strategy >= ZSTD_greedy) && (cParams->strategy <= ZSTD_lazy2) - && (cParams->hashLog >= cParams->chainLog) + && (cParams->hashLog > cParams->chainLog) && (cParams->chainLog <= 24); } @@ -5018,6 +6032,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams( case ZSTD_lazy: case ZSTD_lazy2: cParams->hashLog -= ZSTD_LAZY_DDSS_BUCKET_LOG; + if (cParams->hashLog < ZSTD_HASHLOG_MIN) { + cParams->hashLog = ZSTD_HASHLOG_MIN; + } break; case ZSTD_btlazy2: case ZSTD_btopt: @@ -5066,6 +6083,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, else row = compressionLevel; { ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row]; + DEBUGLOG(5, "ZSTD_getCParams_internal selected tableID: %u row: %u strat: %u", tableID, row, (U32)cp.strategy); /* acceleration factor */ if (compressionLevel < 0) { int const clampedCompressionLevel = MAX(ZSTD_minCLevel(), compressionLevel); diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h index 685d2f996cc2..71697a11ae30 100644 --- a/lib/zstd/compress/zstd_compress_internal.h +++ b/lib/zstd/compress/zstd_compress_internal.h @@ -57,7 +57,7 @@ typedef struct { } ZSTD_localDict; typedef struct { - HUF_CElt CTable[HUF_CTABLE_SIZE_U32(255)]; + HUF_CElt CTable[HUF_CTABLE_SIZE_ST(255)]; HUF_repeat repeatMode; } ZSTD_hufCTables_t; @@ -75,8 +75,55 @@ typedef struct { ZSTD_fseCTables_t fse; } ZSTD_entropyCTables_t; +/* ********************************************* +* Entropy buffer statistics structs and funcs * +***********************************************/ +/* ZSTD_hufCTablesMetadata_t : + * Stores Literals Block Type for a super-block in hType, and + * huffman tree description in hufDesBuffer. + * hufDesSize refers to the size of huffman tree description in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */ typedef struct { - U32 off; /* Offset code (offset + ZSTD_REP_MOVE) for the match */ + symbolEncodingType_e hType; + BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; + size_t hufDesSize; +} ZSTD_hufCTablesMetadata_t; + +/* ZSTD_fseCTablesMetadata_t : + * Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and + * fse tables in fseTablesBuffer. + * fseTablesSize refers to the size of fse tables in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() */ +typedef struct { + symbolEncodingType_e llType; + symbolEncodingType_e ofType; + symbolEncodingType_e mlType; + BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; + size_t fseTablesSize; + size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ +} ZSTD_fseCTablesMetadata_t; + +typedef struct { + ZSTD_hufCTablesMetadata_t hufMetadata; + ZSTD_fseCTablesMetadata_t fseMetadata; +} ZSTD_entropyCTablesMetadata_t; + +/* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * @return : 0 on success or error code */ +size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize); + +/* ******************************* +* Compression internals structs * +*********************************/ + +typedef struct { + U32 off; /* Offset sumtype code for the match, using ZSTD_storeSeq() format */ U32 len; /* Raw length of match */ } ZSTD_match_t; @@ -126,7 +173,7 @@ typedef struct { U32 offCodeSumBasePrice; /* to compare to log2(offreq) */ ZSTD_OptPrice_e priceType; /* prices can be determined dynamically, or follow a pre-defined cost structure */ const ZSTD_entropyCTables_t* symbolCosts; /* pre-calculated dictionary statistics */ - ZSTD_literalCompressionMode_e literalCompressionMode; + ZSTD_paramSwitch_e literalCompressionMode; } optState_t; typedef struct { @@ -135,14 +182,23 @@ typedef struct { } ZSTD_compressedBlockState_t; typedef struct { - BYTE const* nextSrc; /* next block here to continue on current prefix */ - BYTE const* base; /* All regular indexes relative to this position */ - BYTE const* dictBase; /* extDict indexes relative to this position */ - U32 dictLimit; /* below that point, need extDict */ - U32 lowLimit; /* below that point, no more valid data */ + BYTE const* nextSrc; /* next block here to continue on current prefix */ + BYTE const* base; /* All regular indexes relative to this position */ + BYTE const* dictBase; /* extDict indexes relative to this position */ + U32 dictLimit; /* below that point, need extDict */ + U32 lowLimit; /* below that point, no more valid data */ + U32 nbOverflowCorrections; /* Number of times overflow correction has run since + * ZSTD_window_init(). Useful for debugging coredumps + * and for ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY. + */ } ZSTD_window_t; +#define ZSTD_WINDOW_START_INDEX 2 + typedef struct ZSTD_matchState_t ZSTD_matchState_t; + +#define ZSTD_ROW_HASH_CACHE_SIZE 8 /* Size of prefetching hash cache for row-based matchfinder */ + struct ZSTD_matchState_t { ZSTD_window_t window; /* State for window round buffer management */ U32 loadedDictEnd; /* index of end of dictionary, within context's referential. @@ -154,9 +210,17 @@ struct ZSTD_matchState_t { */ U32 nextToUpdate; /* index from which to continue table update */ U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ + + U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ + U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ + U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ + U32* hashTable; U32* hashTable3; U32* chainTable; + + U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */ + int dedicatedDictSearch; /* Indicates whether this matchState is using the * dedicated dictionary search structure. */ @@ -196,7 +260,7 @@ typedef struct { } ldmState_t; typedef struct { - U32 enableLdm; /* 1 if enable long distance matching */ + ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */ U32 hashLog; /* Log size of hashTable */ U32 bucketSizeLog; /* Log bucket size for collision resolution, at most 8 */ U32 minMatchLength; /* Minimum match length */ @@ -227,7 +291,7 @@ struct ZSTD_CCtx_params_s { * There is no guarantee that hint is close to actual source size */ ZSTD_dictAttachPref_e attachDictPref; - ZSTD_literalCompressionMode_e literalCompressionMode; + ZSTD_paramSwitch_e literalCompressionMode; /* Multithreading: used to pass parameters to mtctx */ int nbWorkers; @@ -249,6 +313,15 @@ struct ZSTD_CCtx_params_s { ZSTD_sequenceFormat_e blockDelimiters; int validateSequences; + /* Block splitting */ + ZSTD_paramSwitch_e useBlockSplitter; + + /* Param for deciding whether to use row-based matchfinder */ + ZSTD_paramSwitch_e useRowMatchFinder; + + /* Always load a dictionary in ext-dict mode (not prefix mode)? */ + int deterministicRefPrefix; + /* Internal use, for createCCtxParams() and freeCCtxParams() only */ ZSTD_customMem customMem; }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ @@ -266,12 +339,29 @@ typedef enum { ZSTDb_buffered } ZSTD_buffered_policy_e; +/* + * Struct that contains all elements of block splitter that should be allocated + * in a wksp. + */ +#define ZSTD_MAX_NB_BLOCK_SPLITS 196 +typedef struct { + seqStore_t fullSeqStoreChunk; + seqStore_t firstHalfSeqStore; + seqStore_t secondHalfSeqStore; + seqStore_t currSeqStore; + seqStore_t nextSeqStore; + + U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS]; + ZSTD_entropyCTablesMetadata_t entropyMetadata; +} ZSTD_blockSplitCtx; + struct ZSTD_CCtx_s { ZSTD_compressionStage_e stage; int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */ int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ ZSTD_CCtx_params requestedParams; ZSTD_CCtx_params appliedParams; + ZSTD_CCtx_params simpleApiParams; /* Param storage used by the simple API - not sticky. Must only be used in top-level simple API functions for storage. */ U32 dictID; size_t dictContentSize; @@ -296,7 +386,7 @@ struct ZSTD_CCtx_s { ZSTD_blockState_t blockState; U32* entropyWorkspace; /* entropy workspace of ENTROPY_WORKSPACE_SIZE bytes */ - /* Wether we are streaming or not */ + /* Whether we are streaming or not */ ZSTD_buffered_policy_e bufferedPolicy; /* streaming */ @@ -324,6 +414,9 @@ struct ZSTD_CCtx_s { /* Multi-threading */ /* Tracing */ + + /* Workspace for block splitter */ + ZSTD_blockSplitCtx blockSplitCtx; }; typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; @@ -358,7 +451,7 @@ typedef enum { typedef size_t (*ZSTD_blockCompressor) ( ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode); +ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode); MEM_STATIC U32 ZSTD_LLcode(U32 litLength) @@ -392,31 +485,6 @@ MEM_STATIC U32 ZSTD_MLcode(U32 mlBase) return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase]; } -typedef struct repcodes_s { - U32 rep[3]; -} repcodes_t; - -MEM_STATIC repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0) -{ - repcodes_t newReps; - if (offset >= ZSTD_REP_NUM) { /* full offset */ - newReps.rep[2] = rep[1]; - newReps.rep[1] = rep[0]; - newReps.rep[0] = offset - ZSTD_REP_MOVE; - } else { /* repcode */ - U32 const repCode = offset + ll0; - if (repCode > 0) { /* note : if repCode==0, no change */ - U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; - newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2]; - newReps.rep[1] = rep[0]; - newReps.rep[0] = currentOffset; - } else { /* repCode == 0 */ - ZSTD_memcpy(&newReps, rep, sizeof(newReps)); - } - } - return newReps; -} - /* ZSTD_cParam_withinBounds: * @return 1 if value is within cParam bounds, * 0 otherwise */ @@ -465,17 +533,17 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) return (srcSize >> minlog) + 2; } -MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParams) +MEM_STATIC int ZSTD_literalsCompressionIsDisabled(const ZSTD_CCtx_params* cctxParams) { switch (cctxParams->literalCompressionMode) { - case ZSTD_lcm_huffman: + case ZSTD_ps_enable: return 0; - case ZSTD_lcm_uncompressed: + case ZSTD_ps_disable: return 1; default: assert(0 /* impossible: pre-validated */); ZSTD_FALLTHROUGH; - case ZSTD_lcm_auto: + case ZSTD_ps_auto: return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0); } } @@ -485,7 +553,9 @@ MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParam * Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single * large copies. */ -static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w) { +static void +ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w) +{ assert(iend > ilimit_w); if (ip <= ilimit_w) { ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap); @@ -495,14 +565,30 @@ static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const ie while (ip < iend) *op++ = *ip++; } +#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) +#define STORE_REPCODE_1 STORE_REPCODE(1) +#define STORE_REPCODE_2 STORE_REPCODE(2) +#define STORE_REPCODE_3 STORE_REPCODE(3) +#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1) +#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE) +#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE) +#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE) +#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE) +#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */ +#define STORED_TO_OFFBASE(o) ((o)+1) +#define OFFBASE_TO_STORED(o) ((o)-1) + /*! ZSTD_storeSeq() : - * Store a sequence (litlen, litPtr, offCode and mlBase) into seqStore_t. - * `offCode` : distance to match + ZSTD_REP_MOVE (values <= ZSTD_REP_MOVE are repCodes). - * `mlBase` : matchLength - MINMATCH + * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t. + * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET(). + * @matchLength : must be >= MINMATCH * Allowed to overread literals up to litLimit. */ -HINT_INLINE UNUSED_ATTR -void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* literals, const BYTE* litLimit, U32 offCode, size_t mlBase) +HINT_INLINE UNUSED_ATTR void +ZSTD_storeSeq(seqStore_t* seqStorePtr, + size_t litLength, const BYTE* literals, const BYTE* litLimit, + U32 offBase_minus1, + size_t matchLength) { BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; BYTE const* const litEnd = literals + litLength; @@ -511,7 +597,7 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* litera if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ { U32 const pos = (U32)((const BYTE*)literals - g_start); DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", - pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offCode); + pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1); } #endif assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); @@ -535,26 +621,66 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* litera /* literal Length */ if (litLength>0xFFFF) { - assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ - seqStorePtr->longLengthID = 1; + assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ + seqStorePtr->longLengthType = ZSTD_llt_literalLength; seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); } seqStorePtr->sequences[0].litLength = (U16)litLength; /* match offset */ - seqStorePtr->sequences[0].offset = offCode + 1; + seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1); /* match Length */ - if (mlBase>0xFFFF) { - assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ - seqStorePtr->longLengthID = 2; - seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + assert(matchLength >= MINMATCH); + { size_t const mlBase = matchLength - MINMATCH; + if (mlBase>0xFFFF) { + assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ + seqStorePtr->longLengthType = ZSTD_llt_matchLength; + seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + } + seqStorePtr->sequences[0].mlBase = (U16)mlBase; } - seqStorePtr->sequences[0].matchLength = (U16)mlBase; seqStorePtr->sequences++; } +/* ZSTD_updateRep() : + * updates in-place @rep (array of repeat offsets) + * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq() + */ +MEM_STATIC void +ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) +{ + if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */ + rep[2] = rep[1]; + rep[1] = rep[0]; + rep[0] = STORED_OFFSET(offBase_minus1); + } else { /* repcode */ + U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0; + if (repCode > 0) { /* note : if repCode==0, no change */ + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + rep[2] = (repCode >= 2) ? rep[1] : rep[2]; + rep[1] = rep[0]; + rep[0] = currentOffset; + } else { /* repCode == 0 */ + /* nothing to do */ + } + } +} + +typedef struct repcodes_s { + U32 rep[3]; +} repcodes_t; + +MEM_STATIC repcodes_t +ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) +{ + repcodes_t newReps; + ZSTD_memcpy(&newReps, rep, sizeof(newReps)); + ZSTD_updateRep(newReps.rep, offBase_minus1, ll0); + return newReps; +} + /*-************************************* * Match length counter @@ -778,6 +904,13 @@ MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window) window->dictLimit = end; } +MEM_STATIC U32 ZSTD_window_isEmpty(ZSTD_window_t const window) +{ + return window.dictLimit == ZSTD_WINDOW_START_INDEX && + window.lowLimit == ZSTD_WINDOW_START_INDEX && + (window.nextSrc - window.base) == ZSTD_WINDOW_START_INDEX; +} + /* * ZSTD_window_hasExtDict(): * Returns non-zero if the window has a non-empty extDict. @@ -801,15 +934,71 @@ MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms) ZSTD_noDict; } +/* Defining this macro to non-zero tells zstd to run the overflow correction + * code much more frequently. This is very inefficient, and should only be + * used for tests and fuzzers. + */ +#ifndef ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY +# ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +# define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 1 +# else +# define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 0 +# endif +#endif + +/* + * ZSTD_window_canOverflowCorrect(): + * Returns non-zero if the indices are large enough for overflow correction + * to work correctly without impacting compression ratio. + */ +MEM_STATIC U32 ZSTD_window_canOverflowCorrect(ZSTD_window_t const window, + U32 cycleLog, + U32 maxDist, + U32 loadedDictEnd, + void const* src) +{ + U32 const cycleSize = 1u << cycleLog; + U32 const curr = (U32)((BYTE const*)src - window.base); + U32 const minIndexToOverflowCorrect = cycleSize + + MAX(maxDist, cycleSize) + + ZSTD_WINDOW_START_INDEX; + + /* Adjust the min index to backoff the overflow correction frequency, + * so we don't waste too much CPU in overflow correction. If this + * computation overflows we don't really care, we just need to make + * sure it is at least minIndexToOverflowCorrect. + */ + U32 const adjustment = window.nbOverflowCorrections + 1; + U32 const adjustedIndex = MAX(minIndexToOverflowCorrect * adjustment, + minIndexToOverflowCorrect); + U32 const indexLargeEnough = curr > adjustedIndex; + + /* Only overflow correct early if the dictionary is invalidated already, + * so we don't hurt compression ratio. + */ + U32 const dictionaryInvalidated = curr > maxDist + loadedDictEnd; + + return indexLargeEnough && dictionaryInvalidated; +} + /* * ZSTD_window_needOverflowCorrection(): * Returns non-zero if the indices are getting too large and need overflow * protection. */ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, + U32 cycleLog, + U32 maxDist, + U32 loadedDictEnd, + void const* src, void const* srcEnd) { U32 const curr = (U32)((BYTE const*)srcEnd - window.base); + if (ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) { + if (ZSTD_window_canOverflowCorrect(window, cycleLog, maxDist, loadedDictEnd, src)) { + return 1; + } + } return curr > ZSTD_CURRENT_MAX; } @@ -821,7 +1010,6 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, * * The least significant cycleLog bits of the indices must remain the same, * which may be 0. Every index up to maxDist in the past must be valid. - * NOTE: (maxDist & cycleMask) must be zero. */ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, U32 maxDist, void const* src) @@ -845,32 +1033,52 @@ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, * 3. (cctx->lowLimit + 1<<windowLog) < 1<<32: * windowLog <= 31 ==> 3<<29 + 1<<windowLog < 7<<29 < 1<<32. */ - U32 const cycleMask = (1U << cycleLog) - 1; + U32 const cycleSize = 1u << cycleLog; + U32 const cycleMask = cycleSize - 1; U32 const curr = (U32)((BYTE const*)src - window->base); - U32 const currentCycle0 = curr & cycleMask; - /* Exclude zero so that newCurrent - maxDist >= 1. */ - U32 const currentCycle1 = currentCycle0 == 0 ? (1U << cycleLog) : currentCycle0; - U32 const newCurrent = currentCycle1 + maxDist; + U32 const currentCycle = curr & cycleMask; + /* Ensure newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX. */ + U32 const currentCycleCorrection = currentCycle < ZSTD_WINDOW_START_INDEX + ? MAX(cycleSize, ZSTD_WINDOW_START_INDEX) + : 0; + U32 const newCurrent = currentCycle + + currentCycleCorrection + + MAX(maxDist, cycleSize); U32 const correction = curr - newCurrent; - assert((maxDist & cycleMask) == 0); + /* maxDist must be a power of two so that: + * (newCurrent & cycleMask) == (curr & cycleMask) + * This is required to not corrupt the chains / binary tree. + */ + assert((maxDist & (maxDist - 1)) == 0); + assert((curr & cycleMask) == (newCurrent & cycleMask)); assert(curr > newCurrent); - /* Loose bound, should be around 1<<29 (see above) */ - assert(correction > 1<<28); + if (!ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) { + /* Loose bound, should be around 1<<29 (see above) */ + assert(correction > 1<<28); + } window->base += correction; window->dictBase += correction; - if (window->lowLimit <= correction) window->lowLimit = 1; - else window->lowLimit -= correction; - if (window->dictLimit <= correction) window->dictLimit = 1; - else window->dictLimit -= correction; + if (window->lowLimit < correction + ZSTD_WINDOW_START_INDEX) { + window->lowLimit = ZSTD_WINDOW_START_INDEX; + } else { + window->lowLimit -= correction; + } + if (window->dictLimit < correction + ZSTD_WINDOW_START_INDEX) { + window->dictLimit = ZSTD_WINDOW_START_INDEX; + } else { + window->dictLimit -= correction; + } /* Ensure we can still reference the full window. */ assert(newCurrent >= maxDist); - assert(newCurrent - maxDist >= 1); + assert(newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX); /* Ensure that lowLimit and dictLimit didn't underflow. */ assert(window->lowLimit <= newCurrent); assert(window->dictLimit <= newCurrent); + ++window->nbOverflowCorrections; + DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction, window->lowLimit); return correction; @@ -975,11 +1183,13 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { ZSTD_memset(window, 0, sizeof(*window)); - window->base = (BYTE const*)""; - window->dictBase = (BYTE const*)""; - window->dictLimit = 1; /* start from 1, so that 1st position is valid */ - window->lowLimit = 1; /* it ensures first and later CCtx usages compress the same */ - window->nextSrc = window->base + 1; /* see issue #1241 */ + window->base = (BYTE const*)" "; + window->dictBase = (BYTE const*)" "; + ZSTD_STATIC_ASSERT(ZSTD_DUBT_UNSORTED_MARK < ZSTD_WINDOW_START_INDEX); /* Start above ZSTD_DUBT_UNSORTED_MARK */ + window->dictLimit = ZSTD_WINDOW_START_INDEX; /* start from >0, so that 1st position is valid */ + window->lowLimit = ZSTD_WINDOW_START_INDEX; /* it ensures first and later CCtx usages compress the same */ + window->nextSrc = window->base + ZSTD_WINDOW_START_INDEX; /* see issue #1241 */ + window->nbOverflowCorrections = 0; } /* @@ -990,7 +1200,8 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { * Returns non-zero if the segment is contiguous. */ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, - void const* src, size_t srcSize) + void const* src, size_t srcSize, + int forceNonContiguous) { BYTE const* const ip = (BYTE const*)src; U32 contiguous = 1; @@ -1000,7 +1211,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, assert(window->base != NULL); assert(window->dictBase != NULL); /* Check if blocks follow each other */ - if (src != window->nextSrc) { + if (src != window->nextSrc || forceNonContiguous) { /* not contiguous */ size_t const distanceFromBase = (size_t)(window->nextSrc - window->base); DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit); @@ -1030,15 +1241,15 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, */ MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog) { - U32 const maxDistance = 1U << windowLog; - U32 const lowestValid = ms->window.lowLimit; - U32 const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; - U32 const isDictionary = (ms->loadedDictEnd != 0); + U32 const maxDistance = 1U << windowLog; + U32 const lowestValid = ms->window.lowLimit; + U32 const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; + U32 const isDictionary = (ms->loadedDictEnd != 0); /* When using a dictionary the entire dictionary is valid if a single byte of the dictionary * is within the window. We invalidate the dictionary (and set loadedDictEnd to 0) when it isn't * valid for the entire block. So this check is sufficient to find the lowest valid match index. */ - U32 const matchLowest = isDictionary ? lowestValid : withinWindow; + U32 const matchLowest = isDictionary ? lowestValid : withinWindow; return matchLowest; } diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c index 655bcda4d1f1..52b0a8059aba 100644 --- a/lib/zstd/compress/zstd_compress_literals.c +++ b/lib/zstd/compress/zstd_compress_literals.c @@ -73,7 +73,8 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, void* dst, size_t dstCapacity, const void* src, size_t srcSize, void* entropyWorkspace, size_t entropyWorkspaceSize, - const int bmi2) + const int bmi2, + unsigned suspectUncompressible) { size_t const minGain = ZSTD_minGain(srcSize, strategy); size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); @@ -105,11 +106,11 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, HUF_compress1X_repeat( ostart+lhSize, dstCapacity-lhSize, src, srcSize, HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, - (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) : + (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) : HUF_compress4X_repeat( ostart+lhSize, dstCapacity-lhSize, src, srcSize, HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, - (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2); + (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible); if (repeat != HUF_repeat_none) { /* reused the existing table */ DEBUGLOG(5, "Reusing previous huffman table"); @@ -117,7 +118,7 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, } } - if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) { + if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); } diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h index 9904c0cd30a0..9775fb97cb70 100644 --- a/lib/zstd/compress/zstd_compress_literals.h +++ b/lib/zstd/compress/zstd_compress_literals.h @@ -18,12 +18,14 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); +/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, ZSTD_hufCTables_t* nextHuf, ZSTD_strategy strategy, int disableLiteralCompression, void* dst, size_t dstCapacity, const void* src, size_t srcSize, void* entropyWorkspace, size_t entropyWorkspaceSize, - const int bmi2); + const int bmi2, + unsigned suspectUncompressible); #endif /* ZSTD_COMPRESS_LITERALS_H */ diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c index dcfcdc9cc5e8..21ddc1b37acf 100644 --- a/lib/zstd/compress/zstd_compress_sequences.c +++ b/lib/zstd/compress/zstd_compress_sequences.c @@ -85,6 +85,8 @@ static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t { unsigned cost = 0; unsigned s; + + assert(total > 0); for (s = 0; s <= max; ++s) { unsigned norm = (unsigned)((256 * count[s]) / total); if (count[s] != 0 && norm == 0) @@ -273,10 +275,11 @@ ZSTD_buildCTable(void* dst, size_t dstCapacity, assert(nbSeq_1 > 1); assert(entropyWorkspaceSize >= sizeof(ZSTD_BuildCTableWksp)); (void)entropyWorkspaceSize; - FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), ""); - { size_t const NCountSize = FSE_writeNCount(op, oend - op, wksp->norm, max, tableLog); /* overflow protected */ + FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "FSE_normalizeCount failed"); + assert(oend >= op); + { size_t const NCountSize = FSE_writeNCount(op, (size_t)(oend - op), wksp->norm, max, tableLog); /* overflow protected */ FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed"); - FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), ""); + FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "FSE_buildCTable_wksp failed"); return NCountSize; } } @@ -310,19 +313,19 @@ ZSTD_encodeSequences_body( FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]); BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]); if (MEM_32bits()) BIT_flushBits(&blockStream); - BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]); + BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTable[nbSeq-1]]); if (MEM_32bits()) BIT_flushBits(&blockStream); if (longOffsets) { U32 const ofBits = ofCodeTable[nbSeq-1]; unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); if (extraBits) { - BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits); + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, extraBits); BIT_flushBits(&blockStream); } - BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits, + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase >> extraBits, ofBits - extraBits); } else { - BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]); + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[nbSeq-1]); } BIT_flushBits(&blockStream); @@ -336,8 +339,8 @@ ZSTD_encodeSequences_body( U32 const mlBits = ML_bits[mlCode]; DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u", (unsigned)sequences[n].litLength, - (unsigned)sequences[n].matchLength + MINMATCH, - (unsigned)sequences[n].offset); + (unsigned)sequences[n].mlBase + MINMATCH, + (unsigned)sequences[n].offBase); /* 32b*/ /* 64b*/ /* (7)*/ /* (7)*/ FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */ /* 15 */ @@ -348,18 +351,18 @@ ZSTD_encodeSequences_body( BIT_flushBits(&blockStream); /* (7)*/ BIT_addBits(&blockStream, sequences[n].litLength, llBits); if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); - BIT_addBits(&blockStream, sequences[n].matchLength, mlBits); + BIT_addBits(&blockStream, sequences[n].mlBase, mlBits); if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream); if (longOffsets) { unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); if (extraBits) { - BIT_addBits(&blockStream, sequences[n].offset, extraBits); + BIT_addBits(&blockStream, sequences[n].offBase, extraBits); BIT_flushBits(&blockStream); /* (7)*/ } - BIT_addBits(&blockStream, sequences[n].offset >> extraBits, + BIT_addBits(&blockStream, sequences[n].offBase >> extraBits, ofBits - extraBits); /* 31 */ } else { - BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */ + BIT_addBits(&blockStream, sequences[n].offBase, ofBits); /* 31 */ } BIT_flushBits(&blockStream); /* (7)*/ DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr)); @@ -396,7 +399,7 @@ ZSTD_encodeSequences_default( #if DYNAMIC_BMI2 -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t ZSTD_encodeSequences_bmi2( void* dst, size_t dstCapacity, FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c index b0610b255653..17d836cc84e8 100644 --- a/lib/zstd/compress/zstd_compress_superblock.c +++ b/lib/zstd/compress/zstd_compress_superblock.c @@ -15,289 +15,10 @@ #include "../common/zstd_internal.h" /* ZSTD_getSequenceLength */ #include "hist.h" /* HIST_countFast_wksp */ -#include "zstd_compress_internal.h" +#include "zstd_compress_internal.h" /* ZSTD_[huf|fse|entropy]CTablesMetadata_t */ #include "zstd_compress_sequences.h" #include "zstd_compress_literals.h" -/*-************************************* -* Superblock entropy buffer structs -***************************************/ -/* ZSTD_hufCTablesMetadata_t : - * Stores Literals Block Type for a super-block in hType, and - * huffman tree description in hufDesBuffer. - * hufDesSize refers to the size of huffman tree description in bytes. - * This metadata is populated in ZSTD_buildSuperBlockEntropy_literal() */ -typedef struct { - symbolEncodingType_e hType; - BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; - size_t hufDesSize; -} ZSTD_hufCTablesMetadata_t; - -/* ZSTD_fseCTablesMetadata_t : - * Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and - * fse tables in fseTablesBuffer. - * fseTablesSize refers to the size of fse tables in bytes. - * This metadata is populated in ZSTD_buildSuperBlockEntropy_sequences() */ -typedef struct { - symbolEncodingType_e llType; - symbolEncodingType_e ofType; - symbolEncodingType_e mlType; - BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; - size_t fseTablesSize; - size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_compressSubBlock_sequences() */ -} ZSTD_fseCTablesMetadata_t; - -typedef struct { - ZSTD_hufCTablesMetadata_t hufMetadata; - ZSTD_fseCTablesMetadata_t fseMetadata; -} ZSTD_entropyCTablesMetadata_t; - - -/* ZSTD_buildSuperBlockEntropy_literal() : - * Builds entropy for the super-block literals. - * Stores literals block type (raw, rle, compressed, repeat) and - * huffman description table to hufMetadata. - * @return : size of huffman description table or error code */ -static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t srcSize, - const ZSTD_hufCTables_t* prevHuf, - ZSTD_hufCTables_t* nextHuf, - ZSTD_hufCTablesMetadata_t* hufMetadata, - const int disableLiteralsCompression, - void* workspace, size_t wkspSize) -{ - BYTE* const wkspStart = (BYTE*)workspace; - BYTE* const wkspEnd = wkspStart + wkspSize; - BYTE* const countWkspStart = wkspStart; - unsigned* const countWksp = (unsigned*)workspace; - const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); - BYTE* const nodeWksp = countWkspStart + countWkspSize; - const size_t nodeWkspSize = wkspEnd-nodeWksp; - unsigned maxSymbolValue = 255; - unsigned huffLog = HUF_TABLELOG_DEFAULT; - HUF_repeat repeat = prevHuf->repeatMode; - - DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_literal (srcSize=%zu)", srcSize); - - /* Prepare nextEntropy assuming reusing the existing table */ - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); - - if (disableLiteralsCompression) { - DEBUGLOG(5, "set_basic - disabled"); - hufMetadata->hType = set_basic; - return 0; - } - - /* small ? don't even attempt compression (speed opt) */ -# define COMPRESS_LITERALS_SIZE_MIN 63 - { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; - if (srcSize <= minLitSize) { - DEBUGLOG(5, "set_basic - too small"); - hufMetadata->hType = set_basic; - return 0; - } - } - - /* Scan input and build symbol stats */ - { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); - FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); - if (largest == srcSize) { - DEBUGLOG(5, "set_rle"); - hufMetadata->hType = set_rle; - return 0; - } - if (largest <= (srcSize >> 7)+4) { - DEBUGLOG(5, "set_basic - no gain"); - hufMetadata->hType = set_basic; - return 0; - } - } - - /* Validate the previous Huffman table */ - if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { - repeat = HUF_repeat_none; - } - - /* Build Huffman Tree */ - ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); - huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); - { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, - maxSymbolValue, huffLog, - nodeWksp, nodeWkspSize); - FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); - huffLog = (U32)maxBits; - { /* Build and write the CTable */ - size_t const newCSize = HUF_estimateCompressedSize( - (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); - size_t const hSize = HUF_writeCTable_wksp( - hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), - (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, - nodeWksp, nodeWkspSize); - /* Check against repeating the previous CTable */ - if (repeat != HUF_repeat_none) { - size_t const oldCSize = HUF_estimateCompressedSize( - (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); - if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { - DEBUGLOG(5, "set_repeat - smaller"); - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); - hufMetadata->hType = set_repeat; - return 0; - } - } - if (newCSize + hSize >= srcSize) { - DEBUGLOG(5, "set_basic - no gains"); - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); - hufMetadata->hType = set_basic; - return 0; - } - DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); - hufMetadata->hType = set_compressed; - nextHuf->repeatMode = HUF_repeat_check; - return hSize; - } - } -} - -/* ZSTD_buildSuperBlockEntropy_sequences() : - * Builds entropy for the super-block sequences. - * Stores symbol compression modes and fse table to fseMetadata. - * @return : size of fse tables or error code */ -static size_t ZSTD_buildSuperBlockEntropy_sequences(seqStore_t* seqStorePtr, - const ZSTD_fseCTables_t* prevEntropy, - ZSTD_fseCTables_t* nextEntropy, - const ZSTD_CCtx_params* cctxParams, - ZSTD_fseCTablesMetadata_t* fseMetadata, - void* workspace, size_t wkspSize) -{ - BYTE* const wkspStart = (BYTE*)workspace; - BYTE* const wkspEnd = wkspStart + wkspSize; - BYTE* const countWkspStart = wkspStart; - unsigned* const countWksp = (unsigned*)workspace; - const size_t countWkspSize = (MaxSeq + 1) * sizeof(unsigned); - BYTE* const cTableWksp = countWkspStart + countWkspSize; - const size_t cTableWkspSize = wkspEnd-cTableWksp; - ZSTD_strategy const strategy = cctxParams->cParams.strategy; - FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; - FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; - FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; - const BYTE* const ofCodeTable = seqStorePtr->ofCode; - const BYTE* const llCodeTable = seqStorePtr->llCode; - const BYTE* const mlCodeTable = seqStorePtr->mlCode; - size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; - BYTE* const ostart = fseMetadata->fseTablesBuffer; - BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); - BYTE* op = ostart; - - assert(cTableWkspSize >= (1 << MaxFSELog) * sizeof(FSE_FUNCTION_TYPE)); - DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_sequences (nbSeq=%zu)", nbSeq); - ZSTD_memset(workspace, 0, wkspSize); - - fseMetadata->lastCountSize = 0; - /* convert length/distances into codes */ - ZSTD_seqToCodes(seqStorePtr); - /* build CTable for Literal Lengths */ - { U32 LLtype; - unsigned max = MaxLL; - size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, llCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ - DEBUGLOG(5, "Building LL table"); - nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; - LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, - countWksp, max, mostFrequent, nbSeq, - LLFSELog, prevEntropy->litlengthCTable, - LL_defaultNorm, LL_defaultNormLog, - ZSTD_defaultAllowed, strategy); - assert(set_basic < set_compressed && set_rle < set_compressed); - assert(!(LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, - countWksp, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL, - prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable), - cTableWksp, cTableWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed"); - if (LLtype == set_compressed) - fseMetadata->lastCountSize = countSize; - op += countSize; - fseMetadata->llType = (symbolEncodingType_e) LLtype; - } } - /* build CTable for Offsets */ - { U32 Offtype; - unsigned max = MaxOff; - size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, ofCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ - /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ - ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; - DEBUGLOG(5, "Building OF table"); - nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; - Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, - countWksp, max, mostFrequent, nbSeq, - OffFSELog, prevEntropy->offcodeCTable, - OF_defaultNorm, OF_defaultNormLog, - defaultPolicy, strategy); - assert(!(Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, - countWksp, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, - prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable), - cTableWksp, cTableWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed"); - if (Offtype == set_compressed) - fseMetadata->lastCountSize = countSize; - op += countSize; - fseMetadata->ofType = (symbolEncodingType_e) Offtype; - } } - /* build CTable for MatchLengths */ - { U32 MLtype; - unsigned max = MaxML; - size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, mlCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ - DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); - nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; - MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, - countWksp, max, mostFrequent, nbSeq, - MLFSELog, prevEntropy->matchlengthCTable, - ML_defaultNorm, ML_defaultNormLog, - ZSTD_defaultAllowed, strategy); - assert(!(MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, - countWksp, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML, - prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable), - cTableWksp, cTableWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed"); - if (MLtype == set_compressed) - fseMetadata->lastCountSize = countSize; - op += countSize; - fseMetadata->mlType = (symbolEncodingType_e) MLtype; - } } - assert((size_t) (op-ostart) <= sizeof(fseMetadata->fseTablesBuffer)); - return op-ostart; -} - - -/* ZSTD_buildSuperBlockEntropy() : - * Builds entropy for the super-block. - * @return : 0 on success or error code */ -static size_t -ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr, - const ZSTD_entropyCTables_t* prevEntropy, - ZSTD_entropyCTables_t* nextEntropy, - const ZSTD_CCtx_params* cctxParams, - ZSTD_entropyCTablesMetadata_t* entropyMetadata, - void* workspace, size_t wkspSize) -{ - size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; - DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy"); - entropyMetadata->hufMetadata.hufDesSize = - ZSTD_buildSuperBlockEntropy_literal(seqStorePtr->litStart, litSize, - &prevEntropy->huf, &nextEntropy->huf, - &entropyMetadata->hufMetadata, - ZSTD_disableLiteralsCompression(cctxParams), - workspace, wkspSize); - FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildSuperBlockEntropy_literal failed"); - entropyMetadata->fseMetadata.fseTablesSize = - ZSTD_buildSuperBlockEntropy_sequences(seqStorePtr, - &prevEntropy->fse, &nextEntropy->fse, - cctxParams, - &entropyMetadata->fseMetadata, - workspace, wkspSize); - FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildSuperBlockEntropy_sequences failed"); - return 0; -} - /* ZSTD_compressSubBlock_literal() : * Compresses literals section for a sub-block. * When we have to write the Huffman table we will sometimes choose a header @@ -411,8 +132,7 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* const seqDef* sp = sstart; size_t matchLengthSum = 0; size_t litLengthSum = 0; - /* Only used by assert(), suppress unused variable warnings in production. */ - (void)litLengthSum; + (void)(litLengthSum); /* suppress unused variable warning on some environments */ while (send-sp > 0) { ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); litLengthSum += seqLen.litLength; @@ -605,7 +325,7 @@ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t lit static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type, const BYTE* codeTable, unsigned maxCode, size_t nbSeq, const FSE_CTable* fseCTable, - const U32* additionalBits, + const U8* additionalBits, short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, void* workspace, size_t wkspSize) { @@ -646,8 +366,9 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable, void* workspace, size_t wkspSize, int writeEntropy) { - size_t sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */ + size_t const sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */ size_t cSeqSizeEstimate = 0; + if (nbSeq == 0) return sequencesSectionHeaderSize; cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff, nbSeq, fseTables->offcodeCTable, NULL, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, @@ -754,7 +475,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, /* I think there is an optimization opportunity here. * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful * since it recalculates estimate from scratch. - * For example, it would recount literal distribution and symbol codes everytime. + * For example, it would recount literal distribution and symbol codes every time. */ cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount, &nextCBlock->entropy, entropyMetadata, @@ -818,7 +539,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, repcodes_t rep; ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); for (seq = sstart; seq < sp; ++seq) { - rep = ZSTD_updateRep(rep.rep, seq->offset - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); + ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); } ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); } @@ -833,7 +554,7 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, unsigned lastBlock) { ZSTD_entropyCTablesMetadata_t entropyMetadata; - FORWARD_IF_ERROR(ZSTD_buildSuperBlockEntropy(&zc->seqStore, + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore, &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, &zc->appliedParams, diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h index 98e359adf5d4..349fc923c355 100644 --- a/lib/zstd/compress/zstd_cwksp.h +++ b/lib/zstd/compress/zstd_cwksp.h @@ -32,6 +32,10 @@ #define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128 #endif + +/* Set our tables and aligneds to align by 64 bytes */ +#define ZSTD_CWKSP_ALIGNMENT_BYTES 64 + /*-************************************* * Structures ***************************************/ @@ -114,10 +118,11 @@ typedef enum { * - Tables: these are any of several different datastructures (hash tables, * chain tables, binary trees) that all respect a common format: they are * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). - * Their sizes depend on the cparams. + * Their sizes depend on the cparams. These tables are 64-byte aligned. * * - Aligned: these buffers are used for various purposes that require 4 byte - * alignment, but don't require any initialization before they're used. + * alignment, but don't require any initialization before they're used. These + * buffers are each aligned to 64 bytes. * * - Buffers: these buffers are used for various purposes that don't require * any alignment or initialization before they're used. This means they can @@ -130,8 +135,7 @@ typedef enum { * * 1. Objects * 2. Buffers - * 3. Aligned - * 4. Tables + * 3. Aligned/Tables * * Attempts to reserve objects of different types out of order will fail. */ @@ -184,6 +188,8 @@ MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) { * Since tables aren't currently redzoned, you don't need to call through this * to figure out how much space you need for the matchState tables. Everything * else is though. + * + * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size(). */ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { if (size == 0) @@ -191,66 +197,139 @@ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { return size; } -MEM_STATIC void ZSTD_cwksp_internal_advance_phase( - ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) { +/* + * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes. + * Used to determine the number of bytes required for a given "aligned". + */ +MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { + return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES)); +} + +/* + * Returns the amount of additional space the cwksp must allocate + * for internal purposes (currently only alignment). + */ +MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { + /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes + * to align the beginning of tables section, as well as another n_2=[0, 63] bytes + * to align the beginning of the aligned section. + * + * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and + * aligneds being sized in multiples of 64 bytes. + */ + size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; + return slackSpace; +} + + +/* + * Return the number of additional bytes required to align a pointer to the given number of bytes. + * alignBytes must be a power of two. + */ +MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) { + size_t const alignBytesMask = alignBytes - 1; + size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; + assert((alignBytes & alignBytesMask) == 0); + assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); + return bytes; +} + +/* + * Internal function. Do not use directly. + * Reserves the given number of bytes within the aligned/buffer segment of the wksp, + * which counts from the end of the wksp (as opposed to the object/table segment). + * + * Returns a pointer to the beginning of that space. + */ +MEM_STATIC void* +ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes) +{ + void* const alloc = (BYTE*)ws->allocStart - bytes; + void* const bottom = ws->tableEnd; + DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", + alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); + ZSTD_cwksp_assert_internal_consistency(ws); + assert(alloc >= bottom); + if (alloc < bottom) { + DEBUGLOG(4, "cwksp: alloc failed!"); + ws->allocFailed = 1; + return NULL; + } + /* the area is reserved from the end of wksp. + * If it overlaps with tableValidEnd, it voids guarantees on values' range */ + if (alloc < ws->tableValidEnd) { + ws->tableValidEnd = alloc; + } + ws->allocStart = alloc; + return alloc; +} + +/* + * Moves the cwksp to the next phase, and does any necessary allocations. + * cwksp initialization must necessarily go through each phase in order. + * Returns a 0 on success, or zstd error + */ +MEM_STATIC size_t +ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) +{ assert(phase >= ws->phase); if (phase > ws->phase) { + /* Going from allocating objects to allocating buffers */ if (ws->phase < ZSTD_cwksp_alloc_buffers && phase >= ZSTD_cwksp_alloc_buffers) { ws->tableValidEnd = ws->objectEnd; } + + /* Going from allocating buffers to allocating aligneds/tables */ if (ws->phase < ZSTD_cwksp_alloc_aligned && phase >= ZSTD_cwksp_alloc_aligned) { - /* If unaligned allocations down from a too-large top have left us - * unaligned, we need to realign our alloc ptr. Technically, this - * can consume space that is unaccounted for in the neededSpace - * calculation. However, I believe this can only happen when the - * workspace is too large, and specifically when it is too large - * by a larger margin than the space that will be consumed. */ - /* TODO: cleaner, compiler warning friendly way to do this??? */ - ws->allocStart = (BYTE*)ws->allocStart - ((size_t)ws->allocStart & (sizeof(U32)-1)); - if (ws->allocStart < ws->tableValidEnd) { - ws->tableValidEnd = ws->allocStart; + { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ + size_t const bytesToAlign = + ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); + DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); + ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ + RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), + memory_allocation, "aligned phase - alignment initial allocation failed!"); } - } + { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ + void* const alloc = ws->objectEnd; + size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); + void* const objectEnd = (BYTE*)alloc + bytesToAlign; + DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); + RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, + "table phase - alignment initial allocation failed!"); + ws->objectEnd = objectEnd; + ws->tableEnd = objectEnd; /* table area starts being empty */ + if (ws->tableValidEnd < ws->tableEnd) { + ws->tableValidEnd = ws->tableEnd; + } } } ws->phase = phase; + ZSTD_cwksp_assert_internal_consistency(ws); } + return 0; } /* * Returns whether this object/buffer/etc was allocated in this workspace. */ -MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) { +MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) +{ return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); } /* * Internal function. Do not use directly. */ -MEM_STATIC void* ZSTD_cwksp_reserve_internal( - ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) { +MEM_STATIC void* +ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) +{ void* alloc; - void* bottom = ws->tableEnd; - ZSTD_cwksp_internal_advance_phase(ws, phase); - alloc = (BYTE *)ws->allocStart - bytes; - - if (bytes == 0) + if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || bytes == 0) { return NULL; + } - DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", - alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); - ZSTD_cwksp_assert_internal_consistency(ws); - assert(alloc >= bottom); - if (alloc < bottom) { - DEBUGLOG(4, "cwksp: alloc failed!"); - ws->allocFailed = 1; - return NULL; - } - if (alloc < ws->tableValidEnd) { - ws->tableValidEnd = alloc; - } - ws->allocStart = alloc; + alloc = ZSTD_cwksp_reserve_internal_buffer_space(ws, bytes); return alloc; @@ -259,33 +338,44 @@ MEM_STATIC void* ZSTD_cwksp_reserve_internal( /* * Reserves and returns unaligned memory. */ -MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) { +MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) +{ return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); } /* - * Reserves and returns memory sized on and aligned on sizeof(unsigned). + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). */ -MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) { - assert((bytes & (sizeof(U32)-1)) == 0); - return ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, sizeof(U32)), ZSTD_cwksp_alloc_aligned); +MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) +{ + void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES), + ZSTD_cwksp_alloc_aligned); + assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); + return ptr; } /* - * Aligned on sizeof(unsigned). These buffers have the special property that + * Aligned on 64 bytes. These buffers have the special property that * their values remain constrained, allowing us to re-use them without * memset()-ing them. */ -MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) { +MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) +{ const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; - void* alloc = ws->tableEnd; - void* end = (BYTE *)alloc + bytes; - void* top = ws->allocStart; + void* alloc; + void* end; + void* top; + + if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { + return NULL; + } + alloc = ws->tableEnd; + end = (BYTE *)alloc + bytes; + top = ws->allocStart; DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining", alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); assert((bytes & (sizeof(U32)-1)) == 0); - ZSTD_cwksp_internal_advance_phase(ws, phase); ZSTD_cwksp_assert_internal_consistency(ws); assert(end <= top); if (end > top) { @@ -296,27 +386,31 @@ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) { ws->tableEnd = end; + assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); + assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); return alloc; } /* * Aligned on sizeof(void*). + * Note : should happen only once, at workspace first initialization */ -MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) { - size_t roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*)); +MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) +{ + size_t const roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*)); void* alloc = ws->objectEnd; void* end = (BYTE*)alloc + roundedBytes; - DEBUGLOG(5, + DEBUGLOG(4, "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes remaining", alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - roundedBytes); - assert(((size_t)alloc & (sizeof(void*)-1)) == 0); - assert((bytes & (sizeof(void*)-1)) == 0); + assert((size_t)alloc % ZSTD_ALIGNOF(void*) == 0); + assert(bytes % ZSTD_ALIGNOF(void*) == 0); ZSTD_cwksp_assert_internal_consistency(ws); /* we must be in the first phase, no advance is possible */ if (ws->phase != ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd) { - DEBUGLOG(4, "cwksp: object alloc failed!"); + DEBUGLOG(3, "cwksp: object alloc failed!"); ws->allocFailed = 1; return NULL; } @@ -328,7 +422,8 @@ MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) { return alloc; } -MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) { +MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) +{ DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty"); @@ -451,6 +546,24 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { * Functions Checking Free Space ***************************************/ +/* ZSTD_alignmentSpaceWithinBounds() : + * Returns if the estimated space needed for a wksp is within an acceptable limit of the + * actual amount of space used. + */ +MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, + size_t const estimatedSpace, int resizedWorkspace) { + if (resizedWorkspace) { + /* Resized/newly allocated wksp should have exact bounds */ + return ZSTD_cwksp_used(ws) == estimatedSpace; + } else { + /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes + * than estimatedSpace. See the comments in zstd_cwksp.h for details. + */ + return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); + } +} + + MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) { return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd); } diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c index b0424d23ac57..76933dea2624 100644 --- a/lib/zstd/compress/zstd_double_fast.c +++ b/lib/zstd/compress/zstd_double_fast.c @@ -48,10 +48,216 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, FORCE_INLINE_TEMPLATE -size_t ZSTD_compressBlock_doubleFast_generic( +size_t ZSTD_compressBlock_doubleFast_noDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls /* template */) +{ + ZSTD_compressionParameters const* cParams = &ms->cParams; + U32* const hashLong = ms->hashTable; + const U32 hBitsL = cParams->hashLog; + U32* const hashSmall = ms->chainTable; + const U32 hBitsS = cParams->chainLog; + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + /* presumes that, if there is a dictionary, it must be using Attach mode */ + const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); + const BYTE* const prefixLowest = base + prefixLowestIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; + U32 offsetSaved = 0; + + size_t mLength; + U32 offset; + U32 curr; + + /* how many positions to search before increasing step size */ + const size_t kStepIncr = 1 << kSearchStrength; + /* the position at which to increment the step size if no match is found */ + const BYTE* nextStep; + size_t step; /* the current step size */ + + size_t hl0; /* the long hash at ip */ + size_t hl1; /* the long hash at ip1 */ + + U32 idxl0; /* the long match index for ip */ + U32 idxl1; /* the long match index for ip1 */ + + const BYTE* matchl0; /* the long match for ip */ + const BYTE* matchs0; /* the short match for ip */ + const BYTE* matchl1; /* the long match for ip1 */ + + const BYTE* ip = istart; /* the current position */ + const BYTE* ip1; /* the next position */ + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic"); + + /* init */ + ip += ((ip - prefixLowest) == 0); + { + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; + if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; + if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; + } + + /* Outer Loop: one iteration per match found and stored */ + while (1) { + step = 1; + nextStep = ip + kStepIncr; + ip1 = ip + step; + + if (ip1 > ilimit) { + goto _cleanup; + } + + hl0 = ZSTD_hashPtr(ip, hBitsL, 8); + idxl0 = hashLong[hl0]; + matchl0 = base + idxl0; + + /* Inner Loop: one iteration per search / position */ + do { + const size_t hs0 = ZSTD_hashPtr(ip, hBitsS, mls); + const U32 idxs0 = hashSmall[hs0]; + curr = (U32)(ip-base); + matchs0 = base + idxs0; + + hashLong[hl0] = hashSmall[hs0] = curr; /* update hash tables */ + + /* check noDict repcode */ + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); + goto _match_stored; + } + + hl1 = ZSTD_hashPtr(ip1, hBitsL, 8); + + if (idxl0 > prefixLowestIndex) { + /* check prefix long match */ + if (MEM_read64(matchl0) == MEM_read64(ip)) { + mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8; + offset = (U32)(ip-matchl0); + while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */ + goto _match_found; + } + } + + idxl1 = hashLong[hl1]; + matchl1 = base + idxl1; + + if (idxs0 > prefixLowestIndex) { + /* check prefix short match */ + if (MEM_read32(matchs0) == MEM_read32(ip)) { + goto _search_next_long; + } + } + + if (ip1 >= nextStep) { + PREFETCH_L1(ip1 + 64); + PREFETCH_L1(ip1 + 128); + step++; + nextStep += kStepIncr; + } + ip = ip1; + ip1 += step; + + hl0 = hl1; + idxl0 = idxl1; + matchl0 = matchl1; + #if defined(__aarch64__) + PREFETCH_L1(ip+256); + #endif + } while (ip1 <= ilimit); + +_cleanup: + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved; + rep[1] = offset_2 ? offset_2 : offsetSaved; + + /* Return the last literals size */ + return (size_t)(iend - anchor); + +_search_next_long: + + /* check prefix long +1 match */ + if (idxl1 > prefixLowestIndex) { + if (MEM_read64(matchl1) == MEM_read64(ip1)) { + ip = ip1; + mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8; + offset = (U32)(ip-matchl1); + while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */ + goto _match_found; + } + } + + /* if no long +1 match, explore the short match we found */ + mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4; + offset = (U32)(ip - matchs0); + while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */ + + /* fall-through */ + +_match_found: /* requires ip, offset, mLength */ + offset_2 = offset_1; + offset_1 = offset; + + if (step < 4) { + /* It is unsafe to write this value back to the hashtable when ip1 is + * greater than or equal to the new ip we will have after we're done + * processing this match. Rather than perform that test directly + * (ip1 >= ip + mLength), which costs speed in practice, we do a simpler + * more predictable test. The minmatch even if we take a short match is + * 4 bytes, so as long as step, the distance between ip and ip1 + * (initially) is less than 4, we know ip1 < new ip. */ + hashLong[hl1] = (U32)(ip1 - base); + } + + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); + +_match_stored: + /* match found */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Complementary insertion */ + /* done after iLimit test, as candidates could be > iend-8 */ + { U32 const indexToInsert = curr+2; + hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; + hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); + hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; + hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base); + } + + /* check immediate repcode */ + while ( (ip <= ilimit) + && ( (offset_2>0) + & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { + /* store sequence */ + size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; + U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } + } + } +} + + +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, - U32 const mls /* template */, ZSTD_dictMode_e const dictMode) + U32 const mls /* template */) { ZSTD_compressionParameters const* cParams = &ms->cParams; U32* const hashLong = ms->hashTable; @@ -72,54 +278,30 @@ size_t ZSTD_compressBlock_doubleFast_generic( U32 offsetSaved = 0; const ZSTD_matchState_t* const dms = ms->dictMatchState; - const ZSTD_compressionParameters* const dictCParams = - dictMode == ZSTD_dictMatchState ? - &dms->cParams : NULL; - const U32* const dictHashLong = dictMode == ZSTD_dictMatchState ? - dms->hashTable : NULL; - const U32* const dictHashSmall = dictMode == ZSTD_dictMatchState ? - dms->chainTable : NULL; - const U32 dictStartIndex = dictMode == ZSTD_dictMatchState ? - dms->window.dictLimit : 0; - const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ? - dms->window.base : NULL; - const BYTE* const dictStart = dictMode == ZSTD_dictMatchState ? - dictBase + dictStartIndex : NULL; - const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ? - dms->window.nextSrc : NULL; - const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ? - prefixLowestIndex - (U32)(dictEnd - dictBase) : - 0; - const U32 dictHBitsL = dictMode == ZSTD_dictMatchState ? - dictCParams->hashLog : hBitsL; - const U32 dictHBitsS = dictMode == ZSTD_dictMatchState ? - dictCParams->chainLog : hBitsS; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams; + const U32* const dictHashLong = dms->hashTable; + const U32* const dictHashSmall = dms->chainTable; + const U32 dictStartIndex = dms->window.dictLimit; + const BYTE* const dictBase = dms->window.base; + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase); + const U32 dictHBitsL = dictCParams->hashLog; + const U32 dictHBitsS = dictCParams->chainLog; const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); - DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_generic"); - - assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState); + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); /* if a dictionary is attached, it must be within window range */ - if (dictMode == ZSTD_dictMatchState) { - assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); - } + assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); /* init */ ip += (dictAndPrefixLength == 0); - if (dictMode == ZSTD_noDict) { - U32 const curr = (U32)(ip - base); - U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); - U32 const maxRep = curr - windowLow; - if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; - if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; - } - if (dictMode == ZSTD_dictMatchState) { - /* dictMatchState repCode checks don't currently handle repCode == 0 - * disabling. */ - assert(offset_1 <= dictAndPrefixLength); - assert(offset_2 <= dictAndPrefixLength); - } + + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); /* Main Search Loop */ while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ @@ -135,29 +317,18 @@ size_t ZSTD_compressBlock_doubleFast_generic( const BYTE* matchLong = base + matchIndexL; const BYTE* match = base + matchIndexS; const U32 repIndex = curr + 1 - offset_1; - const BYTE* repMatch = (dictMode == ZSTD_dictMatchState - && repIndex < prefixLowestIndex) ? + const BYTE* repMatch = (repIndex < prefixLowestIndex) ? dictBase + (repIndex - dictIndexDelta) : base + repIndex; hashLong[h2] = hashSmall[h] = curr; /* update hash tables */ - /* check dictMatchState repcode */ - if (dictMode == ZSTD_dictMatchState - && ((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) + /* check repcode */ + if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); - goto _match_stored; - } - - /* check noDict repcode */ - if ( dictMode == ZSTD_noDict - && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) { - mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; - ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); goto _match_stored; } @@ -169,7 +340,7 @@ size_t ZSTD_compressBlock_doubleFast_generic( while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ goto _match_found; } - } else if (dictMode == ZSTD_dictMatchState) { + } else { /* check dictMatchState long match */ U32 const dictMatchIndexL = dictHashLong[dictHL]; const BYTE* dictMatchL = dictBase + dictMatchIndexL; @@ -187,7 +358,7 @@ size_t ZSTD_compressBlock_doubleFast_generic( if (MEM_read32(match) == MEM_read32(ip)) { goto _search_next_long; } - } else if (dictMode == ZSTD_dictMatchState) { + } else { /* check dictMatchState short match */ U32 const dictMatchIndexS = dictHashSmall[dictHS]; match = dictBase + dictMatchIndexS; @@ -220,7 +391,7 @@ _search_next_long: while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ goto _match_found; } - } else if (dictMode == ZSTD_dictMatchState) { + } else { /* check dict long +1 match */ U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; @@ -234,7 +405,7 @@ _search_next_long: } } } /* if no long +1 match, explore the short match we found */ - if (dictMode == ZSTD_dictMatchState && matchIndexS < prefixLowestIndex) { + if (matchIndexS < prefixLowestIndex) { mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4; offset = (U32)(curr - matchIndexS); while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ @@ -248,7 +419,7 @@ _match_found: offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); _match_stored: /* match found */ @@ -266,43 +437,27 @@ _match_stored: } /* check immediate repcode */ - if (dictMode == ZSTD_dictMatchState) { - while (ip <= ilimit) { - U32 const current2 = (U32)(ip-base); - U32 const repIndex2 = current2 - offset_2; - const BYTE* repMatch2 = dictMode == ZSTD_dictMatchState - && repIndex2 < prefixLowestIndex ? - dictBase + repIndex2 - dictIndexDelta : - base + repIndex2; - if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) - && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { - const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; - size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; - U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; - ip += repLength2; - anchor = ip; - continue; - } - break; - } } - - if (dictMode == ZSTD_noDict) { - while ( (ip <= ilimit) - && ( (offset_2>0) - & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { - /* store sequence */ - size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; - U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, rLength-MINMATCH); - ip += rLength; + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ? + dictBase + repIndex2 - dictIndexDelta : + base + repIndex2; + if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; anchor = ip; - continue; /* faster when present ... (?) */ - } } } + continue; + } + break; + } + } } /* while (ip < ilimit) */ /* save reps for next block */ @@ -313,6 +468,24 @@ _match_stored: return (size_t)(iend - anchor); } +#define ZSTD_GEN_DFAST_FN(dictMode, mls) \ + static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls( \ + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ + void const* src, size_t srcSize) \ + { \ + return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \ + } + +ZSTD_GEN_DFAST_FN(noDict, 4) +ZSTD_GEN_DFAST_FN(noDict, 5) +ZSTD_GEN_DFAST_FN(noDict, 6) +ZSTD_GEN_DFAST_FN(noDict, 7) + +ZSTD_GEN_DFAST_FN(dictMatchState, 4) +ZSTD_GEN_DFAST_FN(dictMatchState, 5) +ZSTD_GEN_DFAST_FN(dictMatchState, 6) +ZSTD_GEN_DFAST_FN(dictMatchState, 7) + size_t ZSTD_compressBlock_doubleFast( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -323,13 +496,13 @@ size_t ZSTD_compressBlock_doubleFast( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, src, srcSize); } } @@ -343,13 +516,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState); + return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState); + return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState); + return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState); + return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore, rep, src, srcSize); } } @@ -385,7 +558,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( /* if extDict is invalidated due to maxDistance, switch to "regular" variant */ if (prefixStartIndex == dictStartIndex) - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, mls, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSize); /* Search Loop */ while (ip < ilimit) { /* < instead of <=, because (ip+1) */ @@ -407,12 +580,12 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */ if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */ - & (repIndex > dictStartIndex)) + & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); } else { if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; @@ -423,7 +596,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); @@ -448,7 +621,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( } offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); } else { ip += ((ip-anchor) >> kSearchStrength) + 1; @@ -475,12 +648,12 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( U32 const repIndex2 = current2 - offset_2; const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */ - & (repIndex2 > dictStartIndex)) + & (offset_2 <= current2 - dictStartIndex)) && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; ip += repLength2; @@ -498,6 +671,10 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( return (size_t)(iend - anchor); } +ZSTD_GEN_DFAST_FN(extDict, 4) +ZSTD_GEN_DFAST_FN(extDict, 5) +ZSTD_GEN_DFAST_FN(extDict, 6) +ZSTD_GEN_DFAST_FN(extDict, 7) size_t ZSTD_compressBlock_doubleFast_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -508,12 +685,12 @@ size_t ZSTD_compressBlock_doubleFast_extDict( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); + return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); + return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); + return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); + return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize); } } diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c index 96b7d48e2868..a752e6beab52 100644 --- a/lib/zstd/compress/zstd_fast.c +++ b/lib/zstd/compress/zstd_fast.c @@ -43,145 +43,294 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, } +/* + * If you squint hard enough (and ignore repcodes), the search operation at any + * given position is broken into 4 stages: + * + * 1. Hash (map position to hash value via input read) + * 2. Lookup (map hash val to index via hashtable read) + * 3. Load (map index to value at that position via input read) + * 4. Compare + * + * Each of these steps involves a memory read at an address which is computed + * from the previous step. This means these steps must be sequenced and their + * latencies are cumulative. + * + * Rather than do 1->2->3->4 sequentially for a single position before moving + * onto the next, this implementation interleaves these operations across the + * next few positions: + * + * R = Repcode Read & Compare + * H = Hash + * T = Table Lookup + * M = Match Read & Compare + * + * Pos | Time --> + * ----+------------------- + * N | ... M + * N+1 | ... TM + * N+2 | R H T M + * N+3 | H TM + * N+4 | R H T M + * N+5 | H ... + * N+6 | R ... + * + * This is very much analogous to the pipelining of execution in a CPU. And just + * like a CPU, we have to dump the pipeline when we find a match (i.e., take a + * branch). + * + * When this happens, we throw away our current state, and do the following prep + * to re-enter the loop: + * + * Pos | Time --> + * ----+------------------- + * N | H T + * N+1 | H + * + * This is also the work we do at the beginning to enter the loop initially. + */ FORCE_INLINE_TEMPLATE size_t -ZSTD_compressBlock_fast_generic( +ZSTD_compressBlock_fast_noDict_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, - U32 const mls) + U32 const mls, U32 const hasStep) { const ZSTD_compressionParameters* const cParams = &ms->cParams; U32* const hashTable = ms->hashTable; U32 const hlog = cParams->hashLog; /* support stepSize of 0 */ - size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; + size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2; const BYTE* const base = ms->window.base; const BYTE* const istart = (const BYTE*)src; - /* We check ip0 (ip + 0) and ip1 (ip + 1) each loop */ - const BYTE* ip0 = istart; - const BYTE* ip1; - const BYTE* anchor = istart; const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); const BYTE* const prefixStart = base + prefixStartIndex; const BYTE* const iend = istart + srcSize; const BYTE* const ilimit = iend - HASH_READ_SIZE; - U32 offset_1=rep[0], offset_2=rep[1]; + + const BYTE* anchor = istart; + const BYTE* ip0 = istart; + const BYTE* ip1; + const BYTE* ip2; + const BYTE* ip3; + U32 current0; + + U32 rep_offset1 = rep[0]; + U32 rep_offset2 = rep[1]; U32 offsetSaved = 0; - /* init */ + size_t hash0; /* hash for ip0 */ + size_t hash1; /* hash for ip1 */ + U32 idx; /* match idx for ip0 */ + U32 mval; /* src value at match idx */ + + U32 offcode; + const BYTE* match0; + size_t mLength; + + /* ip0 and ip1 are always adjacent. The targetLength skipping and + * uncompressibility acceleration is applied to every other position, + * matching the behavior of #1562. step therefore represents the gap + * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */ + size_t step; + const BYTE* nextStep; + const size_t kStepIncr = (1 << (kSearchStrength - 1)); + DEBUGLOG(5, "ZSTD_compressBlock_fast_generic"); ip0 += (ip0 == prefixStart); - ip1 = ip0 + 1; { U32 const curr = (U32)(ip0 - base); U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); U32 const maxRep = curr - windowLow; - if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; - if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; + if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0; + if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0; } - /* Main Search Loop */ -#ifdef __INTEL_COMPILER - /* From intel 'The vector pragma indicates that the loop should be - * vectorized if it is legal to do so'. Can be used together with - * #pragma ivdep (but have opted to exclude that because intel - * warns against using it).*/ - #pragma vector always -#endif - while (ip1 < ilimit) { /* < instead of <=, because check at ip0+2 */ - size_t mLength; - BYTE const* ip2 = ip0 + 2; - size_t const h0 = ZSTD_hashPtr(ip0, hlog, mls); - U32 const val0 = MEM_read32(ip0); - size_t const h1 = ZSTD_hashPtr(ip1, hlog, mls); - U32 const val1 = MEM_read32(ip1); - U32 const current0 = (U32)(ip0-base); - U32 const current1 = (U32)(ip1-base); - U32 const matchIndex0 = hashTable[h0]; - U32 const matchIndex1 = hashTable[h1]; - BYTE const* repMatch = ip2 - offset_1; - const BYTE* match0 = base + matchIndex0; - const BYTE* match1 = base + matchIndex1; - U32 offcode; - -#if defined(__aarch64__) - PREFETCH_L1(ip0+256); -#endif - - hashTable[h0] = current0; /* update hash table */ - hashTable[h1] = current1; /* update hash table */ - - assert(ip0 + 1 == ip1); - - if ((offset_1 > 0) & (MEM_read32(repMatch) == MEM_read32(ip2))) { - mLength = (ip2[-1] == repMatch[-1]) ? 1 : 0; - ip0 = ip2 - mLength; - match0 = repMatch - mLength; + /* start each op */ +_start: /* Requires: ip0 */ + + step = stepSize; + nextStep = ip0 + kStepIncr; + + /* calculate positions, ip0 - anchor == 0, so we skip step calc */ + ip1 = ip0 + 1; + ip2 = ip0 + step; + ip3 = ip2 + 1; + + if (ip3 >= ilimit) { + goto _cleanup; + } + + hash0 = ZSTD_hashPtr(ip0, hlog, mls); + hash1 = ZSTD_hashPtr(ip1, hlog, mls); + + idx = hashTable[hash0]; + + do { + /* load repcode match for ip[2]*/ + const U32 rval = MEM_read32(ip2 - rep_offset1); + + /* write back hash table entry */ + current0 = (U32)(ip0 - base); + hashTable[hash0] = current0; + + /* check repcode at ip[2] */ + if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) { + ip0 = ip2; + match0 = ip0 - rep_offset1; + mLength = ip0[-1] == match0[-1]; + ip0 -= mLength; + match0 -= mLength; + offcode = STORE_REPCODE_1; mLength += 4; - offcode = 0; goto _match; } - if ((matchIndex0 > prefixStartIndex) && MEM_read32(match0) == val0) { - /* found a regular match */ - goto _offset; + + /* load match for ip[0] */ + if (idx >= prefixStartIndex) { + mval = MEM_read32(base + idx); + } else { + mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ } - if ((matchIndex1 > prefixStartIndex) && MEM_read32(match1) == val1) { - /* found a regular match after one literal */ - ip0 = ip1; - match0 = match1; + + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ goto _offset; } - { size_t const step = ((size_t)(ip0-anchor) >> (kSearchStrength - 1)) + stepSize; - assert(step >= 2); - ip0 += step; - ip1 += step; - continue; + + /* lookup ip[1] */ + idx = hashTable[hash1]; + + /* hash ip[2] */ + hash0 = hash1; + hash1 = ZSTD_hashPtr(ip2, hlog, mls); + + /* advance to next positions */ + ip0 = ip1; + ip1 = ip2; + ip2 = ip3; + + /* write back hash table entry */ + current0 = (U32)(ip0 - base); + hashTable[hash0] = current0; + + /* load match for ip[0] */ + if (idx >= prefixStartIndex) { + mval = MEM_read32(base + idx); + } else { + mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ } -_offset: /* Requires: ip0, match0 */ - /* Compute the offset code */ - offset_2 = offset_1; - offset_1 = (U32)(ip0-match0); - offcode = offset_1 + ZSTD_REP_MOVE; - mLength = 4; - /* Count the backwards match length */ - while (((ip0>anchor) & (match0>prefixStart)) - && (ip0[-1] == match0[-1])) { ip0--; match0--; mLength++; } /* catch up */ -_match: /* Requires: ip0, match0, offcode */ - /* Count the forward length */ - mLength += ZSTD_count(ip0+mLength, match0+mLength, iend); - ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, offcode, mLength-MINMATCH); - /* match found */ - ip0 += mLength; - anchor = ip0; + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ + goto _offset; + } - if (ip0 <= ilimit) { - /* Fill Table */ - assert(base+current0+2 > istart); /* check base overflow */ - hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ - hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); - - if (offset_2 > 0) { /* offset_2==0 means offset_2 is invalidated */ - while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - offset_2)) ) { - /* store sequence */ - size_t const rLength = ZSTD_count(ip0+4, ip0+4-offset_2, iend) + 4; - { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */ - hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); - ip0 += rLength; - ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, 0 /*offCode*/, rLength-MINMATCH); - anchor = ip0; - continue; /* faster when present (confirmed on gcc-8) ... (?) */ - } } } - ip1 = ip0 + 1; - } + /* lookup ip[1] */ + idx = hashTable[hash1]; + + /* hash ip[2] */ + hash0 = hash1; + hash1 = ZSTD_hashPtr(ip2, hlog, mls); + + /* advance to next positions */ + ip0 = ip1; + ip1 = ip2; + ip2 = ip0 + step; + ip3 = ip1 + step; + + /* calculate step */ + if (ip2 >= nextStep) { + step++; + PREFETCH_L1(ip1 + 64); + PREFETCH_L1(ip1 + 128); + nextStep += kStepIncr; + } + } while (ip3 < ilimit); + +_cleanup: + /* Note that there are probably still a couple positions we could search. + * However, it seems to be a meaningful performance hit to try to search + * them. So let's not. */ /* save reps for next block */ - rep[0] = offset_1 ? offset_1 : offsetSaved; - rep[1] = offset_2 ? offset_2 : offsetSaved; + rep[0] = rep_offset1 ? rep_offset1 : offsetSaved; + rep[1] = rep_offset2 ? rep_offset2 : offsetSaved; /* Return the last literals size */ return (size_t)(iend - anchor); + +_offset: /* Requires: ip0, idx */ + + /* Compute the offset code. */ + match0 = base + idx; + rep_offset2 = rep_offset1; + rep_offset1 = (U32)(ip0-match0); + offcode = STORE_OFFSET(rep_offset1); + mLength = 4; + + /* Count the backwards match length. */ + while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) { + ip0--; + match0--; + mLength++; + } + +_match: /* Requires: ip0, match0, offcode */ + + /* Count the forward length. */ + mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend); + + ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); + + ip0 += mLength; + anchor = ip0; + + /* write next hash table entry */ + if (ip1 < ip0) { + hashTable[hash1] = (U32)(ip1 - base); + } + + /* Fill table and check for immediate repcode. */ + if (ip0 <= ilimit) { + /* Fill Table */ + assert(base+current0+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + + if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */ + while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) { + /* store sequence */ + size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4; + { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); + ip0 += rLength; + ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength); + anchor = ip0; + continue; /* faster when present (confirmed on gcc-8) ... (?) */ + } } } + + goto _start; } +#define ZSTD_GEN_FAST_FN(dictMode, mls, step) \ + static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step( \ + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ + void const* src, size_t srcSize) \ + { \ + return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \ + } + +ZSTD_GEN_FAST_FN(noDict, 4, 1) +ZSTD_GEN_FAST_FN(noDict, 5, 1) +ZSTD_GEN_FAST_FN(noDict, 6, 1) +ZSTD_GEN_FAST_FN(noDict, 7, 1) + +ZSTD_GEN_FAST_FN(noDict, 4, 0) +ZSTD_GEN_FAST_FN(noDict, 5, 0) +ZSTD_GEN_FAST_FN(noDict, 6, 0) +ZSTD_GEN_FAST_FN(noDict, 7, 0) size_t ZSTD_compressBlock_fast( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -189,24 +338,40 @@ size_t ZSTD_compressBlock_fast( { U32 const mls = ms->cParams.minMatch; assert(ms->dictMatchState == NULL); - switch(mls) - { - default: /* includes case 3 */ - case 4 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4); - case 5 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5); - case 6 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6); - case 7 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7); + if (ms->cParams.targetLength > 1) { + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize); + case 5 : + return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize); + case 6 : + return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize); + case 7 : + return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize); + } + } else { + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize); + case 5 : + return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize); + case 6 : + return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize); + case 7 : + return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize); + } + } } FORCE_INLINE_TEMPLATE size_t ZSTD_compressBlock_fast_dictMatchState_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize, U32 const mls) + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) { const ZSTD_compressionParameters* const cParams = &ms->cParams; U32* const hashTable = ms->hashTable; @@ -242,6 +407,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( assert(endIndex - prefixStartIndex <= maxDistance); (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ + (void)hasStep; /* not currently specialized on whether it's accelerated */ + /* ensure there will be no underflow * when translating a dict index into a local index */ assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); @@ -272,7 +439,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); } else if ( (matchIndex <= prefixStartIndex) ) { size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); U32 const dictMatchIndex = dictHashTable[dictHash]; @@ -292,7 +459,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( } /* catch up */ offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); } } else if (MEM_read32(match) != MEM_read32(ip)) { /* it's not a match, and we're not going to check the dictionary */ @@ -307,7 +474,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); } /* match found */ @@ -332,7 +499,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; ip += repLength2; anchor = ip; @@ -351,6 +518,12 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( return (size_t)(iend - anchor); } + +ZSTD_GEN_FAST_FN(dictMatchState, 4, 0) +ZSTD_GEN_FAST_FN(dictMatchState, 5, 0) +ZSTD_GEN_FAST_FN(dictMatchState, 6, 0) +ZSTD_GEN_FAST_FN(dictMatchState, 7, 0) + size_t ZSTD_compressBlock_fast_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) @@ -361,20 +534,20 @@ size_t ZSTD_compressBlock_fast_dictMatchState( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4); + return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5); + return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6); + return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7); + return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize); } } static size_t ZSTD_compressBlock_fast_extDict_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize, U32 const mls) + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) { const ZSTD_compressionParameters* const cParams = &ms->cParams; U32* const hashTable = ms->hashTable; @@ -398,11 +571,13 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( const BYTE* const ilimit = iend - 8; U32 offset_1=rep[0], offset_2=rep[1]; + (void)hasStep; /* not currently specialized on whether it's accelerated */ + DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1); /* switch to "regular" variant if extDict is invalidated due to maxDistance */ if (prefixStartIndex == dictStartIndex) - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, mls); + return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); /* Search Loop */ while (ip < ilimit) { /* < instead of <=, because (ip+1) */ @@ -416,14 +591,14 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( const BYTE* const repMatch = repBase + repIndex; hashTable[h] = curr; /* update hash table */ DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr); - assert(offset_1 <= curr +1); /* check repIndex */ - if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex)) + if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ + & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, rLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength); ip += rLength; anchor = ip; } else { @@ -439,7 +614,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ offset_2 = offset_1; offset_1 = offset; /* update offset history */ - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ip += mLength; anchor = ip; } } @@ -453,12 +628,12 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( U32 const current2 = (U32)(ip-base); U32 const repIndex2 = current2 - offset_2; const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; - if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex)) /* intentional overflow */ + if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */ && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, 0 /*offcode*/, repLength2-MINMATCH); + ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2); hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; ip += repLength2; anchor = ip; @@ -475,6 +650,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( return (size_t)(iend - anchor); } +ZSTD_GEN_FAST_FN(extDict, 4, 0) +ZSTD_GEN_FAST_FN(extDict, 5, 0) +ZSTD_GEN_FAST_FN(extDict, 6, 0) +ZSTD_GEN_FAST_FN(extDict, 7, 0) size_t ZSTD_compressBlock_fast_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -485,12 +664,12 @@ size_t ZSTD_compressBlock_fast_extDict( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); + return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); + return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); + return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); + return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize); } } diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c index fb54d4e28a2b..0298a01a7504 100644 --- a/lib/zstd/compress/zstd_lazy.c +++ b/lib/zstd/compress/zstd_lazy.c @@ -61,7 +61,7 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms, * assumption : curr >= btlow == (curr - btmask) * doesn't fail */ static void -ZSTD_insertDUBT1(ZSTD_matchState_t* ms, +ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, U32 curr, const BYTE* inputEnd, U32 nbCompares, U32 btLow, const ZSTD_dictMode_e dictMode) @@ -151,7 +151,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms, static size_t ZSTD_DUBT_findBetterDictMatch ( - ZSTD_matchState_t* ms, + const ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iend, size_t* offsetPtr, size_t bestLength, @@ -197,8 +197,8 @@ ZSTD_DUBT_findBetterDictMatch ( U32 matchIndex = dictMatchIndex + dictIndexDelta; if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", - curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex); - bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex; + curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); + bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); } if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ break; /* drop, to guarantee consistency (miss a little bit of compression) */ @@ -218,7 +218,7 @@ ZSTD_DUBT_findBetterDictMatch ( } if (bestLength >= MINMATCH) { - U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; + U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", curr, (U32)bestLength, (U32)*offsetPtr, mIndex); } @@ -328,7 +328,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, if (matchLength > matchEndIdx - matchIndex) matchEndIdx = matchIndex + (U32)matchLength; if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) - bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex; + bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ if (dictMode == ZSTD_dictMatchState) { nbCompares = 0; /* in addition to avoiding checking any @@ -368,7 +368,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ if (bestLength >= MINMATCH) { - U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; + U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", curr, (U32)bestLength, (U32)*offsetPtr, mIndex); } @@ -391,91 +391,9 @@ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); } - -static size_t -ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); - } -} - - -static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); - } -} - - -static size_t ZSTD_BtFindBestMatch_extDict_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); - } -} - - - /* ********************************* -* Hash Chain +* Dedicated dict search ***********************************/ -#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] - -/* Update chains up to ip (excluded) - Assumption : always within prefix (i.e. not within extDict) */ -FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( - ZSTD_matchState_t* ms, - const ZSTD_compressionParameters* const cParams, - const BYTE* ip, U32 const mls) -{ - U32* const hashTable = ms->hashTable; - const U32 hashLog = cParams->hashLog; - U32* const chainTable = ms->chainTable; - const U32 chainMask = (1 << cParams->chainLog) - 1; - const BYTE* const base = ms->window.base; - const U32 target = (U32)(ip - base); - U32 idx = ms->nextToUpdate; - - while(idx < target) { /* catch up */ - size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); - NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; - hashTable[h] = idx; - idx++; - } - - ms->nextToUpdate = target; - return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; -} - -U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { - const ZSTD_compressionParameters* const cParams = &ms->cParams; - return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); -} void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip) { @@ -485,7 +403,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B U32* const chainTable = ms->chainTable; U32 const chainSize = 1 << ms->cParams.chainLog; U32 idx = ms->nextToUpdate; - U32 const minChain = chainSize < target ? target - chainSize : idx; + U32 const minChain = chainSize < target - idx ? target - chainSize : idx; U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG; U32 const cacheSize = bucketSize - 1; U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize; @@ -499,13 +417,12 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; U32* const tmpHashTable = hashTable; U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog); - U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; + U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx; - U32 hashIdx; assert(ms->cParams.chainLog <= 24); - assert(ms->cParams.hashLog >= ms->cParams.chainLog); + assert(ms->cParams.hashLog > ms->cParams.chainLog); assert(idx != 0); assert(tmpMinChain <= minChain); @@ -536,7 +453,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B if (count == cacheSize) { for (count = 0; count < chainLimit;) { if (i < minChain) { - if (!i || countBeyondMinChain++ > cacheSize) { + if (!i || ++countBeyondMinChain > cacheSize) { /* only allow pulling `cacheSize` number of entries * into the cache or chainTable beyond `minChain`, * to replace the entries pulled out of the @@ -592,10 +509,143 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B ms->nextToUpdate = target; } +/* Returns the longest match length found in the dedicated dict search structure. + * If none are longer than the argument ml, then ml will be returned. + */ +FORCE_INLINE_TEMPLATE +size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts, + const ZSTD_matchState_t* const dms, + const BYTE* const ip, const BYTE* const iLimit, + const BYTE* const prefixStart, const U32 curr, + const U32 dictLimit, const size_t ddsIdx) { + const U32 ddsLowestIndex = dms->window.dictLimit; + const BYTE* const ddsBase = dms->window.base; + const BYTE* const ddsEnd = dms->window.nextSrc; + const U32 ddsSize = (U32)(ddsEnd - ddsBase); + const U32 ddsIndexDelta = dictLimit - ddsSize; + const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); + const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; + U32 ddsAttempt; + U32 matchIndex; + + for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { + PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); + } + + { + U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; + U32 const chainIndex = chainPackedPointer >> 8; + + PREFETCH_L1(&dms->chainTable[chainIndex]); + } + + for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { + size_t currentMl=0; + const BYTE* match; + matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; + match = ddsBase + matchIndex; + + if (!matchIndex) { + return ml; + } + + /* guaranteed by table construction */ + (void)ddsLowestIndex; + assert(matchIndex >= ddsLowestIndex); + assert(match+4 <= ddsEnd); + if (MEM_read32(match) == MEM_read32(ip)) { + /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) { + /* best possible, avoids read overflow on next attempt */ + return ml; + } + } + } + + { + U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; + U32 chainIndex = chainPackedPointer >> 8; + U32 const chainLength = chainPackedPointer & 0xFF; + U32 const chainAttempts = nbAttempts - ddsAttempt; + U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; + U32 chainAttempt; + + for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { + PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); + } + + for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { + size_t currentMl=0; + const BYTE* match; + matchIndex = dms->chainTable[chainIndex]; + match = ddsBase + matchIndex; + + /* guaranteed by table construction */ + assert(matchIndex >= ddsLowestIndex); + assert(match+4 <= ddsEnd); + if (MEM_read32(match) == MEM_read32(ip)) { + /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } + } + return ml; +} + + +/* ********************************* +* Hash Chain +***********************************/ +#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] + +/* Update chains up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + ZSTD_matchState_t* ms, + const ZSTD_compressionParameters* const cParams, + const BYTE* ip, U32 const mls) +{ + U32* const hashTable = ms->hashTable; + const U32 hashLog = cParams->hashLog; + U32* const chainTable = ms->chainTable; + const U32 chainMask = (1 << cParams->chainLog) - 1; + const BYTE* const base = ms->window.base; + const U32 target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + + while(idx < target) { /* catch up */ + size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; + } + + ms->nextToUpdate = target; + return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; +} + +U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); +} /* inlining is important to hardwire a hot branch (template emulation) */ FORCE_INLINE_TEMPLATE -size_t ZSTD_HcFindBestMatch_generic ( +size_t ZSTD_HcFindBestMatch( ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, size_t* offsetPtr, @@ -653,7 +703,7 @@ size_t ZSTD_HcFindBestMatch_generic ( /* save best solution */ if (currentMl > ml) { ml = currentMl; - *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE; + *offsetPtr = STORE_OFFSET(curr - matchIndex); if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } @@ -663,90 +713,8 @@ size_t ZSTD_HcFindBestMatch_generic ( assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ if (dictMode == ZSTD_dedicatedDictSearch) { - const U32 ddsLowestIndex = dms->window.dictLimit; - const BYTE* const ddsBase = dms->window.base; - const BYTE* const ddsEnd = dms->window.nextSrc; - const U32 ddsSize = (U32)(ddsEnd - ddsBase); - const U32 ddsIndexDelta = dictLimit - ddsSize; - const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); - const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; - U32 ddsAttempt; - - for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { - PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); - } - - { - U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; - U32 const chainIndex = chainPackedPointer >> 8; - - PREFETCH_L1(&dms->chainTable[chainIndex]); - } - - for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { - size_t currentMl=0; - const BYTE* match; - matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; - match = ddsBase + matchIndex; - - if (!matchIndex) { - return ml; - } - - /* guaranteed by table construction */ - (void)ddsLowestIndex; - assert(matchIndex >= ddsLowestIndex); - assert(match+4 <= ddsEnd); - if (MEM_read32(match) == MEM_read32(ip)) { - /* assumption : matchIndex <= dictLimit-4 (by table construction) */ - currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; - } - - /* save best solution */ - if (currentMl > ml) { - ml = currentMl; - *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; - if (ip+currentMl == iLimit) { - /* best possible, avoids read overflow on next attempt */ - return ml; - } - } - } - - { - U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; - U32 chainIndex = chainPackedPointer >> 8; - U32 const chainLength = chainPackedPointer & 0xFF; - U32 const chainAttempts = nbAttempts - ddsAttempt; - U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; - U32 chainAttempt; - - for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { - PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); - } - - for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { - size_t currentMl=0; - const BYTE* match; - matchIndex = dms->chainTable[chainIndex]; - match = ddsBase + matchIndex; - - /* guaranteed by table construction */ - assert(matchIndex >= ddsLowestIndex); - assert(match+4 <= ddsEnd); - if (MEM_read32(match) == MEM_read32(ip)) { - /* assumption : matchIndex <= dictLimit-4 (by table construction) */ - currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; - } - - /* save best solution */ - if (currentMl > ml) { - ml = currentMl; - *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; - if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ - } - } - } + ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms, + ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); } else if (dictMode == ZSTD_dictMatchState) { const U32* const dmsChainTable = dms->chainTable; const U32 dmsChainSize = (1 << dms->cParams.chainLog); @@ -770,7 +738,8 @@ size_t ZSTD_HcFindBestMatch_generic ( /* save best solution */ if (currentMl > ml) { ml = currentMl; - *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE; + assert(curr > matchIndex + dmsIndexDelta); + *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } @@ -783,75 +752,725 @@ size_t ZSTD_HcFindBestMatch_generic ( return ml; } +/* ********************************* +* (SIMD) Row-based matchfinder +***********************************/ +/* Constants for row-based hash */ +#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ +#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ +#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) +#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ + +#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1) -FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) +typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */ + +/* ZSTD_VecMask_next(): + * Starting from the LSB, returns the idx of the next non-zero bit. + * Basically counting the nb of trailing zeroes. + */ +static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { + assert(val != 0); +# if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) + if (sizeof(size_t) == 4) { + U32 mostSignificantWord = (U32)(val >> 32); + U32 leastSignificantWord = (U32)val; + if (leastSignificantWord == 0) { + return 32 + (U32)__builtin_ctz(mostSignificantWord); + } else { + return (U32)__builtin_ctz(leastSignificantWord); + } + } else { + return (U32)__builtin_ctzll(val); + } +# else + /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count + * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer + */ + val = ~val & (val - 1ULL); /* Lowest set bit mask */ + val = val - ((val >> 1) & 0x5555555555555555); + val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); + return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); +# endif +} + +/* ZSTD_rotateRight_*(): + * Rotates a bitfield to the right by "count" bits. + * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts + */ +FORCE_INLINE_TEMPLATE +U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { + assert(count < 64); + count &= 0x3F; /* for fickle pattern recognition */ + return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +} + +FORCE_INLINE_TEMPLATE +U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { + assert(count < 32); + count &= 0x1F; /* for fickle pattern recognition */ + return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +} + +FORCE_INLINE_TEMPLATE +U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { + assert(count < 16); + count &= 0x0F; /* for fickle pattern recognition */ + return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); +} + +/* ZSTD_row_nextIndex(): + * Returns the next index to insert at within a tagTable row, and updates the "head" + * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) + */ +FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { + U32 const next = (*tagRow - 1) & rowMask; + *tagRow = (BYTE)next; + return next; +} + +/* ZSTD_isAligned(): + * Checks that a pointer is aligned to "align" bytes which must be a power of 2. + */ +MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { + assert((align & (align - 1)) == 0); + return (((size_t)ptr) & (align - 1)) == 0; +} + +/* ZSTD_row_prefetch(): + * Performs prefetching for the hashTable and tagTable at a given row. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { + PREFETCH_L1(hashTable + relRow); + if (rowLog >= 5) { + PREFETCH_L1(hashTable + relRow + 16); + /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */ + } + PREFETCH_L1(tagTable + relRow); + if (rowLog == 6) { + PREFETCH_L1(tagTable + relRow + 32); + } + assert(rowLog == 4 || rowLog == 5 || rowLog == 6); + assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */ + assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */ +} + +/* ZSTD_row_fillHashCache(): + * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, + * but not beyond iLimit. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, + U32 const rowLog, U32 const mls, + U32 idx, const BYTE* const iLimit) { - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); + U32 const* const hashTable = ms->hashTable; + U16 const* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); + U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); + + for (; idx < lim; ++idx) { + U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; } + + DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1], + ms->hashCache[2], ms->hashCache[3], ms->hashCache[4], + ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]); } +/* ZSTD_row_nextCachedHash(): + * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. + */ +FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, + U16 const* tagTable, BYTE const* base, + U32 idx, U32 const hashLog, + U32 const rowLog, U32 const mls) +{ + U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; + cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash; + return hash; + } +} -static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) +/* ZSTD_row_update_internalImpl(): + * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, + U32 updateStartIdx, U32 const updateEndIdx, + U32 const mls, U32 const rowLog, + U32 const rowMask, U32 const useCache) { - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); + U32* const hashTable = ms->hashTable; + U16* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + const BYTE* const base = ms->window.base; + + DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); + for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { + U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) + : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32* const row = hashTable + relRow; + BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. + Explicit cast allows us to get exact desired position within each row */ + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + + assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); + ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; + row[pos] = updateStartIdx; } } +/* ZSTD_row_update_internal(): + * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. + * Skips sections of long matches as is necessary. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, + U32 const mls, U32 const rowLog, + U32 const rowMask, U32 const useCache) +{ + U32 idx = ms->nextToUpdate; + const BYTE* const base = ms->window.base; + const U32 target = (U32)(ip - base); + const U32 kSkipThreshold = 384; + const U32 kMaxMatchStartPositionsToUpdate = 96; + const U32 kMaxMatchEndPositionsToUpdate = 32; + + if (useCache) { + /* Only skip positions when using hash cache, i.e. + * if we are loading a dict, don't skip anything. + * If we decide to skip, then we only update a set number + * of positions at the beginning and end of the match. + */ + if (UNLIKELY(target - idx > kSkipThreshold)) { + U32 const bound = idx + kMaxMatchStartPositionsToUpdate; + ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache); + idx = target - kMaxMatchEndPositionsToUpdate; + ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1); + } + } + assert(target >= idx); + ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache); + ms->nextToUpdate = target; +} -static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) +/* ZSTD_row_update(): + * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary + * processing. + */ +void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + const U32 rowMask = (1u << rowLog) - 1; + const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); + + DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); +} + +#if defined(ZSTD_ARCH_X86_SSE2) +FORCE_INLINE_TEMPLATE ZSTD_VecMask +ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head) { - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch); + const __m128i comparisonMask = _mm_set1_epi8((char)tag); + int matches[4] = {0}; + int i; + assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4); + for (i=0; i<nbChunks; i++) { + const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i)); + const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask); + matches[i] = _mm_movemask_epi8(equalMask); } + if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head); + if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head); + assert(nbChunks == 4); + return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head); } +#endif +/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches + * the hash at the nth position in a row of the tagTable. + * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield + * to match up with the actual layout of the entries within the hashTable */ +FORCE_INLINE_TEMPLATE ZSTD_VecMask +ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) +{ + const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); + +#if defined(ZSTD_ARCH_X86_SSE2) + + return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head); + +#else /* SW or NEON-LE */ + +# if defined(ZSTD_ARCH_ARM_NEON) + /* This NEON path only works for little endian - otherwise use SWAR below */ + if (MEM_isLittleEndian()) { + if (rowEntries == 16) { + const uint8x16_t chunk = vld1q_u8(src); + const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); + const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); + const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); + const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); + const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); + const U16 hi = (U16)vgetq_lane_u8(t3, 8); + const U16 lo = (U16)vgetq_lane_u8(t3, 0); + return ZSTD_rotateRight_U16((hi << 8) | lo, head); + } else if (rowEntries == 32) { + const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); + const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); + const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); + const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); + const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); + const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); + const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); + const uint8x8_t t0 = vreinterpret_u8_s8(pack0); + const uint8x8_t t1 = vreinterpret_u8_s8(pack1); + const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); + const uint8x8x2_t t3 = vuzp_u8(t2, t0); + const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); + const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); + return ZSTD_rotateRight_U32(matches, head); + } else { /* rowEntries == 64 */ + const uint8x16x4_t chunk = vld4q_u8(src); + const uint8x16_t dup = vdupq_n_u8(tag); + const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); + const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); + const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); + const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); + + const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); + const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); + const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); + const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); + const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); + const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); + return ZSTD_rotateRight_U64(matches, head); + } + } +# endif /* ZSTD_ARCH_ARM_NEON */ + /* SWAR */ + { const size_t chunkSize = sizeof(size_t); + const size_t shiftAmount = ((chunkSize * 8) - chunkSize); + const size_t xFF = ~((size_t)0); + const size_t x01 = xFF / 0xFF; + const size_t x80 = x01 << 7; + const size_t splatChar = tag * x01; + ZSTD_VecMask matches = 0; + int i = rowEntries - chunkSize; + assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8)); + if (MEM_isLittleEndian()) { /* runtime check so have two loops */ + const size_t extractMagic = (xFF / 0x7F) >> chunkSize; + do { + size_t chunk = MEM_readST(&src[i]); + chunk ^= splatChar; + chunk = (((chunk | x80) - x01) | chunk) & x80; + matches <<= chunkSize; + matches |= (chunk * extractMagic) >> shiftAmount; + i -= chunkSize; + } while (i >= 0); + } else { /* big endian: reverse bits during extraction */ + const size_t msb = xFF ^ (xFF >> 1); + const size_t extractMagic = (msb / 0x1FF) | msb; + do { + size_t chunk = MEM_readST(&src[i]); + chunk ^= splatChar; + chunk = (((chunk | x80) - x01) | chunk) & x80; + matches <<= chunkSize; + matches |= ((chunk >> 7) * extractMagic) >> shiftAmount; + i -= chunkSize; + } while (i >= 0); + } + matches = ~matches; + if (rowEntries == 16) { + return ZSTD_rotateRight_U16((U16)matches, head); + } else if (rowEntries == 32) { + return ZSTD_rotateRight_U32((U32)matches, head); + } else { + return ZSTD_rotateRight_U64((U64)matches, head); + } + } +#endif +} -FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( +/* The high-level approach of the SIMD row based match finder is as follows: + * - Figure out where to insert the new entry: + * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag" + * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines + * which row to insert into. + * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can + * be considered as a circular buffer with a "head" index that resides in the tagTable. + * - Also insert the "tag" into the equivalent row and position in the tagTable. + * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry. + * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively, + * for alignment/performance reasons, leaving some bytes unused. + * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and + * generate a bitfield that we can cycle through to check the collisions in the hash table. + * - Pick the longest match. + */ +FORCE_INLINE_TEMPLATE +size_t ZSTD_RowFindBestMatch( ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode, + const U32 rowLog) { - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); + U32* const hashTable = ms->hashTable; + U16* const tagTable = ms->tagTable; + U32* const hashCache = ms->hashCache; + const U32 hashLog = ms->rowHashLog; + const ZSTD_compressionParameters* const cParams = &ms->cParams; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const U32 curr = (U32)(ip-base); + const U32 maxDistance = 1U << cParams->windowLog; + const U32 lowestValid = ms->window.lowLimit; + const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; + const U32 isDictionary = (ms->loadedDictEnd != 0); + const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; + const U32 rowEntries = (1U << rowLog); + const U32 rowMask = rowEntries - 1; + const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ + U32 nbAttempts = 1U << cappedSearchLog; + size_t ml=4-1; + + /* DMS/DDS variables that may be referenced laster */ + const ZSTD_matchState_t* const dms = ms->dictMatchState; + + /* Initialize the following variables to satisfy static analyzer */ + size_t ddsIdx = 0; + U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */ + U32 dmsTag = 0; + U32* dmsRow = NULL; + BYTE* dmsTagRow = NULL; + + if (dictMode == ZSTD_dedicatedDictSearch) { + const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; + { /* Prefetch DDS hashtable entry */ + ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG; + PREFETCH_L1(&dms->hashTable[ddsIdx]); + } + ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0; + } + + if (dictMode == ZSTD_dictMatchState) { + /* Prefetch DMS rows */ + U32* const dmsHashTable = dms->hashTable; + U16* const dmsTagTable = dms->tagTable; + U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; + dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow); + dmsRow = dmsHashTable + dmsRelRow; + ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog); + } + + /* Update the hashTable and tagTable up to (but not including) ip */ + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); + { /* Get the hash for ip, compute the appropriate row */ + U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; + U32* const row = hashTable + relRow; + BYTE* tagRow = (BYTE*)(tagTable + relRow); + U32 const head = *tagRow & rowMask; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; + ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); + + /* Cycle through the matches and prefetch */ + for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { + U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; + U32 const matchIndex = row[matchPos]; + assert(numMatches < rowEntries); + if (matchIndex < lowLimit) + break; + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + PREFETCH_L1(base + matchIndex); + } else { + PREFETCH_L1(dictBase + matchIndex); + } + matchBuffer[numMatches++] = matchIndex; + } + + /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop + in ZSTD_row_update_internal() at the next search. */ + { + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag; + row[pos] = ms->nextToUpdate++; + } + + /* Return the longest match */ + for (; currMatch < numMatches; ++currMatch) { + U32 const matchIndex = matchBuffer[currMatch]; + size_t currentMl=0; + assert(matchIndex < curr); + assert(matchIndex >= lowLimit); + + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ + if (match[ml] == ip[ml]) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; + assert(match+4 <= dictEnd); + if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; + } + + /* Save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = STORE_OFFSET(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } + } + + assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ + if (dictMode == ZSTD_dedicatedDictSearch) { + ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms, + ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); + } else if (dictMode == ZSTD_dictMatchState) { + /* TODO: Measure and potentially add prefetching to DMS */ + const U32 dmsLowestIndex = dms->window.dictLimit; + const BYTE* const dmsBase = dms->window.base; + const BYTE* const dmsEnd = dms->window.nextSrc; + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + + { U32 const head = *dmsTagRow & rowMask; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; + ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); + + for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { + U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; + U32 const matchIndex = dmsRow[matchPos]; + if (matchIndex < dmsLowestIndex) + break; + PREFETCH_L1(dmsBase + matchIndex); + matchBuffer[numMatches++] = matchIndex; + } + + /* Return the longest match */ + for (; currMatch < numMatches; ++currMatch) { + U32 const matchIndex = matchBuffer[currMatch]; + size_t currentMl=0; + assert(matchIndex >= dmsLowestIndex); + assert(matchIndex < curr); + + { const BYTE* const match = dmsBase + matchIndex; + assert(match+4 <= dmsEnd); + if (MEM_read32(match) == MEM_read32(ip)) + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; + } + + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); + *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; + } + } + } } + return ml; } +/* + * Generate search functions templated on (dictMode, mls, rowLog). + * These functions are outlined for code size & compilation time. + * ZSTD_searchMax() dispatches to the correct implementation function. + * + * TODO: The start of the search function involves loading and calculating a + * bunch of constants from the ZSTD_matchState_t. These computations could be + * done in an initialization function, and saved somewhere in the match state. + * Then we could pass a pointer to the saved state instead of the match state, + * and avoid duplicate computations. + * + * TODO: Move the match re-winding into searchMax. This improves compression + * ratio, and unlocks further simplifications with the next TODO. + * + * TODO: Try moving the repcode search into searchMax. After the re-winding + * and repcode search are in searchMax, there is no more logic in the match + * finder loop that requires knowledge about the dictMode. So we should be + * able to avoid force inlining it, and we can join the extDict loop with + * the single segment loop. It should go in searchMax instead of its own + * function to avoid having multiple virtual function calls per search. + */ + +#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls +#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls +#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog + +#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE + +#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \ + ZSTD_matchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offBasePtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \ + } \ + +#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \ + ZSTD_matchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ + } \ + +#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \ + ZSTD_matchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \ + return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \ + } \ + +#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \ + X(dictMode, mls, 4) \ + X(dictMode, mls, 5) \ + X(dictMode, mls, 6) + +#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6) + +#define ZSTD_FOR_EACH_MLS(X, dictMode) \ + X(dictMode, 4) \ + X(dictMode, 5) \ + X(dictMode, 6) + +#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \ + X(__VA_ARGS__, noDict) \ + X(__VA_ARGS__, extDict) \ + X(__VA_ARGS__, dictMatchState) \ + X(__VA_ARGS__, dedicatedDictSearch) + +/* Generate row search fns for each combination of (dictMode, mls, rowLog) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN) +/* Generate binary Tree search fns for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN) +/* Generate hash chain search fns for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN) + +typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e; + +#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \ + case mls: \ + return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); +#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \ + case mls: \ + return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); +#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \ + case rowLog: \ + return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr); + +#define ZSTD_SWITCH_MLS(X, dictMode) \ + switch (mls) { \ + ZSTD_FOR_EACH_MLS(X, dictMode) \ + } + +#define ZSTD_SWITCH_ROWLOG(dictMode, mls) \ + case mls: \ + switch (rowLog) { \ + ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \ + } \ + ZSTD_UNREACHABLE; \ + break; + +#define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \ + switch (searchMethod) { \ + case search_hashChain: \ + ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \ + break; \ + case search_binaryTree: \ + ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \ + break; \ + case search_rowHash: \ + ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \ + break; \ + } \ + ZSTD_UNREACHABLE; + +/* + * Searches for the longest match at @p ip. + * Dispatches to the correct implementation function based on the + * (searchMethod, dictMode, mls, rowLog). We use switch statements + * here instead of using an indirect function call through a function + * pointer because after Spectre and Meltdown mitigations, indirect + * function calls can be very costly, especially in the kernel. + * + * NOTE: dictMode and searchMethod should be templated, so those switch + * statements should be optimized out. Only the mls & rowLog switches + * should be left. + * + * @param ms The match state. + * @param ip The position to search at. + * @param iend The end of the input data. + * @param[out] offsetPtr Stores the match offset into this pointer. + * @param mls The minimum search length, in the range [4, 6]. + * @param rowLog The row log (if applicable), in the range [4, 6]. + * @param searchMethod The search method to use (templated). + * @param dictMode The dictMode (templated). + * + * @returns The length of the longest match found, or < mls if no match is found. + * If a match is found its offset is stored in @p offsetPtr. + */ +FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( + ZSTD_matchState_t* ms, + const BYTE* ip, + const BYTE* iend, + size_t* offsetPtr, + U32 const mls, + U32 const rowLog, + searchMethod_e const searchMethod, + ZSTD_dictMode_e const dictMode) +{ + if (dictMode == ZSTD_noDict) { + ZSTD_SWITCH_SEARCH_METHOD(noDict) + } else if (dictMode == ZSTD_extDict) { + ZSTD_SWITCH_SEARCH_METHOD(extDict) + } else if (dictMode == ZSTD_dictMatchState) { + ZSTD_SWITCH_SEARCH_METHOD(dictMatchState) + } else if (dictMode == ZSTD_dedicatedDictSearch) { + ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch) + } + ZSTD_UNREACHABLE; + return 0; +} + /* ******************************* * Common parser - lazy strategy *********************************/ -typedef enum { search_hashChain, search_binaryTree } searchMethod_e; FORCE_INLINE_TEMPLATE size_t ZSTD_compressBlock_lazy_generic( @@ -865,41 +1484,13 @@ ZSTD_compressBlock_lazy_generic( const BYTE* ip = istart; const BYTE* anchor = istart; const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - 8; + const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; const BYTE* const base = ms->window.base; const U32 prefixLowestIndex = ms->window.dictLimit; const BYTE* const prefixLowest = base + prefixLowestIndex; + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); - typedef size_t (*searchMax_f)( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); - - /* - * This table is indexed first by the four ZSTD_dictMode_e values, and then - * by the two searchMethod_e values. NULLs are placed for configurations - * that should never occur (extDict modes go to the other implementation - * below and there is no DDSS for binary tree search yet). - */ - const searchMax_f searchFuncs[4][2] = { - { - ZSTD_HcFindBestMatch_selectMLS, - ZSTD_BtFindBestMatch_selectMLS - }, - { - NULL, - NULL - }, - { - ZSTD_HcFindBestMatch_dictMatchState_selectMLS, - ZSTD_BtFindBestMatch_dictMatchState_selectMLS - }, - { - ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS, - NULL - } - }; - - searchMax_f const searchMax = searchFuncs[dictMode][searchMethod == search_binaryTree]; U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; const int isDMS = dictMode == ZSTD_dictMatchState; @@ -915,11 +1506,7 @@ ZSTD_compressBlock_lazy_generic( 0; const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest)); - assert(searchMax != NULL); - - DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode); - - /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod); ip += (dictAndPrefixLength == 0); if (dictMode == ZSTD_noDict) { U32 const curr = (U32)(ip - base); @@ -935,6 +1522,12 @@ ZSTD_compressBlock_lazy_generic( assert(offset_2 <= dictAndPrefixLength); } + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, + MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), + ms->nextToUpdate, ilimit); + } + /* Match Loop */ #if defined(__x86_64__) /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the @@ -944,8 +1537,9 @@ ZSTD_compressBlock_lazy_generic( #endif while (ip < ilimit) { size_t matchLength=0; - size_t offset=0; + size_t offcode=STORE_REPCODE_1; const BYTE* start=ip+1; + DEBUGLOG(7, "search baseline (depth 0)"); /* check repCode */ if (isDxS) { @@ -969,9 +1563,9 @@ ZSTD_compressBlock_lazy_generic( /* first search (depth 0) */ { size_t offsetFound = 999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode); if (ml2 > matchLength) - matchLength = ml2, start = ip, offset=offsetFound; + matchLength = ml2, start = ip, offcode=offsetFound; } if (matchLength < 4) { @@ -982,14 +1576,15 @@ ZSTD_compressBlock_lazy_generic( /* let's try to find a better solution */ if (depth>=1) while (ip<ilimit) { + DEBUGLOG(7, "search depth 1"); ip ++; if ( (dictMode == ZSTD_noDict) - && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; int const gain2 = (int)(mlRep * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offset = 0, start = ip; + matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; } if (isDxS) { const U32 repIndex = (U32)(ip - base) - offset_1; @@ -1001,30 +1596,31 @@ ZSTD_compressBlock_lazy_generic( const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 = (int)(mlRep * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offset = 0, start = ip; + matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; } } { size_t offset2=999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; + matchLength = ml2, offcode = offset2, start = ip; continue; /* search a better one */ } } /* let's find an even better one */ if ((depth==2) && (ip<ilimit)) { + DEBUGLOG(7, "search depth 2"); ip ++; if ( (dictMode == ZSTD_noDict) - && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; int const gain2 = (int)(mlRep * 4); - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offset = 0, start = ip; + matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; } if (isDxS) { const U32 repIndex = (U32)(ip - base) - offset_1; @@ -1036,46 +1632,45 @@ ZSTD_compressBlock_lazy_generic( const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 = (int)(mlRep * 4); - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offset = 0, start = ip; + matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; } } { size_t offset2=999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; + matchLength = ml2, offcode = offset2, start = ip; continue; } } } break; /* nothing found : store previous solution */ } /* NOTE: - * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior. - * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which - * overflows the pointer, which is undefined behavior. + * Pay attention that `start[-value]` can lead to strange undefined behavior + * notably if `value` is unsigned, resulting in a large positive `-value`. */ /* catch up */ - if (offset) { + if (STORED_IS_OFFSET(offcode)) { if (dictMode == ZSTD_noDict) { - while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest)) - && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */ + while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest)) + && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */ { start--; matchLength++; } } if (isDxS) { - U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); + U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ } - offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); + offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); } /* store sequence */ _storeSequence: - { size_t const litLength = start - anchor; - ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); + { size_t const litLength = (size_t)(start - anchor); + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); anchor = ip = start + matchLength; } @@ -1091,8 +1686,8 @@ _storeSequence: && (MEM_read32(repMatch) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ip += matchLength; anchor = ip; continue; @@ -1106,8 +1701,8 @@ _storeSequence: && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { /* store sequence */ matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ip += matchLength; anchor = ip; continue; /* faster when present ... (?) */ @@ -1200,6 +1795,70 @@ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); } +/* Row-based matchfinder */ +size_t ZSTD_compressBlock_lazy2_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_greedy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); +} + + +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); +} FORCE_INLINE_TEMPLATE size_t ZSTD_compressBlock_lazy_extDict_generic( @@ -1212,7 +1871,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* ip = istart; const BYTE* anchor = istart; const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - 8; + const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; const BYTE* const base = ms->window.base; const U32 dictLimit = ms->window.dictLimit; const BYTE* const prefixStart = base + dictLimit; @@ -1220,18 +1879,20 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* const dictEnd = dictBase + dictLimit; const BYTE* const dictStart = dictBase + ms->window.lowLimit; const U32 windowLog = ms->cParams.windowLog; - - typedef size_t (*searchMax_f)( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); - searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS; + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); U32 offset_1 = rep[0], offset_2 = rep[1]; - DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic"); + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); /* init */ ip += (ip == prefixStart); + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, + MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), + ms->nextToUpdate, ilimit); + } /* Match Loop */ #if defined(__x86_64__) @@ -1242,7 +1903,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( #endif while (ip < ilimit) { size_t matchLength=0; - size_t offset=0; + size_t offcode=STORE_REPCODE_1; const BYTE* start=ip+1; U32 curr = (U32)(ip-base); @@ -1251,7 +1912,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const U32 repIndex = (U32)(curr+1 - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */ + & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */ if (MEM_read32(ip+1) == MEM_read32(repMatch)) { /* repcode detected we should take it */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -1261,9 +1923,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( /* first search (depth 0) */ { size_t offsetFound = 999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict); if (ml2 > matchLength) - matchLength = ml2, start = ip, offset=offsetFound; + matchLength = ml2, start = ip, offcode=offsetFound; } if (matchLength < 4) { @@ -1277,29 +1939,30 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( ip ++; curr++; /* check repCode */ - if (offset) { + if (offcode) { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); const U32 repIndex = (U32)(curr - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 = (int)(repLength * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((repLength >= 4) && (gain2 > gain1)) - matchLength = repLength, offset = 0, start = ip; + matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; } } /* search match, depth 1 */ { size_t offset2=999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; + matchLength = ml2, offcode = offset2, start = ip; continue; /* search a better one */ } } @@ -1308,47 +1971,48 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( ip ++; curr++; /* check repCode */ - if (offset) { + if (offcode) { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); const U32 repIndex = (U32)(curr - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 = (int)(repLength * 4); - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((repLength >= 4) && (gain2 > gain1)) - matchLength = repLength, offset = 0, start = ip; + matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; } } /* search match, depth 2 */ { size_t offset2=999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; + matchLength = ml2, offcode = offset2, start = ip; continue; } } } break; /* nothing found : store previous solution */ } /* catch up */ - if (offset) { - U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); + if (STORED_IS_OFFSET(offcode)) { + U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ - offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); + offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); } /* store sequence */ _storeSequence: - { size_t const litLength = start - anchor; - ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); + { size_t const litLength = (size_t)(start - anchor); + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); anchor = ip = start + matchLength; } @@ -1359,13 +2023,14 @@ _storeSequence: const U32 repIndex = repCurrent - offset_2; const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ + & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected we should take it */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ip += matchLength; anchor = ip; continue; /* faster when present ... (?) */ @@ -1412,3 +2077,26 @@ size_t ZSTD_compressBlock_btlazy2_extDict( { return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); } + +size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); +} + +size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); +} + +size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); +} diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h index 2fc5a6182134..e5bdf4df8dde 100644 --- a/lib/zstd/compress/zstd_lazy.h +++ b/lib/zstd/compress/zstd_lazy.h @@ -23,6 +23,7 @@ #define ZSTD_LAZY_DDSS_BUCKET_LOG 2 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); +void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip); @@ -40,6 +41,15 @@ size_t ZSTD_compressBlock_lazy( size_t ZSTD_compressBlock_greedy( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); size_t ZSTD_compressBlock_btlazy2_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -53,6 +63,15 @@ size_t ZSTD_compressBlock_lazy_dictMatchState( size_t ZSTD_compressBlock_greedy_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -63,6 +82,15 @@ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); size_t ZSTD_compressBlock_greedy_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -73,9 +101,19 @@ size_t ZSTD_compressBlock_lazy_extDict( size_t ZSTD_compressBlock_lazy2_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); size_t ZSTD_compressBlock_btlazy2_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); + #endif /* ZSTD_LAZY_H */ diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c index 8ef7e88a5add..dd86fc83e7dd 100644 --- a/lib/zstd/compress/zstd_ldm.c +++ b/lib/zstd/compress/zstd_ldm.c @@ -57,6 +57,33 @@ static void ZSTD_ldm_gear_init(ldmRollingHashState_t* state, ldmParams_t const* } } +/* ZSTD_ldm_gear_reset() + * Feeds [data, data + minMatchLength) into the hash without registering any + * splits. This effectively resets the hash state. This is used when skipping + * over data, either at the beginning of a block, or skipping sections. + */ +static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state, + BYTE const* data, size_t minMatchLength) +{ + U64 hash = state->rolling; + size_t n = 0; + +#define GEAR_ITER_ONCE() do { \ + hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \ + n += 1; \ + } while (0) + while (n + 3 < minMatchLength) { + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + } + while (n < minMatchLength) { + GEAR_ITER_ONCE(); + } +#undef GEAR_ITER_ONCE +} + /* ZSTD_ldm_gear_feed(): * * Registers in the splits array all the split points found in the first @@ -132,12 +159,12 @@ size_t ZSTD_ldm_getTableSize(ldmParams_t params) size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog); size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize) + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t)); - return params.enableLdm ? totalSize : 0; + return params.enableLdm == ZSTD_ps_enable ? totalSize : 0; } size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize) { - return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0; + return params.enableLdm == ZSTD_ps_enable ? (maxChunkSize / params.minMatchLength) : 0; } /* ZSTD_ldm_getBucket() : @@ -255,7 +282,7 @@ void ZSTD_ldm_fillHashTable( while (ip < iend) { size_t hashed; unsigned n; - + numSplits = 0; hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits); @@ -327,16 +354,8 @@ static size_t ZSTD_ldm_generateSequences_internal( /* Initialize the rolling hash state with the first minMatchLength bytes */ ZSTD_ldm_gear_init(&hashState, params); - { - size_t n = 0; - - while (n < minMatchLength) { - numSplits = 0; - n += ZSTD_ldm_gear_feed(&hashState, ip + n, minMatchLength - n, - splits, &numSplits); - } - ip += minMatchLength; - } + ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength); + ip += minMatchLength; while (ip < ilimit) { size_t hashed; @@ -361,6 +380,7 @@ static size_t ZSTD_ldm_generateSequences_internal( for (n = 0; n < numSplits; n++) { size_t forwardMatchLength = 0, backwardMatchLength = 0, bestMatchLength = 0, mLength; + U32 offset; BYTE const* const split = candidates[n].split; U32 const checksum = candidates[n].checksum; U32 const hash = candidates[n].hash; @@ -428,9 +448,9 @@ static size_t ZSTD_ldm_generateSequences_internal( } /* Match found */ + offset = (U32)(split - base) - bestEntry->offset; mLength = forwardMatchLength + backwardMatchLength; { - U32 const offset = (U32)(split - base) - bestEntry->offset; rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size; /* Out of sequence storage */ @@ -447,6 +467,21 @@ static size_t ZSTD_ldm_generateSequences_internal( ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); anchor = split + forwardMatchLength; + + /* If we find a match that ends after the data that we've hashed + * then we have a repeating, overlapping, pattern. E.g. all zeros. + * If one repetition of the pattern matches our `stopMask` then all + * repetitions will. We don't need to insert them all into out table, + * only the first one. So skip over overlapping matches. + * This is a major speed boost (20x) for compressing a single byte + * repeated, when that byte ends up in the table. + */ + if (anchor > ip + hashed) { + ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength); + /* Continue the outer loop at anchor (ip + hashed == anchor). */ + ip = anchor - hashed; + break; + } } ip += hashed; @@ -500,7 +535,7 @@ size_t ZSTD_ldm_generateSequences( assert(chunkStart < iend); /* 1. Perform overflow correction if necessary. */ - if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) { + if (ZSTD_window_needOverflowCorrection(ldmState->window, 0, maxDist, ldmState->loadedDictEnd, chunkStart, chunkEnd)) { U32 const ldmHSize = 1U << params->hashLog; U32 const correction = ZSTD_window_correctOverflow( &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart); @@ -544,7 +579,9 @@ size_t ZSTD_ldm_generateSequences( return 0; } -void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) { +void +ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) +{ while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) { rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos; if (srcSize <= seq->litLength) { @@ -622,12 +659,13 @@ void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) { size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_paramSwitch_e useRowMatchFinder, void const* src, size_t srcSize) { const ZSTD_compressionParameters* const cParams = &ms->cParams; unsigned const minMatch = cParams->minMatch; ZSTD_blockCompressor const blockCompressor = - ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms)); + ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms)); /* Input bounds */ BYTE const* const istart = (BYTE const*)src; BYTE const* const iend = istart + srcSize; @@ -673,8 +711,8 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, rep[0] = sequence.offset; /* Store the sequence */ ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, - sequence.offset + ZSTD_REP_MOVE, - sequence.matchLength - MINMATCH); + STORE_OFFSET(sequence.offset), + sequence.matchLength); ip += sequence.matchLength; } } diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h index 25b25270b72e..fbc6a5e88fd7 100644 --- a/lib/zstd/compress/zstd_ldm.h +++ b/lib/zstd/compress/zstd_ldm.h @@ -63,6 +63,7 @@ size_t ZSTD_ldm_generateSequences( */ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_paramSwitch_e useRowMatchFinder, void const* src, size_t srcSize); /* diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h index e5c24d856b0a..647f865be290 100644 --- a/lib/zstd/compress/zstd_ldm_geartab.h +++ b/lib/zstd/compress/zstd_ldm_geartab.h @@ -11,7 +11,10 @@ #ifndef ZSTD_LDM_GEARTAB_H #define ZSTD_LDM_GEARTAB_H -static U64 ZSTD_ldm_gearTab[256] = { +#include "../common/compiler.h" /* UNUSED_ATTR */ +#include "../common/mem.h" /* U64 */ + +static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = { 0xf5b8f72c5f77775c, 0x84935f266b7ac412, 0xb647ada9ca730ccc, 0xb065bb4b114fb1de, 0x34584e7e8c3a9fd0, 0x4e97e17c6ae26b05, 0x3a03d743bc99a604, 0xcecd042422c4044f, 0x76de76c58524259e, diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c index dfc55e3e8119..fd82acfda62f 100644 --- a/lib/zstd/compress/zstd_opt.c +++ b/lib/zstd/compress/zstd_opt.c @@ -8,25 +8,12 @@ * You may select, at your option, one of the above-listed licenses. */ -/* - * Disable inlining for the optimal parser for the kernel build. - * It is unlikely to be used in the kernel, and where it is used - * latency shouldn't matter because it is very slow to begin with. - * We prefer a ~180KB binary size win over faster optimal parsing. - * - * TODO(https://github.com/facebook/zstd/issues/2862): - * Improve the code size of the optimal parser in general, so we - * don't need this hack for the kernel build. - */ -#define ZSTD_NO_INLINE 1 - #include "zstd_compress_internal.h" #include "hist.h" #include "zstd_opt.h" #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ -#define ZSTD_FREQ_DIV 4 /* log factor when using previous stats to init next stats */ #define ZSTD_MAX_PRICE (1<<30) #define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ @@ -36,11 +23,11 @@ * Price functions for optimal parser ***************************************/ -#if 0 /* approximation at bit level */ +#if 0 /* approximation at bit level (for tests) */ # define BITCOST_ACCURACY 0 # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) -# define WEIGHT(stat) ((void)opt, ZSTD_bitWeight(stat)) -#elif 0 /* fractional bit accuracy */ +# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat)) +#elif 0 /* fractional bit accuracy (for tests) */ # define BITCOST_ACCURACY 8 # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) # define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) @@ -78,7 +65,7 @@ MEM_STATIC double ZSTD_fCost(U32 price) static int ZSTD_compressedLiterals(optState_t const* const optPtr) { - return optPtr->literalCompressionMode != ZSTD_lcm_uncompressed; + return optPtr->literalCompressionMode != ZSTD_ps_disable; } static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel) @@ -91,25 +78,46 @@ static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel) } -/* ZSTD_downscaleStat() : - * reduce all elements in table by a factor 2^(ZSTD_FREQ_DIV+malus) - * return the resulting sum of elements */ -static U32 ZSTD_downscaleStat(unsigned* table, U32 lastEltIndex, int malus) +static U32 sum_u32(const unsigned table[], size_t nbElts) +{ + size_t n; + U32 total = 0; + for (n=0; n<nbElts; n++) { + total += table[n]; + } + return total; +} + +static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift) { U32 s, sum=0; - DEBUGLOG(5, "ZSTD_downscaleStat (nbElts=%u)", (unsigned)lastEltIndex+1); - assert(ZSTD_FREQ_DIV+malus > 0 && ZSTD_FREQ_DIV+malus < 31); + DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift); + assert(shift < 30); for (s=0; s<lastEltIndex+1; s++) { - table[s] = 1 + (table[s] >> (ZSTD_FREQ_DIV+malus)); + table[s] = 1 + (table[s] >> shift); sum += table[s]; } return sum; } +/* ZSTD_scaleStats() : + * reduce all elements in table is sum too large + * return the resulting sum of elements */ +static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) +{ + U32 const prevsum = sum_u32(table, lastEltIndex+1); + U32 const factor = prevsum >> logTarget; + DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget); + assert(logTarget < 30); + if (factor <= 1) return prevsum; + return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)); +} + /* ZSTD_rescaleFreqs() : * if first block (detected by optPtr->litLengthSum == 0) : init statistics * take hints from dictionary if there is one - * or init from zero, using src for literals stats, or flat 1 for match symbols + * and init from zero if there is none, + * using src for literals stats, and baseline stats for sequence symbols * otherwise downscale existing stats, to be used as seed for next block. */ static void @@ -138,7 +146,7 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, optPtr->litSum = 0; for (lit=0; lit<=MaxLit; lit++) { U32 const scaleLog = 11; /* scale to 2K */ - U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit); + U32 const bitCost = HUF_getNbBitsFromCTable(optPtr->symbolCosts->huf.CTable, lit); assert(bitCost <= scaleLog); optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; optPtr->litSum += optPtr->litFreq[lit]; @@ -186,14 +194,19 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, if (compressedLiterals) { unsigned lit = MaxLit; HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ - optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1); + optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8); } - { unsigned ll; - for (ll=0; ll<=MaxLL; ll++) - optPtr->litLengthFreq[ll] = 1; + { unsigned const baseLLfreqs[MaxLL+1] = { + 4, 2, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1 + }; + ZSTD_memcpy(optPtr->litLengthFreq, baseLLfreqs, sizeof(baseLLfreqs)); + optPtr->litLengthSum = sum_u32(baseLLfreqs, MaxLL+1); } - optPtr->litLengthSum = MaxLL+1; { unsigned ml; for (ml=0; ml<=MaxML; ml++) @@ -201,21 +214,26 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, } optPtr->matchLengthSum = MaxML+1; - { unsigned of; - for (of=0; of<=MaxOff; of++) - optPtr->offCodeFreq[of] = 1; + { unsigned const baseOFCfreqs[MaxOff+1] = { + 6, 2, 1, 1, 2, 3, 4, 4, + 4, 3, 2, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1 + }; + ZSTD_memcpy(optPtr->offCodeFreq, baseOFCfreqs, sizeof(baseOFCfreqs)); + optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1); } - optPtr->offCodeSum = MaxOff+1; + } } else { /* new block : re-use previous statistics, scaled down */ if (compressedLiterals) - optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1); - optPtr->litLengthSum = ZSTD_downscaleStat(optPtr->litLengthFreq, MaxLL, 0); - optPtr->matchLengthSum = ZSTD_downscaleStat(optPtr->matchLengthFreq, MaxML, 0); - optPtr->offCodeSum = ZSTD_downscaleStat(optPtr->offCodeFreq, MaxOff, 0); + optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12); + optPtr->litLengthSum = ZSTD_scaleStats(optPtr->litLengthFreq, MaxLL, 11); + optPtr->matchLengthSum = ZSTD_scaleStats(optPtr->matchLengthFreq, MaxML, 11); + optPtr->offCodeSum = ZSTD_scaleStats(optPtr->offCodeFreq, MaxOff, 11); } ZSTD_setBasePrices(optPtr, optLevel); @@ -251,7 +269,16 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, * cost of literalLength symbol */ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel) { - if (optPtr->priceType == zop_predef) return WEIGHT(litLength, optLevel); + assert(litLength <= ZSTD_BLOCKSIZE_MAX); + if (optPtr->priceType == zop_predef) + return WEIGHT(litLength, optLevel); + /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX + * because it isn't representable in the zstd format. So instead just + * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block + * would be all literals. + */ + if (litLength == ZSTD_BLOCKSIZE_MAX) + return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel); /* dynamic statistics */ { U32 const llCode = ZSTD_LLcode(litLength); @@ -264,15 +291,17 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP /* ZSTD_getMatchPrice() : * Provides the cost of the match part (offset + matchLength) of a sequence * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. - * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */ + * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2 + * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) + */ FORCE_INLINE_TEMPLATE U32 -ZSTD_getMatchPrice(U32 const offset, +ZSTD_getMatchPrice(U32 const offcode, U32 const matchLength, const optState_t* const optPtr, int const optLevel) { U32 price; - U32 const offCode = ZSTD_highbit32(offset+1); + U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode)); U32 const mlBase = matchLength - MINMATCH; assert(matchLength >= MINMATCH); @@ -315,8 +344,8 @@ static void ZSTD_updateStats(optState_t* const optPtr, optPtr->litLengthSum++; } - /* match offset code (0-2=>repCode; 3+=>offset+2) */ - { U32 const offCode = ZSTD_highbit32(offsetCode+1); + /* offset code : expected to follow storeSeq() numeric representation */ + { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode)); assert(offCode <= MaxOff); optPtr->offCodeFreq[offCode]++; optPtr->offCodeSum++; @@ -350,7 +379,7 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length) /* Update hashTable3 up to ip (excluded) Assumption : always within prefix (i.e. not within extDict) */ -static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, +static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, U32* nextToUpdate3, const BYTE* const ip) { @@ -376,11 +405,13 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, * Binary Tree search ***************************************/ /* ZSTD_insertBt1() : add one or multiple positions to tree. - * ip : assumed <= iend-8 . + * @param ip assumed <= iend-8 . + * @param target The target of ZSTD_updateTree_internal() - we are filling to this position * @return : nb of positions added */ static U32 ZSTD_insertBt1( - ZSTD_matchState_t* ms, + const ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iend, + U32 const target, U32 const mls, const int extDict) { const ZSTD_compressionParameters* const cParams = &ms->cParams; @@ -403,7 +434,10 @@ static U32 ZSTD_insertBt1( U32* smallerPtr = bt + 2*(curr&btMask); U32* largerPtr = smallerPtr + 1; U32 dummy32; /* to be nullified at the end */ - U32 const windowLow = ms->window.lowLimit; + /* windowLow is based on target because + * we only need positions that will be in the window at the end of the tree update. + */ + U32 const windowLow = ZSTD_getLowestMatchIndex(ms, target, cParams->windowLog); U32 matchEndIdx = curr+8+1; size_t bestLength = 8; U32 nbCompares = 1U << cParams->searchLog; @@ -416,6 +450,7 @@ static U32 ZSTD_insertBt1( DEBUGLOG(8, "ZSTD_insertBt1 (%u)", curr); + assert(curr <= target); assert(ip <= iend-8); /* required for h calculation */ hashTable[h] = curr; /* Update Hash Table */ @@ -504,7 +539,7 @@ void ZSTD_updateTree_internal( idx, target, dictMode); while(idx < target) { - U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, mls, dictMode == ZSTD_extDict); + U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, target, mls, dictMode == ZSTD_extDict); assert(idx < (U32)(idx + forward)); idx += forward; } @@ -609,7 +644,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", repCode, ll0, repOffset, repLen); bestLength = repLen; - matches[mnum].off = repCode - ll0; + matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */ matches[mnum].len = (U32)repLen; mnum++; if ( (repLen > sufficient_len) @@ -638,7 +673,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( bestLength = mlen; assert(curr > matchIndex3); assert(mnum==0); /* no prior solution */ - matches[0].off = (curr - matchIndex3) + ZSTD_REP_MOVE; + matches[0].off = STORE_OFFSET(curr - matchIndex3); matches[0].len = (U32)mlen; mnum = 1; if ( (mlen > sufficient_len) | @@ -647,7 +682,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( return 1; } } } /* no dictMatchState lookup: dicts don't have a populated HC3 table */ - } + } /* if (mls == 3) */ hashTable[h] = curr; /* Update Hash Table */ @@ -672,20 +707,19 @@ U32 ZSTD_insertBtAndGetAllMatches ( if (matchLength > bestLength) { DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", - (U32)matchLength, curr - matchIndex, curr - matchIndex + ZSTD_REP_MOVE); + (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); assert(matchEndIdx > matchIndex); if (matchLength > matchEndIdx - matchIndex) matchEndIdx = matchIndex + (U32)matchLength; bestLength = matchLength; - matches[mnum].off = (curr - matchIndex) + ZSTD_REP_MOVE; + matches[mnum].off = STORE_OFFSET(curr - matchIndex); matches[mnum].len = (U32)matchLength; mnum++; if ( (matchLength > ZSTD_OPT_NUM) | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) { if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */ break; /* drop, to preserve bt consistency (miss a little bit of compression) */ - } - } + } } if (match[matchLength] < ip[matchLength]) { /* match smaller than current */ @@ -721,18 +755,17 @@ U32 ZSTD_insertBtAndGetAllMatches ( if (matchLength > bestLength) { matchIndex = dictMatchIndex + dmsIndexDelta; DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", - (U32)matchLength, curr - matchIndex, curr - matchIndex + ZSTD_REP_MOVE); + (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); if (matchLength > matchEndIdx - matchIndex) matchEndIdx = matchIndex + (U32)matchLength; bestLength = matchLength; - matches[mnum].off = (curr - matchIndex) + ZSTD_REP_MOVE; + matches[mnum].off = STORE_OFFSET(curr - matchIndex); matches[mnum].len = (U32)matchLength; mnum++; if ( (matchLength > ZSTD_OPT_NUM) | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) { break; /* drop, to guarantee consistency (miss a little bit of compression) */ - } - } + } } if (dictMatchIndex <= dmsBtLow) { break; } /* beyond tree size, stop the search */ if (match[matchLength] < ip[matchLength]) { @@ -742,39 +775,91 @@ U32 ZSTD_insertBtAndGetAllMatches ( /* match is larger than current */ commonLengthLarger = matchLength; dictMatchIndex = nextPtr[0]; - } - } - } + } } } /* if (dictMode == ZSTD_dictMatchState) */ assert(matchEndIdx > curr+8); ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ return mnum; } - -FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches ( - ZSTD_match_t* matches, /* store result (match found, increasing size) in this table */ - ZSTD_matchState_t* ms, - U32* nextToUpdate3, - const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode, - const U32 rep[ZSTD_REP_NUM], - U32 const ll0, - U32 const lengthToBeat) +typedef U32 (*ZSTD_getAllMatchesFn)( + ZSTD_match_t*, + ZSTD_matchState_t*, + U32*, + const BYTE*, + const BYTE*, + const U32 rep[ZSTD_REP_NUM], + U32 const ll0, + U32 const lengthToBeat); + +FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal( + ZSTD_match_t* matches, + ZSTD_matchState_t* ms, + U32* nextToUpdate3, + const BYTE* ip, + const BYTE* const iHighLimit, + const U32 rep[ZSTD_REP_NUM], + U32 const ll0, + U32 const lengthToBeat, + const ZSTD_dictMode_e dictMode, + const U32 mls) { - const ZSTD_compressionParameters* const cParams = &ms->cParams; - U32 const matchLengthSearch = cParams->minMatch; - DEBUGLOG(8, "ZSTD_BtGetAllMatches"); - if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ - ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMode); - switch(matchLengthSearch) - { - case 3 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 3); - default : - case 4 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 4); - case 5 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 5); - case 7 : - case 6 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 6); + assert(BOUNDED(3, ms->cParams.minMatch, 6) == mls); + DEBUGLOG(8, "ZSTD_BtGetAllMatches(dictMode=%d, mls=%u)", (int)dictMode, mls); + if (ip < ms->window.base + ms->nextToUpdate) + return 0; /* skipped area */ + ZSTD_updateTree_internal(ms, ip, iHighLimit, mls, dictMode); + return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, mls); +} + +#define ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls) ZSTD_btGetAllMatches_##dictMode##_##mls + +#define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls) \ + static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)( \ + ZSTD_match_t* matches, \ + ZSTD_matchState_t* ms, \ + U32* nextToUpdate3, \ + const BYTE* ip, \ + const BYTE* const iHighLimit, \ + const U32 rep[ZSTD_REP_NUM], \ + U32 const ll0, \ + U32 const lengthToBeat) \ + { \ + return ZSTD_btGetAllMatches_internal( \ + matches, ms, nextToUpdate3, ip, iHighLimit, \ + rep, ll0, lengthToBeat, ZSTD_##dictMode, mls); \ + } + +#define GEN_ZSTD_BT_GET_ALL_MATCHES(dictMode) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 3) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 4) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 5) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 6) + +GEN_ZSTD_BT_GET_ALL_MATCHES(noDict) +GEN_ZSTD_BT_GET_ALL_MATCHES(extDict) +GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState) + +#define ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMode) \ + { \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 3), \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 4), \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 5), \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 6) \ } + +static ZSTD_getAllMatchesFn +ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const dictMode) +{ + ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = { + ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict), + ZSTD_BT_GET_ALL_MATCHES_ARRAY(extDict), + ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMatchState) + }; + U32 const mls = BOUNDED(3, ms->cParams.minMatch, 6); + assert((U32)dictMode < 3); + assert(mls - 3 < 4); + return getAllMatchesFns[(int)dictMode][mls - 3]; } /* *********************** @@ -783,16 +868,18 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches ( /* Struct containing info needed to make decision about ldm inclusion */ typedef struct { - rawSeqStore_t seqStore; /* External match candidates store for this block */ - U32 startPosInBlock; /* Start position of the current match candidate */ - U32 endPosInBlock; /* End position of the current match candidate */ - U32 offset; /* Offset of the match candidate */ + rawSeqStore_t seqStore; /* External match candidates store for this block */ + U32 startPosInBlock; /* Start position of the current match candidate */ + U32 endPosInBlock; /* End position of the current match candidate */ + U32 offset; /* Offset of the match candidate */ } ZSTD_optLdm_t; /* ZSTD_optLdm_skipRawSeqStoreBytes(): - * Moves forward in rawSeqStore by nbBytes, which will update the fields 'pos' and 'posInSequence'. + * Moves forward in @rawSeqStore by @nbBytes, + * which will update the fields 'pos' and 'posInSequence'. */ -static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) { +static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) +{ U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes); while (currPos && rawSeqStore->pos < rawSeqStore->size) { rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos]; @@ -813,8 +900,10 @@ static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t * Calculates the beginning and end of the next match in the current block. * Updates 'pos' and 'posInSequence' of the ldmSeqStore. */ -static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock, - U32 blockBytesRemaining) { +static void +ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock, + U32 blockBytesRemaining) +{ rawSeq currSeq; U32 currBlockEndPos; U32 literalsBytesRemaining; @@ -826,8 +915,8 @@ static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 cu optLdm->endPosInBlock = UINT_MAX; return; } - /* Calculate appropriate bytes left in matchLength and litLength after adjusting - based on ldmSeqStore->posInSequence */ + /* Calculate appropriate bytes left in matchLength and litLength + * after adjusting based on ldmSeqStore->posInSequence */ currSeq = optLdm->seqStore.seq[optLdm->seqStore.pos]; assert(optLdm->seqStore.posInSequence <= currSeq.litLength + currSeq.matchLength); currBlockEndPos = currPosInBlock + blockBytesRemaining; @@ -863,15 +952,16 @@ static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 cu } /* ZSTD_optLdm_maybeAddMatch(): - * Adds a match if it's long enough, based on it's 'matchStartPosInBlock' - * and 'matchEndPosInBlock', into 'matches'. Maintains the correct ordering of 'matches' + * Adds a match if it's long enough, + * based on it's 'matchStartPosInBlock' and 'matchEndPosInBlock', + * into 'matches'. Maintains the correct ordering of 'matches'. */ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, - ZSTD_optLdm_t* optLdm, U32 currPosInBlock) { - U32 posDiff = currPosInBlock - optLdm->startPosInBlock; + const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) +{ + U32 const posDiff = currPosInBlock - optLdm->startPosInBlock; /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */ - U32 candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; - U32 candidateOffCode = optLdm->offset + ZSTD_REP_MOVE; + U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; /* Ensure that current block position is not outside of the match */ if (currPosInBlock < optLdm->startPosInBlock @@ -881,6 +971,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, } if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { + U32 const candidateOffCode = STORE_OFFSET(optLdm->offset); DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u", candidateOffCode, candidateMatchLength, currPosInBlock); matches[*nbMatches].len = candidateMatchLength; @@ -892,8 +983,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, /* ZSTD_optLdm_processMatchCandidate(): * Wrapper function to update ldm seq store and call ldm functions as necessary. */ -static void ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, ZSTD_match_t* matches, U32* nbMatches, - U32 currPosInBlock, U32 remainingBytes) { +static void +ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, + ZSTD_match_t* matches, U32* nbMatches, + U32 currPosInBlock, U32 remainingBytes) +{ if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) { return; } @@ -904,19 +998,19 @@ static void ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, ZSTD_match_ * at the end of a match from the ldm seq store, and will often be some bytes * over beyond matchEndPosInBlock. As such, we need to correct for these "overshoots" */ - U32 posOvershoot = currPosInBlock - optLdm->endPosInBlock; + U32 const posOvershoot = currPosInBlock - optLdm->endPosInBlock; ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, posOvershoot); - } + } ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes); } ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock); } + /*-******************************* * Optimal parser *********************************/ - static U32 ZSTD_totalLen(ZSTD_optimal_t sol) { return sol.litlen + sol.mlen; @@ -957,6 +1051,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, const BYTE* const prefixStart = base + ms->window.dictLimit; const ZSTD_compressionParameters* const cParams = &ms->cParams; + ZSTD_getAllMatchesFn getAllMatches = ZSTD_selectBtGetAllMatches(ms, dictMode); + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4; U32 nextToUpdate3 = ms->nextToUpdate; @@ -984,7 +1080,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, /* find first match */ { U32 const litlen = (U32)(ip - anchor); U32 const ll0 = !litlen; - U32 nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, ip, iend, dictMode, rep, ll0, minMatch); + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch); ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, (U32)(ip-istart), (U32)(iend - ip)); if (!nbMatches) { ip++; continue; } @@ -998,18 +1094,18 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, * in every price. We include the literal length to avoid negative * prices when we subtract the previous literal length. */ - opt[0].price = ZSTD_litLengthPrice(litlen, optStatePtr, optLevel); + opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel); /* large match -> immediate encoding */ { U32 const maxML = matches[nbMatches-1].len; - U32 const maxOffset = matches[nbMatches-1].off; + U32 const maxOffcode = matches[nbMatches-1].off; DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", - nbMatches, maxML, maxOffset, (U32)(ip-prefixStart)); + nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart)); if (maxML > sufficient_len) { lastSequence.litlen = litlen; lastSequence.mlen = maxML; - lastSequence.off = maxOffset; + lastSequence.off = maxOffcode; DEBUGLOG(6, "large match (%u>%u), immediate encoding", maxML, sufficient_len); cur = 0; @@ -1018,24 +1114,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, } } /* set prices for first matches starting position == 0 */ - { U32 const literalsPrice = opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); + assert(opt[0].price >= 0); + { U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); U32 pos; U32 matchNb; for (pos = 1; pos < minMatch; pos++) { opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ } for (matchNb = 0; matchNb < nbMatches; matchNb++) { - U32 const offset = matches[matchNb].off; + U32 const offcode = matches[matchNb].off; U32 const end = matches[matchNb].len; for ( ; pos <= end ; pos++ ) { - U32 const matchPrice = ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel); + U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel); U32 const sequencePrice = literalsPrice + matchPrice; DEBUGLOG(7, "rPos:%u => set initial price : %.2f", pos, ZSTD_fCost(sequencePrice)); opt[pos].mlen = pos; - opt[pos].off = offset; + opt[pos].off = offcode; opt[pos].litlen = litlen; - opt[pos].price = sequencePrice; + opt[pos].price = (int)sequencePrice; } } last_pos = pos-1; } @@ -1050,9 +1147,9 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, /* Fix current position with one literal if cheaper */ { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1; int const price = opt[cur-1].price - + ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) - + ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) - - ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); + + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) + + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) + - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); assert(price < 1000000000); /* overflow check */ if (price <= opt[cur].price) { DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", @@ -1078,7 +1175,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, assert(cur >= opt[cur].mlen); if (opt[cur].mlen != 0) { U32 const prev = cur - opt[cur].mlen; - repcodes_t newReps = ZSTD_updateRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); + repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); } else { ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t)); @@ -1095,11 +1192,12 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ } + assert(opt[cur].price >= 0); { U32 const ll0 = (opt[cur].mlen != 0); U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0; - U32 const previousPrice = opt[cur].price; + U32 const previousPrice = (U32)opt[cur].price; U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel); - U32 nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch); + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch); U32 matchNb; ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, @@ -1137,7 +1235,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ U32 const pos = cur + mlen; - int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); + int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); if ((pos > last_pos) || (price < opt[pos].price)) { DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", @@ -1167,7 +1265,7 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ * update them while traversing the sequences. */ if (lastSequence.mlen != 0) { - repcodes_t reps = ZSTD_updateRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); + repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); ZSTD_memcpy(rep, &reps, sizeof(reps)); } else { ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); @@ -1211,7 +1309,7 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ assert(anchor + llen <= iend); ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); - ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen-MINMATCH); + ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen); anchor += advance; ip = anchor; } } @@ -1223,38 +1321,30 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ return (size_t)(iend - anchor); } +static size_t ZSTD_compressBlock_opt0( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode); +} + +static size_t ZSTD_compressBlock_opt2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode); +} size_t ZSTD_compressBlock_btopt( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { DEBUGLOG(5, "ZSTD_compressBlock_btopt"); - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_noDict); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict); } -/* used in 2-pass strategy */ -static U32 ZSTD_upscaleStat(unsigned* table, U32 lastEltIndex, int bonus) -{ - U32 s, sum=0; - assert(ZSTD_FREQ_DIV+bonus >= 0); - for (s=0; s<lastEltIndex+1; s++) { - table[s] <<= ZSTD_FREQ_DIV+bonus; - table[s]--; - sum += table[s]; - } - return sum; -} -/* used in 2-pass strategy */ -MEM_STATIC void ZSTD_upscaleStats(optState_t* optPtr) -{ - if (ZSTD_compressedLiterals(optPtr)) - optPtr->litSum = ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0); - optPtr->litLengthSum = ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL, 0); - optPtr->matchLengthSum = ZSTD_upscaleStat(optPtr->matchLengthFreq, MaxML, 0); - optPtr->offCodeSum = ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 0); -} /* ZSTD_initStats_ultra(): * make a first compression pass, just to seed stats with more accurate starting values. @@ -1276,7 +1366,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, assert(ms->window.dictLimit == ms->window.lowLimit); /* no dictionary */ assert(ms->window.dictLimit - ms->nextToUpdate <= 1); /* no prefix (note: intentional overflow, defined as 2-complement) */ - ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); /* generate stats into ms->opt*/ + ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ /* invalidate first scan from history */ ZSTD_resetSeqStore(seqStore); @@ -1285,8 +1375,6 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, ms->window.lowLimit = ms->window.dictLimit; ms->nextToUpdate = ms->window.dictLimit; - /* re-inforce weight of collected statistics */ - ZSTD_upscaleStats(&ms->opt); } size_t ZSTD_compressBlock_btultra( @@ -1294,7 +1382,7 @@ size_t ZSTD_compressBlock_btultra( const void* src, size_t srcSize) { DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize); - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); } size_t ZSTD_compressBlock_btultra2( @@ -1322,35 +1410,35 @@ size_t ZSTD_compressBlock_btultra2( ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); } - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); } size_t ZSTD_compressBlock_btopt_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); } size_t ZSTD_compressBlock_btultra_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); } size_t ZSTD_compressBlock_btopt_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_extDict); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); } size_t ZSTD_compressBlock_btultra_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_extDict); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict); } /* note : no btultra2 variant for extDict nor dictMatchState, |