Skip to content

Commit

Permalink
improve compression ratio of small alphabets
Browse files Browse the repository at this point in the history
fix #3328

In situations where the alphabet size is very small,
the evaluation of literal costs from the Optimal Parser is initially incorrect.
It takes some time to converge, during which compression is less efficient.
This is especially important for small files,
because there will not be enough data to converge,
so most of the parsing is selected based on incorrect metrics.

After this patch, the scenario ##3328 gets fixed,
delivering the expected 29 bytes compressed size (smallest known compressed size).
  • Loading branch information
Cyan4973 committed Dec 21, 2022
1 parent 0694f14 commit 1db5136
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 40 deletions.
11 changes: 10 additions & 1 deletion lib/compress/huf_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -1253,7 +1253,14 @@ unsigned HUF_minTableLog(unsigned symbolCardinality)
return minBitsSymbols;
}

unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace, size_t wkspSize, HUF_CElt* table, const unsigned* count, HUF_depth_mode depthMode)
unsigned HUF_optimalTableLog(
unsigned maxTableLog,
size_t srcSize,
unsigned maxSymbolValue,
void* workSpace, size_t wkspSize,
HUF_CElt* table,
const unsigned* count,
HUF_depth_mode depthMode)
{
unsigned optLog = FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
assert(srcSize > 1); /* Not supported, RLE should be used instead */
Expand All @@ -1266,9 +1273,11 @@ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxS
size_t maxBits, hSize, newSize;
const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);

DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) return optLog;

for (huffLog = HUF_minTableLog(symbolCardinality); huffLog <= maxTableLog; huffLog++) {
DEBUGLOG(7, "checking for huffLog=%u", huffLog);
maxBits = HUF_buildCTable_wksp(table, count,
maxSymbolValue, huffLog,
workSpace, wkspSize);
Expand Down
11 changes: 5 additions & 6 deletions lib/compress/zstd_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -2661,15 +2661,14 @@ ZSTD_entropyCompressSeqStore_internal(
unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
size_t const litSize = (size_t)(seqStorePtr->lit - literals);

HUF_depth_mode depthMode = cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_depth_optimal : HUF_depth_fast;
size_t const cSize = ZSTD_compressLiterals(
&prevEntropy->huf, &nextEntropy->huf,
cctxParams->cParams.strategy,
ZSTD_literalsCompressionIsDisabled(cctxParams),
op, dstCapacity,
literals, litSize,
entropyWorkspace, entropyWkspSize,
bmi2, suspectUncompressible, depthMode);
&prevEntropy->huf, &nextEntropy->huf,
cctxParams->cParams.strategy,
ZSTD_literalsCompressionIsDisabled(cctxParams),
suspectUncompressible, bmi2);
FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
assert(cSize <= dstCapacity);
op += cSize;
Expand Down Expand Up @@ -3688,12 +3687,12 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
size_t estimatedSecondHalfSize;
size_t midIdx = (startIdx + endIdx)/2;

DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
assert(endIdx >= startIdx);
if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
return;
}
DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
Expand Down
57 changes: 39 additions & 18 deletions lib/compress/zstd_compress_literals.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,16 +92,37 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
return flSize+1;
}

size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
ZSTD_hufCTables_t* nextHuf,
ZSTD_strategy strategy, int disableLiteralCompression,
void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
void* entropyWorkspace, size_t entropyWorkspaceSize,
const int bmi2,
unsigned suspectUncompressible, HUF_depth_mode depthMode)
/* ZSTD_minLiteralsToCompress() :
* returns minimal amount of literals
* for literal compression to even be attempted.
* Minimum is made tighter as compression strategy increases.
*/
static size_t
ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
{
assert((int)strategy >= 0);
assert((int)strategy <= 9);
/* btultra2 : min 8 bytes;
* then 2x larger for each successive compression strategy
* max threshold 64 bytes */
{ int const shift = MIN(9-strategy, 3);
size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : 8 << shift;
DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
return mintc;
}
}

size_t ZSTD_compressLiterals (
void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
void* entropyWorkspace, size_t entropyWorkspaceSize,
const ZSTD_hufCTables_t* prevHuf,
ZSTD_hufCTables_t* nextHuf,
ZSTD_strategy strategy,
int disableLiteralCompression,
int suspectUncompressible,
int bmi2)
{
size_t const minGain = ZSTD_minGain(srcSize, strategy);
size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
BYTE* const ostart = (BYTE*)dst;
U32 singleStream = srcSize < 256;
Expand All @@ -119,15 +140,14 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
if (disableLiteralCompression)
return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);

/* small ? don't even attempt compression (speed opt) */
# define COMPRESS_LITERALS_SIZE_MIN 63
{ size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
}
/* if too small, don't even attempt compression (speed opt) */
if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);

RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
{ HUF_repeat repeat = prevHuf->repeatMode;
int const preferRepeat = (strategy < ZSTD_lazy) ? srcSize <= 1024 : 0;
HUF_depth_mode const depthMode = (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD) ? HUF_depth_optimal : HUF_depth_fast;
typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int, int, unsigned, HUF_depth_mode);
huf_compress_f huf_compress;
if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
Expand All @@ -146,10 +166,11 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
}
}

if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
}
{ size_t const minGain = ZSTD_minGain(srcSize, strategy);
if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
} }
if (cLitSize==1) {
ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
Expand Down
18 changes: 11 additions & 7 deletions lib/compress/zstd_compress_literals.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,18 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,

size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);

/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
ZSTD_hufCTables_t* nextHuf,
ZSTD_strategy strategy, int disableLiteralCompression,
void* dst, size_t dstCapacity,
/* ZSTD_compressLiterals():
* @entropyWorkspace: must be aligned on 4-bytes boundaries
* @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
* @suspectUncompressible: sampling checks, to potentially skip huffman coding
*/
size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
void* entropyWorkspace, size_t entropyWorkspaceSize,
const int bmi2,
unsigned suspectUncompressible, HUF_depth_mode depthMode);
const ZSTD_hufCTables_t* prevHuf,
ZSTD_hufCTables_t* nextHuf,
ZSTD_strategy strategy, int disableLiteralCompression,
int suspectUncompressible,
int bmi2);

#endif /* ZSTD_COMPRESS_LITERALS_H */
19 changes: 11 additions & 8 deletions lib/compress/zstd_opt.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
#define ZSTD_MAX_PRICE (1<<30)

#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */


/*-*************************************
Expand Down Expand Up @@ -88,20 +88,24 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
return total;
}

static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;

static U32
ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
{
U32 s, sum=0;
DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
assert(shift < 30);
for (s=0; s<lastEltIndex+1; s++) {
table[s] = 1 + (table[s] >> shift);
unsigned const base = base1 ? 1 : (table[s]>0);
table[s] = base + (table[s] >> shift);
sum += table[s];
}
return sum;
}

/* ZSTD_scaleStats() :
* reduce all elements in table is sum too large
* reduce all elements in table if sum too large
* return the resulting sum of elements */
static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
{
Expand All @@ -110,7 +114,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
assert(logTarget < 30);
if (factor <= 1) return prevsum;
return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
}

/* ZSTD_rescaleFreqs() :
Expand Down Expand Up @@ -188,13 +192,13 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
optPtr->offCodeSum += optPtr->offCodeFreq[of];
} }

} else { /* not a dictionary */
} else { /* first block, no dictionary */

assert(optPtr->litFreq != NULL);
if (compressedLiterals) {
unsigned lit = MaxLit;
HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */
optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
}

{ unsigned const baseLLfreqs[MaxLL+1] = {
Expand Down Expand Up @@ -224,7 +228,6 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
}


}

} else { /* new block : re-use previous statistics, scaled down */
Expand Down

0 comments on commit 1db5136

Please sign in to comment.