Provide better comment for chunking decisions - dedup - deduplicating backup program
 (HTM) git clone git://bitreich.org/dedup/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/dedup/
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Tags
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 39f907c919176a8fed990e10e95a7f7371aedf40
 (DIR) parent 90f2ff9fcf896060da4be75c70cd148f18be28e9
 (HTM) Author: z3bra <contactatz3bradotorg>
       Date:   Sun, 17 Feb 2019 13:04:03 +0100
       
       Provide better comment for chunking decisions
       
       Diffstat:
         M dedup.c                             |      21 ++++++++++++++-------
       
       1 file changed, 14 insertions(+), 7 deletions(-)
       ---
 (DIR) diff --git a/dedup.c b/dedup.c
       @@ -74,6 +74,11 @@ char *argv0;
        /*
         * Static table for use in buzhash algorithm.
         * 256 * 32 bits randomly generated unique integers
       + *
       + * To get better pseudo-random results, there is exactly the same number
       + * of 0 and 1 spread amongst these integers. It means that there is
       + * exactly 50% chance that a XOR operation would flip all the bits in
       + * the hash.
         */
        uint32_t buz[] = {
                0xbc9fa594,0x30a8f827,0xced627a7,0xdb46a745,0xcfa4a9e8,0x77cccb59,0xddb66276,0x3adc532f,
       @@ -115,9 +120,9 @@ uint32_t
        buzh_init(uint8_t *buf, size_t size)
        {
                size_t i;
       -        uint32_t fp = 0;
       +        uint32_t fp;
        
       -        for (i = size - 1; i > 0; i--, buf++)
       +        for (i = size - 1, fp = 0; i > 0; i--, buf++)
                        fp ^= ROTL(buz[*buf], i % 32);
        
                return fp ^ buz[*buf];
       @@ -136,11 +141,13 @@ chunk_blk(uint8_t *buf, size_t size)
                uint32_t fp;
        
                /*
       -         * Chunking blocks is decided using a rolling hash + binary pattern.
       -         * The buzhash algorithm is used to "fingerprint" a fixed size window.
       -         * Once the lower bits of this fingerprint are all zeros,
       -         * the block is chunked.
       -         * If the pattern can't be matched, then we return the buffer size.
       +         * To achieve better deduplication, we chunk blocks based on a
       +         * recurring pattern occuring on the data stream. A fixed window
       +         * of WINSIZ bytes is slid over the data, and a rolling hash is
       +         * computed for this window.
       +         * When the rolling hash matches a given pattern (see HASHMSK),
       +         * the block is chunked at the end of that window, thus making
       +         * WINSIZ the smallest possible block size.
                 */
                fp = buzh_init(buf, WINSIZ);
                for (i = 1; i < size - WINSIZ; i++) {