Increase dedup throughput by a factor of 2 - dedup - deduplicating backup program
 (HTM) git clone git://bitreich.org/dedup/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/dedup/
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Tags
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 7f8b5b3e7b72b0d64437a87c6b412743f2ab6187
 (DIR) parent ba61c65bb274657b7ea643de789db2a24ea836f8
 (HTM) Author: sin <sin@2f30.org>
       Date:   Sun, 10 Mar 2019 09:36:05 +0000
       
       Increase dedup throughput by a factor of 2
       
       Calculating the hash of the entire snapshot inside the loop slows the
       process down by 2x.  This is because we hash the block twice.  We hash
       first the raw uncompressed stream (which will become the snapshot
       hash) and then we hash the compressed block which is stored in the
       block descriptor.
       
       Change the calcuation so we only hash the compressed block inside
       dedup_chunk().  The hash of the snapshot is the hash of its block
       hashes.
       
       Diffstat:
         M dedup.c                             |      24 ++++++++++++++++++------
       
       1 file changed, 18 insertions(+), 6 deletions(-)
       ---
 (DIR) diff --git a/dedup.c b/dedup.c
       @@ -229,27 +229,39 @@ dedup(int fd, char *msg)
        {
                struct snapshot *snap;
                struct chunker *chunker;
       -        SHA256_CTX ctx;
       -        ssize_t n;
        
                snap = alloc_snap();
                chunker = alloc_chunker(fd, BLKSIZE_MIN, BLKSIZE_MAX,
                                        HASHMASK_BITS, WINSIZE);
        
       -        SHA256_Init(&ctx);
       -        while ((n = fill_chunker(chunker)) > 0) {
       +        while (fill_chunker(chunker) > 0) {
                        uint8_t *chunkp;
                        size_t chunk_size;
        
                        chunkp = get_chunk(chunker, &chunk_size);
       -                SHA256_Update(&ctx, chunkp, chunk_size);
                        snap = grow_snap(snap, snap->nr_blk_descs + 1);
                        dedup_chunk(snap, chunkp, chunk_size);
                        drain_chunker(chunker);
                }
       -        SHA256_Final(snap->md, &ctx);
        
                if (snap->nr_blk_descs > 0) {
       +                SHA256_CTX ctx;
       +                uint64_t i;
       +
       +                /*
       +                 * The snapshot hash is calculated over the
       +                 * hash of its block descriptors.
       +                 */
       +                SHA256_Init(&ctx);
       +                for (i = 0; i < snap->nr_blk_descs; i++) {
       +                        struct blk_desc *blk_desc;
       +
       +                        blk_desc = &snap->blk_desc[i];
       +                        SHA256_Update(&ctx, blk_desc->md,
       +                                      sizeof(blk_desc->md));
       +                }
       +                SHA256_Final(snap->md, &ctx);
       +
                        if (msg != NULL) {
                                size_t size;