Add primitive chunk compression support - dedup - deduplicating backup program
 (HTM) git clone git://bitreich.org/dedup/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/dedup/
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Tags
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 3ce9db0f144f9d5a39a9180ed343d9498d064778
 (DIR) parent 4ff76106cb1e2c6f99dfd163744224fc57d5fa65
 (HTM) Author: sin <sin@2f30.org>
       Date:   Sun, 17 Feb 2019 21:28:00 +0000
       
       Add primitive chunk compression support
       
       Diffstat:
         M Makefile                            |       2 +-
         M dedup.c                             |     103 +++++++++++++++++++------------
       
       2 files changed, 66 insertions(+), 39 deletions(-)
       ---
 (DIR) diff --git a/Makefile b/Makefile
       @@ -8,7 +8,7 @@ DISTFILES = $(SRC) LICENSE Makefile README arg.h dedup.1 tree.h
        
        CFLAGS = -g -Wall
        CPPFLAGS = -I/usr/local/include -D_FILE_OFFSET_BITS=64
       -LDLIBS = -lcrypto
       +LDLIBS = -lcrypto -llz4
        
        all: $(BIN)
        
 (DIR) diff --git a/dedup.c b/dedup.c
       @@ -7,6 +7,7 @@
        #include <string.h>
        #include <unistd.h>
        
       +#include <lz4.h>
        #include <openssl/sha.h>
        
        #include "arg.h"
       @@ -16,11 +17,11 @@
        #define STOREF ".store"
        #define CACHEF ".cache"
        
       -#define BLKSIZ (1024 * 512)
       -#define WINSIZ 1024
       +#define BLKSIZE (1 * 1024 * 1024)
       +#define WINSIZE 4095
        #define HASHMSK ((1ul << 21) - 1)
       -#define MSGSIZ 256
       -#define MDSIZ SHA256_DIGEST_LENGTH
       +#define MSGSIZE 256
       +#define MDSIZE SHA256_DIGEST_LENGTH
        
        #define ROTL(x, y) (((x) << (y)) | ((x) >> (32 - (y))))
        
       @@ -38,7 +39,7 @@ struct enthdr {
        
        /* block descriptor */
        struct bdescr {
       -        uint8_t md[MDSIZ];
       +        uint8_t md[MDSIZE];
                uint64_t offset;
                uint64_t size;
        };
       @@ -46,8 +47,8 @@ struct bdescr {
        /* index file entry */
        struct ent {
                uint64_t size;
       -        uint8_t msg[MSGSIZ];
       -        uint8_t md[MDSIZ];        /* hash of file */
       +        uint8_t msg[MSGSIZE];
       +        uint8_t md[MDSIZE];        /* hash of file */
                uint64_t nblks;
                struct bdescr bdescr[];
        };
       @@ -140,29 +141,47 @@ chunk_blk(uint8_t *buf, size_t size)
                size_t i;
                uint32_t fp;
        
       -        /* buzhash should be at least WINSIZ */
       -        if (size < WINSIZ)
       +        /* buzhash should be at least WINSIZE */
       +        if (size < WINSIZE)
                        return size;
        
                /*
                 * To achieve better deduplication, we chunk blocks based on a
                 * recurring pattern occuring on the data stream. A fixed window
       -         * of WINSIZ bytes is slid over the data, and a rolling hash is
       +         * of WINSIZE bytes is slid over the data, and a rolling hash is
                 * computed for this window.
                 * When the rolling hash matches a given pattern (see HASHMSK),
                 * the block is chunked at the end of that window, thus making
       -         * WINSIZ the smallest possible block size.
       +         * WINSIZE the smallest possible block size.
                 */
       -        fp = buzh_init(buf, WINSIZ);
       -        for (i = 0; i < size - WINSIZ; i++) {
       +        fp = buzh_init(buf, WINSIZE);
       +        for (i = 0; i < size - WINSIZE; i++) {
                        if (i > 0)
       -                        fp = buzh_update(fp, buf[i - 1], buf[WINSIZ + i - 1], WINSIZ);
       +                        fp = buzh_update(fp, buf[i - 1], buf[WINSIZE + i - 1], WINSIZE);
                        if ((fp & HASHMSK) == 0)
       -                        return i + WINSIZ;
       +                        return i + WINSIZE;
                }
                return size;
        }
        
       +size_t
       +comp_size(size_t size)
       +{
       +        return LZ4_compressBound(size);
       +}
       +
       +size_t
       +comp(uint8_t *in, uint8_t *out, size_t insize, size_t outsize)
       +{
       +        return LZ4_compress_default((char *)in, (char *)out, insize, outsize);
       +}
       +
       +size_t
       +decomp(uint8_t *in, uint8_t *out, size_t insize, size_t outsize)
       +{
       +        return LZ4_decompress_safe((char *)in, (char *)out, insize, outsize);
       +}
       +
        void
        print_md(const uint8_t *md, size_t size)
        {
       @@ -375,33 +394,35 @@ lookup_blk(uint8_t *md)
        void
        dedup(int fd, char *msg)
        {
       -        uint8_t *buf;
       +        uint8_t *buf, *cbuf;
                struct ent *ent;
                SHA256_CTX ctx;
                ssize_t n;
        
       -        buf = alloc_buf(BLKSIZ);
       +        buf = alloc_buf(BLKSIZE);
       +        cbuf = alloc_buf(comp_size(BLKSIZE));
                ent = alloc_ent();
        
                SHA256_Init(&ctx);
       -        while ((n = xread(fd, buf, BLKSIZ)) > 0) {
       +        while ((n = xread(fd, buf, BLKSIZE)) > 0) {
                        uint8_t *bp = buf;
        
                        while (n > 0) {
       -                        uint8_t md[MDSIZ];
       +                        uint8_t md[MDSIZE];
                                struct bdescr bdescr;
       -                        size_t blksiz;
       +                        size_t blksize, csize;
        
       -                        blksiz = chunk_blk(bp, n);
       +                        blksize = chunk_blk(bp, n);
       +                        csize = comp(bp, cbuf, blksize, comp_size(BLKSIZE));
        
                                memcpy(bdescr.md, md, sizeof(bdescr));
                                bdescr.offset = enthdr.store_size;
       -                        bdescr.size = blksiz;
       +                        bdescr.size = csize;
        
       -                        hash_blk(bp, bdescr.size, bdescr.md);
       +                        hash_blk(cbuf, bdescr.size, bdescr.md);
        
                                /* Calculate file hash one block at a time */
       -                        SHA256_Update(&ctx, bp, bdescr.size);
       +                        SHA256_Update(&ctx, cbuf, bdescr.size);
        
                                ent = grow_ent(ent, ent->nblks + 1);
        
       @@ -412,7 +433,7 @@ dedup(int fd, char *msg)
                                        ent->bdescr[ent->nblks++] = bdescr;
        
                                        /* Store block */
       -                                append_blk(bp, &bdescr);
       +                                append_blk(cbuf, &bdescr);
        
                                        /* Create a cache entry for this block */
                                        cent = alloc_cent();
       @@ -422,8 +443,8 @@ dedup(int fd, char *msg)
                                        ent->bdescr[ent->nblks++] = bdescr;
                                }
        
       -                        bp += blksiz;
       -                        n -= blksiz;
       +                        bp += blksize;
       +                        n -= blksize;
                        }
                }
        
       @@ -445,24 +466,30 @@ dedup(int fd, char *msg)
                }
        
                free(ent);
       +        free(cbuf);
                free(buf);
        }
        
        int
        extract(struct ent *ent, void *arg)
        {
       -        uint8_t *buf;
       +        uint8_t *buf, *cbuf;
                struct extract_args *args = arg;
                uint64_t i;
        
                if (memcmp(ent->md, args->md, sizeof(ent->md)) != 0)
                        return WALK_CONTINUE;
        
       -        buf = alloc_buf(BLKSIZ);
       +        buf = alloc_buf(BLKSIZE);
       +        cbuf = alloc_buf(comp_size(BLKSIZE));
                for (i = 0; i < ent->nblks; i++) {
       -                read_blk(buf, &ent->bdescr[i]);
       -                xwrite(args->fd, buf, ent->bdescr[i].size);
       +                size_t blksize;
       +
       +                read_blk(cbuf, &ent->bdescr[i]);
       +                blksize = decomp(cbuf, buf, ent->bdescr[i].size, BLKSIZE);
       +                xwrite(args->fd, buf, blksize);
                }
       +        free(cbuf);
                free(buf);
                return WALK_STOP;
        }
       @@ -470,12 +497,12 @@ extract(struct ent *ent, void *arg)
        int
        check(struct ent *ent, void *arg)
        {
       -        uint8_t md[MDSIZ];
       +        uint8_t md[MDSIZE];
                uint8_t *buf;
                SHA256_CTX ctx;
                uint64_t i;
        
       -        buf = alloc_buf(BLKSIZ);
       +        buf = alloc_buf(BLKSIZE);
                /*
                 * Calculate hash for each block and compare
                 * with index entry block descriptor
       @@ -521,12 +548,12 @@ list(struct ent *ent, void *arg)
        int
        rebuild_cache(struct ent *ent, void *arg)
        {
       -        uint8_t md[MDSIZ];
       +        uint8_t md[MDSIZE];
                uint8_t *buf;
                SHA256_CTX ctx;
                uint64_t i;
        
       -        buf = alloc_buf(BLKSIZ);
       +        buf = alloc_buf(BLKSIZE);
                for (i = 0; i < ent->nblks; i++) {
                        struct cent *cent;
        
       @@ -575,7 +602,7 @@ init_cache(void)
                uint64_t nents, i;
                uint64_t min, max, avg;
        
       -        min = BLKSIZ;
       +        min = BLKSIZE;
                max = 0;
                avg = 0;
        
       @@ -598,7 +625,7 @@ init_cache(void)
                avg /= nents;
        
                if (verbose) {
       -                fprintf(stderr, "min/avg/max blksize: %llu/%llu/%llu\n",
       +                fprintf(stderr, "min/avg/max block size: %llu/%llu/%llu\n",
                                (unsigned long long)min,
                                (unsigned long long)avg,
                                (unsigned long long)max);
       @@ -661,7 +688,7 @@ usage(void)
        int
        main(int argc, char *argv[])
        {
       -        uint8_t md[MDSIZ];
       +        uint8_t md[MDSIZE];
                char *id = NULL, *root = NULL, *msg = NULL;
                int fd = -1, lflag = 0, cflag = 0;