tCheckpoint. - plan9port - [fork] Plan 9 from user space
 (HTM) git clone git://src.adamsgaard.dk/plan9port
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 9ffbb5adcaeec878d3b6db0f8b1f654e839b4689
 (DIR) parent 7c5190d2c854128fb607289e9d83379e522c7090
 (HTM) Author: rsc <devnull@localhost>
       Date:   Fri, 12 Mar 2004 18:28:14 +0000
       
       Checkpoint.
       
       Add disk caching code and first draft of fractional index.
       
       Diffstat:
         M src/cmd/venti/buildbuck.c           |       2 +-
         M src/cmd/venti/buildindex.c          |      12 +++---------
         M src/cmd/venti/checkindex.c          |       9 ++-------
         M src/cmd/venti/clump.c               |       2 +-
         M src/cmd/venti/conv.c                |       4 ++--
         M src/cmd/venti/dat.h                 |      15 ++++++++++++---
         M src/cmd/venti/dcache.c              |      34 ++++++++++++-------------------
         M src/cmd/venti/fns.h                 |       2 +-
         M src/cmd/venti/httpd.c               |      17 +++++------------
         M src/cmd/venti/icache.c              |       1 -
         M src/cmd/venti/index.c               |     700 ++++++++++++++++++-------------
         M src/cmd/venti/lump.c                |       1 -
         M src/cmd/venti/mkfile                |       4 ++--
         M src/cmd/venti/part.c                |       6 ++++--
         M src/cmd/venti/venti.c               |       4 ++--
       
       15 files changed, 450 insertions(+), 363 deletions(-)
       ---
 (DIR) diff --git a/src/cmd/venti/buildbuck.c b/src/cmd/venti/buildbuck.c
       t@@ -80,7 +80,7 @@ buildbucket(Index *ix, IEStream *ies, IBucket *ib)
        
                buck = TWID32;
                ib->n = 0;
       -        ib->next = 0;
       +        ib->depth = 0;
                while(ies->n){
                        b = peekientry(ies);
                        if(b == nil)
 (DIR) diff --git a/src/cmd/venti/buildindex.c b/src/cmd/venti/buildindex.c
       t@@ -7,15 +7,9 @@ writebucket(Index *ix, u32int buck, IBucket *ib, ZBlock *b)
        {
                ISect *is;
        
       -        is = findisect(ix, buck);
       -        if(is == nil){
       -                seterr(EAdmin, "bad math in writebucket");
       +        is = findibucket(ix, buck, &buck);
       +        if(is == nil)
                        return -1;
       -        }
       -        if(buck < is->start || buck >= is->stop)
       -                seterr(EAdmin, "index write out of bounds: %d not in [%d,%d)\n",
       -                                buck, is->start, is->stop);
       -        buck -= is->start;
                qlock(&stats.lock);
                stats.indexwrites++;
                qunlock(&stats.lock);
       t@@ -47,7 +41,7 @@ buildindex(Index *ix, Part *part, u64int off, u64int clumps, int zero)
                ib.data = b->data + IBucketSize;
                zib.data = z->data + IBucketSize;
                zib.n = 0;
       -        zib.next = 0;
       +        zib.depth = 0;
                for(;;){
                        buck = buildbucket(ix, ies, &ib);
                        found += ib.n;
 (DIR) diff --git a/src/cmd/venti/checkindex.c b/src/cmd/venti/checkindex.c
       t@@ -11,12 +11,7 @@ checkbucket(Index *ix, u32int buck, IBucket *ib)
                IEntry ie, eie;
                int i, ei, ok, c;
        
       -        is = findisect(ix, buck);
       -        if(is == nil){
       -                seterr(EAdmin, "bad math in checkbuckets");
       -                return -1;
       -        }
       -        buck -= is->start;
       +        is = findibucket(ix, buck, &buck);
                eb = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1);
                if(eb == nil)
                        return -1;
       t@@ -87,7 +82,7 @@ u64int found = 0;
                ib.data = b->data;
                zib.data = z->data;
                zib.n = 0;
       -        zib.next = 0;
       +        zib.depth = 0;
                for(;;){
                        buck = buildbucket(ix, ies, &ib);
                        found += ib.n;
 (DIR) diff --git a/src/cmd/venti/clump.c b/src/cmd/venti/clump.c
       t@@ -60,7 +60,7 @@ if(0)print("whackedblock %08x %p\n", mainindex->arenas[0], &cl);
                a = writeiclump(ix, &cl, cb->data);
        
                freezblock(cb);
       -        if(a == 0)
       +        if(a == TWID64)
                        return -1;
        
                qlock(&stats.lock);
 (DIR) diff --git a/src/cmd/venti/conv.c b/src/cmd/venti/conv.c
       t@@ -488,7 +488,7 @@ void
        unpackibucket(IBucket *b, u8int *buf)
        {
                b->n = U16GET(buf);
       -        b->next = U32GET(&buf[U16Size]);
       +        b->depth = U32GET(&buf[U16Size]);
                b->data = buf + IBucketSize;
        }
        
       t@@ -496,5 +496,5 @@ void
        packibucket(IBucket *b, u8int *buf)
        {
                U16PUT(buf, b->n);
       -        U32PUT(&buf[U16Size], b->next);
       +        U32PUT(&buf[U16Size], b->depth);
        }
 (DIR) diff --git a/src/cmd/venti/dat.h b/src/cmd/venti/dat.h
       t@@ -79,7 +79,8 @@ enum
        
                ArenaPartVersion        = 3,
                ArenaVersion                = 4,
       -        IndexVersion                = 1,
       +        IndexVersion1                = 1,
       +        IndexVersion2                = 2,
                ISectVersion                = 1,
        
                /*
       t@@ -116,11 +117,14 @@ enum
                MaxClumpBlocks                =  (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
        
                /*
       -         * dirty flags 
       +         * dirty flags - order controls disk write order
                 */
                DirtyArena                = 1,
       +        DirtyIndexSplit,
                DirtyIndex,
       +        DirtyIndexBitmap,
                DirtyArenaCib,
       +        DirtyMax,
        
                VentiZZZZZZZZ
        };
       t@@ -367,6 +371,11 @@ struct Index
                u32int                buckets;                /* last bucket used in disk hash table */
                u32int                blocksize;
                u32int                tabsize;                /* max. bytes in index config */
       +        u32int                bitblocks;
       +        u32int                maxdepth;
       +        u32int                bitkeylog;
       +        u32int                bitkeymask;
       +
                int                mapalloc;                /* first arena to check when adding a lump */
                Arena                **arenas;                /* arenas in the mapping */
                ISect                **sects;                /* sections which hold the buckets */
       t@@ -440,7 +449,7 @@ struct IEntry
        struct IBucket
        {
                u16int                n;                        /* number of active indices */
       -        u32int                next;                        /* overflow bucket */
       +        u32int                depth;                /* depth in version 2 (was overflow in v1) */
                u8int                *data;
        };
        
 (DIR) diff --git a/src/cmd/venti/dcache.c b/src/cmd/venti/dcache.c
       t@@ -230,7 +230,6 @@ dirtydblock(DBlock *b, int dirty)
                int odirty;
                Part *p;
        
       -fprint(2, "dirty %p\n", b);
                rlock(&dcache.dirtylock);
                assert(b->ref != 0);
                assert(b->dirtying == 0);
       t@@ -242,8 +241,16 @@ fprint(2, "dirty %p\n", b);
                stats.dirtydblocks++;
                qunlock(&stats.lock);
        
       +        /*
       +         * In general, shouldn't mark any block as more than one
       +         * type, except that split index blocks are a subcase of index
       +         * blocks.  Only clean blocks ever get marked DirtyIndexSplit,
       +         * though, so we don't need the opposite conjunction here.
       +         */
                if(b->dirty)
       -                assert(b->dirty == dirty);
       +                assert(b->dirty == dirty
       +                        || (b->dirty==DirtyIndexSplit && dirty==DirtyIndex));
       +
                odirty = b->dirty;
                b->dirty = dirty;
                p = b->part;
       t@@ -533,7 +540,7 @@ flushtimerproc(void *v)
        static void
        flushproc(void *v)
        {
       -        int i, n;
       +        int i, j, n;
                DBlock *b, **write;
        
                USED(v);
       t@@ -575,25 +582,10 @@ flushproc(void *v)
        
                        qsort(write, n, sizeof(write[0]), writeblockcmp);
        
       -                /*
       -                 * At the beginning of the array are the arena blocks.
       -                 */
       -                fprint(2, "flushproc: write arena blocks\n");
       +                /* Write each stage of blocks out. */
                        i = 0;
       -                i += parallelwrites(write+i, write+n, DirtyArena);
       -
       -                /*
       -                 * Next are the index blocks.
       -                 */
       -                fprint(2, "flushproc: write index blocks\n");
       -                i += parallelwrites(write+i, write+n, DirtyIndex);
       -
       -                /*
       -                 * Finally, the arena clump info blocks.
       -                 */
       -                fprint(2, "flushproc: write cib blocks\n");
       -                i += parallelwrites(write+i, write+n, DirtyArenaCib);
       -
       +                for(j=1; j<DirtyMax; j++)
       +                        i += parallelwrites(write+i, write+n, j);
                        assert(i == n);
        
                        fprint(2, "flushproc: update dirty bits\n");
 (DIR) diff --git a/src/cmd/venti/fns.h b/src/cmd/venti/fns.h
       t@@ -20,7 +20,6 @@ void                *erealloc(void *, ulong);
        char                *estrdup(char*);
        void                *ezmalloc(ulong);
        Arena                *findarena(char *name);
       -ISect                *findisect(Index *ix, u32int buck);
        int                flushciblocks(Arena *arena);
        void                flushdcache(void);
        void                flushqueue(void);
       t@@ -57,6 +56,7 @@ int                initventi(char *config);
        void                insertlump(Lump *lump, Packet *p);
        int                insertscore(u8int *score, IAddr *ia, int write);
        ZBlock                *loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int verify);
       +DBlock        *loadibucket(Index *index, u8int *score, ISect **is, u32int *buck, IBucket *ib);
        int                loadientry(Index *index, u8int *score, int type, IEntry *ie);
        void                logerr(int severity, char *fmt, ...);
        Lump                *lookuplump(u8int *score, int type);
 (DIR) diff --git a/src/cmd/venti/httpd.c b/src/cmd/venti/httpd.c
       t@@ -137,14 +137,12 @@ httpproc(void *v)
                c = v;
        
                for(t = 15*60*1000; ; t = 15*1000){
       -fprint(2, "httpd: get headers\n");
                        if(hparsereq(c, t) < 0)
                                break;
        
                        ok = -1;
                        for(i = 0; i < MaxObjs && objs[i].name[0]; i++){
                                if(strcmp(c->req.uri, objs[i].name) == 0){
       -fprint(2, "httpd: call function %p\n", objs[i].f);
                                        ok = (*objs[i].f)(c);
                                        break;
                                }
       t@@ -158,9 +156,7 @@ fprint(2, "httpd: call function %p\n", objs[i].f);
                        if(ok < 0)
                                break;
                }
       -print("httpd cleanup %d\n", c->hin.fd);
                hreqcleanup(c);
       -print("close %d\n", c->hin.fd);
                close(c->hin.fd);
                free(c);
        }
       t@@ -239,12 +235,9 @@ estats(HConnect *c)
        
                r = preqtext(c);
                if(r < 0)
       -{
       -fprint(2, "preqtext failed\n");
                        return r;
       -}
        
       -fprint(2, "write stats\n");
       +
                hout = &c->hout;
                hprint(hout, "lump writes=%,ld\n", stats.lumpwrites);
                hprint(hout, "lump reads=%,ld\n", stats.lumpreads);
       t@@ -277,21 +270,21 @@ fprint(2, "write stats\n");
                hprint(hout, "disk cache misses=%,ld\n", stats.pcmiss);
                hprint(hout, "disk cache reads=%,ld\n", stats.pcreads);
                hprint(hout, "disk cache bytes read=%,lld\n", stats.pcbreads);
       -fprint(2, "write new stats\n");
       +
                hprint(hout, "disk cache writes=%,ld\n", stats.dirtydblocks);
                hprint(hout, "disk cache writes absorbed=%,ld %d%%\n", stats.absorbedwrites,
                        percent(stats.absorbedwrites, stats.dirtydblocks));
        
       -fprint(2, "back to old stats\n");
       +        hprint(hout, "disk cache flushes=%,ld\n", stats.dcacheflushes);
       +        hprint(hout, "disk cache flush writes=%,ld (%,ld per flush)\n", 
       +                stats.dcacheflushwrites, stats.dcacheflushwrites/stats.dcacheflushes);
        
                hprint(hout, "disk writes=%,ld\n", stats.diskwrites);
                hprint(hout, "disk bytes written=%,lld\n", stats.diskbwrites);
                hprint(hout, "disk reads=%,ld\n", stats.diskreads);
                hprint(hout, "disk bytes read=%,lld\n", stats.diskbreads);
        
       -fprint(2, "hflush stats\n");
                hflush(hout);
       -fprint(2, "done with stats\n");
                return 0;
        }
        
 (DIR) diff --git a/src/cmd/venti/icache.c b/src/cmd/venti/icache.c
       t@@ -58,7 +58,6 @@ lookupscore(u8int *score, int type, IAddr *ia, int *rac)
                IEntry d, *ie, *last;
                u32int h;
        
       -fprint(2, "lookupscore %V %d\n", score, type);
                qlock(&stats.lock);
                stats.iclookups++;
                qunlock(&stats.lock);
 (DIR) diff --git a/src/cmd/venti/index.c b/src/cmd/venti/index.c
       t@@ -1,101 +1,110 @@
        /*
       - * Index, mapping scores to log positions.  The log data mentioned in
       - * the index _always_ goes out to disk before the index blocks themselves.
       - * A counter in the arena tail records which logged blocks have been
       - * successfully indexed.  The ordering of dirtydcache calls along with
       - * the flags passed to dirtydcache ensure the proper write ordering.
       + * Index, mapping scores to log positions. 
       + *
       + * The index is made up of some number of index sections, each of
       + * which is typically stored on a different disk.  The blocks in all the 
       + * index sections are logically numbered, with each index section 
       + * responsible for a range of blocks.  Blocks are typically 8kB.
       + *
       + * Index Version 1:
         * 
       - * For historical reasons, there are two indexing schemes. In both,
       - * the index is broken up into some number of fixed-size (say, 8kB)
       - * buckets holding index entries.  An index entry is about 40 bytes.
       - * The index can be spread across many disks, although in small
       - * configurations it is not uncommon for the index and arenas to be
       - * on the same disk.
       - *  *
       - * In the first scheme, the many buckets are treated as a giant on-disk
       - * hash table.  If there are N buckets, then the top 32 bits of the
       - * score are used as an index into the hash table, with each bucket
       - * holding 2^32 / N of the index space.  The index must be sized so
       - * that a bucket can't ever overflow.  Assuming that a typical compressed
       - * data block is about 4000 bytes, the index size is expected to be
       - * about 1% of the total data size.  Since scores are essentially
       - * random, they will be distributed evenly among the buckets, so all
       - * buckets should be about the same fullness.  A factor of 5 gives us
       - * a wide comfort boundary, so the index storage is suggested to be
       - * 5% of the total data storage.
       + * The N index blocks are treated as a giant hash table.  The top 32 bits
       + * of score are used as the key for a lookup.  Each index block holds
       + * one hash bucket, which is responsible for ceil(2^32 / N) of the key space.
         * 
       - * Unfortunately, this very sparse index does not make good use of the
       - * disk -- most of it is empty, and disk reads, which are costly because
       - * of the random seek to get to an arbitrary bucket, tend to bring in
       - * only a few entries, making them hardly cost effective.  The second
       - * scheme is a variation on the first scheme that tries to lay out the
       - * index in a denser format on the disk.  In this scheme, the index
       - * buckets are organized in a binary tree, with all data at the leaf
       - * nodes.  Bucket numbers are easiest to treat in binary.  In the
       - * beginning, there is a single bucket with 0-bit number "".  When a
       - * bucket with number x fills, it splits into buckets 0x and 1x.  Since
       - * x and 0x are the same number, this means that half the bucket space
       - * is assigned to a new bucket, 1x.  So "" splits into 0 and 1, 1
       - * splits into 01 and 11, and so on.  The bucket number determines the
       - * placement on disk, and the bucket header includes the number of
       - * bits represented by the bucket.  To find the right bucket for a
       - * given score with top 32-bits x, read bucket "" off disk and check
       - * its depth.  If it is zero, we're done.  If x doesn't match the
       - * number of bits in 0's header, we know that the block has split, so
       - * we use the last 1 bit of x to load a new block (perhaps the same
       - * one) and repeat, using successively more bits of x until we find
       - * the block responsible for x.  Note that we're using bits from the
       - * _right_ not the left.  This gives the "split into 0x and 1x" property
       - * needed by the tree and is easier than using the reversal of the
       - * bits on the left.
       - *  *
       - * At the moment, this second scheme sounds worse than the first --
       - * there are log n disk reads to find a block instead of just 1.  But
       - * we can keep the tree structure in memory, using 1 bit per block to
       - * keep track of whether that block has been allocated.  Want to know
       - * whether block x has been split?  Check whether 1x is allocated.  1
       - * bit per 8kB gives us an in-use bitmap 1/65536 the size of the index.
       - * The index data is 1/100 the size of the arena data, explained above.
       - * In this scheme, after the first block split, the index is always
       - * at least half full (proof by induction), so it is at most 2x the
       - * size of the index data.  This gives a bitmap size of 2/6,553,600
       - * of the data size.  Let's call that one millionth.  So each terabyte
       - * of storage requires one megabyte of free bitmap.  The bitmap is
       - * going to be accessed so much that it will be effectively pinned in
       - * the cache.  So it still only takes one disk read to find the block
       - * -- the tree walking can be done by consulting the in-core bitmap
       - * describing the tree structure.
       - *  *
       - * Now we have to worry about write ordering, though.  What if the
       - * bitmap ends up out of sync with the index blocks?  When block x
       - * splits into 0x and 1x, causing an update to bitmap block b, the
       - * dcache flushing code makes sure that the writes happen in this
       - * order: first 1x, then 0x, then the bitmap.  Writing 1x before 0x
       - * makes sure we write the split-off entries to disk before we discard
       - * them from 0x.  Writing the bitmap after both simplifies the following
       - * case analysis.
       + * The index is sized so that a particular bucket is extraordinarily 
       + * unlikely to overflow: assuming compressed data blocks are 4kB 
       + * on disk, and assuming each block has a 40 byte index entry,
       + * the index data will be 1% of the total data.  Since scores are essentially
       + * random, all buckets should be about the same fullness.
       + * A factor of 5 gives us a wide comfort boundary to account for 
       + * random variation.  So the index disk space should be 5% of the arena disk space.
       + *
       + * Problems with Index Version 1:
         * 
       - * If Venti is interrupted while flushing blocks to disk, the state
       - * of the disk upon next startup can be one of the following:
       - *  *
       -
       - * (a) none of 0x, 1x, and b are written
       - *        Looks like nothing happened - use as is.
       + * Because the index size is chosen to handle the worst case load,
       + * the index is very sparse, especially when the Venti server is mostly empty.
       + * This has a few bad properties.
       + * 
       + * Loading an index block (which typically requires a random disk seek)
       + * is a very expensive operation, yet it yields only a couple index entries.
       + * We're not making efficient use of the disk arm.
       + *
       + * Writing a block requires first checking to see if the block already
       + * exists on the server, which in turn requires an index lookup.  When
       + * writing fresh data, these lookups will fail.  The index entry cache 
       + * cannot serve these, so they must go to disk, which is expensive.
       + * 
       + * Thus both the reading and the writing of blocks are held back by
       + * the expense of accessing the index.
       + * 
       + * Index Version 2:
       + * 
       + * The index is sized to be exactly 2^M blocks.  The index blocks are 
       + * logically arranged in a (not exactly balanced) binary tree of depth at
       + * most M.  The nodes are named by bit strings describing the path from
       + * the root to the node.  The root is . (dot).  The left child of the root is .0,
       + * the right child .1.  The node you get to by starting at the root and going
       + * left right right left is .0110.  At the beginning, there is only the root block.
       + * When a block with name .xxx fills, it splits into .xxx0 and .xxx1.
       + * All the index data is kept in the leaves of the tree.
       + *
       + * Index leaf blocks are laid out on disk by interpreting the bitstring as a 
       + * binary fraction and multiplying by 2^M -- .0 is the first block, .1 is
       + * the block halfway into the index, .0110 is at position 6/16, and
       + * .xxx and .xxx0 map to the same block (but only one can be a leaf
       + * node at any given time, so this is okay!).  A cheap implementation of
       + * this is to append zeros to the bit string to make it M bits long.  That's
       + * the integer index block number.
       + *
       + * To find the leaf block that should hold a score, use the bits of the 
       + * score one at a time to walk down the tree to a leaf node.  If the tree
       + * has leaf nodes .0, .10, and .11, then score 0110101... ends up in bucket
       + * .0 while score 101110101... ends up in bucket .10.  There is no leaf node
       + * .1 because it has split into .10 and .11.
       + *
       + * If we know which disk blocks are in use, we can reconstruct the interior
       + * of the tree: if .xxx1 is in use, then .xxx has been split.  We keep an in-use
       + * bitmap of all index disk blocks to aid in reconstructing the interior of the
       + * tree.  At one bit per index block, the bitmap is small enough to be kept
       + * in memory even on the largest of Venti servers.
       + *
       + * After the root block splits, the index blocks being used will always be
       + * at least half full (averaged across the entire index).  So unlike Version 1,
       + * Index Version 2 is quite dense, alleviating the two problems above.
       + * Index block reads now return many index entries.  More importantly,
       + * small servers can keep most of the index in the disk cache, making them
       + * more likely to handle negative lookups without going to disk.
       + *
       + * As the index becomes more full, Index Version 2's performance
       + * degrades gracefully into Index Version 1.  V2 is really an optimization
       + * for little servers.
       + *
       + * Write Ordering for Index Version 2:
       + * 
       + * Unlike Index Version 1, Version 2 must worry about write ordering
       + * within the index.  What happens if the in-use bitmap is out of sync
       + * with the actual leaf nodes?  What happens if .xxx splits into .xxx0 and
       + * .xxx1 but only one of the new blocks gets written to disk?
       + *
       + * We arrange that when .xxx splits, the .xxx1 block is written first,
       + * then the .xxx0 block, and finally the in-use bitmap entry for .xxx1.
       + * The split is committed by the writing of .xxx0.  This ordering leaves
       + * two possible partial disk writes:
         *
       - * (b) 1x is written
       - *        Since 0x hasn't been rewritten and the bitmap doesn't record 1x
       - *         as being in use, it's like this never happened.  See (a).
       - *        This does mean that the bitmap trumps actual disk contents:
       - *        no need to zero the index disks anymore.
       + * (a) If .xxx1 is written but .xxx0 and the bitmap are not, then it's as if
       + * the split never happened -- we won't think .xxx1 is in use, and we
       + * won't go looking for it.
         *
       - * (c) 0x and 1x are written, but not the bitmap
       - *        Writing 0x commits the change.  When we next encounter
       - *        0x or 1x on a lookup (we can't get to 1x except through x==0x),
       - *        the bitmap will direct us to x, we'll load the block and find out
       - *         that its now 0x, so we update the bitmap.
       + * (b) If .xxx1 and .xxx0 are written but the bitmap is not, then the first
       + * time we try to load .xxx, we'll get .xxx0 instead, realize the bitmap is
       + * out of date, and update the bitmap.
         *
       - * (d) 0x, 1x, and b are written.
       - *        Great - just use as is.
       + * Backwards Compatibility
       + *
       + * Because there are large Venti servers in use with Index V1, this code
       + * will read either index version, but when asked to create a new index,
       + * will only create V2.
         */
        
        #include "stdinc.h"
       t@@ -105,6 +114,7 @@
        static int        bucklook(u8int *score, int type, u8int *data, int n);
        static int        writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b);
        static int        okibucket(IBucket *ib, ISect *is);
       +static int        initindex1(Index*);
        static ISect        *initisect1(ISect *is);
        
        //static QLock        indexlock;        //ZZZ
       t@@ -118,7 +128,7 @@ initindex(char *name, ISect **sects, int n)
                Index *ix;
                ISect *is;
                u32int last, blocksize, tabsize;
       -        int i, nbits;
       +        int i;
        
                if(n <= 0){
                        seterr(EOk, "no index sections to initialize index");
       t@@ -171,21 +181,7 @@ initindex(char *name, ISect **sects, int n)
                ix->tabsize = tabsize;
                ix->buckets = last;
        
       -        /* compute number of buckets used for in-use map */
       -        nbits = blocksize*8;
       -        ix->bitbuckets = (ix->buckets+nbits-1)/nbits;
       -
       -        last -= ix->bitbuckets;
       -        /* 
       -         * compute log of max. power of two not greater than 
       -         * number of remaining buckets.
       -         */
       -        for(nbits=0; last>>=1; nbits++)
       -                ;
       -        ix->maxdepth = nbits;
       -
       -        if((1UL<<ix->maxdepth) > ix->buckets-ix->bitbuckets){
       -                seterr(ECorrupt, "inconsistent math for buckets in %s", ix->name);
       +        if(initindex1(ix) < 0){
                        freeindex(ix);
                        return nil;
                }
       t@@ -195,9 +191,37 @@ initindex(char *name, ISect **sects, int n)
                        freeindex(ix);
                        return nil;
                }
       +
                return ix;
        }
        
       +static int
       +initindex1(Index *ix)
       +{
       +        u32int buckets;
       +
       +        switch(ix->version){
       +        case IndexVersion1:
       +                ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets;
       +                buckets = (((u64int)1 << 32) - 1) / ix->div + 1;
       +                if(buckets != ix->buckets){
       +                        seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name);
       +                        return -1;
       +                }
       +                break;
       +
       +        case IndexVersion2:
       +                buckets = ix->buckets - ix->bitblocks;
       +                if(ix->buckets < ix->bitblocks || (buckets&(buckets-1)))
       +                        seterr(ECorrupt, "bucket count not a power of two in %s", ix->name);
       +                ix->maxdepth = u64log2(buckets);
       +                ix->bitkeylog = u64log2(ix->blocksize*8);
       +                ix->bitkeymask = (1<<ix->bitkeylog)-1;
       +                break;
       +        }
       +        return 0;
       +}
       +
        int
        wbindex(Index *ix)
        {
       t@@ -237,7 +261,7 @@ wbindex(Index *ix)
        }
        
        /*
       - * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' sections arenas
       + * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas
         * version, blocksize: u32int
         * name: max. ANameSize string
         * sections, arenas: AMap
       t@@ -246,6 +270,7 @@ int
        outputindex(Fmt *f, Index *ix)
        {
                if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0
       +        || (ix->version==IndexVersion2 && fmtprint(f, "%ud\n", ix->bitblocks) < 0)
                || outputamap(f, ix->smap, ix->nsects) < 0
                || outputamap(f, ix->amap, ix->narenas) < 0)
                        return -1;
       t@@ -276,7 +301,7 @@ parseindex(IFile *f, Index *ix)
                        return -1;
                }
                ix->version = v;
       -        if(ix->version != IndexVersion){
       +        if(ix->version != IndexVersion1 && ix->version != IndexVersion2){
                        seterr(ECorrupt, "bad version number in %s", f->name);
                        return -1;
                }
       t@@ -293,11 +318,22 @@ parseindex(IFile *f, Index *ix)
                 * block size
                 */
                if(ifileu32int(f, &v) < 0){
       -                seterr(ECorrupt, "syntax error: bad version number in %s", f->name);
       +                seterr(ECorrupt, "syntax error: bad block size number in %s", f->name);
                        return -1;
                }
                ix->blocksize = v;
        
       +        if(ix->version == IndexVersion2){
       +                /*
       +                 * free bitmap size
       +                 */
       +                if(ifileu32int(f, &v) < 0){
       +                        seterr(ECorrupt, "syntax error: bad bitmap size in %s", f->name);
       +                        return -1;
       +                }
       +                ix->bitblocks = v;
       +        }
       +
                if(parseamap(f, &amn) < 0)
                        return -1;
                ix->nsects = amn.n;
       t@@ -320,8 +356,10 @@ newindex(char *name, ISect **sects, int n)
                Index *ix;
                AMap *smap;
                u64int nb;
       -        u32int div, ub, xb, start, stop, blocksize, tabsize;
       -        int i, j;
       +        u32int div, ub, xb, fb, start, stop, blocksize, tabsize;
       +        int i, j, version;
       +
       +        version = IndexVersion2;
        
                if(n < 1){
                        seterr(EOk, "creating index with no index sections");
       t@@ -368,16 +406,27 @@ newindex(char *name, ISect **sects, int n)
                        seterr(EBug, "index too large");
                        return nil;
                }
       -        div = (((u64int)1 << 32) + nb - 1) / nb;
       -        ub = (((u64int)1 << 32) - 1) / div + 1;
       -        if(div < 100){
       -                seterr(EBug, "index divisor too coarse");
       -                return nil;
       +
       +        div = 0;
       +        fb = 0;
       +        if(version == IndexVersion1){
       +                div = (((u64int)1 << 32) + nb - 1) / nb;
       +                ub = (((u64int)1 << 32) - 1) / div + 1;
       +                if(div < 100){
       +                        seterr(EBug, "index divisor too coarse");
       +                        return nil;
       +                }
       +        }else{
       +                fb = (nb+blocksize*8-1)/(blocksize*8);
       +                for(ub=1; ub<=((nb-fb)>>1); ub<<=1)
       +                        ;
       +                ub += fb;
                }
                if(ub > nb){
                        seterr(EBug, "index initialization math wrong");
                        return nil;
                }
       +        xb = nb - ub;
        
                /*
                 * initialize each of the index sections
       t@@ -388,7 +437,6 @@ newindex(char *name, ISect **sects, int n)
                        seterr(EOk, "can't create new index: out of memory");
                        return nil;
                }
       -        xb = nb - ub;
                start = 0;
                for(i = 0; i < n; i++){
                        stop = start + sects[i]->blocks - xb / n;
       t@@ -413,15 +461,22 @@ newindex(char *name, ISect **sects, int n)
                        free(smap);
                        return nil;
                }
       -        ix->version = IndexVersion;
       +        ix->version = version;
                namecp(ix->name, name);
                ix->sects = sects;
                ix->smap = smap;
                ix->nsects = n;
                ix->blocksize = blocksize;
       -        ix->div = div;
                ix->buckets = ub;
                ix->tabsize = tabsize;
       +        ix->div = div;
       +        ix->bitblocks = fb;
       +
       +        if(initindex1(ix) < 0){
       +                free(smap);
       +                return nil;
       +        }
       +
                return ix;
        }
        
       t@@ -489,7 +544,7 @@ newisect(Part *part, char *name, u32int blocksize, u32int tabsize)
        }
        
        /*
       - * initialize the computed paramaters for an index
       + * initialize the computed parameters for an index
         */
        static ISect*
        initisect1(ISect *is)
       t@@ -606,7 +661,7 @@ writeiclump(Index *ix, Clump *c, u8int *clbuf)
        }
        
        /*
       - * convert an arena index to an relative address address
       + * convert an arena index to an relative arena address
         */
        Arena*
        amapitoa(Index *ix, u64int a, u64int *aa)
       t@@ -665,20 +720,15 @@ loadientry(Index *ix, u8int *score, int type, IEntry *ie)
                u32int buck;
                int h, ok;
        
       -        buck = hashbits(score, 32) / ix->div;
                ok = -1;
        
                qlock(&stats.lock);
                stats.indexreads++;
                qunlock(&stats.lock);
       -        is = findibucket(ix, buck, &buck);
       -        if(is == nil)
       -                return -1;
       -        b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1);
       +        b = loadibucket(ix, score, &is, &buck, &ib);
                if(b == nil)
                        return -1;
        
       -        unpackibucket(&ib, b->data);
                if(okibucket(&ib, is) < 0)
                        goto out;
        
       t@@ -707,19 +757,16 @@ storeientry(Index *ix, IEntry *ie)
                u32int buck;
                int h, ok;
        
       -        buck = hashbits(ie->score, 32) / ix->div;
                ok = 0;
        
                qlock(&stats.lock);
                stats.indexwreads++;
                qunlock(&stats.lock);
        
       -        is = findibucket(ix, buck, &buck);
       -        b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1);
       +        b = loadibucket(ix, ie->score, &is, &buck, &ib);
                if(b == nil)
                        return -1;
        
       -        unpackibucket(&ib, b->data);
                if(okibucket(&ib, is) < 0)
                        goto out;
        
       t@@ -765,177 +812,14 @@ writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b)
                return 0;
        }
        
       -/*
       - * find the number of the index section holding score
       - */
       -int
       -indexsect(Index *ix, u8int *score)
       -{
       -        u32int buck;
       -        int r, l, m;
       -
       -        buck = hashbits(score, 32) / ix->div;
       -        l = 1;
       -        r = ix->nsects - 1;
       -        while(l <= r){
       -                m = (r + l) >> 1;
       -                if(ix->sects[m]->start <= buck)
       -                        l = m + 1;
       -                else
       -                        r = m - 1;
       -        }
       -        return l - 1;
       -}
       -
       -/*
       - * find the index section which holds bucket #buck.
       - */
       -static ISect*
       -findisect(Index *ix, u32int buck, u32int *ibuck)
       -{
       -        ISect *is;
       -        int r, l, m;
       -
       -        l = 1;
       -        r = ix->nsects - 1;
       -        while(l <= r){
       -                m = (r + l) >> 1;
       -                if(ix->sects[m]->start <= buck)
       -                        l = m + 1;
       -                else
       -                        r = m - 1;
       -        }
       -        is = ix->sects[l - 1];
       -        if(is->start <= buck && is->stop > buck){
       -                *ibuck = buck - is->start;
       -                return is;
       -        }
       -        seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
       -        return nil;
       -}
       -
       -static DBlock*
       -loadisectblock(Index *ix, u32int buck, int read)
       -{
       -        ISect *is;
       -
       -        if((is = findisect(ix, buck, &buck)) == nil)
       -                return nil;
       -        return getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), read);
       -}
       -
       -/*
       - * find the index section which holds the logical bucket #buck
       - */
       -static DBlock*
       -loadibucket(Index *ix, u32int buck, IBucket *ib)
       -{
       -        int d, i, times;
       -        u32int ino;
       -        DBlock *b;
       -        u32int bbuck;
       -        IBucket eib;
       -
       -        times = 0;
       -
       -top:
       -        if(times++ > 2*ix->maxdepth){
       -                seterr(EAdmin, "bucket bitmap tree never converges with buckets");
       -                return nil;
       -        }
       -
       -        bbuck = -1;
       -        b = nil;
       -
       -        /*
       -         * consider the bits of buck, one at a time, to make the bucket number.
       -         */
       -
       -        /*
       -         * walk down the bucket tree using the bitmap, which is used so
       -         * often it's almost certain to be in cache.
       -         */
       -        ino = 0;
       -        for(d=0; d<ix->maxdepth; d++){
       -                /* fetch the bitmap that says whether ino has been split */
       -                if(bbuck != (ino>>ix->bitlog)){
       -                        if(b)
       -                                putdblock(b);
       -                        bbuck = (ino>>ix->bitlog);
       -                        if((b = loadisectblock(ix, bbuck, 1)) == nil)
       -                                return nil;
       -                }
       -                /* has it been split yet? */
       -                if((((u32int*)b->data)[(ino&(ix->bitmask))>>5] & (1<<(ino&31))) == 0){
       -                        /* no.  we're done */
       -                        break;
       -                }
       -        }
       -        putdblock(b);
       -
       -        /*
       -         * continue walking down (or up!) the bucket tree, which may not
       -         * be completely in sync with the bitmap.  we could continue the loop
       -         * here, but it's easiest just to start over once we correct the bitmap.
       -         * corrections should only happen when things get out of sync because
       -         * a crash keeps some updates from making it to disk, so it's not too
       -         * frequent.  we should converge after 2x the max depth, at the very worst
       -         * (up and back down the tree).
       -         */
       -        if((b = loadisectblock(ix, ix->bitbuckets+bucketno(buck, d), 1)) == nil)
       -                return nil;
       -        unpackibucket(&eib, b->data);
       -        if(eib.depth > d){
       -                /* the bitmap thought this block hadn't split */
       -                putdblock(b);
       -                if(markblocksplit(buck, d) < 0)
       -                        return nil;
       -                goto top;
       -        }
       -        if(eib.depth < d){
       -                /* the bitmap thought this block had split */
       -                putdblock(b);
       -                if(markblockunsplit(ix, buck, d) < 0)
       -                        return nil;
       -                goto top;
       -        }
       -        *ib = eib;
       -        return b;
       -}
       -
       -static int
       -markblocksplit(Index *ix, u32int buck, int d)
       -{
       -        u32int ino;
       -
       -        ino = bucketno(buck, d);
       -        if((b = loadisectblock(ix, ino>>ix->bitlog, 1)) == nil)
       -                return -1;
       -        dirtydblock(b, DirtyIndex);
       -        (((u32int*)b->data)[(ino&(ix->bitmask))>>5] |= (1<<(ino&31));
       -        putdblock(b);
       -        return 0;
       -}
       -
       -static int
       -markblockunsplit(Index *ix, u32int buck, int d)
       -{
       -        /*
       -         * Let's 
       -        u32int ino;
       -
       -        ino = bucketno(buck, d);
       -        
       -}
       -
        static int
        okibucket(IBucket *ib, ISect *is)
        {
       -        if(ib->n <= is->buckmax && (ib->next == 0 || ib->next >= is->start && ib->next < is->stop))
       +        if(ib->n <= is->buckmax)
                        return 0;
        
       -        seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, next=%lud range=[%lud,%lud)",
       -                ib->n, is->buckmax, ib->next, is->start, is->stop);
       +        seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, depth=%lud range=[%lud,%lud)",
       +                ib->n, is->buckmax, ib->depth, is->start, is->stop);
                return -1;
        }
        
       t@@ -1010,3 +894,223 @@ ientrycmp(const void *vie1, const void *vie2)
                }
                return -1;
        }
       +
       +/*
       + * find the number of the index section holding bucket #buck
       + */
       +static int
       +indexsect0(Index *ix, u32int buck)
       +{
       +        int r, l, m;
       +
       +        l = 1;
       +        r = ix->nsects - 1;
       +        while(l <= r){
       +                m = (r + l) >> 1;
       +                if(ix->sects[m]->start <= buck)
       +                        l = m + 1;
       +                else
       +                        r = m - 1;
       +        }
       +        return l - 1;
       +}
       +
       +/*
       + * load the index block at bucket #buck
       + */
       +static DBlock*
       +loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib)
       +{
       +        ISect *is;
       +        DBlock *b;
       +
       +        is = ix->sects[indexsect0(ix, buck)];
       +        if(buck < is->start || is->stop <= buck){
       +                seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
       +                return nil;
       +        }
       +
       +        buck -= is->start;
       +        if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1)) == nil)
       +                return nil;
       +
       +        *pis = is;
       +        *pbuck = buck;
       +        unpackibucket(ib, b->data);
       +        return b;
       +}
       +
       +/*
       + * find the number of the index section holding score
       + */
       +static int
       +indexsect1(Index *ix, u8int *score)
       +{
       +        return indexsect0(ix, hashbits(score, 32) / ix->div);
       +}
       +
       +/*
       + * load the index block responsible for score.
       + */
       +static DBlock*
       +loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
       +{
       +        return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib);
       +}
       +
       +static u32int
       +keytobuck(Index *ix, u32int key, int d)
       +{
       +        /* clear all but top d bits */
       +        if(d != 32)
       +                key &= ~((1<<(32-d))-1);
       +
       +        /* truncate to maxdepth bits */
       +        if(ix->maxdepth != 32)
       +                key >>= 32 - ix->maxdepth;
       +
       +        return ix->bitblocks + key;
       +}
       +
       +/*
       + * to check whether .xxx has split, check whether .xxx1 is in use.
       + * it might be worth caching the block for future lookups, but for now
       + * let's assume the dcache is good enough.
       + */
       +static int
       +bitmapop(Index *ix, u32int key, int d, int set)
       +{
       +        DBlock *b;
       +        ISect *is;
       +        IBucket ib;
       +        u32int buck;
       +        int inuse;
       +
       +        if(d >= ix->maxdepth)
       +                return 0;
       +
       +        /* construct .xxx1 in bucket number format */
       +        key = keytobuck(ix, key, d) | (1<<(ix->maxdepth-d-1));
       +
       +        /* check whether key (now the bucket number for .xxx1) is in use */
       +
       +        if((b = loadibucket0(ix, key >> ix->bitkeylog, &is, &buck, &ib)) == nil){
       +                seterr(ECorrupt, "cannot load in-use bitmap block");
       +                return -1;
       +        }
       +        inuse = ((u32int*)b->data)[(key & ix->bitkeymask)>>5] & (1<<(key&31));
       +        if(set && !inuse){
       +                dirtydblock(b, DirtyIndexBitmap);
       +                ((u32int*)b->data)[(key & ix->bitkeymask)>>5] |= (1<<(key&31));
       +        }
       +        putdblock(b);
       +        return inuse;
       +}
       +
       +static int
       +issplit(Index *ix, u32int key, int d)
       +{
       +        return bitmapop(ix, key, d, 0);
       +}
       +
       +static int
       +marksplit(Index *ix, u32int key, int d)
       +{
       +        return bitmapop(ix, key, d, 1);
       +}        
       +
       +/* 
       + * find the number of the index section holding score.
       + * it's not terrible to be wrong once in a while, so we just
       + * do what the bitmap tells us and don't worry about the 
       + * bitmap being out of date.
       + */
       +static int
       +indexsect2(Index *ix, u8int *score)
       +{
       +        u32int key;
       +        int d, is;
       +
       +        key = hashbits(score, 32);
       +        for(d=0; d<=ix->maxdepth; d++){
       +                is = issplit(ix, key, d);
       +                if(is == -1)
       +                        return 0;        /* must return something valid! */
       +                if(!is)
       +                        break;
       +        }
       +
       +        if(d > ix->maxdepth){
       +                seterr(EBug, "index bitmap inconsistent with maxdepth");
       +                return 0;        /* must return something valid! */
       +        }
       +
       +        return indexsect0(ix, keytobuck(ix, key, d));
       +}
       +
       +/*
       + * load the index block responsible for score. 
       + * (unlike indexsect2, must be correct always.)
       + */
       +static DBlock*
       +loadibucket2(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
       +{
       +        u32int key;
       +        int d, try, is;
       +        DBlock *b;
       +
       +        for(try=0; try<32; try++){
       +                key = hashbits(score, 32);
       +                for(d=0; d<=ix->maxdepth; d++){
       +                        is = issplit(ix, key, d);
       +                        if(is == -1)
       +                                return nil;
       +                        if(!is)
       +                                break;
       +                }
       +                if(d > ix->maxdepth){
       +                        seterr(EBug, "index bitmap inconsistent with maxdepth");
       +                        return nil;
       +                }
       +
       +                if((b = loadibucket0(ix, keytobuck(ix, key, d), pis, pbuck, ib)) == nil)
       +                        return nil;
       +
       +                if(ib->depth == d)
       +                        return b;
       +
       +                if(ib->depth < d){
       +                        seterr(EBug, "index block has smaller depth than expected -- cannot happen");
       +                        putdblock(b);
       +                        return nil;
       +                }
       +
       +                /*
       +                 * ib->depth > d, meaning the bitmap was out of date.
       +                 * fix the bitmap and try again.
       +                 */
       +                putdblock(b);
       +                if(marksplit(ix, key, d) < 0)
       +                        return nil;
       +        }
       +        seterr(EBug, "loadibucket2 failed to sync bitmap with disk!");
       +        return nil;
       +}
       +
       +int
       +indexsect(Index *ix, u8int *score)
       +{
       +        if(ix->version == IndexVersion1)
       +                return indexsect1(ix, score);
       +        return indexsect2(ix, score);
       +}
       +
       +DBlock*
       +loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
       +{
       +        if(ix->version == IndexVersion1)
       +                return loadibucket1(ix, score, pis, pbuck, ib);
       +        return loadibucket2(ix, score, pis, pbuck, ib);
       +}
       +
       +
 (DIR) diff --git a/src/cmd/venti/lump.c b/src/cmd/venti/lump.c
       t@@ -81,7 +81,6 @@ writelump(Packet *p, u8int *score, int type, u32int creator)
                        return ok;
                }
        
       -print("writelump %08x\n", mainindex->arenas[0]);
                if(queuewrites)
                        return queuewrite(u, p, creator);
        
 (DIR) diff --git a/src/cmd/venti/mkfile b/src/cmd/venti/mkfile
       t@@ -44,9 +44,9 @@ TARG=\
                fmtarenas\
                fmtisect\
                fmtindex\
       -        buildindex\
       +#        buildindex\
                checkarenas\
       -        checkindex\
       +#        checkindex\
                clumpstats\
                findscore\
                rdarena\
 (DIR) diff --git a/src/cmd/venti/part.c b/src/cmd/venti/part.c
       t@@ -2,6 +2,8 @@
        #include "dat.h"
        #include "fns.h"
        
       +#define trace 0
       +
        u32int        maxblocksize;
        int        readonly;
        
       t@@ -75,7 +77,7 @@ writepart(Part *part, u64int addr, u8int *buf, u32int n)
                        seterr(ECorrupt, "out of bounds write to partition='%s'", part->name);
                        return -1;
                }
       -        print("write %s %lud at %llud\n", part->name, n, addr);
       +if(trace) print("write %s %lud at %llud\n", part->name, n, addr);
                for(nn = 0; nn < n; nn += m){
                        mm = n - nn;
                        if(mm > MaxIo)
       t@@ -107,7 +109,7 @@ readpart(Part *part, u64int addr, u8int *buf, u32int n)
                        seterr(ECorrupt, "out of bounds read from partition='%s': addr=%lld n=%d size=%lld", part->name, addr, n, part->size);
                        return -1;
                }
       -        print("read %s %lud at %llud\n", part->name, n, addr);
       +if(trace) print("read %s %lud at %llud\n", part->name, n, addr);
                for(nn = 0; nn < n; nn += m){
                        mm = n - nn;
                        if(mm > MaxIo)
 (DIR) diff --git a/src/cmd/venti/venti.c b/src/cmd/venti/venti.c
       t@@ -147,8 +147,8 @@ ventiserver(char *addr)
        
                while((r = vtgetreq(s)) != nil){
                        r->rx.type = r->tx.type+1;
       -                print("req (arenas[0]=%p sects[0]=%p) %F\n",
       -                        mainindex->arenas[0], mainindex->sects[0], &r->tx);
       +        //        print("req (arenas[0]=%p sects[0]=%p) %F\n",
       +        //                mainindex->arenas[0], mainindex->sects[0], &r->tx);
                        switch(r->tx.type){
                        default:
                                vtrerror(r, "unknown request");