commit - 7c5190d2c854128fb607289e9d83379e522c7090
commit + 9ffbb5adcaeec878d3b6db0f8b1f654e839b4689
blob - 4232bb47a26472ccad56afac0ff31a9bc2f35e6f
blob + e5aed260f741ed0df517b109d359b47f3445e714
--- src/cmd/venti/buildbuck.c
+++ src/cmd/venti/buildbuck.c
buck = TWID32;
ib->n = 0;
- ib->next = 0;
+ ib->depth = 0;
while(ies->n){
b = peekientry(ies);
if(b == nil)
blob - 952e75dde2fe20030ca2ebf52e88472588400ca5
blob + 8058ba098653cef50a84f99c22b5148590cd2116
--- src/cmd/venti/buildindex.c
+++ src/cmd/venti/buildindex.c
{
ISect *is;
- is = findisect(ix, buck);
- if(is == nil){
- seterr(EAdmin, "bad math in writebucket");
+ is = findibucket(ix, buck, &buck);
+ if(is == nil)
return -1;
- }
- if(buck < is->start || buck >= is->stop)
- seterr(EAdmin, "index write out of bounds: %d not in [%d,%d)\n",
- buck, is->start, is->stop);
- buck -= is->start;
qlock(&stats.lock);
stats.indexwrites++;
qunlock(&stats.lock);
ib.data = b->data + IBucketSize;
zib.data = z->data + IBucketSize;
zib.n = 0;
- zib.next = 0;
+ zib.depth = 0;
for(;;){
buck = buildbucket(ix, ies, &ib);
found += ib.n;
blob - fa6f5efc318d84add042f0dc0d8ccc9742cc2f05
blob + 34edb3708475fc66bab98e394aac46fa84e49c73
--- src/cmd/venti/checkindex.c
+++ src/cmd/venti/checkindex.c
IEntry ie, eie;
int i, ei, ok, c;
- is = findisect(ix, buck);
- if(is == nil){
- seterr(EAdmin, "bad math in checkbuckets");
- return -1;
- }
- buck -= is->start;
+ is = findibucket(ix, buck, &buck);
eb = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1);
if(eb == nil)
return -1;
ib.data = b->data;
zib.data = z->data;
zib.n = 0;
- zib.next = 0;
+ zib.depth = 0;
for(;;){
buck = buildbucket(ix, ies, &ib);
found += ib.n;
blob - 272d7aec7481ad0cec52fe014784fcfee8b3df8a
blob + 33f5950aa75a29e2aaf39d94ca07427e22ba53c2
--- src/cmd/venti/clump.c
+++ src/cmd/venti/clump.c
a = writeiclump(ix, &cl, cb->data);
freezblock(cb);
- if(a == 0)
+ if(a == TWID64)
return -1;
qlock(&stats.lock);
blob - ae89baa78359d7752de7b6f52b1a190d5b6a0f07
blob + 4688b076424cf03e28c99eaed0ca605b322b80c7
--- src/cmd/venti/conv.c
+++ src/cmd/venti/conv.c
unpackibucket(IBucket *b, u8int *buf)
{
b->n = U16GET(buf);
- b->next = U32GET(&buf[U16Size]);
+ b->depth = U32GET(&buf[U16Size]);
b->data = buf + IBucketSize;
}
packibucket(IBucket *b, u8int *buf)
{
U16PUT(buf, b->n);
- U32PUT(&buf[U16Size], b->next);
+ U32PUT(&buf[U16Size], b->depth);
}
blob - d6d7d1b0bf10ffb0df653535714e17df2151cab1
blob + 49c8b38beb047e2e788b670b275705e187ed3529
--- src/cmd/venti/dat.h
+++ src/cmd/venti/dat.h
ArenaPartVersion = 3,
ArenaVersion = 4,
- IndexVersion = 1,
+ IndexVersion1 = 1,
+ IndexVersion2 = 2,
ISectVersion = 1,
/*
MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
/*
- * dirty flags
+ * dirty flags - order controls disk write order
*/
DirtyArena = 1,
+ DirtyIndexSplit,
DirtyIndex,
+ DirtyIndexBitmap,
DirtyArenaCib,
+ DirtyMax,
VentiZZZZZZZZ
};
u32int buckets; /* last bucket used in disk hash table */
u32int blocksize;
u32int tabsize; /* max. bytes in index config */
+ u32int bitblocks;
+ u32int maxdepth;
+ u32int bitkeylog;
+ u32int bitkeymask;
+
int mapalloc; /* first arena to check when adding a lump */
Arena **arenas; /* arenas in the mapping */
ISect **sects; /* sections which hold the buckets */
struct IBucket
{
u16int n; /* number of active indices */
- u32int next; /* overflow bucket */
+ u32int depth; /* depth in version 2 (was overflow in v1) */
u8int *data;
};
blob - c99e79b3fe8bec7c909ec9a2a6cbf7e49e786aa2
blob + dcb47bcf62c756f40ac18b5c5c0c18bcaea0ab9a
--- src/cmd/venti/dcache.c
+++ src/cmd/venti/dcache.c
int odirty;
Part *p;
-fprint(2, "dirty %p\n", b);
rlock(&dcache.dirtylock);
assert(b->ref != 0);
assert(b->dirtying == 0);
stats.dirtydblocks++;
qunlock(&stats.lock);
+ /*
+ * In general, shouldn't mark any block as more than one
+ * type, except that split index blocks are a subcase of index
+ * blocks. Only clean blocks ever get marked DirtyIndexSplit,
+ * though, so we don't need the opposite conjunction here.
+ */
if(b->dirty)
- assert(b->dirty == dirty);
+ assert(b->dirty == dirty
+ || (b->dirty==DirtyIndexSplit && dirty==DirtyIndex));
+
odirty = b->dirty;
b->dirty = dirty;
p = b->part;
static void
flushproc(void *v)
{
- int i, n;
+ int i, j, n;
DBlock *b, **write;
USED(v);
qsort(write, n, sizeof(write[0]), writeblockcmp);
- /*
- * At the beginning of the array are the arena blocks.
- */
- fprint(2, "flushproc: write arena blocks\n");
+ /* Write each stage of blocks out. */
i = 0;
- i += parallelwrites(write+i, write+n, DirtyArena);
-
- /*
- * Next are the index blocks.
- */
- fprint(2, "flushproc: write index blocks\n");
- i += parallelwrites(write+i, write+n, DirtyIndex);
-
- /*
- * Finally, the arena clump info blocks.
- */
- fprint(2, "flushproc: write cib blocks\n");
- i += parallelwrites(write+i, write+n, DirtyArenaCib);
-
+ for(j=1; j<DirtyMax; j++)
+ i += parallelwrites(write+i, write+n, j);
assert(i == n);
fprint(2, "flushproc: update dirty bits\n");
blob - b3d158f65c42c424a64b24b74e15b4fb0f46121f
blob + f8274397704e6ba802cee749f1aff508684c3660
--- src/cmd/venti/fns.h
+++ src/cmd/venti/fns.h
char *estrdup(char*);
void *ezmalloc(ulong);
Arena *findarena(char *name);
-ISect *findisect(Index *ix, u32int buck);
int flushciblocks(Arena *arena);
void flushdcache(void);
void flushqueue(void);
void insertlump(Lump *lump, Packet *p);
int insertscore(u8int *score, IAddr *ia, int write);
ZBlock *loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int verify);
+DBlock *loadibucket(Index *index, u8int *score, ISect **is, u32int *buck, IBucket *ib);
int loadientry(Index *index, u8int *score, int type, IEntry *ie);
void logerr(int severity, char *fmt, ...);
Lump *lookuplump(u8int *score, int type);
blob - 8215ccb6f81f72f5e002dfb210e3b70b76298b15
blob + 859c106e564fd70e7521aa4d1cf7f122b7b15ab7
--- src/cmd/venti/httpd.c
+++ src/cmd/venti/httpd.c
c = v;
for(t = 15*60*1000; ; t = 15*1000){
-fprint(2, "httpd: get headers\n");
if(hparsereq(c, t) < 0)
break;
ok = -1;
for(i = 0; i < MaxObjs && objs[i].name[0]; i++){
if(strcmp(c->req.uri, objs[i].name) == 0){
-fprint(2, "httpd: call function %p\n", objs[i].f);
ok = (*objs[i].f)(c);
break;
}
if(ok < 0)
break;
}
-print("httpd cleanup %d\n", c->hin.fd);
hreqcleanup(c);
-print("close %d\n", c->hin.fd);
close(c->hin.fd);
free(c);
}
r = preqtext(c);
if(r < 0)
-{
-fprint(2, "preqtext failed\n");
return r;
-}
-fprint(2, "write stats\n");
+
hout = &c->hout;
hprint(hout, "lump writes=%,ld\n", stats.lumpwrites);
hprint(hout, "lump reads=%,ld\n", stats.lumpreads);
hprint(hout, "disk cache misses=%,ld\n", stats.pcmiss);
hprint(hout, "disk cache reads=%,ld\n", stats.pcreads);
hprint(hout, "disk cache bytes read=%,lld\n", stats.pcbreads);
-fprint(2, "write new stats\n");
+
hprint(hout, "disk cache writes=%,ld\n", stats.dirtydblocks);
hprint(hout, "disk cache writes absorbed=%,ld %d%%\n", stats.absorbedwrites,
percent(stats.absorbedwrites, stats.dirtydblocks));
-fprint(2, "back to old stats\n");
+ hprint(hout, "disk cache flushes=%,ld\n", stats.dcacheflushes);
+ hprint(hout, "disk cache flush writes=%,ld (%,ld per flush)\n",
+ stats.dcacheflushwrites, stats.dcacheflushwrites/stats.dcacheflushes);
hprint(hout, "disk writes=%,ld\n", stats.diskwrites);
hprint(hout, "disk bytes written=%,lld\n", stats.diskbwrites);
hprint(hout, "disk reads=%,ld\n", stats.diskreads);
hprint(hout, "disk bytes read=%,lld\n", stats.diskbreads);
-fprint(2, "hflush stats\n");
hflush(hout);
-fprint(2, "done with stats\n");
return 0;
}
blob - 04f1134e72f814fb02bb66abfeb85e76812ed157
blob + fc1d32e7cd7616f23ff47607343f70e988b368ac
--- src/cmd/venti/icache.c
+++ src/cmd/venti/icache.c
IEntry d, *ie, *last;
u32int h;
-fprint(2, "lookupscore %V %d\n", score, type);
qlock(&stats.lock);
stats.iclookups++;
qunlock(&stats.lock);
blob - 7fc15fd28da83f84cdce46d5b65bd5b1269f3a8f
blob + 168b13f6b146d41ed017d5785241838ec1d0870f
--- src/cmd/venti/index.c
+++ src/cmd/venti/index.c
/*
- * Index, mapping scores to log positions. The log data mentioned in
- * the index _always_ goes out to disk before the index blocks themselves.
- * A counter in the arena tail records which logged blocks have been
- * successfully indexed. The ordering of dirtydcache calls along with
- * the flags passed to dirtydcache ensure the proper write ordering.
+ * Index, mapping scores to log positions.
+ *
+ * The index is made up of some number of index sections, each of
+ * which is typically stored on a different disk. The blocks in all the
+ * index sections are logically numbered, with each index section
+ * responsible for a range of blocks. Blocks are typically 8kB.
+ *
+ * Index Version 1:
*
- * For historical reasons, there are two indexing schemes. In both,
- * the index is broken up into some number of fixed-size (say, 8kB)
- * buckets holding index entries. An index entry is about 40 bytes.
- * The index can be spread across many disks, although in small
- * configurations it is not uncommon for the index and arenas to be
- * on the same disk.
- * *
- * In the first scheme, the many buckets are treated as a giant on-disk
- * hash table. If there are N buckets, then the top 32 bits of the
- * score are used as an index into the hash table, with each bucket
- * holding 2^32 / N of the index space. The index must be sized so
- * that a bucket can't ever overflow. Assuming that a typical compressed
- * data block is about 4000 bytes, the index size is expected to be
- * about 1% of the total data size. Since scores are essentially
- * random, they will be distributed evenly among the buckets, so all
- * buckets should be about the same fullness. A factor of 5 gives us
- * a wide comfort boundary, so the index storage is suggested to be
- * 5% of the total data storage.
+ * The N index blocks are treated as a giant hash table. The top 32 bits
+ * of score are used as the key for a lookup. Each index block holds
+ * one hash bucket, which is responsible for ceil(2^32 / N) of the key space.
*
- * Unfortunately, this very sparse index does not make good use of the
- * disk -- most of it is empty, and disk reads, which are costly because
- * of the random seek to get to an arbitrary bucket, tend to bring in
- * only a few entries, making them hardly cost effective. The second
- * scheme is a variation on the first scheme that tries to lay out the
- * index in a denser format on the disk. In this scheme, the index
- * buckets are organized in a binary tree, with all data at the leaf
- * nodes. Bucket numbers are easiest to treat in binary. In the
- * beginning, there is a single bucket with 0-bit number "". When a
- * bucket with number x fills, it splits into buckets 0x and 1x. Since
- * x and 0x are the same number, this means that half the bucket space
- * is assigned to a new bucket, 1x. So "" splits into 0 and 1, 1
- * splits into 01 and 11, and so on. The bucket number determines the
- * placement on disk, and the bucket header includes the number of
- * bits represented by the bucket. To find the right bucket for a
- * given score with top 32-bits x, read bucket "" off disk and check
- * its depth. If it is zero, we're done. If x doesn't match the
- * number of bits in 0's header, we know that the block has split, so
- * we use the last 1 bit of x to load a new block (perhaps the same
- * one) and repeat, using successively more bits of x until we find
- * the block responsible for x. Note that we're using bits from the
- * _right_ not the left. This gives the "split into 0x and 1x" property
- * needed by the tree and is easier than using the reversal of the
- * bits on the left.
- * *
- * At the moment, this second scheme sounds worse than the first --
- * there are log n disk reads to find a block instead of just 1. But
- * we can keep the tree structure in memory, using 1 bit per block to
- * keep track of whether that block has been allocated. Want to know
- * whether block x has been split? Check whether 1x is allocated. 1
- * bit per 8kB gives us an in-use bitmap 1/65536 the size of the index.
- * The index data is 1/100 the size of the arena data, explained above.
- * In this scheme, after the first block split, the index is always
- * at least half full (proof by induction), so it is at most 2x the
- * size of the index data. This gives a bitmap size of 2/6,553,600
- * of the data size. Let's call that one millionth. So each terabyte
- * of storage requires one megabyte of free bitmap. The bitmap is
- * going to be accessed so much that it will be effectively pinned in
- * the cache. So it still only takes one disk read to find the block
- * -- the tree walking can be done by consulting the in-core bitmap
- * describing the tree structure.
- * *
- * Now we have to worry about write ordering, though. What if the
- * bitmap ends up out of sync with the index blocks? When block x
- * splits into 0x and 1x, causing an update to bitmap block b, the
- * dcache flushing code makes sure that the writes happen in this
- * order: first 1x, then 0x, then the bitmap. Writing 1x before 0x
- * makes sure we write the split-off entries to disk before we discard
- * them from 0x. Writing the bitmap after both simplifies the following
- * case analysis.
+ * The index is sized so that a particular bucket is extraordinarily
+ * unlikely to overflow: assuming compressed data blocks are 4kB
+ * on disk, and assuming each block has a 40 byte index entry,
+ * the index data will be 1% of the total data. Since scores are essentially
+ * random, all buckets should be about the same fullness.
+ * A factor of 5 gives us a wide comfort boundary to account for
+ * random variation. So the index disk space should be 5% of the arena disk space.
+ *
+ * Problems with Index Version 1:
*
- * If Venti is interrupted while flushing blocks to disk, the state
- * of the disk upon next startup can be one of the following:
- * *
-
- * (a) none of 0x, 1x, and b are written
- * Looks like nothing happened - use as is.
+ * Because the index size is chosen to handle the worst case load,
+ * the index is very sparse, especially when the Venti server is mostly empty.
+ * This has a few bad properties.
+ *
+ * Loading an index block (which typically requires a random disk seek)
+ * is a very expensive operation, yet it yields only a couple index entries.
+ * We're not making efficient use of the disk arm.
*
- * (b) 1x is written
- * Since 0x hasn't been rewritten and the bitmap doesn't record 1x
- * as being in use, it's like this never happened. See (a).
- * This does mean that the bitmap trumps actual disk contents:
- * no need to zero the index disks anymore.
+ * Writing a block requires first checking to see if the block already
+ * exists on the server, which in turn requires an index lookup. When
+ * writing fresh data, these lookups will fail. The index entry cache
+ * cannot serve these, so they must go to disk, which is expensive.
+ *
+ * Thus both the reading and the writing of blocks are held back by
+ * the expense of accessing the index.
+ *
+ * Index Version 2:
+ *
+ * The index is sized to be exactly 2^M blocks. The index blocks are
+ * logically arranged in a (not exactly balanced) binary tree of depth at
+ * most M. The nodes are named by bit strings describing the path from
+ * the root to the node. The root is . (dot). The left child of the root is .0,
+ * the right child .1. The node you get to by starting at the root and going
+ * left right right left is .0110. At the beginning, there is only the root block.
+ * When a block with name .xxx fills, it splits into .xxx0 and .xxx1.
+ * All the index data is kept in the leaves of the tree.
*
- * (c) 0x and 1x are written, but not the bitmap
- * Writing 0x commits the change. When we next encounter
- * 0x or 1x on a lookup (we can't get to 1x except through x==0x),
- * the bitmap will direct us to x, we'll load the block and find out
- * that its now 0x, so we update the bitmap.
+ * Index leaf blocks are laid out on disk by interpreting the bitstring as a
+ * binary fraction and multiplying by 2^M -- .0 is the first block, .1 is
+ * the block halfway into the index, .0110 is at position 6/16, and
+ * .xxx and .xxx0 map to the same block (but only one can be a leaf
+ * node at any given time, so this is okay!). A cheap implementation of
+ * this is to append zeros to the bit string to make it M bits long. That's
+ * the integer index block number.
*
- * (d) 0x, 1x, and b are written.
- * Great - just use as is.
+ * To find the leaf block that should hold a score, use the bits of the
+ * score one at a time to walk down the tree to a leaf node. If the tree
+ * has leaf nodes .0, .10, and .11, then score 0110101... ends up in bucket
+ * .0 while score 101110101... ends up in bucket .10. There is no leaf node
+ * .1 because it has split into .10 and .11.
+ *
+ * If we know which disk blocks are in use, we can reconstruct the interior
+ * of the tree: if .xxx1 is in use, then .xxx has been split. We keep an in-use
+ * bitmap of all index disk blocks to aid in reconstructing the interior of the
+ * tree. At one bit per index block, the bitmap is small enough to be kept
+ * in memory even on the largest of Venti servers.
+ *
+ * After the root block splits, the index blocks being used will always be
+ * at least half full (averaged across the entire index). So unlike Version 1,
+ * Index Version 2 is quite dense, alleviating the two problems above.
+ * Index block reads now return many index entries. More importantly,
+ * small servers can keep most of the index in the disk cache, making them
+ * more likely to handle negative lookups without going to disk.
+ *
+ * As the index becomes more full, Index Version 2's performance
+ * degrades gracefully into Index Version 1. V2 is really an optimization
+ * for little servers.
+ *
+ * Write Ordering for Index Version 2:
+ *
+ * Unlike Index Version 1, Version 2 must worry about write ordering
+ * within the index. What happens if the in-use bitmap is out of sync
+ * with the actual leaf nodes? What happens if .xxx splits into .xxx0 and
+ * .xxx1 but only one of the new blocks gets written to disk?
+ *
+ * We arrange that when .xxx splits, the .xxx1 block is written first,
+ * then the .xxx0 block, and finally the in-use bitmap entry for .xxx1.
+ * The split is committed by the writing of .xxx0. This ordering leaves
+ * two possible partial disk writes:
+ *
+ * (a) If .xxx1 is written but .xxx0 and the bitmap are not, then it's as if
+ * the split never happened -- we won't think .xxx1 is in use, and we
+ * won't go looking for it.
+ *
+ * (b) If .xxx1 and .xxx0 are written but the bitmap is not, then the first
+ * time we try to load .xxx, we'll get .xxx0 instead, realize the bitmap is
+ * out of date, and update the bitmap.
+ *
+ * Backwards Compatibility
+ *
+ * Because there are large Venti servers in use with Index V1, this code
+ * will read either index version, but when asked to create a new index,
+ * will only create V2.
*/
#include "stdinc.h"
static int bucklook(u8int *score, int type, u8int *data, int n);
static int writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b);
static int okibucket(IBucket *ib, ISect *is);
+static int initindex1(Index*);
static ISect *initisect1(ISect *is);
//static QLock indexlock; //ZZZ
Index *ix;
ISect *is;
u32int last, blocksize, tabsize;
- int i, nbits;
+ int i;
if(n <= 0){
seterr(EOk, "no index sections to initialize index");
ix->tabsize = tabsize;
ix->buckets = last;
- /* compute number of buckets used for in-use map */
- nbits = blocksize*8;
- ix->bitbuckets = (ix->buckets+nbits-1)/nbits;
-
- last -= ix->bitbuckets;
- /*
- * compute log of max. power of two not greater than
- * number of remaining buckets.
- */
- for(nbits=0; last>>=1; nbits++)
- ;
- ix->maxdepth = nbits;
-
- if((1UL<<ix->maxdepth) > ix->buckets-ix->bitbuckets){
- seterr(ECorrupt, "inconsistent math for buckets in %s", ix->name);
+ if(initindex1(ix) < 0){
freeindex(ix);
return nil;
}
freeindex(ix);
return nil;
}
+
return ix;
}
+static int
+initindex1(Index *ix)
+{
+ u32int buckets;
+
+ switch(ix->version){
+ case IndexVersion1:
+ ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets;
+ buckets = (((u64int)1 << 32) - 1) / ix->div + 1;
+ if(buckets != ix->buckets){
+ seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name);
+ return -1;
+ }
+ break;
+
+ case IndexVersion2:
+ buckets = ix->buckets - ix->bitblocks;
+ if(ix->buckets < ix->bitblocks || (buckets&(buckets-1)))
+ seterr(ECorrupt, "bucket count not a power of two in %s", ix->name);
+ ix->maxdepth = u64log2(buckets);
+ ix->bitkeylog = u64log2(ix->blocksize*8);
+ ix->bitkeymask = (1<<ix->bitkeylog)-1;
+ break;
+ }
+ return 0;
+}
+
int
wbindex(Index *ix)
{
}
/*
- * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' sections arenas
+ * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas
* version, blocksize: u32int
* name: max. ANameSize string
* sections, arenas: AMap
outputindex(Fmt *f, Index *ix)
{
if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0
+ || (ix->version==IndexVersion2 && fmtprint(f, "%ud\n", ix->bitblocks) < 0)
|| outputamap(f, ix->smap, ix->nsects) < 0
|| outputamap(f, ix->amap, ix->narenas) < 0)
return -1;
return -1;
}
ix->version = v;
- if(ix->version != IndexVersion){
+ if(ix->version != IndexVersion1 && ix->version != IndexVersion2){
seterr(ECorrupt, "bad version number in %s", f->name);
return -1;
}
* block size
*/
if(ifileu32int(f, &v) < 0){
- seterr(ECorrupt, "syntax error: bad version number in %s", f->name);
+ seterr(ECorrupt, "syntax error: bad block size number in %s", f->name);
return -1;
}
ix->blocksize = v;
+ if(ix->version == IndexVersion2){
+ /*
+ * free bitmap size
+ */
+ if(ifileu32int(f, &v) < 0){
+ seterr(ECorrupt, "syntax error: bad bitmap size in %s", f->name);
+ return -1;
+ }
+ ix->bitblocks = v;
+ }
+
if(parseamap(f, &amn) < 0)
return -1;
ix->nsects = amn.n;
Index *ix;
AMap *smap;
u64int nb;
- u32int div, ub, xb, start, stop, blocksize, tabsize;
- int i, j;
+ u32int div, ub, xb, fb, start, stop, blocksize, tabsize;
+ int i, j, version;
+ version = IndexVersion2;
+
if(n < 1){
seterr(EOk, "creating index with no index sections");
return nil;
seterr(EBug, "index too large");
return nil;
}
- div = (((u64int)1 << 32) + nb - 1) / nb;
- ub = (((u64int)1 << 32) - 1) / div + 1;
- if(div < 100){
- seterr(EBug, "index divisor too coarse");
- return nil;
+
+ div = 0;
+ fb = 0;
+ if(version == IndexVersion1){
+ div = (((u64int)1 << 32) + nb - 1) / nb;
+ ub = (((u64int)1 << 32) - 1) / div + 1;
+ if(div < 100){
+ seterr(EBug, "index divisor too coarse");
+ return nil;
+ }
+ }else{
+ fb = (nb+blocksize*8-1)/(blocksize*8);
+ for(ub=1; ub<=((nb-fb)>>1); ub<<=1)
+ ;
+ ub += fb;
}
if(ub > nb){
seterr(EBug, "index initialization math wrong");
return nil;
}
+ xb = nb - ub;
/*
* initialize each of the index sections
seterr(EOk, "can't create new index: out of memory");
return nil;
}
- xb = nb - ub;
start = 0;
for(i = 0; i < n; i++){
stop = start + sects[i]->blocks - xb / n;
free(smap);
return nil;
}
- ix->version = IndexVersion;
+ ix->version = version;
namecp(ix->name, name);
ix->sects = sects;
ix->smap = smap;
ix->nsects = n;
ix->blocksize = blocksize;
- ix->div = div;
ix->buckets = ub;
ix->tabsize = tabsize;
+ ix->div = div;
+ ix->bitblocks = fb;
+
+ if(initindex1(ix) < 0){
+ free(smap);
+ return nil;
+ }
+
return ix;
}
}
/*
- * initialize the computed paramaters for an index
+ * initialize the computed parameters for an index
*/
static ISect*
initisect1(ISect *is)
}
/*
- * convert an arena index to an relative address address
+ * convert an arena index to an relative arena address
*/
Arena*
amapitoa(Index *ix, u64int a, u64int *aa)
u32int buck;
int h, ok;
- buck = hashbits(score, 32) / ix->div;
ok = -1;
qlock(&stats.lock);
stats.indexreads++;
qunlock(&stats.lock);
- is = findibucket(ix, buck, &buck);
- if(is == nil)
- return -1;
- b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1);
+ b = loadibucket(ix, score, &is, &buck, &ib);
if(b == nil)
return -1;
- unpackibucket(&ib, b->data);
if(okibucket(&ib, is) < 0)
goto out;
u32int buck;
int h, ok;
- buck = hashbits(ie->score, 32) / ix->div;
ok = 0;
qlock(&stats.lock);
stats.indexwreads++;
qunlock(&stats.lock);
- is = findibucket(ix, buck, &buck);
- b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1);
+ b = loadibucket(ix, ie->score, &is, &buck, &ib);
if(b == nil)
return -1;
- unpackibucket(&ib, b->data);
if(okibucket(&ib, is) < 0)
goto out;
qunlock(&stats.lock);
packibucket(ib, b->data);
// return writepart(is->part, is->blockbase + ((u64int)buck << is->blocklog), b->data, is->blocksize);
- return 0;
-}
-
-/*
- * find the number of the index section holding score
- */
-int
-indexsect(Index *ix, u8int *score)
-{
- u32int buck;
- int r, l, m;
-
- buck = hashbits(score, 32) / ix->div;
- l = 1;
- r = ix->nsects - 1;
- while(l <= r){
- m = (r + l) >> 1;
- if(ix->sects[m]->start <= buck)
- l = m + 1;
- else
- r = m - 1;
- }
- return l - 1;
-}
-
-/*
- * find the index section which holds bucket #buck.
- */
-static ISect*
-findisect(Index *ix, u32int buck, u32int *ibuck)
-{
- ISect *is;
- int r, l, m;
-
- l = 1;
- r = ix->nsects - 1;
- while(l <= r){
- m = (r + l) >> 1;
- if(ix->sects[m]->start <= buck)
- l = m + 1;
- else
- r = m - 1;
- }
- is = ix->sects[l - 1];
- if(is->start <= buck && is->stop > buck){
- *ibuck = buck - is->start;
- return is;
- }
- seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
- return nil;
-}
-
-static DBlock*
-loadisectblock(Index *ix, u32int buck, int read)
-{
- ISect *is;
-
- if((is = findisect(ix, buck, &buck)) == nil)
- return nil;
- return getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), read);
-}
-
-/*
- * find the index section which holds the logical bucket #buck
- */
-static DBlock*
-loadibucket(Index *ix, u32int buck, IBucket *ib)
-{
- int d, i, times;
- u32int ino;
- DBlock *b;
- u32int bbuck;
- IBucket eib;
-
- times = 0;
-
-top:
- if(times++ > 2*ix->maxdepth){
- seterr(EAdmin, "bucket bitmap tree never converges with buckets");
- return nil;
- }
-
- bbuck = -1;
- b = nil;
-
- /*
- * consider the bits of buck, one at a time, to make the bucket number.
- */
-
- /*
- * walk down the bucket tree using the bitmap, which is used so
- * often it's almost certain to be in cache.
- */
- ino = 0;
- for(d=0; d<ix->maxdepth; d++){
- /* fetch the bitmap that says whether ino has been split */
- if(bbuck != (ino>>ix->bitlog)){
- if(b)
- putdblock(b);
- bbuck = (ino>>ix->bitlog);
- if((b = loadisectblock(ix, bbuck, 1)) == nil)
- return nil;
- }
- /* has it been split yet? */
- if((((u32int*)b->data)[(ino&(ix->bitmask))>>5] & (1<<(ino&31))) == 0){
- /* no. we're done */
- break;
- }
- }
- putdblock(b);
-
- /*
- * continue walking down (or up!) the bucket tree, which may not
- * be completely in sync with the bitmap. we could continue the loop
- * here, but it's easiest just to start over once we correct the bitmap.
- * corrections should only happen when things get out of sync because
- * a crash keeps some updates from making it to disk, so it's not too
- * frequent. we should converge after 2x the max depth, at the very worst
- * (up and back down the tree).
- */
- if((b = loadisectblock(ix, ix->bitbuckets+bucketno(buck, d), 1)) == nil)
- return nil;
- unpackibucket(&eib, b->data);
- if(eib.depth > d){
- /* the bitmap thought this block hadn't split */
- putdblock(b);
- if(markblocksplit(buck, d) < 0)
- return nil;
- goto top;
- }
- if(eib.depth < d){
- /* the bitmap thought this block had split */
- putdblock(b);
- if(markblockunsplit(ix, buck, d) < 0)
- return nil;
- goto top;
- }
- *ib = eib;
- return b;
-}
-
-static int
-markblocksplit(Index *ix, u32int buck, int d)
-{
- u32int ino;
-
- ino = bucketno(buck, d);
- if((b = loadisectblock(ix, ino>>ix->bitlog, 1)) == nil)
- return -1;
- dirtydblock(b, DirtyIndex);
- (((u32int*)b->data)[(ino&(ix->bitmask))>>5] |= (1<<(ino&31));
- putdblock(b);
return 0;
}
static int
-markblockunsplit(Index *ix, u32int buck, int d)
-{
- /*
- * Let's
- u32int ino;
-
- ino = bucketno(buck, d);
-
-}
-
-static int
okibucket(IBucket *ib, ISect *is)
{
- if(ib->n <= is->buckmax && (ib->next == 0 || ib->next >= is->start && ib->next < is->stop))
+ if(ib->n <= is->buckmax)
return 0;
- seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, next=%lud range=[%lud,%lud)",
- ib->n, is->buckmax, ib->next, is->start, is->stop);
+ seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, depth=%lud range=[%lud,%lud)",
+ ib->n, is->buckmax, ib->depth, is->start, is->stop);
return -1;
}
}
return -1;
}
+
+/*
+ * find the number of the index section holding bucket #buck
+ */
+static int
+indexsect0(Index *ix, u32int buck)
+{
+ int r, l, m;
+
+ l = 1;
+ r = ix->nsects - 1;
+ while(l <= r){
+ m = (r + l) >> 1;
+ if(ix->sects[m]->start <= buck)
+ l = m + 1;
+ else
+ r = m - 1;
+ }
+ return l - 1;
+}
+
+/*
+ * load the index block at bucket #buck
+ */
+static DBlock*
+loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+ ISect *is;
+ DBlock *b;
+
+ is = ix->sects[indexsect0(ix, buck)];
+ if(buck < is->start || is->stop <= buck){
+ seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
+ return nil;
+ }
+
+ buck -= is->start;
+ if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1)) == nil)
+ return nil;
+
+ *pis = is;
+ *pbuck = buck;
+ unpackibucket(ib, b->data);
+ return b;
+}
+
+/*
+ * find the number of the index section holding score
+ */
+static int
+indexsect1(Index *ix, u8int *score)
+{
+ return indexsect0(ix, hashbits(score, 32) / ix->div);
+}
+
+/*
+ * load the index block responsible for score.
+ */
+static DBlock*
+loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+ return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib);
+}
+
+static u32int
+keytobuck(Index *ix, u32int key, int d)
+{
+ /* clear all but top d bits */
+ if(d != 32)
+ key &= ~((1<<(32-d))-1);
+
+ /* truncate to maxdepth bits */
+ if(ix->maxdepth != 32)
+ key >>= 32 - ix->maxdepth;
+
+ return ix->bitblocks + key;
+}
+
+/*
+ * to check whether .xxx has split, check whether .xxx1 is in use.
+ * it might be worth caching the block for future lookups, but for now
+ * let's assume the dcache is good enough.
+ */
+static int
+bitmapop(Index *ix, u32int key, int d, int set)
+{
+ DBlock *b;
+ ISect *is;
+ IBucket ib;
+ u32int buck;
+ int inuse;
+
+ if(d >= ix->maxdepth)
+ return 0;
+
+ /* construct .xxx1 in bucket number format */
+ key = keytobuck(ix, key, d) | (1<<(ix->maxdepth-d-1));
+
+ /* check whether key (now the bucket number for .xxx1) is in use */
+
+ if((b = loadibucket0(ix, key >> ix->bitkeylog, &is, &buck, &ib)) == nil){
+ seterr(ECorrupt, "cannot load in-use bitmap block");
+ return -1;
+ }
+ inuse = ((u32int*)b->data)[(key & ix->bitkeymask)>>5] & (1<<(key&31));
+ if(set && !inuse){
+ dirtydblock(b, DirtyIndexBitmap);
+ ((u32int*)b->data)[(key & ix->bitkeymask)>>5] |= (1<<(key&31));
+ }
+ putdblock(b);
+ return inuse;
+}
+
+static int
+issplit(Index *ix, u32int key, int d)
+{
+ return bitmapop(ix, key, d, 0);
+}
+
+static int
+marksplit(Index *ix, u32int key, int d)
+{
+ return bitmapop(ix, key, d, 1);
+}
+
+/*
+ * find the number of the index section holding score.
+ * it's not terrible to be wrong once in a while, so we just
+ * do what the bitmap tells us and don't worry about the
+ * bitmap being out of date.
+ */
+static int
+indexsect2(Index *ix, u8int *score)
+{
+ u32int key;
+ int d, is;
+
+ key = hashbits(score, 32);
+ for(d=0; d<=ix->maxdepth; d++){
+ is = issplit(ix, key, d);
+ if(is == -1)
+ return 0; /* must return something valid! */
+ if(!is)
+ break;
+ }
+
+ if(d > ix->maxdepth){
+ seterr(EBug, "index bitmap inconsistent with maxdepth");
+ return 0; /* must return something valid! */
+ }
+
+ return indexsect0(ix, keytobuck(ix, key, d));
+}
+
+/*
+ * load the index block responsible for score.
+ * (unlike indexsect2, must be correct always.)
+ */
+static DBlock*
+loadibucket2(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+ u32int key;
+ int d, try, is;
+ DBlock *b;
+
+ for(try=0; try<32; try++){
+ key = hashbits(score, 32);
+ for(d=0; d<=ix->maxdepth; d++){
+ is = issplit(ix, key, d);
+ if(is == -1)
+ return nil;
+ if(!is)
+ break;
+ }
+ if(d > ix->maxdepth){
+ seterr(EBug, "index bitmap inconsistent with maxdepth");
+ return nil;
+ }
+
+ if((b = loadibucket0(ix, keytobuck(ix, key, d), pis, pbuck, ib)) == nil)
+ return nil;
+
+ if(ib->depth == d)
+ return b;
+
+ if(ib->depth < d){
+ seterr(EBug, "index block has smaller depth than expected -- cannot happen");
+ putdblock(b);
+ return nil;
+ }
+
+ /*
+ * ib->depth > d, meaning the bitmap was out of date.
+ * fix the bitmap and try again.
+ */
+ putdblock(b);
+ if(marksplit(ix, key, d) < 0)
+ return nil;
+ }
+ seterr(EBug, "loadibucket2 failed to sync bitmap with disk!");
+ return nil;
+}
+
+int
+indexsect(Index *ix, u8int *score)
+{
+ if(ix->version == IndexVersion1)
+ return indexsect1(ix, score);
+ return indexsect2(ix, score);
+}
+
+DBlock*
+loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+ if(ix->version == IndexVersion1)
+ return loadibucket1(ix, score, pis, pbuck, ib);
+ return loadibucket2(ix, score, pis, pbuck, ib);
+}
+
+
blob - c449f022e692dcce83e0d5eb40114b91cd381093
blob + 746a89cab3216029d45c05f0647c975d3e157de4
--- src/cmd/venti/lump.c
+++ src/cmd/venti/lump.c
return ok;
}
-print("writelump %08x\n", mainindex->arenas[0]);
if(queuewrites)
return queuewrite(u, p, creator);
blob - 2ef1ed21028af321c58f63635ba8a3a44ac14094
blob + 99d8c98f494d38bd7ba12c2dd3a13c38178f24cd
--- src/cmd/venti/mkfile
+++ src/cmd/venti/mkfile
fmtarenas\
fmtisect\
fmtindex\
- buildindex\
+# buildindex\
checkarenas\
- checkindex\
+# checkindex\
clumpstats\
findscore\
rdarena\
blob - 55cac6bbe32c7e7400c331894d2ac23b8c6f18b3
blob + 8a162958233602283af5460d0d4aeeb5f263f2e5
--- src/cmd/venti/part.c
+++ src/cmd/venti/part.c
#include "dat.h"
#include "fns.h"
+#define trace 0
+
u32int maxblocksize;
int readonly;
seterr(ECorrupt, "out of bounds write to partition='%s'", part->name);
return -1;
}
- print("write %s %lud at %llud\n", part->name, n, addr);
+if(trace) print("write %s %lud at %llud\n", part->name, n, addr);
for(nn = 0; nn < n; nn += m){
mm = n - nn;
if(mm > MaxIo)
seterr(ECorrupt, "out of bounds read from partition='%s': addr=%lld n=%d size=%lld", part->name, addr, n, part->size);
return -1;
}
- print("read %s %lud at %llud\n", part->name, n, addr);
+if(trace) print("read %s %lud at %llud\n", part->name, n, addr);
for(nn = 0; nn < n; nn += m){
mm = n - nn;
if(mm > MaxIo)
blob - fc0a75b9225ff2307651588b02590fb3357666e4
blob + 4c533cc626c782d719e4792aff43b28e218eee7d
--- src/cmd/venti/venti.c
+++ src/cmd/venti/venti.c
while((r = vtgetreq(s)) != nil){
r->rx.type = r->tx.type+1;
- print("req (arenas[0]=%p sects[0]=%p) %F\n",
- mainindex->arenas[0], mainindex->sects[0], &r->tx);
+ // print("req (arenas[0]=%p sects[0]=%p) %F\n",
+ // mainindex->arenas[0], mainindex->sects[0], &r->tx);
switch(r->tx.type){
default:
vtrerror(r, "unknown request");