Commit Diff


commit - 686bd37d9d8db5e3b969a3aa2d5b455e0976b262
commit + 28b49df3542a635cca788f3de213385f3fcb6334
blob - 8cfe3e5d886798f6a924dc5a286f35a63306be63
blob + 0121d4512d7bca8a694f226c10db52baca86068e
--- src/cmd/venti/srv/arena.c
+++ src/cmd/venti/srv/arena.c
@@ -20,6 +20,7 @@ static void	sumproc(void *);
 static QLock	sumlock;
 static Rendez	sumwait;
 static ASum	*sumq;
+static ASum	*sumqtail;
 static uchar zero[8192];
 
 int	arenasumsleeptime;
@@ -257,7 +258,6 @@ writearena(Arena *arena, u64int aa, u8int *clbuf, u32i
 		if(m > n - nn)
 			m = n - nn;
 		memmove(&b->data[off], &clbuf[nn], m);
-		/* ok = writepart(arena->part, a, b->data, blocksize); */
 		ok = 0;
 		putdblock(b);
 		if(ok < 0){
@@ -329,7 +329,6 @@ writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64i
 		if(m > n - nn)
 			m = n - nn;
 		memmove(&b->data[off], &clbuf[nn], m);
-	/*	ok = writepart(arena->part, a, b->data, blocksize); */
 		ok = 0;
 		putdblock(b);
 		if(ok < 0){
@@ -356,6 +355,7 @@ writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64i
 		arena->ctime = arena->wtime;
 
 	writeclumpinfo(arena, clump, &c->info);
+	wbarena(arena);
 
 	/* set up for call to setdcachestate */
 	as.arena = arena;
@@ -410,6 +410,9 @@ setatailstate(AState *as)
 
 	trace(0, "setatailstate %s 0x%llux clumps %d", as->arena->name, as->aa, as->stats.clumps);
 
+	/*
+	 * Look up as->arena to find index.
+	 */
 	ix = mainindex;
 	for(i=0; i<ix->narenas; i++)
 		if(ix->arenas[i] == as->arena)
@@ -419,6 +422,9 @@ setatailstate(AState *as)
 		return;
 	}
 
+	/*
+ 	 * Walk backward until we find the last time these were in sync.
+	 */
 	for(j=i; --j>=0; ){
 		a = ix->arenas[j];
 		if(atailcmp(&a->diskstats, &a->memstats) == 0)
@@ -464,8 +470,12 @@ backsumarena(Arena *arena)
 		return;
 	qlock(&sumlock);
 	as->arena = arena;
-	as->next = sumq;
-	sumq = as;
+	as->next = nil;
+	if(sumq)
+		sumqtail->next = as;
+	else
+		sumq = as;
+	sumqtail = as;
 	rwakeup(&sumwait);
 	qunlock(&sumlock);
 }
@@ -499,6 +509,7 @@ sumarena(Arena *arena)
 	DigestState s;
 	u64int a, e;
 	u32int bs;
+	int t;
 	u8int score[VtScoreSize];
 
 	bs = MaxIoSize;
@@ -512,7 +523,12 @@ sumarena(Arena *arena)
 	b = alloczblock(bs, 0, arena->part->blocksize);
 	e = arena->base + arena->size;
 	for(a = arena->base - arena->blocksize; a + arena->blocksize <= e; a += bs){
-		sleep(arenasumsleeptime);
+		disksched();
+		while((t=arenasumsleeptime) == SleepForever){
+			sleep(1000);
+			disksched();
+		}
+		sleep(t);
 		if(a + bs > e)
 			bs = arena->blocksize;
 		if(readpart(arena->part, a, b->data, bs) < 0)
@@ -595,7 +611,7 @@ wbarenahead(Arena *arena)
 	b = alloczblock(arena->blocksize, 1, arena->part->blocksize);
 	if(b == nil){
 		logerr(EAdmin, "can't write arena header: %r");
-/*/ZZZ add error message? */
+/* ZZZ add error message? */
 		return -1;
 	}
 	/*
@@ -681,18 +697,22 @@ okarena(Arena *arena)
 	ok = 0;
 	dsize = arenadirsize(arena, arena->diskstats.clumps);
 	if(arena->diskstats.used + dsize > arena->size){
-		seterr(ECorrupt, "arena used > size");
+		seterr(ECorrupt, "arena %s used > size", arena->name);
 		ok = -1;
 	}
 
 	if(arena->diskstats.cclumps > arena->diskstats.clumps)
-		logerr(ECorrupt, "arena has more compressed clumps than total clumps");
+		logerr(ECorrupt, "arena %s has more compressed clumps than total clumps", arena->name);
 
+	/*
+	 * This need not be true if some of the disk is corrupted.
+	 *
 	if(arena->diskstats.uncsize + arena->diskstats.clumps * ClumpSize + arena->blocksize < arena->diskstats.used)
-		logerr(ECorrupt, "arena uncompressed size inconsistent with used space %lld %d %lld", arena->diskstats.uncsize, arena->diskstats.clumps, arena->diskstats.used);
+		logerr(ECorrupt, "arena %s uncompressed size inconsistent with used space %lld %d %lld", arena->name, arena->diskstats.uncsize, arena->diskstats.clumps, arena->diskstats.used);
+	 */
 
 	if(arena->ctime > arena->wtime)
-		logerr(ECorrupt, "arena creation time after last write time");
+		logerr(ECorrupt, "arena %s creation time after last write time", arena->name);
 
 	return ok;
 }
blob - 0bffd3cfcba7fcc89901c0ab407e20dc9a2b191b
blob + 05cb396b7736f34c911e120e25a8609b167dcb17
--- src/cmd/venti/srv/arenas.c
+++ src/cmd/venti/srv/arenas.c
@@ -214,7 +214,7 @@ wbarenapart(ArenaPart *ap)
 		return -1;
 	b = alloczblock(HeadSize, 1, 0);
 	if(b == nil)
-/*ZZZ set error message? */
+/* ZZZ set error message? */
 		return -1;
 
 	if(packarenapart(ap, b->data) < 0){
@@ -337,8 +337,8 @@ wbarenamap(AMap *am, int n, Part *part, u64int base, u
 /*
  * amap: n '\n' amapelem * n
  * n: u32int
- * amapelem: name '\t' astart '\t' asize '\n'
- * astart, asize: u64int
+ * amapelem: name '\t' astart '\t' astop '\n'
+ * astart, astop: u64int
  */
 int
 parseamap(IFile *f, AMapN *amn)
blob - e54e3885f34362a53b2e55af5daa0b5c70738f3b
blob + 7ea5f640e011c8dbd951610c2ce2325d247967d3
--- src/cmd/venti/srv/bloom.c
+++ src/cmd/venti/srv/bloom.c
@@ -7,6 +7,8 @@
 #include "dat.h"
 #include "fns.h"
 
+int ignorebloom;
+
 int
 bloominit(Bloom *b, vlong vsize, u8int *data)
 {
@@ -24,6 +26,7 @@ bloominit(Bloom *b, vlong vsize, u8int *data)
 		if(unpackbloomhead(b, data) < 0)
 			return -1;
 	
+fprint(2, "bloom size %lud nhash %d\n", b->size, b->nhash);
 	b->mask = b->size-1;
 	b->data = data;
 	return 0;
@@ -38,11 +41,7 @@ wbbloomhead(Bloom *b)
 Bloom*
 readbloom(Part *p)
 {
-	int i, n;
-	uint ones;
 	uchar buf[512];
-	uchar *data;
-	u32int *a;
 	Bloom *b;
 	
 	b = vtmallocz(sizeof *b);
@@ -52,14 +51,40 @@ readbloom(Part *p)
 		vtfree(b);
 		return nil;
 	}
+	b->part = p;
+	return b;
+}
+
+int
+resetbloom(Bloom *b)
+{
+	uchar *data;
+	
 	data = vtmallocz(b->size);
-	if(readpart(p, 0, data, b->size) < 0){
+fprint(2, "bloom data %lud\n", b->size);
+	b->data = data;
+	if(b->size == MaxBloomSize)	/* 2^32 overflows ulong */
+		addstat(StatBloomBits, b->size*8-1);
+	else
+		addstat(StatBloomBits, b->size*8);
+	return 0;
+}
+
+int
+loadbloom(Bloom *b)
+{
+	int i, n;
+	uint ones;
+	uchar *data;
+	u32int *a;
+	
+	data = vtmallocz(b->size);
+	if(readpart(b->part, 0, data, b->size) < 0){
 		vtfree(b);
 		vtfree(data);
-		return nil;
+		return -1;
 	}
 	b->data = data;
-	b->part = p;
 
 	a = (u32int*)b->data;
 	n = b->size/4;
@@ -73,7 +98,7 @@ readbloom(Part *p)
 	else
 		addstat(StatBloomBits, b->size*8);
 		
-	return b;
+	return 0;
 }
 
 int
@@ -101,6 +126,8 @@ gethashes(u8int *score, ulong *h)
 		a ^= *(u32int*)(score+i);
 		b ^= *(u32int*)(score+i+4);
 	}
+	if(i+4 <= VtScoreSize)	/* 20 is not 4-aligned */
+		a ^= *(u32int*)(score+i);
 	for(i=0; i<BloomMaxHash; i++, a+=b)
 		h[i] = a < BloomHeadSize*8 ? BloomHeadSize*8 : a;
 }
@@ -154,14 +181,17 @@ inbloomfilter(Bloom *b, u8int *score)
 	int r;
 	uint ms;
 
-	if(b == nil)
+	if(b == nil || b->data == nil)
 		return 1;
 
+	if(ignorebloom)
+		return 1;
+	
 	ms = msec();
 	rlock(&b->lk);
 	r = _inbloomfilter(b, score);
 	runlock(&b->lk);
-	ms = msec() - ms;
+	ms = ms - msec();
 	addstat2(StatBloomLookup, 1, StatBloomLookupTime, ms);
 	if(r)
 		addstat(StatBloomMiss, 1);
@@ -173,7 +203,7 @@ inbloomfilter(Bloom *b, u8int *score)
 void
 markbloomfilter(Bloom *b, u8int *score)
 {
-	if(b == nil)
+	if(b == nil || b->data == nil)
 		return;
 
 	rlock(&b->lk);
@@ -186,14 +216,18 @@ markbloomfilter(Bloom *b, u8int *score)
 static void
 bloomwriteproc(void *v)
 {
+	int ret;
 	Bloom *b;
-	
+
+	threadsetname("bloomwriteproc");	
 	b = v;
 	for(;;){
 		recv(b->writechan, 0);
-		if(writebloom(b) < 0)
+		if((ret=writebloom(b)) < 0)
 			fprint(2, "oops! writing bloom: %r\n");
-		send(b->writedonechan, 0);
+		else
+			ret = 0;
+		sendul(b->writedonechan, ret);
 	}
 }
 
blob - 225bdc43cdf2ec7b8f85a442d8a00f51818f2518
blob + 73f8056beabd693a030efc722aa0bca94fab61b1
--- src/cmd/venti/srv/buildbuck.c
+++ src/cmd/venti/srv/buildbuck.c
@@ -21,7 +21,7 @@ initiestream(Part *part, u64int off, u64int clumps, u3
 {
 	IEStream *ies;
 
-/*ZZZ out of memory? */
+/* out of memory? */
 	ies = MKZ(IEStream);
 	ies->buf = MKN(u8int, size);
 	ies->epos = ies->buf;
@@ -61,7 +61,7 @@ peekientry(IEStream *ies)
 		nn -= n;
 		if(nn == 0)
 			return nil;
-/*fprint(2, "peek %d from %llud into %p\n", nn, ies->off, ies->epos); */
+//fprint(2, "peek %d from %llud into %p\n", nn, ies->off, ies->epos);
 		if(readpart(ies->part, ies->off, ies->epos, nn) < 0){
 			seterr(EOk, "can't read sorted index entries: %r");
 			return nil;
@@ -101,7 +101,7 @@ buildbucket(Index *ix, IEStream *ies, IBucket *ib, uin
 		b = peekientry(ies);
 		if(b == nil)
 			return TWID32;
-/*fprint(2, "b=%p ies->n=%lld ib.n=%d buck=%d score=%V\n", b, ies->n, ib->n, iebuck(ix, b, ib, ies), b); */
+/* fprint(2, "b=%p ies->n=%lld ib.n=%d buck=%d score=%V\n", b, ies->n, ib->n, iebuck(ix, b, ib, ies), b); */
 		if(ib->n == 0)
 			buck = iebuck(ix, b, ib, ies);
 		else{
blob - e70a830d340e0ee2ccd29b04875e7b2ac665d4b1
blob + b6866daf3072ebbd69681b24ccf30326bdd90d03
--- src/cmd/venti/srv/buildindex.c
+++ src/cmd/venti/srv/buildindex.c
@@ -1,164 +1,936 @@
 /*
- * Rebuild the Venti index from scratch.
+ * Rebuild the index from scratch, in place.
  */
-
 #include "stdinc.h"
 #include "dat.h"
 #include "fns.h"
 
-/*
- * Write a single bucket.  Could profit from a big buffer here
- * so that we can absorb sporadic runs of blocks into one write,
- * avoiding disk seeks.
- */
-static int
-writebucket(Index *ix, u32int buck, IBucket *ib, ZBlock *b)
+enum
 {
-	ISect *is;
+	MinBufSize = 64*1024,
+	MaxBufSize = 4*1024*1024,
+};
 
-	is = ix->sects[indexsect0(ix, buck)];
-	if(buck < is->start || buck >= is->stop){
-		seterr(EAdmin, "cannot find index section for bucket %lud\n", (ulong)buck);
-		return -1;
-	}
-	buck -= is->start;
+int		dumb;
+int		errors;
+char		**isect;
+int		nisect;
+int		bloom;
+int		zero;
 
-/*
-	qlock(&stats.lock);
-	stats.indexwrites++;
-	qunlock(&stats.lock);
-*/
-	packibucket(ib, b->data, is->bucketmagic);
-	return writepart(is->part, is->blockbase + ((u64int)buck << is->blocklog), b->data, is->blocksize);
-}
+u32int	isectmem;
+u64int	totalbuckets;
+u64int	totalclumps;
+Channel	*arenadonechan;
+Channel	*isectdonechan;
+Index	*ix;
 
-static int
-buildindex(Index *ix, Part *part, u64int off, u64int clumps, int zero)
-{
-	IEStream *ies;
-	IBucket ib, zib;
-	ZBlock *z, *b;
-	u32int next, buck;
-	int ok;
-	uint nbuck;
-	u64int found = 0;
+u64int	arenaentries;
+u64int	skipentries;
+u64int	indexentries;
 
-/*ZZZ make buffer size configurable */
-	b = alloczblock(ix->blocksize, 0, ix->blocksize);
-	z = alloczblock(ix->blocksize, 1, ix->blocksize);
-	ies = initiestream(part, off, clumps, 64*1024);
-	if(b == nil || z == nil || ies == nil){
-		ok = 0;
-		goto breakout;
-		return -1;
-	}
-	ok = 0;
-	next = 0;
-	memset(&ib, 0, sizeof ib);
-	ib.data = b->data + IBucketSize;
-	zib.data = z->data + IBucketSize;
-	zib.n = 0;
-	nbuck = 0;
-	for(;;){
-		buck = buildbucket(ix, ies, &ib, ix->blocksize-IBucketSize);
-		found += ib.n;
-		if(zero){
-			for(; next != buck; next++){
-				if(next == ix->buckets){
-					if(buck != TWID32){
-						fprint(2, "bucket out of range\n");
-						ok = -1;
-					}
-					goto breakout;
-				}
-				if(writebucket(ix, next, &zib, z) < 0){
-					fprint(2, "can't write zero bucket to buck=%d: %r", next);
-					ok = -1;
-				}
-			}
-		}
-		if(buck >= ix->buckets){
-			if(buck == TWID32)
-				break;
-			fprint(2, "bucket out of range\n");
-			ok = -1;
-			goto breakout;
-		}
-		if(writebucket(ix, buck, &ib, b) < 0){
-			fprint(2, "bad bucket found=%lld: %r\n", found);
-			ok = -1;
-		}
-		next = buck + 1;
-		if(++nbuck%10000 == 0)
-			fprint(2, "\t%,d buckets written...\n", nbuck);
-	}
-breakout:;
-	fprint(2, "wrote index with %lld entries\n", found);
-	freeiestream(ies);
-	freezblock(z);
-	freezblock(b);
-	return ok;
-}
+static int shouldprocess(ISect*);
+static void	isectproc(void*);
+static void	arenapartproc(void*);
 
 void
 usage(void)
 {
-	fprint(2, "usage: buildindex [-Z] [-B blockcachesize] config tmppart\n");
-	threadexitsall(0);
+	fprint(2, "usage: buildindex [-b] [-i isect]... [-M imem] venti.conf\n");
+	threadexitsall("usage");
 }
 
-Config conf;
-
 void
 threadmain(int argc, char *argv[])
 {
-	Part *part;
-	u64int clumps, base;
-	u32int bcmem;
-	int zero;
-
-	zero = 1;
-	bcmem = 0;
+	int fd, i, napart;
+	u32int bcmem, imem;
+	Config conf;
+	Part *p;
+	
 	ventifmtinstall();
+	imem = 256*1024*1024;
 	ARGBEGIN{
-	case 'B':
-		bcmem = unittoull(ARGF());
+	case 'b':
+		bloom = 1;
 		break;
-	case 'Z':
-		zero = 0;
+	case 'i':
+		isect = vtrealloc(isect, (nisect+1)*sizeof(isect[0]));
+		isect[nisect++] = EARGF(usage());
 		break;
+	case 'd':	/* debugging - make sure to run all 3 passes */
+		dumb = 1;
+		break;
+	case 'M':
+		imem = unittoull(EARGF(usage()));
+		break;
 	default:
 		usage();
 		break;
 	}ARGEND
-
-	if(argc != 2)
+	
+	if(argc != 1)
 		usage();
 
 	if(initventi(argv[0], &conf) < 0)
 		sysfatal("can't init venti: %r");
+	ix = mainindex;
+	if(nisect == 0 && ix->bloom)
+		bloom = 1;
+	if(bloom && ix->bloom && resetbloom(ix->bloom) < 0)
+		sysfatal("loadbloom: %r");
+	if(bloom && !ix->bloom)
+		sysfatal("-b specified but no bloom filter");
+	if(!bloom)
+		ix->bloom = nil;
+	isectmem = imem/ix->nsects;
 
-	if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16))
-		bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16);
+	/*
+	 * safety first - only need read access to arenas
+	 */
+	p = nil;
+	for(i=0; i<ix->narenas; i++){
+		if(ix->arenas[i]->part != p){
+			p = ix->arenas[i]->part;
+			if((fd = open(p->filename, OREAD)) < 0)
+				sysfatal("cannot reopen %s: %r", p->filename);
+			dup(fd, p->fd);
+			close(fd);
+		}
+	}
+	
+	/*
+	 * need a block for every arena
+	 */
+	bcmem = maxblocksize * (mainindex->narenas + 16);
 	if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
 	initdcache(bcmem);
+	
+	totalclumps = 0;
+	for(i=0; i<ix->narenas; i++)
+		totalclumps += ix->arenas[i]->diskstats.clumps;
+	
+	totalbuckets = 0;
+	for(i=0; i<ix->nsects; i++)
+		totalbuckets += ix->sects[i]->blocks;
+	fprint(2, "%,lld clumps, %,lld buckets\n", totalclumps, totalbuckets);
 
-	fprint(2, "building a new index %s using %s for temporary storage\n", mainindex->name, argv[1]);
+	/* start index procs */
+	fprint(2, "%T read index\n");
+	isectdonechan = chancreate(sizeof(void*), 0);
+	for(i=0; i<ix->nsects; i++){
+		if(shouldprocess(ix->sects[i]))
+			ix->sects[i]->writechan = chancreate(sizeof(IEntry), 0);
+		vtproc(isectproc, ix->sects[i]);
+	}
+	
+	for(i=0; i<nisect; i++)
+		if(isect[i])
+			fprint(2, "warning: did not find index section %s\n", isect[i]);
 
-	part = initpart(argv[1], ORDWR|ODIRECT);
-	if(part == nil)
-		sysfatal("can't initialize temporary partition: %r");
+	/* start arena procs */
+	p = nil;
+	napart = 0;
+	arenadonechan = chancreate(sizeof(void*), 0);
+	for(i=0; i<ix->narenas; i++){
+		if(ix->arenas[i]->part != p){
+			p = ix->arenas[i]->part;
+			vtproc(arenapartproc, p);
+			napart++;
+		}
+	}
 
-	clumps = sortrawientries(mainindex, part, &base, mainindex->bloom);
-	if(clumps == TWID64)
-		sysfatal("can't build sorted index: %r");
-	fprint(2, "found and sorted index entries for clumps=%lld at %lld\n", clumps, base);
+	/* wait for arena procs to finish */
+	for(i=0; i<napart; i++)
+		recvp(arenadonechan);
 
-	if(buildindex(mainindex, part, base, clumps, zero) < 0)
-		sysfatal("can't build new index: %r");
+	/* tell index procs to finish */
+	for(i=0; i<ix->nsects; i++)
+		if(ix->sects[i]->writechan)
+			send(ix->sects[i]->writechan, nil);
+
+	/* wait for index procs to finish */
+	for(i=0; i<ix->nsects; i++)
+		if(ix->sects[i]->writechan)
+			recvp(isectdonechan);
+
+	if(ix->bloom && writebloom(ix->bloom) < 0)
+		fprint(2, "writing bloom filter: %r\n");
+
+	fprint(2, "%T done arenaentries=%,lld indexed=%,lld (nskip=%,lld)\n", 
+		arenaentries, indexentries, skipentries);
+	threadexitsall(nil);
+}
+
+static int
+shouldprocess(ISect *is)
+{
+	int i;
 	
-	if(mainindex->bloom)
-		writebloom(mainindex->bloom);
+	if(nisect == 0)
+		return 1;
 
-	threadexitsall(0);
+	for(i=0; i<nisect; i++)
+		if(isect[i] && strcmp(isect[i], is->name) == 0){
+			isect[i] = nil;
+			return 1;
+		}
+	return 0;
+}
+
+static void
+add(u64int *a, u64int n)
+{
+	static Lock l;
+	
+	lock(&l);
+	*a += n;
+	unlock(&l);
+}
+
+/*
+ * Read through an arena partition and send each of its IEntries
+ * to the appropriate index section.  When finished, send on
+ * arenadonechan.
+ */
+enum
+{
+	ClumpChunks = 32*1024,
+};
+static void
+arenapartproc(void *v)
+{
+	int i, j, n, nskip, x;
+	u32int clump;
+	u64int addr, tot;
+	Arena *a;
+	ClumpInfo *ci, *cis;
+	IEntry ie;
+	Part *p;
+	
+	p = v;
+	threadsetname("arenaproc %s", p->name);
+
+	nskip = 0;
+	tot = 0;
+	cis = MKN(ClumpInfo, ClumpChunks);
+	for(i=0; i<ix->narenas; i++){
+		a = ix->arenas[i];
+		if(a->part != p)
+			continue;
+		if(a->memstats.clumps)
+			fprint(2, "%T arena %s: %d entries\n", 
+				a->name, a->memstats.clumps);
+		addr = ix->amap[i].start;
+		for(clump=0; clump<a->memstats.clumps; clump+=n){
+			n = ClumpChunks;
+			if(n > a->memstats.clumps - clump)
+				n = a->memstats.clumps - clump;
+			if(readclumpinfos(a, clump, cis, n) != n){
+				fprint(2, "%T arena %s: directory read: %r\n", a->name);
+				errors = 1;
+				break;
+			}
+			for(j=0; j<n; j++){
+				ci = &cis[j];
+				ie.ia.type = ci->type;
+				ie.ia.size = ci->uncsize;
+				ie.ia.addr = addr;
+				addr += ci->size + ClumpSize;
+				ie.ia.blocks = (ci->size + ClumpSize + (1<<ABlockLog)-1) >> ABlockLog;
+				scorecp(ie.score, ci->score);
+				if(ci->type == VtCorruptType)
+					nskip++;
+				else{
+					tot++;
+					x = indexsect(ix, ie.score);
+					assert(0 <= x && x < ix->nsects);
+					if(ix->sects[x]->writechan)
+						send(ix->sects[x]->writechan, &ie);
+					if(ix->bloom)
+						markbloomfilter(ix->bloom, ie.score);
+				}
+			}
+		}
+	}
+	add(&arenaentries, tot);
+	add(&skipentries, nskip);
+	sendp(arenadonechan, p);
+}
+
+/*
+ * Convert score into relative bucket number in isect.
+ * Can pass a packed ientry instead of score - score is first.
+ */
+static u32int
+score2bucket(ISect *is, uchar *score)
+{
+	u32int b;
+	
+	b = hashbits(score, 32)/ix->div;
+	assert(is->start <= b && b < is->stop);
+	return b - is->start;
+}
+
+/*
+ * Convert offset in index section to bucket number.
+ */
+static u32int
+offset2bucket(ISect *is, u64int offset)
+{
+	u32int b;
+	
+	assert(is->blockbase <= offset);
+	offset -= is->blockbase;
+	b = offset/is->blocksize;
+	assert(b < is->stop-is->start);
+	return b;
+}
+
+/*
+ * Convert bucket number to offset.
+ */
+static u64int
+bucket2offset(ISect *is, u32int b)
+{
+	assert(b <= is->stop-is->start);
+	return is->blockbase + (u64int)b*is->blocksize;
+}
+
+/* 
+ * IEntry buffers to hold initial round of spraying.
+ */
+typedef struct Buf Buf;
+struct Buf
+{
+	Part *part;			/* partition being written */
+	uchar *bp;		/* current block */
+	uchar *ep;		/* end of block */
+	uchar *wp;		/* write position in block */
+	u64int boffset;		/* start offset */
+	u64int woffset;		/* next write offset */
+	u64int eoffset;		/* end offset */
+	u32int nentry;		/* number of entries written */
+};
+
+static void
+bflush(Buf *buf)
+{
+	u32int bufsize;
+	
+	if(buf->woffset >= buf->eoffset)
+		sysfatal("buf index chunk overflow - need bufger index");
+	bufsize = buf->ep - buf->bp;
+	if(writepart(buf->part, buf->woffset, buf->bp, bufsize) < 0){
+		fprint(2, "write %s: %r\n", buf->part->name);
+		errors = 1;
+	}
+	buf->woffset += bufsize;
+	memset(buf->bp, 0, bufsize);
+	buf->wp = buf->bp;
+}
+
+static void
+bwrite(Buf *buf, IEntry *ie)
+{
+	if(buf->wp+IEntrySize > buf->ep)
+		bflush(buf);
+	assert(buf->bp <= buf->wp && buf->wp < buf->ep);
+	packientry(ie, buf->wp);
+	buf->wp += IEntrySize;
+	assert(buf->bp <= buf->wp && buf->wp <= buf->ep);
+	buf->nentry++;
+}
+
+/*
+ * Minibuffer.  In-memory data structure holds our place
+ * in the buffer but has no block data.  We are writing and
+ * reading the minibuffers at the same time.  (Careful!)
+ */
+typedef struct Minibuf Minibuf;
+struct Minibuf
+{
+	u64int boffset;		/* start offset */
+	u64int roffset;		/* read offset */
+	u64int woffset;		/* write offset */
+	u64int eoffset;		/* end offset */
+	u32int nentry;		/* # entries left to read */
+	u32int nwentry;	/* # entries written */
+};
+
+/*
+ * Index entry pool.  Used when trying to shuffle around 
+ * the entries in a big buffer into the corresponding M minibuffers.
+ * Sized to hold M*EntriesPerBlock entries, so that there will always
+ * either be room in the pool for another block worth of entries
+ * or there will be an entire block worth of sorted entries to 
+ * write out.
+ */
+typedef struct IEntryLink IEntryLink;
+typedef struct IPool IPool;
+
+struct IEntryLink
+{
+	uchar ie[IEntrySize];		/* raw IEntry */
+	IEntryLink *next;		/* next in chain */
+};
+
+struct IPool
+{
+	ISect *isect;
+	u32int buck0;			/* first bucket in pool */
+	u32int mbufbuckets;	/* buckets per minibuf */
+	IEntryLink *entry;		/* all IEntryLinks */
+	u32int nentry;			/* # of IEntryLinks */
+	IEntryLink *free;		/* free list */
+	u32int nfree;			/* # on free list */
+	Minibuf *mbuf;			/* all minibufs */
+	u32int nmbuf;			/* # of minibufs */
+	IEntryLink **mlist;		/* lists for each minibuf */
+	u32int *mcount;		/* # on each mlist[i] */
+	u32int bufsize;			/* block buffer size */
+	uchar *rbuf;			/* read buffer */
+	uchar *wbuf;			/* write buffer */
+	u32int epbuf;			/* entries per block buffer */
+};
+
+/*
+static int
+countsokay(IPool *p)
+{
+	int i;
+	u64int n;
+	
+	n = 0;
+	for(i=0; i<p->nmbuf; i++)
+		n += p->mcount[i];
+	n += p->nfree;
+	if(n != p->nentry){
+		print("free %ud:", p->nfree);
+		for(i=0; i<p->nmbuf; i++)
+			print(" %ud", p->mcount[i]);
+		print(" = %lld nentry: %ud\n", n, p->nentry);
+	}
+	return n == p->nentry;
+}
+*/
+
+static IPool*
+mkipool(ISect *isect, Minibuf *mbuf, u32int nmbuf, 
+	u32int mbufbuckets, u32int bufsize)
+{
+	u32int i, nentry;
+	uchar *data;
+	IPool *p;
+	IEntryLink *l;
+	
+	nentry = (nmbuf+1)*bufsize / IEntrySize;
+	p = ezmalloc(sizeof(IPool)
+		+nentry*sizeof(IEntry)
+		+nmbuf*sizeof(IEntryLink*)
+		+nmbuf*sizeof(u32int)
+		+3*bufsize);
+	
+	p->isect = isect;
+	p->mbufbuckets = mbufbuckets;
+	p->bufsize = bufsize;
+	p->entry = (IEntryLink*)(p+1);
+	p->nentry = nentry;
+	p->mlist = (IEntryLink**)(p->entry+nentry);
+	p->mcount = (u32int*)(p->mlist+nmbuf);
+	p->nmbuf = nmbuf;
+	p->mbuf = mbuf;
+	data = (uchar*)(p->mcount+nmbuf);
+	data += bufsize - (u32int)data%bufsize;
+	p->rbuf = data;
+	p->wbuf = data+bufsize;
+	p->epbuf = bufsize/IEntrySize;
+
+	for(i=0; i<p->nentry; i++){
+		l = &p->entry[i];
+		l->next = p->free;
+		p->free = l;
+		p->nfree++;
+	}
+	return p;
 }
+
+/* 
+ * Add the index entry ie to the pool p.
+ * Caller must know there is room.
+ */
+static void
+ipoolinsert(IPool *p, uchar *ie)
+{
+	u32int buck, x;
+	IEntryLink *l;
+
+	assert(p->free != nil);
+
+	buck = score2bucket(p->isect, ie);
+	x = (buck-p->buck0) / p->mbufbuckets;
+	if(x >= p->nmbuf){
+		fprint(2, "buck=%ud mbufbucket=%ud x=%ud\n",
+			buck, p->mbufbuckets, x);
+	}
+	assert(x < p->nmbuf);
+
+	l = p->free;
+	p->free = l->next;
+	p->nfree--;
+	memmove(l->ie, ie, IEntrySize);
+	l->next = p->mlist[x];
+	p->mlist[x] = l;
+	p->mcount[x]++;
+}	
+
+/*
+ * Pull out a block containing as many
+ * entries as possible for minibuffer x.
+ */
+static u32int
+ipoolgetbuf(IPool *p, u32int x)
+{
+	uchar *bp, *ep, *wp;
+	IEntryLink *l;
+	u32int n;
+	
+	bp = p->wbuf;
+	ep = p->wbuf + p->bufsize;
+	n = 0;
+	assert(x < p->nmbuf);
+	for(wp=bp; wp+IEntrySize<=ep && p->mlist[x]; wp+=IEntrySize){
+		l = p->mlist[x];
+		p->mlist[x] = l->next;
+		p->mcount[x]--;
+		memmove(wp, l->ie, IEntrySize);
+		l->next = p->free;
+		p->free = l;
+		p->nfree++;
+		n++;
+	}
+	memset(wp, 0, ep-wp);
+	return n;
+}
+
+/*
+ * Read a block worth of entries from the minibuf
+ * into the pool.  Caller must know there is room.
+ */
+static void
+ipoolloadblock(IPool *p, Minibuf *mb)
+{
+	u32int i, n;
+	
+	assert(mb->nentry > 0);
+	assert(mb->roffset >= mb->woffset);
+	assert(mb->roffset < mb->eoffset);
+
+	n = p->bufsize/IEntrySize;
+	if(n > mb->nentry)
+		n = mb->nentry;
+	if(readpart(p->isect->part, mb->roffset, p->rbuf, p->bufsize) < 0)
+		fprint(2, "readpart %s: %r\n", p->isect->part->name);
+	else{
+		for(i=0; i<n; i++)
+			ipoolinsert(p, p->rbuf+i*IEntrySize);
+	}
+	mb->nentry -= n;
+	mb->roffset += p->bufsize;
+}
+
+/*
+ * Write out a block worth of entries to minibuffer x.
+ * If necessary, pick up the data there before overwriting it.
+ */
+static void
+ipoolflush0(IPool *pool, u32int x)
+{
+	u32int bufsize;
+	Minibuf *mb;
+	
+	mb = pool->mbuf+x;
+	bufsize = pool->bufsize;
+	mb->nwentry += ipoolgetbuf(pool, x);
+	if(mb->nentry > 0 && mb->roffset == mb->woffset){
+		assert(pool->nfree >= pool->bufsize/IEntrySize);
+		/*
+		 * There will be room in the pool -- we just 
+		 * removed a block worth.
+		 */
+		ipoolloadblock(pool, mb);
+	}
+	if(writepart(pool->isect->part, mb->woffset, pool->wbuf, bufsize) < 0)
+		fprint(2, "writepart %s: %r\n", pool->isect->part->name);
+	mb->woffset += bufsize;
+}
+
+/*
+ * Write out some full block of entries.
+ * (There must be one -- the pool is almost full!)
+ */
+static void
+ipoolflush1(IPool *pool)
+{
+	u32int i;
+
+	assert(pool->nfree <= pool->epbuf);
+
+	for(i=0; i<pool->nmbuf; i++){
+		if(pool->mcount[i] >= pool->epbuf){
+			ipoolflush0(pool, i);
+			return;
+		}
+	}
+	/* can't be reached - someone must be full */
+	sysfatal("ipoolflush1");
+}
+
+/*
+ * Flush all the entries in the pool out to disk.
+ * Nothing more to read from disk.
+ */
+static void
+ipoolflush(IPool *pool)
+{
+	u32int i;
+	
+	for(i=0; i<pool->nmbuf; i++)
+		while(pool->mlist[i])
+			ipoolflush0(pool, i);
+	assert(pool->nfree == pool->nentry);
+}
+
+/*
+ * Third pass.  Pick up each minibuffer from disk into
+ * memory and then write out the buckets.
+ */
+
+/*
+ * Compare two packed index entries.  
+ * Usual ordering except break ties by putting higher
+ * index addresses first (assumes have duplicates
+ * due to corruption in the lower addresses).
+ */
+static int
+ientrycmpaddr(const void *va, const void *vb)
+{
+	int i;
+	uchar *a, *b;
+	
+	a = (uchar*)va;
+	b = (uchar*)vb;
+	i = ientrycmp(a, b);
+	if(i)
+		return i;
+	return -memcmp(a+IEntryAddrOff, b+IEntryAddrOff, 8);
+}
+
+static void
+zerorange(Part *p, u64int o, u64int e)
+{
+	static uchar zero[MaxIoSize];
+	u32int n;
+	
+	for(; o<e; o+=n){
+		n = sizeof zero;
+		if(o+n > e)
+			n = e-o;
+		if(writepart(p, o, zero, n) < 0)
+			fprint(2, "writepart %s: %r\n", p->name);
+	}
+}
+
+/*
+ * Load a minibuffer into memory and write out the 
+ * corresponding buckets.
+ */
+static void
+sortminibuffer(ISect *is, Minibuf *mb, uchar *buf, u32int nbuf, u32int bufsize)
+{
+	uchar *buckdata, *p, *q, *ep;
+	u32int b, lastb, memsize, n;
+	u64int o;
+	IBucket ib;
+	Part *part;
+	
+	part = is->part;
+	buckdata = emalloc(is->blocksize);
+	
+	if(mb->nwentry == 0)
+		return;
+
+	/*
+	 * read entire buffer.
+	 */
+	assert(mb->nwentry*IEntrySize <= mb->woffset-mb->boffset);
+	assert(mb->woffset-mb->boffset <= nbuf);
+	if(readpart(part, mb->boffset, buf, mb->woffset-mb->boffset) < 0){
+		fprint(2, "readpart %s: %r\n", part->name);
+		errors = 1;
+		return;
+	}
+	assert(*(uint*)buf != 0xa5a5a5a5);
+	
+	/*
+	 * remove fragmentation due to IEntrySize
+	 * not evenly dividing Bufsize
+	 */
+	memsize = (bufsize/IEntrySize)*IEntrySize;
+	for(o=mb->boffset, p=q=buf; o<mb->woffset; o+=bufsize){
+		memmove(p, q, memsize);
+		p += memsize;
+		q += bufsize;
+	}
+	ep = buf + mb->nwentry*IEntrySize;
+	assert(ep <= buf+nbuf);
+
+	/* 
+	 * sort entries
+	 */
+	qsort(buf, mb->nwentry, IEntrySize, ientrycmpaddr);
+
+	/*
+	 * write buckets out
+	 */
+	n = 0;
+	lastb = offset2bucket(is, mb->boffset);
+	for(p=buf; p<ep; p=q){
+		b = score2bucket(is, p);
+		for(q=p; q<ep && score2bucket(is, q)==b; q+=IEntrySize)
+			;
+		if(lastb+1 < b && zero)
+			zerorange(part, bucket2offset(is, lastb+1), bucket2offset(is, b));
+		if(IBucketSize+(q-p) > is->blocksize)
+			sysfatal("bucket overflow - make index bigger");
+		memmove(buckdata+IBucketSize, p, q-p);
+		ib.n = (q-p)/IEntrySize;
+		n += ib.n;
+		packibucket(&ib, buckdata, is->bucketmagic);
+		if(writepart(part, bucket2offset(is, b), buckdata, is->blocksize) < 0)
+			fprint(2, "write %s: %r\n", part->name);
+		lastb = b;
+	}
+	if(lastb+1 < is->stop-is->start && zero)
+		zerorange(part, bucket2offset(is, lastb+1), bucket2offset(is, is->stop - is->start));
+
+	if(n != mb->nwentry)
+		fprint(2, "sortminibuffer bug: n=%ud nwentry=%ud have=%ld\n", n, mb->nwentry, (ep-buf)/IEntrySize);
+
+	free(buckdata);
+}
+
+static void
+isectproc(void *v)
+{
+	u32int buck, bufbuckets, bufsize, epbuf, i, j;
+	u32int mbufbuckets, n, nbucket, nn, space;
+	u32int nbuf, nminibuf, xminiclump, prod;
+	u64int blocksize, offset, xclump;
+	uchar *data, *p;
+	Buf *buf;
+	IEntry ie;
+	IPool *ipool;
+	ISect *is;
+	Minibuf *mbuf, *mb;
+	
+	is = v;
+	blocksize = is->blocksize;
+	nbucket = is->stop - is->start;
+
+	/*
+	 * Three passes:
+	 *	pass 1 - write index entries from arenas into 
+	 *		large sequential sections on index disk.
+	 *		requires nbuf * bufsize memory.
+	 *
+	 *	pass 2 - split each section into minibufs.
+	 *		requires nminibuf * bufsize memory.
+	 *
+	 *	pass 3 - read each minibuf into memory and
+	 *		write buckets out. 
+	 *		requires entries/minibuf * IEntrySize memory.
+	 * 
+	 * The larger we set bufsize the less seeking hurts us.
+	 * 
+	 * The fewer sections and minibufs we have, the less
+	 * seeking hurts us.
+	 * 
+	 * The fewer sections and minibufs we have, the 
+	 * more entries we end up with in each minibuf
+	 * at the end.  
+	 *
+	 * Shoot for using half our memory to hold each
+	 * minibuf.  The chance of a random distribution 
+	 * getting off by 2x is quite low.  
+	 *
+	 * Once that is decided, figure out the smallest 
+	 * nminibuf and nsection/biggest bufsize we can use
+	 * and still fit in the memory constraints.
+	 */
+	
+	/* expected number of clump index entries we'll see */
+	xclump = nbucket * (double)totalclumps/totalbuckets;
+	
+	/* number of clumps we want to see in a minibuf */
+	xminiclump = isectmem/2/IEntrySize;
+	
+	/* total number of minibufs we need */
+	prod = xclump / xminiclump;
+	
+	/* if possible, skip second pass */
+	if(!dumb && prod*MinBufSize < isectmem){
+		nbuf = prod;
+		nminibuf = 1;
+	}else{
+		/* otherwise use nsection = sqrt(nmini) */
+		for(nbuf=1; nbuf*nbuf<prod; nbuf++)
+			;
+		if(nbuf*MinBufSize > isectmem)
+			sysfatal("not enough memory");
+		nminibuf = nbuf;
+	}
+	/* size buffer to use extra memory */
+	bufsize = MinBufSize;
+	while(bufsize*2*nbuf <= isectmem && bufsize < MaxBufSize)
+		bufsize *= 2;
+	data = emalloc(nbuf*bufsize);
+	epbuf = bufsize/IEntrySize;
+
+	fprint(2, "%T %s: %,ud buckets, %,ud groups, %,ud minigroups, %,ud buffer\n",
+		is->part->name, nbucket, nbuf, nminibuf, bufsize);
+	/*
+	 * Accept index entries from arena procs.
+	 */
+	buf = MKNZ(Buf, nbuf);
+	p = data;
+	offset = is->blockbase;
+	bufbuckets = (nbucket+nbuf-1)/nbuf;
+	for(i=0; i<nbuf; i++){
+		buf[i].part = is->part;
+		buf[i].bp = p;
+		buf[i].wp = p;
+		p += bufsize;
+		buf[i].ep = p;
+		buf[i].boffset = offset;
+		buf[i].woffset = offset;
+		if(i < nbuf-1){
+			offset += bufbuckets*blocksize;
+			buf[i].eoffset = offset;
+		}else{
+			offset = is->blockbase + nbucket*blocksize;
+			buf[i].eoffset = offset;
+		}
+	}
+	assert(p == data+nbuf*bufsize);
+
+	n = 0;
+	while(recv(is->writechan, &ie) == 1){
+		if(ie.ia.addr == 0)
+			break;
+		buck = score2bucket(is, ie.score);
+		i = buck/bufbuckets;
+		assert(i < nbuf);
+		bwrite(&buf[i], &ie);
+		n++;
+	}
+	add(&indexentries, n);
+	
+	nn = 0;
+	for(i=0; i<nbuf; i++){
+		bflush(&buf[i]);
+		buf[i].bp = nil;
+		buf[i].ep = nil;
+		buf[i].wp = nil;
+		nn += buf[i].nentry;
+	}
+	if(n != nn)
+		fprint(2, "isectproc bug: n=%ud nn=%ud\n", n, nn);
+		
+	free(data);
+
+	fprint(2, "%T %s: reordering\n", is->part->name);
+	
+	/*
+	 * Rearrange entries into minibuffers and then
+	 * split each minibuffer into buckets.
+	 */
+	mbuf = MKN(Minibuf, nminibuf);
+	mbufbuckets = (bufbuckets+nminibuf-1)/nminibuf;
+	for(i=0; i<nbuf; i++){
+		/*
+		 * Set up descriptors.
+		 */
+		n = buf[i].nentry;
+		nn = 0;
+		offset = buf[i].boffset;
+		memset(mbuf, 0, nminibuf*sizeof(mbuf[0]));
+		for(j=0; j<nminibuf; j++){
+			mb = &mbuf[j];
+			mb->boffset = offset;
+			if(j < nminibuf-1){
+				offset += mbufbuckets*blocksize;
+				mb->eoffset = offset;
+			}else
+				mb->eoffset = buf[i].eoffset;
+			mb->roffset = mb->boffset;
+			mb->woffset = mb->boffset;
+			mb->nentry = epbuf * (mb->eoffset - mb->boffset)/bufsize;
+			if(mb->nentry > buf[i].nentry)
+				mb->nentry = buf[i].nentry;
+			buf[i].nentry -= mb->nentry;
+			nn += mb->nentry;
+		}
+		if(n != nn)
+			fprint(2, "isectproc bug2: n=%ud nn=%ud (i=%d)\n", n, nn, i);;
+		/*
+		 * Rearrange.
+		 */
+		if(!dumb && nminibuf == 1){
+			mbuf[0].nwentry = mbuf[0].nentry;
+			mbuf[0].woffset = buf[i].woffset;
+		}else{
+			ipool = mkipool(is, mbuf, nminibuf, mbufbuckets, bufsize);
+			ipool->buck0 = bufbuckets*i;
+			for(j=0; j<nminibuf; j++){
+				mb = &mbuf[j];
+				while(mb->nentry > 0){
+					if(ipool->nfree < epbuf){
+						ipoolflush1(ipool);
+						/* ipoolflush1 might change mb->nentry */	
+						continue;
+					}
+					assert(ipool->nfree >= epbuf);
+					ipoolloadblock(ipool, mb);
+				}
+			}
+			ipoolflush(ipool);
+			nn = 0;
+			for(j=0; j<nminibuf; j++)
+				nn += mbuf[j].nwentry;
+			if(n != nn)
+				fprint(2, "isectproc bug3: n=%ud nn=%ud (i=%d)\n", n, nn, i);
+			free(ipool);
+		}
+
+		/*
+		 * Make buckets.
+		 */
+		space = 0;
+		for(j=0; j<nminibuf; j++)
+			if(space < mbuf[j].woffset - mbuf[j].boffset)
+				space = mbuf[j].woffset - mbuf[j].boffset;
+
+		data = emalloc(space);
+		for(j=0; j<nminibuf; j++){
+			mb = &mbuf[j];
+			sortminibuffer(is, mb, data, space, bufsize);
+		}
+		free(data);
+	}
+		
+	sendp(isectdonechan, is);
+}
+
+
+
blob - 9397d789a4197bbc1868eea57a9d7a53ed37d7df
blob + 7639b2caef154e88f173e225603e8031d006e5c8
--- src/cmd/venti/srv/checkindex.c
+++ src/cmd/venti/srv/checkindex.c
@@ -109,7 +109,7 @@ checkindex(Index *ix, Part *part, u64int off, u64int c
 	int ok, bok;
 u64int found = 0;
 
-/*ZZZ make buffer size configurable */
+/* ZZZ make buffer size configurable */
 	b = alloczblock(ix->blocksize, 0, ix->blocksize);
 	z = alloczblock(ix->blocksize, 1, ix->blocksize);
 	ies = initiestream(part, off, clumps, 64*1024);
@@ -260,6 +260,8 @@ threadmain(int argc, char *argv[])
 
 	if(initventi(argv[0], &conf) < 0)
 		sysfatal("can't init venti: %r");
+	if(mainindex->bloom && loadbloom(mainindex->bloom) < 0)
+		sysfatal("can't load bloom filter: %r");
 	oldbloom = mainindex->bloom;
 	newbloom = nil;
 	if(oldbloom){
blob - 88ebdb50ab0740ddce38bdcd21a239fdd86ed0b7
blob + ec277864c056eec8bcf6d6e89c672bfb91e60d80
--- src/cmd/venti/srv/clump.c
+++ src/cmd/venti/srv/clump.c
@@ -91,7 +91,7 @@ clumpmagic(Arena *arena, u64int aa)
 {
 	u8int buf[U32Size];
 
-	if(readarena(arena, aa, buf, U32Size) < 0)
+	if(readarena(arena, aa, buf, U32Size) == TWID32)
 		return TWID32;
 	return unpackmagic(buf);
 }
@@ -138,6 +138,11 @@ loadclump(Arena *arena, u64int aa, int blocks, Clump *
 		freezblock(cb);
 		return nil;
 	}
+	if(cl->info.type == VtCorruptType){
+		seterr(EOk, "clump is marked corrupt");
+		freezblock(cb);
+		return nil;
+	}
 	n -= ClumpSize;
 	if(n < cl->info.size){
 		freezblock(cb);
blob - 83f51df0f990c7a6214e2fbb440a8212f8ed7665
blob + 58b3d25c0bc48c8f623b06ecf34aec035857b2d2
--- src/cmd/venti/srv/conv.c
+++ src/cmd/venti/srv/conv.c
@@ -23,7 +23,7 @@ static struct {
 	ArenaHeadMagic, "ArenaHeadMagic",
 	ArenaMagic, "ArenaMagic",
 	ISectMagic, "ISectMagic",
-	BloomMagic, "BloomMagic"
+	BloomMagic, "BloomMagic",
 };
 
 static char*
@@ -138,9 +138,6 @@ unpackarena(Arena *arena, u8int *buf)
 	p += U64Size;
 	arena->diskstats.sealed = U8GET(p);
 	p += U8Size;
-
-	arena->memstats = arena->diskstats;
-
 	switch(arena->version){
 	case ArenaVersion4:
 		sz = ArenaSize4;
@@ -153,6 +150,35 @@ unpackarena(Arena *arena, u8int *buf)
 		seterr(ECorrupt, "arena has bad version number %d", arena->version);
 		return -1;
 	}
+	/*
+	 * Additional fields for the memstats version of the stats.
+	 * Diskstats reflects what is committed to the index.
+	 * Memstats reflects what is in the arena.  Originally intended
+	 * this to be a version 5 extension, but might as well use for
+	 * all the existing version 4 arenas too.
+	 *
+	 * To maintain backwards compatibility with existing venti
+	 * installations using the older format, we define that if 
+	 * memstats == diskstats, then the extension fields are not
+	 * included (see packarena below).  That is, only partially
+	 * indexed arenas have these fields.  Fully indexed arenas
+	 * (in particular, sealed arenas) do not.
+	 */
+	if(U8GET(p) == 1){
+		sz += ArenaSize5a-ArenaSize5;
+		p += U8Size;
+		arena->memstats.clumps = U32GET(p);
+		p += U32Size;
+		arena->memstats.cclumps = U32GET(p);
+		p += U32Size;
+		arena->memstats.used = U64GET(p);
+		p += U64Size;
+		arena->memstats.uncsize = U64GET(p);
+		p += U64Size;
+		arena->memstats.sealed = U8GET(p);
+		p += U8Size;
+	}else
+		arena->memstats = arena->diskstats;
 	if(buf + sz != p)
 		sysfatal("unpackarena unpacked wrong amount");
 
@@ -162,6 +188,12 @@ unpackarena(Arena *arena, u8int *buf)
 int
 packarena(Arena *arena, u8int *buf)
 {
+	return _packarena(arena, buf, 0);
+}
+
+int
+_packarena(Arena *arena, u8int *buf, int forceext)
+{
 	int sz;
 	u8int *p;
 	u32int t32;
@@ -207,6 +239,30 @@ packarena(Arena *arena, u8int *buf)
 	p += U64Size;
 	U8PUT(p, arena->diskstats.sealed);
 	p += U8Size;
+	
+	/*
+	 * Extension fields; see above.
+	 */
+	if(forceext
+	|| arena->memstats.clumps != arena->diskstats.clumps
+	|| arena->memstats.cclumps != arena->diskstats.cclumps
+	|| arena->memstats.used != arena->diskstats.used
+	|| arena->memstats.uncsize != arena->diskstats.uncsize
+	|| arena->memstats.sealed != arena->diskstats.sealed){
+		sz += ArenaSize5a - ArenaSize5;
+		U8PUT(p, 1);
+		p += U8Size;
+		U32PUT(p, arena->memstats.clumps);
+		p += U32Size;
+		U32PUT(p, arena->memstats.cclumps);
+		p += U32Size;
+		U64PUT(p, arena->memstats.used, t32);	
+		p += U64Size;
+		U64PUT(p, arena->memstats.uncsize, t32);
+		p += U64Size;
+		U8PUT(p, arena->memstats.sealed);
+		p += U8Size;
+	}
 
 	if(buf + sz != p)
 		sysfatal("packarena packed wrong amount");
@@ -525,6 +581,8 @@ unpackientry(IEntry *ie, u8int *buf)
 	p += U32Size;
 	ie->train = U16GET(p);
 	p += U16Size;
+	if(p - buf != IEntryAddrOff)
+		sysfatal("unpackentry bad IEntryAddrOff amount");
 	ie->ia.addr = U64GET(p);
 if(ie->ia.addr>>56) print("%.8H => %llux\n", p, ie->ia.addr);
 	p += U64Size;
blob - 5101ff88908d66d573628b9c13a15b9cd6d51643
blob + 4801204faa85488f2d1de4185838dd1fc5e7fb46
--- src/cmd/venti/srv/dat.h
+++ src/cmd/venti/srv/dat.h
@@ -75,23 +75,17 @@ enum
 	/*
 	 * magic numbers on disk
 	 */
-/*	_ClumpMagic		= 0xd15cb10cU,	/ * clump header, deprecated */
-#define	_ClumpMagic	0xd15cb10cU
+	_ClumpMagic		= 0xd15cb10cU,	/* clump header, deprecated */
 	ClumpFreeMagic		= 0,		/* free clump; terminates active clump log */
 
-/*	ArenaPartMagic		= 0xa9e4a5e7U,	/ * arena partition header */
-/*	ArenaMagic		= 0xf2a14eadU,	/ * arena trailer */
-/*	ArenaHeadMagic		= 0xd15c4eadU,	/ * arena header */
-#define	ArenaPartMagic		0xa9e4a5e7U
-#define	ArenaMagic		0xf2a14eadU
-#define	ArenaHeadMagic		0xd15c4eadU
-
-/*	BloomMagic		= 0xb1004eadU,	/ * bloom filter header */
-#define	BloomMagic		0xb1004eadU
+	ArenaPartMagic		= 0xa9e4a5e7U,	/* arena partition header */
+	ArenaMagic		= 0xf2a14eadU,	/* arena trailer */
+	ArenaHeadMagic		= 0xd15c4eadU,	/* arena header */
+	
+	BloomMagic		= 0xb1004eadU,	/* bloom filter header */
 	BloomMaxHash	= 32,
 
-/*	ISectMagic		= 0xd15c5ec7U,	/ * index header */
-#define	ISectMagic		0xd15c5ec7U
+	ISectMagic		= 0xd15c5ec7U,	/* index header */
 
 	ArenaPartVersion	= 3,
 	ArenaVersion4		= 4,
@@ -120,6 +114,7 @@ enum
 	ArenaPartSize		= 4 * U32Size,
 	ArenaSize4		= 2 * U64Size + 6 * U32Size + ANameSize + U8Size,
 	ArenaSize5			= ArenaSize4 + U32Size,
+	ArenaSize5a		= ArenaSize5 + 2 * U8Size + 2 * U32Size + 2 * U64Size,
 	ArenaHeadSize4		= U64Size + 3 * U32Size + ANameSize,
 	ArenaHeadSize5		= ArenaHeadSize4 + U32Size,
 	BloomHeadSize	= 4 * U32Size,
@@ -137,10 +132,14 @@ enum
 	 */
 	IBucketSize		= U32Size + U16Size,
 	IEntrySize		= U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize,
-	IEntryTypeOff		= VtScoreSize + U64Size + U32Size + 2 * U16Size,
+	IEntryTypeOff		= VtScoreSize + U32Size + U16Size + U64Size + U16Size,
+	IEntryAddrOff		= VtScoreSize + U32Size + U16Size,
 
 	MaxClumpBlocks		=  (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
+	
+	IcacheFrac		= 1000000,	/* denominator */
 
+	SleepForever		= 1000000000,	/* magic value for sleep time */
 	/*
 	 * dirty flags - order controls disk write order
 	 */
@@ -356,13 +355,11 @@ struct Arena
 	int		blocksize;		/* size of block to read or write */
 	u64int		base;			/* base address on disk */
 	u64int		size;			/* total space in the arena */
-	u64int		limit;			/* storage limit for clumps */
 	u8int		score[VtScoreSize];	/* score of the entire sealed & summed arena */
 
 	int		clumpmax;		/* ClumpInfos per block */
 	AState		mem;
 	int		inqueue;
-	DigestState	sha1;
 
 	/*
 	 * fields stored on disk
@@ -477,6 +474,8 @@ struct ISect
 	u32int		tabsize;		/* max. bytes in index config */
 	Channel	*writechan;
 	Channel	*writedonechan;
+	void		*ig;		/* used by buildindex only */
+	int		ng;
 
 	/*
 	 * fields stored on disk
@@ -716,8 +715,19 @@ extern	int		writestodevnull;	/* dangerous - for perfor
 extern	int		collectstats;
 extern	QLock	memdrawlock;
 extern	int		icachesleeptime;
+extern	int		minicachesleeptime;
 extern	int		arenasumsleeptime;
+extern	int		manualscheduling;
+extern	int		l0quantum;
+extern	int		l1quantum;
+extern	int		ignorebloom;
+extern	int		icacheprefetch;
+extern	int		syncwrites;
 
+extern	Stats	*stathist;
+extern	int	nstathist;
+extern	ulong	stattime;
+
 #ifndef PLAN9PORT
 #pragma varargck type "V" uchar*
 #define ODIRECT 0
blob - f5cc8e64057554836e2166ead7dfdfc2cefcf1c7
blob + 4d6d0865ca4555ec9b3229c55b42be76b861b2b4
--- src/cmd/venti/srv/dcache.c
+++ src/cmd/venti/srv/dcache.c
@@ -34,7 +34,7 @@ enum
 {
 	HashLog		= 9,
 	HashSize	= 1<<HashLog,
-	HashMask	= HashSize - 1
+	HashMask	= HashSize - 1,
 };
 
 struct DCache
@@ -212,8 +212,6 @@ return;
 		lastmiss.part = part;
 		lastmiss.addr = addr;
 	}
-
-/*	fprint(2, "%s %llx %s\n", part->name, addr, miss ? "miss" : "hit"); */
 }
 
 int
@@ -230,6 +228,7 @@ rareadpart(Part *part, u64int addr, u8int *buf, uint n
 	}
 	if(load != 2 || addr >= part->size){	/* addr >= part->size: let readpart do the error */	
 		runlock(&ralock);
+		diskaccess(0);
 		return readpart(part, addr, buf, n);
 	}
 
@@ -239,6 +238,7 @@ fprint(2, "raread %s %llx\n", part->name, addr);
 	nn = dcache.ramax;
 	if(addr+nn > part->size)
 		nn = part->size - addr;
+	diskaccess(0);
 	if(readpart(part, addr, dcache.rabuf, nn) < 0){
 		wunlock(&ralock);
 		return -1;
@@ -297,7 +297,6 @@ _getdblock(Part *part, u64int addr, int mode, int load
 	/*
 	 * look for the block in the cache
 	 */
-/*checkdcache(); */
 	qlock(&dcache.lock);
 again:
 	for(b = dcache.heads[h]; b != nil; b = b->next){
@@ -367,7 +366,6 @@ found:
 		fixheap(b->heap, b);
 
 	qunlock(&dcache.lock);
-/*checkdcache(); */
 
 	trace(TraceBlock, "getdblock lock");
 	addstat(StatDblockStall, 1);
@@ -427,7 +425,6 @@ putdblock(DBlock *b)
 	else
 		wunlock(&b->lock);
 
-/*checkdcache(); */
 	qlock(&dcache.lock);
 	if(--b->ref == 0 && !b->dirty){
 		if(b->heap == TWID32)
@@ -435,7 +432,6 @@ putdblock(DBlock *b)
 		rwakeupall(&dcache.full);
 	}
 	qunlock(&dcache.lock);
-/*checkdcache(); */
 }
 
 void
@@ -474,6 +470,25 @@ dirtydblock(DBlock *b, int dirty)
 	qunlock(&dcache.lock);
 }
 
+static void
+unchain(DBlock *b)
+{
+	ulong h;
+	
+	/*
+	 * unchain the block
+	 */
+	if(b->prev == nil){
+		h = pbhash(b->addr);
+		if(dcache.heads[h] != b)
+			sysfatal("bad hash chains in disk cache");
+		dcache.heads[h] = b->next;
+	}else
+		b->prev->next = b->next;
+	if(b->next != nil)
+		b->next->prev = b->prev;
+}
+
 /*
  * remove some block from use and update the free list and counters
  */
@@ -481,7 +496,6 @@ static DBlock*
 bumpdblock(void)
 {
 	DBlock *b;
-	ulong h;
 
 	trace(TraceBlock, "bumpdblock enter");
 	b = dcache.free;
@@ -512,20 +526,26 @@ bumpdblock(void)
 
 	trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr);
 
-	/*
-	 * unchain the block
-	 */
-	if(b->prev == nil){
-		h = pbhash(b->addr);
-		if(dcache.heads[h] != b)
-			sysfatal("bad hash chains in disk cache");
-		dcache.heads[h] = b->next;
-	}else
-		b->prev->next = b->next;
-	if(b->next != nil)
-		b->next->prev = b->prev;
-
+	unchain(b);
 	return b;
+}
+
+void
+emptydcache(void)
+{
+	DBlock *b;
+	
+	qlock(&dcache.lock);
+	while(dcache.nheap > 0){
+		b = dcache.heap[0];
+		delheap(b);
+		if(!b->ref && !b->dirty){
+			unchain(b);
+			b->next = dcache.free;
+			dcache.free = b;
+		}
+	}
+	qunlock(&dcache.lock);
 }
 
 /*
@@ -683,6 +703,7 @@ static int
 parallelwrites(DBlock **b, DBlock **eb, int dirty)
 {
 	DBlock **p, **q;
+
 	for(p=b; p<eb && (*p)->dirty == dirty; p++){
 		assert(b<=p && p<eb);
 		sendp((*p)->part->writechan, *p);
@@ -803,6 +824,7 @@ writeproc(void *v)
 		trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr);
 		wlock(&b->lock);
 		trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr);
+		diskaccess(0);
 		if(writepart(p, b->addr, b->data, b->size) < 0)
 			fprint(2, "write error: %r\n"); /* XXX details! */
 		addstat(StatApartWrite, 1);
blob - /dev/null
blob + 687616e1772838fade8380d392ef21e45299c7b6 (mode 644)
--- /dev/null
+++ src/cmd/venti/srv/disksched.c
@@ -0,0 +1,88 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+ulong lasttime[2];
+int manualscheduling;
+int l0quantum = 120;
+int l1quantum = 120;
+ulong lasticachechange;
+
+void
+disksched(void)
+{
+	int p, nwrite, nflush, ndirty, tdirty, toflush;
+	ulong t;
+	vlong cflush;
+	Stats *prev;
+	
+	/*
+	 * no locks because all the data accesses are atomic.
+	 */
+	t = time(0);
+	if(manualscheduling){
+		lasticachechange = t;
+		return;
+	}
+
+	if(t-lasttime[0] < l0quantum){
+		/* level-0 disk access going on */
+		p = icachedirtyfrac();
+		if(p < IcacheFrac*5/10){	/* can wait */
+			icachesleeptime = SleepForever;
+			lasticachechange = t;
+		}else if(p > IcacheFrac*9/10){	/* can't wait */
+			icachesleeptime = 0;
+			lasticachechange = t;
+		}else if(t-lasticachechange > 60){
+			/* have minute worth of data for current rate */
+			prev = &stathist[(stattime-60+nstathist)%nstathist];
+
+			/* # entries written to index cache */
+			nwrite = stats.n[StatIcacheWrite] - prev->n[StatIcacheWrite];
+			
+			/* # dirty entries in index cache */
+			ndirty = stats.n[StatIcacheDirty] - prev->n[StatIcacheDirty];
+			
+			/* # entries flushed to disk */
+			nflush = nwrite - ndirty;
+			
+			/* want to stay around 70% dirty */
+			tdirty = (vlong)stats.n[StatIcacheSize]*700/1000;
+			
+			/* assume nflush*icachesleeptime is a constant */
+			cflush = (vlong)nflush*(icachesleeptime+1);
+			
+			/* computer number entries to write in next minute */
+			toflush = nwrite + (stats.n[StatIcacheDirty] - tdirty);
+			
+			/* schedule for  that many */
+			if(toflush <= 0 || cflush/toflush > 100000)
+				icachesleeptime = SleepForever;
+			else
+				icachesleeptime = cflush/toflush;
+		}
+		arenasumsleeptime = SleepForever;
+		return;
+	}
+	if(t-lasttime[1] < l1quantum){
+		/* level-1 disk access (icache flush) going on */
+		icachesleeptime = 0;
+		arenasumsleeptime = SleepForever;
+		return;
+	}
+	/* no disk access going on - no holds barred*/
+	icachesleeptime = 0;
+	arenasumsleeptime = 0;
+}
+
+void
+diskaccess(int level)
+{
+	if(level < 0 || level >= nelem(lasttime)){
+		fprint(2, "bad level in diskaccess; caller=%lux\n", getcallerpc(&level));
+		return;
+	}
+	lasttime[level] = time(0);
+}
+
blob - 6681503dfdc1a04c5f2e32e1960ba1887234afbe
blob + 226d97aef20e794ce196f496f89e764c9245ddd4
--- src/cmd/venti/srv/findscore.c
+++ src/cmd/venti/srv/findscore.c
@@ -27,7 +27,7 @@ findscore(Arena *arena, uchar *score)
 	u32int clump;
 	int i, n, found;
 
-/*ZZZ remove fprint? */
+//ZZZ remove fprint?
 	if(arena->memstats.clumps)
 		fprint(2, "reading directory for arena=%s with %d entries\n", arena->name, arena->memstats.clumps);
 
blob - /dev/null
blob + 955159424e7bd416581bb9181484f7c05487fe03 (mode 644)
--- /dev/null
+++ src/cmd/venti/srv/fixarenas.c
@@ -0,0 +1,1894 @@
+/*
+ * Check and fix an arena partition.
+ *
+ * This is a lot grittier than the rest of Venti because
+ * it can't just give up if a byte here or there is wrong.
+ *
+ * The rule here (hopefully followed!) is that block corruption
+ * only ever has a local effect -- there are no blocks that you
+ * can wipe out that will cause large portions of 
+ * uncorrupted data blocks to be useless.
+ */
+
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "whack.h"
+
+#pragma varargck type "z" uvlong
+#pragma varargck type "z" vlong
+#pragma varargck type "t" uint
+
+enum
+{
+	K = 1024,
+	M = 1024*1024,
+	G = 1024*1024*1024,
+	
+	Block = 4096,
+};
+
+int debugsha1;
+
+int verbose;
+Part *part;
+char *file;
+char *basename;
+char *dumpbase;
+int fix;
+int badreads;
+int unseal;
+uchar zero[MaxDiskBlock];
+
+Arena lastarena;
+ArenaPart ap;
+uvlong arenasize;
+int nbadread;
+int nbad;
+uvlong partend;
+void checkarena(vlong, int);
+
+void
+usage(void)
+{
+	fprint(2, "usage: fixarenas [-fv] [-a arenasize] [-b blocksize] file [ranges]\n");
+	threadexitsall(0);
+}
+
+/*
+ * Format number in simplest way that is okay with unittoull.
+ */
+static int
+zfmt(Fmt *fmt)
+{
+	vlong x;
+	
+	x = va_arg(fmt->args, vlong);
+	if(x == 0)
+		return fmtstrcpy(fmt, "0");
+	if(x%G == 0)
+		return fmtprint(fmt, "%lldG", x/G);
+	if(x%M == 0)
+		return fmtprint(fmt, "%lldM", x/M);
+	if(x%K == 0)
+		return fmtprint(fmt, "%lldK", x/K);
+	return fmtprint(fmt, "%lld", x);
+}
+
+/*
+ * Format time like ctime without newline.
+ */
+static int
+tfmt(Fmt *fmt)
+{
+	uint t;
+	char buf[30];
+	
+	t = va_arg(fmt->args, uint);
+	strcpy(buf, ctime(t));
+	buf[28] = 0;
+	return fmtstrcpy(fmt, buf);
+}
+
+/*
+ * Coalesce messages about unreadable sectors into larger ranges.
+ * bad(0, 0) flushes the buffer.
+ */
+static void
+bad(char *msg, vlong o, int len)
+{
+	static vlong lb0, lb1;
+	static char *lmsg;
+
+	if(msg == nil)
+		msg = lmsg;
+	if(o == -1){
+		lmsg = nil;
+		lb0 = 0;
+		lb1 = 0;
+		return;
+	}
+	if(lb1 != o || (msg && lmsg && strcmp(msg, lmsg) != 0)){
+		if(lb0 != lb1)
+			print("%s %#llux+%#llux (%,lld+%,lld)\n",
+				lmsg, lb0, lb1-lb0, lb0, lb1-lb0);
+		lb0 = o;
+	}
+	lmsg = msg;
+	lb1 = o+len;
+}
+
+/*
+ * Read in the len bytes of data at the offset.  If can't for whatever reason,
+ * fill it with garbage but print an error.
+ */
+static uchar*
+readdisk(uchar *buf, vlong offset, int len)
+{
+	int i, j, k, n;
+
+	if(offset >= partend){
+		memset(buf, 0xFB, sizeof buf);
+		return buf;
+	}
+	
+	if(offset+len > partend){
+		memset(buf, 0xFB, sizeof buf);
+		len = partend - offset;
+	}
+
+	if(readpart(part, offset, buf, len) >= 0)
+		return buf;
+	
+	/*
+	 * The read failed.  Clear the buffer to nonsense, and
+	 * then try reading in smaller pieces.  If that fails,
+	 * read in even smaller pieces.  And so on down to sectors.
+	 */
+	memset(buf, 0xFD, len);
+	for(i=0; i<len; i+=64*K){
+		n = 64*K;
+		if(i+n > len)
+			n = len-i;
+		if(readpart(part, offset+i, buf+i, n) >= 0)
+			continue;
+		for(j=i; j<len && j<i+64*K; j+=4*K){
+			n = 4*K;
+			if(j+n > len)
+				n = len-j;
+			if(readpart(part, offset+j, buf+j, n) >= 0)
+				continue;
+			for(k=j; k<len && k<j+4*K; k+=512){
+				if(readpart(part, offset+k, buf+k, 512) >= 0)
+					continue;
+				bad("disk read failed at", k, 512);
+				badreads++;
+			}
+		}
+	}
+	bad(nil, 0, 0);
+	return buf;
+}
+
+/*
+ * Buffer to support running SHA1 hash of the disk.
+ */
+typedef struct Shabuf Shabuf;
+struct Shabuf
+{
+	int fd;
+	vlong offset;
+	DigestState state;
+	int rollback;
+	vlong r0;
+	DigestState *hist;
+	int nhist;
+};
+
+void
+sbdebug(Shabuf *sb, char *file)
+{
+	int fd;
+	
+	if(sb->fd > 0){
+		close(sb->fd);
+		sb->fd = 0;
+	}
+	if((fd = create(file, OWRITE, 0666)) < 0)
+		return;
+	if(fd == 0){
+		fd = dup(fd, -1);
+		close(0);
+	}
+	sb->fd = fd;
+}
+
+void
+sbupdate(Shabuf *sb, uchar *p, vlong offset, int len)
+{
+	int n, x;
+	vlong o;
+
+	if(sb->rollback && !sb->hist){
+		sb->r0 = offset;
+		sb->nhist = 1;
+		sb->hist = vtmalloc(sb->nhist*sizeof *sb->hist);
+		memset(sb->hist, 0, sizeof sb->hist[0]);
+	}
+	if(sb->r0 == 0)
+		sb->r0 = offset;
+
+	if(sb->offset < offset || sb->offset >= offset+len){
+		if(0) print("sbupdate %p %#llux+%d but offset=%#llux\n",
+			p, offset, len, sb->offset);
+		return;
+	}
+	x = sb->offset - offset;
+	if(0) print("sbupdate %p %#llux+%d skip %d\n",
+		sb, offset, len, x);
+	if(x){
+		p += x;
+		offset += x;
+		len -= x;
+	}
+	assert(sb->offset == offset);
+	
+	if(sb->fd > 0)
+		pwrite(sb->fd, p, len, offset - sb->r0);
+
+	if(!sb->rollback){
+		sha1(p, len, nil, &sb->state);
+		sb->offset += len;
+		return;
+	}
+	
+	/* save state every 4M so we can roll back quickly */
+	o = offset - sb->r0;
+	while(len > 0){
+		n = 4*M - o%(4*M);
+		if(n > len)
+			n = len;
+		sha1(p, n, nil, &sb->state);
+		sb->offset += n;
+		o += n;
+		p += n;
+		len -= n;
+		if(o%(4*M) == 0){
+			x = o/(4*M);
+			if(x >= sb->nhist){
+				if(x != sb->nhist)
+					print("oops! x=%d nhist=%d\n", x, sb->nhist);
+				sb->nhist += 32;
+				sb->hist = vtrealloc(sb->hist, sb->nhist*sizeof *sb->hist);
+			}
+			sb->hist[x] = sb->state;
+		}
+	}		
+}
+
+void
+sbdiskhash(Shabuf *sb, vlong eoffset)
+{
+	static uchar dbuf[4*M];
+	int n;
+	
+	while(sb->offset < eoffset){
+		n = sizeof dbuf;
+		if(sb->offset+n > eoffset)
+			n = eoffset - sb->offset;
+		readdisk(dbuf, sb->offset, n);
+		sbupdate(sb, dbuf, sb->offset, n);
+	}
+}
+
+void
+sbrollback(Shabuf *sb, vlong offset)
+{
+	int x;
+	vlong o;
+	Dir d;
+	
+	if(!sb->rollback || !sb->r0){
+		print("cannot rollback sha\n");
+		return;
+	}
+	if(offset >= sb->offset)
+		return;
+	o = offset - sb->r0;
+	x = o/(4*M);
+	if(x >= sb->nhist){
+		print("cannot rollback sha\n");
+		return;
+	}
+	sb->state = sb->hist[x];
+	sb->offset = sb->r0 + x*4*M;
+	assert(sb->offset <= offset);
+	
+	if(sb->fd > 0){
+		nulldir(&d);
+		d.length = sb->offset - sb->r0;
+		dirfwstat(sb->fd, &d);
+	}
+}
+
+void
+sbscore(Shabuf *sb, uchar *score)
+{
+	if(sb->hist){
+		free(sb->hist);
+		sb->hist = nil;
+	}
+	sha1(nil, 0, score, &sb->state);
+}
+
+/*
+ * If we're fixing arenas, then editing this memory edits the disk!
+ * It will be written back out as new data is paged in. 
+ */
+uchar buf[4*M];
+uchar sbuf[4*M];
+vlong bufoffset;
+int buflen;
+
+static void pageout(void);
+static uchar*
+pagein(vlong offset, int len)
+{
+	pageout();
+	if(offset >= partend){
+		memset(buf, 0xFB, sizeof buf);
+		return buf;
+	}
+	
+	if(offset+len > partend){
+		memset(buf, 0xFB, sizeof buf);
+		len = partend - offset;
+	}
+	bufoffset = offset;
+	buflen = len;
+	readdisk(buf, offset, len);
+	memmove(sbuf, buf, len);
+	return buf;
+}
+
+static void
+pageout(void)
+{
+	if(buflen==0 || !fix || memcmp(buf, sbuf, buflen) == 0){
+		buflen = 0;
+		return;
+	}
+	if(writepart(part, bufoffset, buf, buflen) < 0)
+		print("disk write failed at %#llux+%#ux (%,lld+%,d)\n",
+			bufoffset, buflen, bufoffset, buflen);
+	buflen = 0;
+}
+
+static void
+zerorange(vlong offset, int len)
+{
+	int i;
+	vlong ooff;
+	int olen;
+	enum { MinBlock = 4*K, MaxBlock = 8*K };
+	
+	if(0)
+	if(bufoffset <= offset && offset+len <= bufoffset+buflen){
+		memset(buf+(offset-bufoffset), 0, len);
+		return;
+	}
+	
+	ooff = bufoffset;
+	olen = buflen;
+	
+	i = offset%MinBlock;
+	if(i+len < MaxBlock){
+		pagein(offset-i, (len+MinBlock-1)&~(MinBlock-1));
+		memset(buf+i, 0, len);
+	}else{
+		pagein(offset-i, MaxBlock);
+		memset(buf+i, 0, MaxBlock-i);
+		offset += MaxBlock-i;
+		len -= MaxBlock-i;
+		while(len >= MaxBlock){
+			pagein(offset, MaxBlock);
+			memset(buf, 0, MaxBlock);
+			offset += MaxBlock;
+			len -= MaxBlock;
+		}
+		pagein(offset, (len+MinBlock-1)&~(MinBlock-1));
+		memset(buf, 0, len);
+	}
+	pagein(ooff, olen);
+}
+
+/*
+ * read/write integers
+ *
+static void
+p16(uchar *p, u16int u)
+{
+	p[0] = (u>>8) & 0xFF;
+	p[1] = u & 0xFF;
+}
+*/
+
+static u16int
+u16(uchar *p)
+{
+	return (p[0]<<8)|p[1];
+}
+
+static void
+p32(uchar *p, u32int u)
+{
+	p[0] = (u>>24) & 0xFF;
+	p[1] = (u>>16) & 0xFF;
+	p[2] = (u>>8) & 0xFF;
+	p[3] = u & 0xFF;
+}
+
+static u32int
+u32(uchar *p)
+{
+	return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3];
+}
+
+/*
+static void
+p64(uchar *p, u64int u)
+{
+	p32(p, u>>32);
+	p32(p, u);
+}
+*/
+
+static u64int
+u64(uchar *p)
+{
+	return ((u64int)u32(p)<<32) | u32(p+4);
+}
+
+static int
+vlongcmp(const void *va, const void *vb)
+{
+	vlong a, b;
+	
+	a = *(vlong*)va;
+	b = *(vlong*)vb;
+	if(a < b)
+		return -1;
+	if(b > a)
+		return 1;
+	return 0;
+}
+
+/* D and S are in draw.h */
+#define D VD
+#define S VS
+
+enum
+{
+	D = 0x10000,
+	Z = 0x20000,
+	S = 0x30000,
+	T = 0x40000,
+	N = 0xFFFF
+};
+typedef struct Info Info;
+struct Info
+{
+	int len;
+	char *name;
+};
+
+Info partinfo[] = {
+	4,	"magic",
+	D|4,	"version",
+	Z|4,	"blocksize",
+	4,	"arenabase",
+	0
+};
+
+Info headinfo4[] = {
+	4,	"magic",
+	D|4,	"version",
+	S|ANameSize,	"name",
+	Z|4,	"blocksize",
+	Z|8,	"size",
+	0
+};
+
+Info headinfo5[] = {
+	4,	"magic",
+	D|4,	"version",
+	S|ANameSize,	"name",
+	Z|4,	"blocksize",
+	Z|8,	"size",
+	4,	"clumpmagic",
+	0
+};
+
+Info tailinfo4[] = {
+	4,	"magic",
+	D|4,	"version",
+	S|ANameSize,	"name",
+	D|4,	"clumps",
+	D|4,	"cclumps",
+	T|4,	"ctime",
+	T|4,	"wtime",
+	D|8,	"used",
+	D|8,	"uncsize",
+	1,	"sealed",
+	0
+};
+	
+Info tailinfo4a[] = {
+	/* tailinfo 4 */
+	4,	"magic",
+	D|4,	"version",
+	S|ANameSize,	"name",
+	D|4,	"clumps",
+	D|4,	"cclumps",
+	T|4,	"ctime",
+	T|4,	"wtime",
+	D|8,	"used",
+	D|8,	"uncsize",
+	1,	"sealed",
+
+	/* mem stats */
+	1,	"extension",
+	D|4,	"mem.clumps",
+	D|4,	"mem.cclumps",
+	D|8,	"mem.used",
+	D|8,	"mem.uncsize",
+	1,	"mem.sealed",
+	0
+};
+	
+Info tailinfo5[] = {
+	4,	"magic",
+	D|4,	"version",
+	S|ANameSize,	"name",
+	D|4,	"clumps",
+	D|4,	"cclumps",
+	T|4,	"ctime",
+	T|4,	"wtime",
+	4,	"clumpmagic",
+	D|8,	"used",
+	D|8,	"uncsize",
+	1,	"sealed",
+	0
+};
+
+Info tailinfo5a[] = {
+	/* tailinfo 5 */
+	4,	"magic",
+	D|4,	"version",
+	S|ANameSize,	"name",
+	D|4,	"clumps",
+	D|4,	"cclumps",
+	T|4,	"ctime",
+	T|4,	"wtime",
+	4,	"clumpmagic",
+	D|8,	"used",
+	D|8,	"uncsize",
+	1,	"sealed",
+
+	/* mem stats */
+	1,	"extension",
+	D|4,	"mem.clumps",
+	D|4,	"mem.cclumps",
+	D|8,	"mem.used",
+	D|8,	"mem.uncsize",
+	1,	"mem.sealed",
+	0
+};
+	
+void
+showdiffs(uchar *want, uchar *have, int len, Info *info)
+{
+	int n;
+	
+	while(len > 0 && (n=info->len&N) > 0){
+		if(memcmp(have, want, n) != 0){
+			switch(info->len){
+			case 1:
+				print("\t%s: correct=%d disk=%d\n",
+					info->name, *want, *have);
+				break;
+			case 4:
+				print("\t%s: correct=%#ux disk=%#ux\n",
+					info->name, u32(want), u32(have));
+				break;
+			case D|4:
+				print("\t%s: correct=%,ud disk=%,ud\n",
+					info->name, u32(want), u32(have));
+				break;
+			case T|4:
+				print("\t%s: correct=%t\n\t\tdisk=%t\n",
+					info->name, u32(want), u32(have));
+				break;
+			case Z|4:
+				print("\t%s: correct=%z disk=%z\n",
+					info->name, (uvlong)u32(want), (uvlong)u32(have));
+				break;
+			case D|8:
+				print("\t%s: correct=%,lld disk=%,lld\n",
+					info->name, u64(want), u64(have));
+				break;
+			case Z|8:
+				print("\t%s: correct=%z disk=%z\n",
+					info->name, u64(want), u64(have));
+				break;
+			case S|ANameSize:
+				print("\t%s: correct=%s disk=%.*s\n",
+					info->name, (char*)want, 
+					utfnlen((char*)have, ANameSize-1),
+					(char*)have);
+				break;
+			default:
+				print("\t%s: correct=%.*H disk=%.*H\n",
+					info->name, n, want, n, have);
+				break;
+			}
+		}
+		have += n;
+		want += n;
+		len -= n;
+		info++;
+	}
+	if(len > 0 && memcmp(have, want, len) != 0){
+		if(memcmp(want, zero, len) != 0)
+			print("!!\textra want data in showdiffs (bug in fixarenas)\n");
+		else
+			print("\tnon-zero data on disk after structure\n");
+		if(verbose > 1){
+			print("want: %.*H\n", len, want);
+			print("have: %.*H\n", len, have);
+		}
+	}
+}
+
+static int tabsizes[] = { 64*1024, 512*1024, };
+/*
+ * Poke around on the disk to guess what the ArenaPart numbers are.
+ */
+void
+guessgeometry(void)
+{
+	int i, j, n, bestn, ndiff, nhead, ntail;
+	uchar *p, *ep, *sp;
+	u64int diff[100], head[20], tail[20];
+	u64int offset, bestdiff;
+	
+	ap.version = ArenaPartVersion;
+
+	if(arenasize == 0 || ap.blocksize == 0){
+		/*
+		 * The ArenaPart block at offset PartBlank may be corrupt or just wrong.
+		 * Instead, look for the individual arena headers and tails, which there
+		 * are many of, and once we've seen enough, infer the spacing.
+		 *
+		 * Of course, nothing in the file format requires that arenas be evenly
+		 * spaced, but fmtarenas always does that for us.
+		 */
+		nhead = 0;
+		ntail = 0;
+		for(offset=PartBlank; offset<partend; offset+=4*M){
+			p = pagein(offset, 4*M);
+			for(sp=p, ep=p+4*M; p<ep; p+=K){
+				if(u32(p) == ArenaHeadMagic && nhead < nelem(head)){
+					if(verbose)
+						print("arena head at %#llx\n", offset+(p-sp));
+					head[nhead++] = offset+(p-sp);
+				}
+				if(u32(p) == ArenaMagic && ntail < nelem(tail)){
+					tail[ntail++] = offset+(p-sp);
+					if(verbose)
+						print("arena tail at %#llx\n", offset+(p-sp));
+				}
+			}
+			if(nhead == nelem(head) && ntail == nelem(tail))
+				break;
+		}
+		if(nhead < 3 && ntail < 3)
+			sysfatal("too few intact arenas: %d heads, %d tails", nhead, ntail);
+	
+		/* 
+		 * Arena size is likely the most common
+		 * inter-head or inter-tail spacing.
+		 */
+		ndiff = 0;
+		for(i=1; i<nhead; i++)
+			diff[ndiff++] = head[i] - head[i-1];
+		for(i=1; i<ntail; i++)
+			diff[ndiff++] = tail[i] - tail[i-1];
+		qsort(diff, ndiff, sizeof diff[0], vlongcmp);
+		bestn = 0;
+		bestdiff = 0;
+		for(i=1, n=1; i<=ndiff; i++, n++){
+			if(i==ndiff || diff[i] != diff[i-1]){
+				if(n > bestn){
+					bestn = n;
+					bestdiff = diff[i-1];
+				}
+				n = 0;
+			}
+		}
+		print("arena size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
+		if(arenasize != 0 && arenasize != bestdiff)
+			print("using user-specified size %z instead\n", arenasize);
+		else
+			arenasize = bestdiff;
+
+		/*
+		 * The arena tail for an arena is arenasize-blocksize from the head.
+		 */
+		ndiff = 0;
+		for(i=j=0; i<nhead && j<ntail; ){
+			if(tail[j] < head[i]){
+				j++;
+				continue;
+			}
+			if(tail[j] < head[i]+arenasize){
+				diff[ndiff++] = head[i]+arenasize - tail[j];
+				j++;
+				continue;
+			}
+			i++;
+		}
+		if(ndiff < 3)
+			sysfatal("too few intact arenas: %d head, tail pairs", ndiff);
+		qsort(diff, ndiff, sizeof diff[0], vlongcmp);
+		bestn = 0;
+		bestdiff = 0;
+		for(i=1, n=1; i<=ndiff; i++, n++){
+			if(i==ndiff || diff[i] != diff[i-1]){
+				if(n > bestn){
+					bestn = n;
+					bestdiff = diff[i-1];
+				}
+				n = 0;
+			}
+		}
+		print("block size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
+		if(ap.blocksize != 0 && ap.blocksize != bestdiff)
+			print("using user-specified size %z instead\n", (vlong)ap.blocksize);
+		else
+			ap.blocksize = bestdiff;
+		if(ap.blocksize == 0 || ap.blocksize&(ap.blocksize-1))
+			sysfatal("block size not a power of two");
+		if(ap.blocksize > MaxDiskBlock)
+			sysfatal("block size too big (max=%d)", MaxDiskBlock);
+
+		/*
+		 * Use head/tail information to deduce arena base.
+		 */
+		ndiff = 0;
+		for(i=0; i<nhead; i++)
+			diff[ndiff++] = head[i]%arenasize;
+		for(i=0; i<ntail; i++)
+			diff[ndiff++] = (tail[i]+ap.blocksize)%arenasize;
+		qsort(diff, ndiff, sizeof diff[0], vlongcmp);
+		bestn = 0;
+		bestdiff = 0;
+		for(i=1, n=1; i<=ndiff; i++, n++){
+			if(i==ndiff || diff[i] != diff[i-1]){
+				if(n > bestn){
+					bestn = n;
+					bestdiff = diff[i-1];
+				}
+				n = 0;
+			}
+		}
+		ap.arenabase = bestdiff;
+	}
+	
+	ap.tabbase = (PartBlank+HeadSize+ap.blocksize-1)&~(ap.blocksize-1);
+	/*
+	 * XXX pick up table, check arenabase.
+	 * XXX pick up table, record base name.
+	 */
+
+	/*
+	 * Somewhat standard computation.
+	 * Fmtarenas used to use 64k tab, now uses 512k tab.
+	 */
+	if(ap.arenabase == 0){
+		for(i=0; i<nelem(tabsizes); i++){
+			ap.arenabase = (PartBlank+HeadSize+tabsizes[i]+ap.blocksize-1)&~(ap.blocksize-1);
+			p = pagein(ap.arenabase, Block);
+			if(u32(p) == ArenaHeadMagic)
+				break;
+		}
+	}
+	p = pagein(ap.arenabase, Block);
+	print("arena base likely %z%s\n", (vlong)ap.arenabase, 
+		u32(p)!=ArenaHeadMagic ? " (but no arena head there)" : "");
+
+	ap.tabsize = ap.arenabase - ap.tabbase;
+	
+}
+
+/*
+ * Check the arena partition blocks and then the arenas listed in range.
+ */
+void
+checkarenas(char *range)
+{
+	char *s, *t;
+	int i, lo, hi, narena;
+	uchar dbuf[HeadSize];
+	uchar *p;
+
+	guessgeometry();
+
+	partend -= partend%ap.blocksize;
+
+	memset(dbuf, 0, sizeof dbuf);
+	packarenapart(&ap, dbuf);
+	p = pagein(PartBlank, Block);
+	if(memcmp(p, dbuf, HeadSize) != 0){
+		print("on-disk arena part superblock incorrect\n");
+		showdiffs(dbuf, p, HeadSize, partinfo);
+	}
+	memmove(p, dbuf, HeadSize);
+
+	narena = (partend-ap.arenabase + arenasize-1)/arenasize;
+	if(range == nil){
+		for(i=0; i<narena; i++)
+			checkarena(ap.arenabase+(vlong)i*arenasize, i);
+	}else if(strcmp(range, "none") == 0){
+		/* nothing */
+	}else{
+		/* parse, e.g., -4,8-9,10- */
+		for(s=range; *s; s=t){
+			t = strchr(s, ',');
+			if(t)
+				*t++ = 0;
+			else
+				t = s+strlen(s);
+			if(*s == '-')
+				lo = 0;
+			else
+				lo = strtol(s, &s, 0);
+			hi = lo;
+			if(*s == '-'){
+				s++;
+				if(*s == 0)
+					hi = narena-1;
+				else
+					hi = strtol(s, &s, 0);
+			}
+			if(*s != 0){
+				print("bad arena range: %s\n", s);
+				continue;
+			}
+			for(i=lo; i<=hi; i++)
+				checkarena(ap.arenabase+(vlong)i*arenasize, i);
+		}
+	}
+}
+
+/*
+ * Is there a clump here at p?
+ */
+static int
+isclump(uchar *p, Clump *cl, u32int *pmagic)
+{
+	int n;
+	u32int magic;
+	uchar score[VtScoreSize], *bp;
+	Unwhack uw;
+	uchar ubuf[70*1024];
+	
+	bp = p;
+	magic = u32(p);
+	if(magic == 0)
+		return 0;
+	p += U32Size;
+
+	cl->info.type = vtfromdisktype(*p);
+	if(cl->info.type == 0xFF)
+		return 0;
+	p++;
+	cl->info.size = u16(p);
+	p += U16Size;
+	cl->info.uncsize = u16(p);
+	if(cl->info.size > cl->info.uncsize)
+		return 0;
+	p += U16Size;
+	scorecp(cl->info.score, p);
+	p += VtScoreSize;
+	cl->encoding = *p;
+	p++;
+	cl->creator = u32(p);
+	p += U32Size;
+	cl->time = u32(p);
+	p += U32Size;
+
+	switch(cl->encoding){
+	case ClumpENone:
+		if(cl->info.size != cl->info.uncsize)
+			return 0;
+		scoremem(score, p, cl->info.size);
+		if(scorecmp(score, cl->info.score) != 0)
+			return 0;
+		break;
+	case ClumpECompress:
+		if(cl->info.size >= cl->info.uncsize)
+			return 0;
+		unwhackinit(&uw);
+		n = unwhack(&uw, ubuf, cl->info.uncsize, p, cl->info.size);
+		if(n != cl->info.uncsize)
+			return 0;
+		scoremem(score, ubuf, cl->info.uncsize);
+		if(scorecmp(score, cl->info.score) != 0)
+			return 0;
+		break;
+	default:
+		return 0;
+	}
+	p += cl->info.size;
+	
+	/* it all worked out in the end */
+	*pmagic = magic;
+	return p - bp;
+}
+
+/*
+ * All ClumpInfos seen in this arena.
+ * Kept in binary tree so we can look up by score.
+ */
+typedef struct Cit Cit;
+struct Cit
+{
+	int left;
+	int right;
+	vlong corrupt;
+	ClumpInfo ci;
+};
+Cit *cibuf;
+int ciroot;
+int ncibuf, mcibuf;
+
+void
+resetcibuf(void)
+{
+	ncibuf = 0;
+	ciroot = -1;
+}
+
+int*
+ltreewalk(int *p, uchar *score)
+{
+	int i;
+	
+	for(;;){
+		if(*p == -1)
+			return p;
+		i = scorecmp(cibuf[*p].ci.score, score);
+		if(i == 0)
+			return p;
+		if(i < 0)
+			p = &cibuf[*p].right;
+		else
+			p = &cibuf[*p].left;
+	}
+	return nil; 	/* stupid 8c */
+}
+
+void
+addcibuf(ClumpInfo *ci, vlong corrupt)
+{
+	Cit *cit;
+	
+	if(ncibuf == mcibuf){
+		mcibuf += 131072;
+		cibuf = vtrealloc(cibuf, mcibuf*sizeof cibuf[0]);
+	}
+	cit = &cibuf[ncibuf];
+	cit->ci = *ci;
+	cit->left = -1;
+	cit->right = -1;
+	cit->corrupt = corrupt;
+	if(!corrupt)
+		*ltreewalk(&ciroot, ci->score) = ncibuf;
+	ncibuf++;
+}
+
+void
+addcicorrupt(vlong len)
+{
+	static ClumpInfo zci;
+	
+	addcibuf(&zci, len);
+}
+
+int
+haveclump(uchar *score)
+{
+	int i;
+	int p;
+	
+	p = ciroot;
+	for(;;){
+		if(p == -1)
+			return 0;
+		i = scorecmp(cibuf[p].ci.score, score);
+		if(i == 0)
+			return 1;
+		if(i < 0)
+			p = cibuf[p].right;
+		else
+			p = cibuf[p].left;
+	}
+	return 0;	/* stupid 8c */
+}
+
+int
+matchci(ClumpInfo *ci, uchar *p)
+{
+	if(ci->type != vtfromdisktype(p[0]))
+		return 0;
+	if(ci->size != u16(p+1))
+		return 0;
+	if(ci->uncsize != u16(p+3))
+		return 0;
+	if(scorecmp(ci->score, p+5) != 0)
+		return 0;
+	return 1;
+}
+
+int
+sealedarena(uchar *p, int blocksize)
+{
+	int v, n;
+	
+	v = u32(p+4);
+	switch(v){
+	default:
+		return 0;
+	case ArenaVersion4:
+		n = ArenaSize4;
+		break;
+	case ArenaVersion5:
+		n = ArenaSize5;
+		break;
+	}
+	if(p[n-1] != 1){
+		print("arena tail says not sealed\n");
+		return 0;
+	}
+	if(memcmp(p+n, zero, blocksize-VtScoreSize-n) != 0){
+		print("arena tail followed by non-zero data\n");
+		return 0;
+	}
+	if(memcmp(p+blocksize-VtScoreSize, zero, VtScoreSize) == 0){
+		print("arena score zero\n");
+		return 0;
+	}
+	return 1;
+}
+
+int
+okayname(char *name, int n)
+{
+	char buf[20];
+	
+	if(nameok(name) < 0)
+		return 0;
+	sprint(buf, "%d", n);
+	if(strlen(name) < strlen(buf) 
+	|| strcmp(name+strlen(name)-strlen(buf), buf) != 0)
+		return 0;
+	return 1;
+}
+
+int
+clumpinfocmp(ClumpInfo *a, ClumpInfo *b)
+{
+	if(a->type != b->type)
+		return a->type - b->type;
+	if(a->size != b->size)
+		return a->size - b->size;
+	if(a->uncsize != b->uncsize)
+		return a->uncsize - b->uncsize;
+	return scorecmp(a->score, b->score);
+}
+
+ClumpInfo*
+loadci(vlong offset, Arena *arena, int nci)
+{
+	int i, j, per;
+	uchar *p, *sp;
+	ClumpInfo *bci, *ci;
+	
+	per = arena->blocksize/ClumpInfoSize;
+	bci = vtmalloc(nci*sizeof bci[0]);
+	ci = bci;
+	offset += arena->size - arena->blocksize;
+	p = sp = nil;
+	for(i=0; i<nci; i+=per){
+		if(p == sp){
+			sp = pagein(offset-4*M, 4*M);
+			p = sp+4*M;
+		}
+		p -= arena->blocksize;
+		offset -= arena->blocksize;
+		for(j=0; j<per && i+j<nci; j++)
+			unpackclumpinfo(ci++, p+j*ClumpInfoSize);
+	}
+	return bci;
+}
+
+vlong
+writeci(vlong offset, Arena *arena, ClumpInfo *ci, int nci)
+{
+	int i, j, per;
+	uchar *p, *sp;
+	
+	per = arena->blocksize/ClumpInfoSize;
+	offset += arena->size - arena->blocksize;
+	p = sp = nil;
+	for(i=0; i<nci; i+=per){
+		if(p == sp){
+			sp = pagein(offset-4*M, 4*M);
+			p = sp+4*M;
+		}
+		p -= arena->blocksize;
+		offset -= arena->blocksize;
+		memset(p, 0, arena->blocksize);
+		for(j=0; j<per && i+j<nci; j++)
+			packclumpinfo(ci++, p+j*ClumpInfoSize);
+	}
+	pageout();
+	return offset;
+}
+
+void
+loadarenabasics(vlong offset0, int anum, ArenaHead *head, Arena *arena)
+{
+	char dname[ANameSize];
+	static char lastbase[ANameSize];
+	uchar *p;
+	Arena oarena;
+	ArenaHead ohead;
+
+	/*
+	 * Fmtarenas makes all arenas the same size
+	 * except the last, which may be smaller.
+	 * It uses the same block size for arenas as for
+	 * the arena partition blocks.
+	 */
+	arena->size = arenasize;
+	if(offset0+arena->size > partend)
+		arena->size = partend - offset0;
+	head->size = arena->size;
+	
+	arena->blocksize = ap.blocksize;
+	head->blocksize = arena->blocksize;
+	
+	/* 
+	 * Look for clump magic and name in head/tail blocks.
+	 * All the other info we will reconstruct just in case.
+	 */
+	p = pagein(offset0, arena->blocksize);
+	memset(&ohead, 0, sizeof ohead);
+	if(unpackarenahead(&ohead, p) >= 0){
+		head->version = ohead.version;
+		head->clumpmagic = ohead.clumpmagic;
+		if(okayname(ohead.name, anum))
+			strcpy(head->name, ohead.name);
+	}
+
+	p = pagein(offset0+arena->size-arena->blocksize, 
+		arena->blocksize);
+	memset(&oarena, 0, sizeof oarena);
+	if(unpackarena(&oarena, p) >= 0){
+		arena->version = oarena.version;
+		arena->clumpmagic = oarena.clumpmagic;
+		if(okayname(oarena.name, anum))
+			strcpy(arena->name, oarena.name);
+		arena->diskstats.clumps = oarena.diskstats.clumps;
+print("old arena: sealed=%d\n", oarena.diskstats.sealed);
+		arena->diskstats.sealed = oarena.diskstats.sealed;
+	}
+
+	/* Head trumps arena. */
+	if(head->version){
+		arena->version = head->version;
+		arena->clumpmagic = head->clumpmagic;
+	}
+	if(arena->version == 0)
+		arena->version = ArenaVersion5;
+	if(basename)
+		snprint(arena->name, ANameSize, "%s%d", basename, anum);
+	else if(lastbase[0])
+		snprint(arena->name, ANameSize, "%s%d", lastbase, anum);
+	else if(head->name[0])
+		strcpy(arena->name, head->name);
+	else if(arena->name[0] == 0)
+		sysfatal("cannot determine base name for arena; use -n");
+	strcpy(lastbase, arena->name);
+	sprint(dname, "%d", anum);
+	lastbase[strlen(lastbase)-strlen(dname)] = 0;
+	
+	/* Was working in arena, now copy to head. */
+	head->version = arena->version;
+	memmove(head->name, arena->name, sizeof head->name);
+	head->blocksize = arena->blocksize;
+	head->size = arena->size;
+}
+
+void
+shahead(Shabuf *sb, vlong offset0, ArenaHead *head)
+{
+	uchar headbuf[MaxDiskBlock];
+	
+	sb->offset = offset0;
+	memset(headbuf, 0, sizeof headbuf);
+	packarenahead(head, headbuf);
+	sbupdate(sb, headbuf, offset0, head->blocksize);
+}
+
+u32int
+newclumpmagic(int version)
+{
+	u32int m;
+	
+	if(version == ArenaVersion4)
+		return _ClumpMagic;
+	do{
+		m = fastrand();
+	}while(m==0 || m == _ClumpMagic);
+	return m;
+}
+
+/*
+ * Poke around in the arena to find the clump data
+ * and compute the relevant statistics.
+ */
+void
+guessarena(vlong offset0, int anum, ArenaHead *head, Arena *arena,
+	uchar *oldscore, uchar *score)
+{
+	uchar dbuf[MaxDiskBlock];
+	int needtozero, clumps, nb1, nb2, minclumps;
+	int inbad, n, ncib, printed, sealing, smart;
+	u32int magic;
+	uchar *sp, *ep, *p;
+	vlong boffset, eoffset, lastclumpend, leaked;
+	vlong offset, toffset, totalcorrupt, v;
+	Clump cl;
+	ClumpInfo *bci, *ci, *eci, *xci;
+	Cit *bcit, *cit, *ecit;
+	Shabuf oldsha, newsha;
+	
+	/*
+	 * We expect to find an arena, with data, between offset
+	 * and offset+arenasize.  With any luck, the data starts at
+	 * offset+ap.blocksize.  The blocks have variable size and
+	 * aren't padded at all, which doesn't give us any alignment
+	 * constraints.  The blocks are compressed or high entropy,
+	 * but the headers are pretty low entropy (except the score):
+	 *
+	 *	type[1] (range 0 thru 9, 13)
+	 *	size[2]
+	 *	uncsize[2] (<= size)
+	 *
+	 * so we can look for these.  We check the scores as we go,
+	 * so we can't make any wrong turns.  If we find ourselves
+	 * in a dead end, scan forward looking for a new start.
+	 */
+
+	resetcibuf();
+	memset(head, 0, sizeof *head);
+	memset(arena, 0, sizeof *arena);
+	memset(oldscore, 0, VtScoreSize);
+	memset(score, 0, VtScoreSize);
+	memset(&oldsha, 0, sizeof oldsha);
+	memset(&newsha, 0, sizeof newsha);
+	newsha.rollback = 1;
+
+	if(0){
+		sbdebug(&oldsha, "old.sha");
+		sbdebug(&newsha, "new.sha");
+	}
+
+	loadarenabasics(offset0, anum, head, arena);
+
+	/* start the clump hunt */
+	
+	clumps = 0;
+	totalcorrupt = 0;
+	sealing = 1;
+	boffset = offset0 + arena->blocksize;
+	offset = boffset;
+	eoffset = offset0+arena->size - arena->blocksize;
+	toffset = eoffset;
+	sp = pagein(offset0, 4*M);
+
+	if(arena->diskstats.sealed){
+		oldsha.offset = offset0;
+		sbupdate(&oldsha, sp, offset0, 4*M);
+	}
+	ep = sp+4*M;
+	p = sp + (boffset - offset0);
+	ncib = arena->blocksize / ClumpInfoSize;	/* ci per block in index */
+	lastclumpend = offset;
+	nbad = 0;
+	inbad = 0;
+	needtozero = 0;
+	minclumps = 0;
+	while(offset < eoffset){
+		/*
+		 * Shift buffer if we're running out of room.
+		 */
+		if(p+70*K >= ep){
+			/*
+			 * Start the post SHA1 buffer.   By now we should know the
+			 * clumpmagic and arena version, so we can create a
+			 * correct head block to get things going.
+			 */
+			if(sealing && fix && newsha.offset == 0){
+				newsha.offset = offset0;
+				if(arena->clumpmagic == 0){
+					if(arena->version == 0)
+						arena->version = ArenaVersion5;
+					arena->clumpmagic = newclumpmagic(arena->version);
+				}
+				head->clumpmagic = arena->clumpmagic;
+				shahead(&newsha, offset0, head);
+			}
+			n = 4*M-256*K;
+			if(sealing && fix){
+				sbdiskhash(&newsha, bufoffset);
+				sbupdate(&newsha, buf, bufoffset, 4*M-256*K);
+			}
+			pagein(bufoffset+n, 4*M);
+			p -= n;
+			if(arena->diskstats.sealed)
+				sbupdate(&oldsha, buf, bufoffset, 4*M);
+		}
+
+		/*
+		 * Check for a clump at p, which is at offset in the disk.
+		 * Duplicate clumps happen in corrupted disks
+		 * (the same pattern gets written many times in a row)
+		 * and should never happen during regular use.
+		 */
+		if((n = isclump(p, &cl, &magic)) > 0){
+			/*
+			 * If we were in the middle of some corrupted data,
+			 * flush a warning about it and then add any clump
+			 * info blocks as necessary.
+			 */
+			if(inbad){
+				inbad = 0;
+				v = offset-lastclumpend;
+				if(needtozero){
+					zerorange(lastclumpend, v);
+					sbrollback(&newsha, lastclumpend);
+					print("corrupt clump data - %#llux+%#llux (%,llud bytes)\n",
+						lastclumpend, v, v);
+				}
+				addcicorrupt(v);
+				totalcorrupt += v;
+				nb1 = (minclumps+ncib-1)/ncib;
+				minclumps += (v+ClumpSize+VtMaxLumpSize-1)/(ClumpSize+VtMaxLumpSize);
+				nb2 = (minclumps+ncib-1)/ncib;
+				eoffset -= (nb2-nb1)*arena->blocksize;
+			}
+
+			if(haveclump(cl.info.score))
+				print("warning: duplicate clump %d %V\n", cl.info.type, cl.info.score);
+
+			/*
+			 * If clumps use different magic numbers, we don't care.
+			 * We'll just use the first one we find and make the others
+			 * follow suit.
+			 */
+			if(arena->clumpmagic == 0){
+				print("clump type %d size %d score %V magic %x\n",
+					cl.info.type, cl.info.size, cl.info.score, magic);
+				arena->clumpmagic = magic;
+				if(magic == _ClumpMagic)
+					arena->version = ArenaVersion4;
+				else
+					arena->version = ArenaVersion5;
+			}
+			if(magic != arena->clumpmagic)
+				p32(p, arena->clumpmagic);
+			if(clumps == 0)
+				arena->ctime = cl.time;
+
+			/*
+			 * Record the clump, update arena stats,
+			 * grow clump info blocks if needed.
+			 */
+			if(verbose > 1)
+				print("\tclump %d: %d %V at %#llux+%#ux (%d)\n", 
+					clumps, cl.info.type, cl.info.score, offset, n, n);
+			addcibuf(&cl.info, 0);
+			if(minclumps%ncib == 0)
+				eoffset -= arena->blocksize;
+			minclumps++;
+			clumps++;
+			if(cl.encoding != ClumpENone)
+				arena->diskstats.cclumps++;
+			arena->diskstats.uncsize += cl.info.uncsize;
+			arena->wtime = cl.time;
+			
+			/*
+			 * Move to next clump.
+			 */
+			offset += n;
+			p += n;
+			lastclumpend = offset;
+		}else{
+			/*
+			 * Overwrite malformed clump data with zeros later.
+			 * For now, just record whether it needs to be overwritten.
+			 * Bad regions must be of size at least ClumpSize.
+			 * Postponing the overwriting keeps us from writing past
+			 * the end of the arena data (which might be directory data)
+			 * with zeros.
+			 */
+			if(!inbad){
+				inbad = 1;
+				needtozero = 0;
+				if(memcmp(p, zero, ClumpSize) != 0)
+					needtozero = 1;
+				p += ClumpSize;
+				offset += ClumpSize;
+				nbad++;
+			}else{
+				if(*p != 0)
+					needtozero = 1;
+				p++;
+				offset++;
+			}
+		}
+	}
+	pageout();
+
+	if(verbose)
+		print("readable clumps: %d; min. directory entries: %d\n", 
+			clumps, minclumps);
+	arena->diskstats.used = lastclumpend - boffset;
+	leaked = eoffset - lastclumpend;
+	if(verbose)
+		print("used from %#llux to %#llux = %,lld (%,lld unused)\n",
+			boffset, lastclumpend, arena->diskstats.used, leaked);
+
+	/*
+	 * Finish the SHA1 of the old data.
+	 */
+	if(arena->diskstats.sealed){
+		sbdiskhash(&oldsha, toffset);
+		readdisk(dbuf, toffset, arena->blocksize);
+		scorecp(dbuf+arena->blocksize-VtScoreSize, zero);
+		sbupdate(&oldsha, dbuf, toffset, arena->blocksize);
+		sbscore(&oldsha, oldscore);
+	}
+	
+	/*
+	 * If we still don't know the clump magic, the arena
+	 * must be empty.  It still needs a value, so make 
+	 * something up.
+	 */
+	if(arena->version == 0)
+		arena->version = ArenaVersion5;
+	if(arena->clumpmagic == 0){
+		if(arena->version == ArenaVersion4)
+			arena->clumpmagic = _ClumpMagic;
+		else{
+			do
+				arena->clumpmagic = fastrand();
+			while(arena->clumpmagic==_ClumpMagic
+				||arena->clumpmagic==0);
+		}
+		head->clumpmagic = arena->clumpmagic;
+	}
+
+	/*
+	 * Guess at number of clumpinfo blocks to load.
+	 * If we guess high, it's no big deal.  If we guess low,
+	 * we'll be forced into rewriting the whole directory.
+	 * Still not such a big deal.
+	 */
+	if(clumps == 0 || arena->diskstats.used == totalcorrupt)
+		goto Nocib;
+	if(clumps < arena->diskstats.clumps)
+		clumps = arena->diskstats.clumps;
+	if(clumps < ncibuf)
+		clumps = ncibuf;
+	clumps += totalcorrupt/
+		((arena->diskstats.used - totalcorrupt)/clumps);
+	clumps += totalcorrupt/2000;
+	if(clumps < minclumps)
+		clumps = minclumps;
+	clumps += ncib-1;
+	clumps -= clumps%ncib;
+
+	/*
+	 * Can't write into the actual data.
+	 */
+	v = offset0 + arena->size - arena->blocksize;
+	v -= (clumps+ncib-1)/ncib * arena->blocksize;
+	if(v < lastclumpend){
+		v = offset0 + arena->size - arena->blocksize;
+		clumps = (v-lastclumpend)/arena->blocksize * ncib;
+	}
+	
+	if(clumps < minclumps)
+		print("cannot happen?\n");
+
+	/*
+	 * Check clumpinfo blocks against directory we created.
+	 * The tricky part is handling the corrupt sections of arena.
+	 * If possible, we remark just the affected directory entries
+	 * rather than slide everything down.
+	 * 
+	 * Allocate clumps+1 blocks and check that we don't need
+	 * the last one at the end.
+	 */
+	bci = loadci(offset0, arena, clumps+1);
+	eci = bci+clumps+1;
+	bcit = cibuf;
+	ecit = cibuf+ncibuf;
+	smart = 1;
+Again:
+	nbad = 0;
+	ci = bci;
+	for(cit=bcit; cit<ecit && ci<eci; cit++){
+		if(cit->corrupt){
+			vlong n, m;
+			if(smart){
+				/*
+				 * If we can, just mark existing entries as corrupt.
+				 */
+				n = cit->corrupt;
+				for(xci=ci; n>0 && xci<eci; xci++)
+					n -= ClumpSize+xci->size;
+				if(n > 0 || xci >= eci)
+					goto Dumb;
+				printed = 0;
+				for(; ci<xci; ci++){
+					if(verbose && ci->type != VtCorruptType){
+						if(!printed){
+							print("marking directory %d-%d as corrupt\n",
+								(int)(ci-bci), (int)(xci-bci));
+							printed = 1;
+						}
+						print("\ttype=%d size=%d uncsize=%d score=%V\n",
+							ci->type, ci->size, ci->uncsize, ci->score);
+					}
+					ci->type = VtCorruptType;
+				}
+			}else{
+			Dumb:
+				print("\trewriting clump directory\n");
+				/*
+				 * Otherwise, blaze a new trail.
+				 */
+				n = cit->corrupt;
+				while(n > 0 && ci < eci){
+					if(n < ClumpSize)
+						sysfatal("bad math in clump corrupt");
+					if(n <= VtMaxLumpSize+ClumpSize)
+						m = n;
+					else{
+						m = VtMaxLumpSize+ClumpSize;
+						if(n-m < ClumpSize)
+							m -= ClumpSize;
+					}
+					ci->type = VtCorruptType;
+					ci->size = m-ClumpSize;
+					ci->uncsize = m-ClumpSize;
+					memset(ci->score, 0, VtScoreSize);
+					ci++;
+					n -= m;
+				}
+			}
+			continue;
+		}
+		if(clumpinfocmp(&cit->ci, ci) != 0){
+			if(verbose && (smart || verbose>1)){
+				print("clumpinfo %d\n", (int)(ci-bci));
+				print("\twant: %d %d %d %V\n", 
+					cit->ci.type, cit->ci.size,
+					cit->ci.uncsize, cit->ci.score);
+				print("\thave: %d %d %d %V\n", 
+					ci->type, ci->size, 
+					ci->uncsize, ci->score);
+			}
+			*ci = cit->ci;
+			nbad++;
+		}
+		ci++;
+	}
+	if(ci >= eci || cit < ecit){
+		print("ran out of space editing existing directory; rewriting\n");
+		print("# eci %ld ci %ld ecit %ld cit %ld\n", eci-bci, ci-bci, ecit-bcit, cit-bcit);
+		assert(smart);	/* can't happen second time thru */
+		smart = 0;
+		goto Again;
+	}
+	
+	assert(ci <= eci);
+	arena->diskstats.clumps = ci-bci;
+	eoffset = writeci(offset0, arena, bci, ci-bci);
+	if(sealing && fix)
+		sbrollback(&newsha, v);
+print("eoffset=%lld lastclumpend=%lld diff=%lld unseal=%d\n", eoffset, lastclumpend, eoffset-lastclumpend, unseal);
+	if(lastclumpend > eoffset)
+		print("arena directory overwrote blocks!  cannot happen!\n");
+	free(bci);
+	if(smart && nbad)
+		print("arena directory has %d bad or missing entries\n", nbad);
+Nocib:
+	if(eoffset - lastclumpend > 64*1024 && (!arena->diskstats.sealed || unseal)){
+		if(arena->diskstats.sealed)
+			print("unsealing arena\n");
+		sealing = 0;
+		memset(oldscore, 0, VtScoreSize);
+	}
+
+	/*
+	 * Finish the SHA1 of the new data - only meaningful
+	 * if we've been writing to disk (`fix').
+	 */
+	arena->diskstats.sealed = sealing;
+	arena->memstats = arena->diskstats;
+	if(sealing && fix){
+		uchar tbuf[MaxDiskBlock];
+		
+		sbdiskhash(&newsha, toffset);
+		memset(tbuf, 0, sizeof tbuf);
+		packarena(arena, tbuf);
+		sbupdate(&newsha, tbuf, toffset, arena->blocksize);
+		sbscore(&newsha, score);
+	}
+}
+
+void
+dumparena(vlong offset, int anum, Arena *arena)
+{
+	char buf[1000];
+	vlong o, e;
+	int fd, n;
+	
+	snprint(buf, sizeof buf, "%s.%d", dumpbase, anum);
+	if((fd = create(buf, OWRITE, 0666)) < 0){
+		fprint(2, "create %s: %r\n", buf);
+		return;
+	}
+	e = offset+arena->size;
+	for(o=offset; o<e; o+=n){
+		n = 4*M;
+		if(o+n > e)
+			n = e-o;
+		if(pwrite(fd, pagein(o, n), n, o-offset) != n){
+			fprint(2, "write %s at %#llux: %r\n", buf, o-offset);
+			return;
+		}
+	}
+}
+
+void
+checkarena(vlong offset, int anum)
+{
+	uchar dbuf[MaxDiskBlock];
+	uchar *p, oldscore[VtScoreSize], score[VtScoreSize];
+	Arena arena, oarena;
+	ArenaHead head;
+	Info *fmt, *fmta;
+	int sz;
+	
+	print("# arena %d: offset %#llux\n", anum, offset);
+
+	if(offset >= partend){
+		print("arena offset out of bounds\n");
+		return;
+	}
+
+	guessarena(offset, anum, &head, &arena, oldscore, score);
+
+	if(verbose){
+		print("#\tversion=%d name=%s blocksize=%d size=%z",
+			head.version, head.name, head.blocksize, head.size);
+		if(head.clumpmagic)
+			print(" clumpmagic=%#.8ux", head.clumpmagic);
+		print("\n#\tclumps=%d cclumps=%d used=%,lld uncsize=%,lld\n",
+			arena.diskstats.clumps, arena.diskstats.cclumps,
+			arena.diskstats.used, arena.diskstats.uncsize);
+		print("#\tctime=%t\n", arena.ctime);
+		print("#\twtime=%t\n", arena.wtime);
+		if(arena.diskstats.sealed)
+			print("#\tsealed score=%V\n", score);
+	}
+
+	if(dumpbase){
+		dumparena(offset, anum, &arena);
+		return;
+	}
+
+	memset(dbuf, 0, sizeof dbuf);
+	packarenahead(&head, dbuf);
+	p = pagein(offset, arena.blocksize);
+	if(memcmp(dbuf, p, arena.blocksize) != 0){
+		print("on-disk arena header incorrect\n");
+		showdiffs(dbuf, p, arena.blocksize, 
+			arena.version==ArenaVersion4 ? headinfo4 : headinfo5);
+	}
+	memmove(p, dbuf, arena.blocksize);
+	
+	memset(dbuf, 0, sizeof dbuf);
+	packarena(&arena, dbuf);
+	if(arena.diskstats.sealed)
+		scorecp(dbuf+arena.blocksize-VtScoreSize, score);
+	p = pagein(offset+arena.size-arena.blocksize, arena.blocksize);
+	memset(&oarena, 0, sizeof oarena);
+	unpackarena(&oarena, p);
+	if(arena.version == ArenaVersion4){
+		sz = ArenaSize4;
+		fmt = tailinfo4;
+		fmta = tailinfo4a;
+	}else{
+		sz = ArenaSize5;
+		fmt = tailinfo5;
+		fmta = tailinfo5a;
+	}
+	if(p[sz] == 1){
+		fmt = fmta;
+		if(oarena.diskstats.sealed){
+			/*
+			 * some arenas were sealed with the extension
+			 * before we adopted the convention that if it didn't
+			 * add new information it gets dropped.
+			 */
+			_packarena(&arena, dbuf, 1);
+		}
+	}
+	if(memcmp(dbuf, p, arena.blocksize-VtScoreSize) != 0){
+		print("on-disk arena tail incorrect\n");
+		showdiffs(dbuf, p, arena.blocksize-VtScoreSize, fmt);
+	}
+	if(arena.diskstats.sealed){
+		if(oarena.diskstats.sealed)
+		if(scorecmp(p+arena.blocksize-VtScoreSize, oldscore) != 0){
+			print("on-disk arena seal score incorrect\n");
+			print("\tcorrect=%V\n", oldscore);
+			print("\t   disk=%V\n", p+arena.blocksize-VtScoreSize);
+		}
+		if(fix && scorecmp(p+arena.blocksize-VtScoreSize, score) != 0){
+			print("%ssealing arena%s: %V\n", 
+				oarena.diskstats.sealed ? "re" : "",
+				scorecmp(oldscore, score) == 0 ? 
+					"" : " after changes", score);
+		}
+	}
+	memmove(p, dbuf, arena.blocksize);
+	
+	pageout();
+}
+
+AMapN*
+buildamap(void)
+{
+	uchar *p;
+	vlong o;
+	ArenaHead h;
+	AMapN *an;
+	AMap *m;
+	
+	an = vtmallocz(sizeof *an);
+	for(o=ap.arenabase; o<partend; o+=arenasize){
+		p = pagein(o, Block);
+		if(unpackarenahead(&h, p) >= 0){
+			an->map = vtrealloc(an->map, (an->n+1)*sizeof an->map[0]);
+			m = &an->map[an->n++];
+			m->start = o;
+			m->stop = o+h.size;
+			strcpy(m->name, h.name);
+		}
+	}
+	return an;	
+}
+
+void
+checkmap(void)
+{
+	char *s;
+	uchar *p;
+	int i, len;
+	AMapN *an;
+	Fmt fmt;
+	
+	an = buildamap();
+	fmtstrinit(&fmt);
+	fmtprint(&fmt, "%ud\n", an->n);
+	for(i=0; i<an->n; i++)
+		fmtprint(&fmt, "%s\t%lld\t%lld\n",
+			an->map[i].name, an->map[i].start, an->map[i].stop);
+	s = fmtstrflush(&fmt);
+	len = strlen(s);
+	if(len > ap.tabsize){
+		print("arena partition map too long: need %z bytes have %z\n",
+			(vlong)len, (vlong)ap.tabsize);
+		len = ap.tabsize;
+	}
+	
+	if(ap.tabsize >= 4*M){	/* can't happen - max arenas is 2000 */
+		print("arena partition map *way* too long\n");
+		return;
+	}
+
+	p = pagein(ap.tabbase, ap.tabsize);
+	if(memcmp(p, s, len) != 0){
+		print("arena partition map incorrect; rewriting.\n");
+		memmove(p, s, len);
+	}
+	pageout();
+}
+
+int mainstacksize = 512*1024;
+
+void
+threadmain(int argc, char **argv)
+{
+	int mode;
+	
+	mode = OREAD;
+	readonly = 1;	
+	ARGBEGIN{
+	case 'U':
+		unseal = 1;
+		break;
+	case 'a':
+		arenasize = unittoull(EARGF(usage()));
+		break;
+	case 'b':
+		ap.blocksize = unittoull(EARGF(usage()));
+		break;
+	case 'f':
+		fix = 1;
+		mode = ORDWR;
+		readonly = 0;
+		break;
+	case 'n':
+		basename = EARGF(usage());
+		break;
+	case 'v':
+		verbose++;
+		break;
+	case 'x':
+		dumpbase = EARGF(usage());
+		break;
+	default:
+		usage();
+	}ARGEND
+	
+	if(argc != 1 && argc != 2)
+		usage();
+
+	file = argv[0];
+	
+	ventifmtinstall();
+	fmtinstall('z', zfmt);
+	fmtinstall('t', tfmt);
+	quotefmtinstall();
+	
+	part = initpart(file, mode|ODIRECT);
+	if(part == nil)
+		sysfatal("can't open %s: %r", file);
+	partend = part->size;
+	
+	checkarenas(argc > 1 ? argv[1] : nil);
+	checkmap();
+	threadexitsall(nil);
+}
+
blob - f35580ed48467e6f9d1cffa8210a82d491b0d731
blob + 1a6f1e4ba352b23e52f23aeed9dd5e01fb790d51
--- src/cmd/venti/srv/fns.h
+++ src/cmd/venti/srv/fns.h
@@ -24,8 +24,13 @@ void		delaykickicache(void);
 void		delaykickround(Round*);
 void		delaykickroundproc(void*);
 void		dirtydblock(DBlock*, int);
+void		diskaccess(int);
+void		disksched(void);
 AState	diskstate(void);
 void		*emalloc(ulong);
+void		emptydcache(void);
+void		emptyicache(void);
+void		emptylumpcache(void);
 void		*erealloc(void *, ulong);
 char		*estrdup(char*);
 void		*ezmalloc(ulong);
@@ -49,6 +54,7 @@ u32int		hashbits(u8int *score, int nbits);
 int		httpdinit(char *address, char *webroot);
 int		iaddrcmp(IAddr *ia1, IAddr *ia2);
 IEntry*	icachedirty(u32int, u32int, u64int);
+ulong	icachedirtyfrac(void);
 void		icacheclean(IEntry*);
 int		ientrycmp(const void *vie1, const void *vie2);
 char		*ifileline(IFile *f);
@@ -77,6 +83,7 @@ int		insertscore(u8int *score, IAddr *ia, int write);
 void		kickdcache(void);
 void		kickicache(void);
 void		kickround(Round*, int wait);
+int		loadbloom(Bloom*);
 ZBlock		*loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int verify);
 DBlock	*loadibucket(Index *index, u8int *score, ISect **is, u32int *buck, IBucket *ib);
 int		loadientry(Index *index, u8int *score, int type, IEntry *ie);
@@ -98,6 +105,7 @@ int		okamap(AMap *am, int n, u64int start, u64int stop
 int		okibucket(IBucket*, ISect*);
 int		outputamap(Fmt *f, AMap *am, int n);
 int		outputindex(Fmt *f, Index *ix);
+int		_packarena(Arena *arena, u8int *buf, int);
 int		packarena(Arena *arena, u8int *buf);
 int		packarenahead(ArenaHead *head, u8int *buf);
 int		packarenapart(ArenaPart *as, u8int *buf);
@@ -129,6 +137,7 @@ ZBlock		*readfile(char *name);
 int		readifile(IFile *f, char *name);
 Packet		*readlump(u8int *score, int type, u32int size, int *cached);
 int		readpart(Part *part, u64int addr, u8int *buf, u32int n);
+int		resetbloom(Bloom*);
 int		runconfig(char *config, Config*);
 int		scorecmp(u8int *, u8int *);
 void		scoremem(u8int *score, u8int *buf, int size);
blob - 647c74b2d23405dfc821db4ce8306fa54dc833e2
blob + 9c906ad739af5babbd43d93e8587a6211a2fb0d1
--- src/cmd/venti/srv/graph.c
+++ src/cmd/venti/srv/graph.c
@@ -55,7 +55,11 @@ ginit(void)
 		
 	first = 0;
 	memimageinit();
+#ifdef PLAN9PORT
 	smallfont = openmemsubfont(unsharp("#9/font/lucsans/lstr.10"));
+#else
+	smallfont = openmemsubfont("/lib/font/bit/lucidasans/lstr.10");
+#endif
 	black = memblack;
 	blue = allocrepl(DBlue);
 	red = allocrepl(DRed);
@@ -121,7 +125,7 @@ statgraph(Graph *g)
 	if(g->wid > nelem(bin))
 		g->wid = nelem(bin);
 	if(g->fill < 0)
-		g->fill = ((uint)(uintptr)g->arg>>8)%nelem(lofill);
+		g->fill = ((uint)g->arg>>8)%nelem(lofill);
 	if(g->fill > nelem(lofill))
 		g->fill %= nelem(lofill);
 	
@@ -151,7 +155,7 @@ statgraph(Graph *g)
 	qlock(&memdrawlock);
 	ginit();
 	if(smallfont==nil || black==nil || blue==nil || red==nil || hifill==nil || lofill==nil){
-		werrstr("graphics initialization failed");
+		werrstr("graphics initialization failed: %r");
 		qunlock(&memdrawlock);
 		return nil;
 	}
@@ -186,12 +190,12 @@ statgraph(Graph *g)
 		if(0)
 		if(lastlo != -1){
 			if(lastlo < lo)
-				memimagedraw(m, Rect(x-1, lastlo, x, lo), hifill[g->fill], ZP, memopaque, ZP, S);
+				memimagedraw(m, Rect(x-1, lastlo, x, lo), hifill[g->fill%nelem(hifill)], ZP, memopaque, ZP, S);
 			else if(lastlo > lo)
-				memimagedraw(m, Rect(x-1, lo, x, lastlo), hifill[g->fill], ZP, memopaque, ZP, S);
+				memimagedraw(m, Rect(x-1, lo, x, lastlo), hifill[g->fill%nelem(hifill)], ZP, memopaque, ZP, S);
 		}
-		memimagedraw(m, Rect(x, hi, x+1,lo), hifill[g->fill], ZP, memopaque, ZP, S);
-		memimagedraw(m, Rect(x, lo, x+1, r.max.y), lofill[g->fill], ZP, memopaque, ZP, S);
+		memimagedraw(m, Rect(x, hi, x+1,lo), hifill[g->fill%nelem(hifill)], ZP, memopaque, ZP, S);
+		memimagedraw(m, Rect(x, lo, x+1, r.max.y), lofill[g->fill%nelem(lofill)], ZP, memopaque, ZP, S);
 		lastlo = lo;
 	}
 
blob - ad7222dd8c21ad77458cf336add45f524fc65788
blob + 04d19d9d325bb84b429615ad10aae5a93fbb254f
--- src/cmd/venti/srv/httpd.c
+++ src/cmd/venti/srv/httpd.c
@@ -9,7 +9,7 @@ extern QLock memdrawlock;
 enum
 {
 	ObjNameSize	= 64,
-	MaxObjs		= 16
+	MaxObjs		= 64
 };
 
 struct HttpObj
@@ -28,6 +28,12 @@ static	int		dindex(HConnect *c);
 static	int		xindex(HConnect *c);
 static	int		xlog(HConnect *c);
 static	int		sindex(HConnect *c);
+static	int		hempty(HConnect *c);
+static	int		hlcacheempty(HConnect *c);
+static	int		hdcacheempty(HConnect *c);
+static	int		hicacheempty(HConnect *c);
+static	int		hicachekick(HConnect *c);
+static	int		hdcachekick(HConnect *c);
 static	int		hicacheflush(HConnect *c);
 static	int		hdcacheflush(HConnect *c);
 static	int		notfound(HConnect *c);
@@ -53,10 +59,17 @@ httpdinit(char *address, char *dir)
 	httpdobj("/xindex", xindex);
 	httpdobj("/flushicache", hicacheflush);
 	httpdobj("/flushdcache", hdcacheflush);
+	httpdobj("/kickicache", hicachekick);
+	httpdobj("/kickdcache", hdcachekick);
 	httpdobj("/graph/", xgraph);
+	httpdobj("/set", xset);
 	httpdobj("/set/", xset);
 	httpdobj("/log", xlog);
 	httpdobj("/log/", xlog);
+	httpdobj("/empty", hempty);
+	httpdobj("/emptyicache", hicacheempty);
+	httpdobj("/emptylumpcache", hlcacheempty);
+	httpdobj("/emptydcache", hdcacheempty);
 
 	if(vtproc(listenproc, address) < 0)
 		return -1;
@@ -105,8 +118,6 @@ listenproc(void *vaddress)
 	char *address, ndir[NETPATHLEN], dir[NETPATHLEN];
 	int ctl, nctl, data;
 
-/*sleep(1000);	// let strace find us */
-
 	address = vaddress;
 	ctl = announce(address, dir);
 	if(ctl < 0){
@@ -148,7 +159,6 @@ httpproc(void *v)
 	HConnect *c;
 	int ok, i, n;
 
-/*sleep(1000);	// let strace find us */
 	c = v;
 
 	for(;;){
@@ -182,7 +192,7 @@ httpproc(void *v)
 }
 
 static int
-percent(long v, long total)
+percent(ulong v, ulong total)
 {
 	if(total == 0)
 		total = 1;
@@ -240,6 +250,31 @@ preqtext(HConnect *c)
 }
 
 static int
+herror(HConnect *c)
+{
+	int n;
+	Hio *hout;
+	
+	hout = &c->hout;
+	n = snprint(c->xferbuf, HBufSize, "<html><head><title>Error</title></head>\n<body><h1>Error</h1>\n<pre>%r</pre>\n</body></html>");
+	hprint(hout, "%s %s\r\n", hversion, "400 Bad Request");
+	hprint(hout, "Date: %D\r\n", time(nil));
+	hprint(hout, "Server: Venti\r\n");
+	hprint(hout, "Content-Type: text/html\r\n");
+	hprint(hout, "Content-Length: %d\r\n", n);
+	if(c->head.closeit)
+		hprint(hout, "Connection: close\r\n");
+	else if(!http11(c))
+		hprint(hout, "Connection: Keep-Alive\r\n");
+	hprint(hout, "\r\n");
+
+	if(c->req.meth == nil || strcmp(c->req.meth, "HEAD") != 0)
+		hwrite(hout, c->xferbuf, n);
+
+	return hflush(hout);
+}
+	
+static int
 notfound(HConnect *c)
 {
 	int r;
@@ -325,21 +360,53 @@ static struct
 	"logging",	&ventilogging,
 	"stats",	&collectstats,
 	"icachesleeptime",	&icachesleeptime,
+	"minicachesleeptime",	&minicachesleeptime,
 	"arenasumsleeptime",	&arenasumsleeptime,
+	"l0quantum",	&l0quantum,
+	"l1quantum",	&l1quantum,
+	"manualscheduling",	&manualscheduling,
+	"ignorebloom",	&ignorebloom,
+	"syncwrites",	&syncwrites,
+	"icacheprefetch",	&icacheprefetch,
 	0
 };
+
+static int
+xsetlist(HConnect *c)
+{
+	int i;
+	
+	if(preqtype(c, "text/plain") < 0)
+		return -1;
+	for(i=0; namedints[i].name; i++)
+		print("%s = %d\n", namedints[i].name, *namedints[i].p);
+	hflush(&c->hout);
+	return 0;
+}
 
+
+
 static int
 xset(HConnect *c)
 {
 	int i, nf, r;
 	char *f[10], *s;
 
+	if(strcmp(c->req.uri, "/set") == 0 || strcmp(c->req.uri, "/set/") == 0)
+		return xsetlist(c);
+
 	s = estrdup(c->req.uri);
 	nf = getfields(s+strlen("/set/"), f, nelem(f), 1, "/");
 
-	if(nf < 1)
-		return notfound(c);
+	if(nf < 1){
+		r = preqtext(c);
+		if(r < 0)
+			return r;
+		for(i=0; namedints[i].name; i++)
+			hprint(&c->hout, "%s = %d\n", namedints[i].name, *namedints[i].p);
+		hflush(&c->hout);
+		return 0;
+	}
 	for(i=0; namedints[i].name; i++){
 		if(strcmp(f[0], namedints[i].name) == 0){
 			if(nf >= 2)
@@ -495,6 +562,108 @@ darena(Hio *hout, Arena *arena)
 }
 
 static int
+hempty(HConnect *c)
+{
+	Hio *hout;
+	int r;
+
+	r = preqtext(c);
+	if(r < 0)
+		return r;
+	hout = &c->hout;
+
+	emptylumpcache();
+	emptydcache();
+	emptyicache();
+	hprint(hout, "emptied all caches\n");
+	hflush(hout);
+	return 0;
+}
+
+static int
+hlcacheempty(HConnect *c)
+{
+	Hio *hout;
+	int r;
+
+	r = preqtext(c);
+	if(r < 0)
+		return r;
+	hout = &c->hout;
+
+	emptylumpcache();
+	hprint(hout, "emptied lumpcache\n");
+	hflush(hout);
+	return 0;
+}
+
+static int
+hicacheempty(HConnect *c)
+{
+	Hio *hout;
+	int r;
+
+	r = preqtext(c);
+	if(r < 0)
+		return r;
+	hout = &c->hout;
+
+	emptyicache();
+	hprint(hout, "emptied icache\n");
+	hflush(hout);
+	return 0;
+}
+
+static int
+hdcacheempty(HConnect *c)
+{
+	Hio *hout;
+	int r;
+
+	r = preqtext(c);
+	if(r < 0)
+		return r;
+	hout = &c->hout;
+
+	emptydcache();
+	hprint(hout, "emptied dcache\n");
+	hflush(hout);
+	return 0;
+}
+static int
+hicachekick(HConnect *c)
+{
+	Hio *hout;
+	int r;
+
+	r = preqtext(c);
+	if(r < 0)
+		return r;
+	hout = &c->hout;
+
+	kickicache();
+	hprint(hout, "kicked icache\n");
+	hflush(hout);
+	return 0;
+}
+
+static int
+hdcachekick(HConnect *c)
+{
+	Hio *hout;
+	int r;
+
+	r = preqtext(c);
+	if(r < 0)
+		return r;
+	hout = &c->hout;
+
+	kickdcache();
+	hprint(hout, "kicked dcache\n");
+	hflush(hout);
+	return 0;
+}
+static int
 hicacheflush(HConnect *c)
 {
 	Hio *hout;
@@ -569,6 +738,7 @@ rawgraph(Stats *s, Stats *t, void *va)
 {
 	Arg *a;
 
+	USED(s);
 	a = va;
 	return t->n[a->index];
 }
@@ -587,6 +757,7 @@ pctgraph(Stats *s, Stats *t, void *va)
 {
 	Arg *a;
 
+	USED(s);
 	a = va;
 	return percent(t->n[a->index], t->n[a->index2]);
 }
@@ -722,7 +893,7 @@ static char* graphname[] =
 	"isectwritebyte",
 
 	"sumread",
-	"sumreadbyte"
+	"sumreadbyte",
 };
 
 static int
@@ -733,7 +904,6 @@ findname(char *s)
 	for(i=0; i<nelem(graphname); i++)
 		if(strcmp(graphname[i], s) == 0)
 			return i;
-fprint(2, "no name '%s'\n", s);
 	return -1;
 }
 
@@ -769,10 +939,14 @@ xgraph(HConnect *c)
 if(0) fprint(2, "graph %s\n" ,s);
 	memset(&g, 0, sizeof g);
 	nf = getfields(s+strlen("/graph/"), f, nelem(f), 1, "/");
-	if(nf < 1)
-		goto notfound;
-	if((arg.index = findname(f[0])) == -1 && strcmp(f[0], "*") != 0)
-		goto notfound;
+	if(nf < 1){
+		werrstr("bad syntax -- not enough fields");
+		goto error;
+	}
+	if((arg.index = findname(f[0])) == -1 && strcmp(f[0], "*") != 0){
+		werrstr("unknown name %s", f[0]);
+		goto error;
+	}
 	g.arg = &arg;
 	g.t0 = -120;
 	g.t1 = 0;
@@ -793,14 +967,18 @@ if(0) fprint(2, "graph %s\n" ,s);
 		else if(strncmp(f[i], "max=", 4) == 0)
 			g.max = atoi(f[i]+4);
 		else if(strncmp(f[i], "pct=", 4) == 0){
-			if((arg.index2 = findname(f[i]+4)) == -1)
-				goto notfound;
+			if((arg.index2 = findname(f[i]+4)) == -1){
+				werrstr("unknown name %s", f[i]+4);
+				goto error;
+			}
 			g.fn = pctgraph;
 			g.min = 0;
 			g.max = 100;
 		}else if(strncmp(f[i], "pctdiff=", 8) == 0){
-			if((arg.index2 = findname(f[i]+8)) == -1)
-				goto notfound;
+			if((arg.index2 = findname(f[i]+8)) == -1){
+				werrstr("unknown name %s", f[i]+8);
+				goto error;
+			}
 			g.fn = pctdiffgraph;
 			g.min = 0;
 			g.max = 100;
@@ -830,7 +1008,7 @@ if(0) fprint(2, "graph %s\n" ,s);
 
 	m = statgraph(&g);
 	if(m == nil)
-		goto notfound;
+		goto error;
 
 	if(preqtype(c, "image/png") < 0)
 		return -1;
@@ -843,9 +1021,9 @@ if(0) fprint(2, "graph %s\n" ,s);
 	free(s);
 	return 0;
 
-notfound:
+error:
 	free(s);
-	return notfound(c);
+	return herror(c);
 }
 
 static int
@@ -944,7 +1122,6 @@ vtloghdump(Hio *h, VtLog *l)
 	
 	name = l ? l->name : "&lt;nil&gt;";
 
-fprint(2, "hdump xfer %d\n", h->xferenc);
 	hprint(h, "<html><head>\n");
 	hprint(h, "<title>Venti Server Log: %s</title>\n", name);
 	hprint(h, "</head><body>\n");
blob - 46d411e584473e1f20cf665ca5751b2fa032cf51
blob + 49f741e7911cbb0b2df3b69101cc5657b5afb143
--- src/cmd/venti/srv/icache.c
+++ src/cmd/venti/srv/icache.c
@@ -11,6 +11,7 @@ struct ICache
 	int	bits;			/* bits to use for indexing heads */
 	u32int	size;			/* number of heads; == 1 << bits, should be < entries */
 	IEntry	*base;			/* all allocated hash table entries */
+	IEntry	*free;
 	u32int	entries;		/* elements in base */
 	IEntry	*dirty;		/* chain of dirty elements */
 	u32int	ndirty;
@@ -23,6 +24,8 @@ struct ICache
 	int		nlast;
 };
 
+int icacheprefetch = 1;
+
 static ICache icache;
 
 static IEntry	*icachealloc(IAddr *ia, u8int *score);
@@ -45,6 +48,12 @@ initicache(int bits, int depth)
 	setstat(StatIcacheSize, icache.entries);
 }
 
+ulong
+icachedirtyfrac(void)
+{
+	return (vlong)icache.ndirty*IcacheFrac / icache.entries;
+}
+
 u32int
 hashbits(u8int *sc, int bits)
 {
@@ -141,14 +150,16 @@ lookupscore(u8int *score, int type, IAddr *ia, int *ra
 	 * load the table of contents for that arena into the cache.
 	 */
 	ie = icachealloc(&d.ia, score);
-	icache.last[icache.nlast++%nelem(icache.last)] = amapitoa(mainindex, ie->ia.addr, &aa);
-	aa = ie->ia.addr - aa;	/* compute base addr of arena */
-	for(i=0; i<nelem(icache.last); i++)
-		if(icache.last[i] != icache.last[0])
-			break;
-	if(i==nelem(icache.last) && icache.lastload != icache.last[0]){
-		load = icache.last[0];
-		icache.lastload = load;
+	if(icacheprefetch){
+		icache.last[icache.nlast++%nelem(icache.last)] = amapitoa(mainindex, ie->ia.addr, &aa);
+		aa = ie->ia.addr - aa;	/* compute base addr of arena */
+		for(i=0; i<nelem(icache.last); i++)
+			if(icache.last[i] != icache.last[0])
+				break;
+		if(i==nelem(icache.last) && icache.lastload != icache.last[0]){
+			load = icache.last[0];
+			icache.lastload = load;
+		}
 	}
 
 found:
@@ -249,6 +260,11 @@ icachealloc(IAddr *ia, u8int *score)
 		trace(TraceLump, "icachealloc unused");
 		goto Found;
 	}
+	
+	if((ie = icache.free) != nil){
+		icache.free = ie->next;
+		goto Found;
+	}
 
 	h = icache.stolen;
 	for(i=0;; i++){
@@ -346,3 +362,21 @@ icacheclean(IEntry *ie)
 	trace(TraceProc, "icachedirty exit");
 }
 
+void
+emptyicache(void)
+{
+	int i;
+	IEntry *ie, **lie;
+	
+	qlock(&icache.lock);
+	for(i=0; i<icache.size; i++)
+	for(lie=&icache.heads[i]; (ie=*lie); ){
+		if(ie->dirty == 0){
+			*lie = ie->next;
+			ie->next = icache.free;
+			icache.free = ie;
+		}else
+			lie = &ie->next;
+	}
+	qunlock(&icache.lock);
+}
blob - 9c36ba2ca54e48f55acc645d01f79b1b2d487418
blob + 003abb1856ba386f5402cc94860febb24ef99985
--- src/cmd/venti/srv/icachewrite.c
+++ src/cmd/venti/srv/icachewrite.c
@@ -12,6 +12,7 @@ static void icachewritecoord(void*);
 static IEntry *iesort(IEntry*);
 
 int icachesleeptime = 1000;	/* milliseconds */
+int minicachesleeptime = 50;
 
 enum
 {
@@ -74,7 +75,7 @@ nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *
 static int
 icachewritesect(Index *ix, ISect *is, u8int *buf)
 {
-	int err, h, bsize;
+	int err, h, bsize, t;
 	u32int lo, hi;
 	u64int addr, naddr;
 	uint nbuf, off;
@@ -96,7 +97,14 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
 	err = 0;
 
 	while(iedirty){
-		sleep(icachesleeptime);
+		disksched();
+		while((t=icachesleeptime) == SleepForever){
+			sleep(1000);
+			disksched();
+		}
+		if(t < minicachesleeptime)
+			t = minicachesleeptime;
+		sleep(t);
 		trace(TraceProc, "icachewritesect nextchunk");
 		chunk = nextchunk(ix, is, &iedirty, &addr, &nbuf);
 
@@ -146,12 +154,15 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
 					break;
 			}
 			packibucket(&ib, buf+off, is->bucketmagic);
+			/* XXX not right - must update cache after writepart */
 			if((b = _getdblock(is->part, naddr, ORDWR, 0)) != nil){
 				memmove(b->data, buf+off, bsize);
 				putdblock(b);
 			}
 		}
 
+		diskaccess(1);
+
 		trace(TraceProc, "icachewritesect writepart", addr, nbuf);
 		if(writepart(is->part, addr, buf, nbuf) < 0){
 			/* XXX */
@@ -171,6 +182,7 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
 static void
 icachewriteproc(void *v)
 {
+	int ret;
 	uint bsize;
 	ISect *is;
 	Index *ix;
@@ -188,17 +200,17 @@ icachewriteproc(void *v)
 		trace(TraceProc, "icachewriteproc recv");
 		recv(is->writechan, 0);
 		trace(TraceWork, "start");
-		icachewritesect(ix, is, buf);
+		ret = icachewritesect(ix, is, buf);
 		trace(TraceProc, "icachewriteproc send");
 		trace(TraceWork, "finish");
-		send(is->writedonechan, 0);
+		sendul(is->writedonechan, ret);
 	}
 }
 
 static void
 icachewritecoord(void *v)
 {
-	int i;
+	int i, err;
 	Index *ix;
 	AState as;
 
@@ -216,9 +228,9 @@ icachewritecoord(void *v)
 		as = diskstate();
 		if(as.arena==iwrite.as.arena && as.aa==iwrite.as.aa){
 			/* will not be able to do anything more than last flush - kick disk */
-			trace(TraceProc, "icachewritecoord flush dcache");
+			trace(TraceProc, "icachewritecoord kick dcache");
 			kickdcache();
-			trace(TraceProc, "icachewritecoord flushed dcache");
+			trace(TraceProc, "icachewritecoord kicked dcache");
 		}
 		iwrite.as = as;
 
@@ -229,13 +241,15 @@ icachewritecoord(void *v)
 			if(ix->bloom)
 				send(ix->bloom->writechan, 0);
 		
+			err = 0;
 			for(i=0; i<ix->nsects; i++)
-				recv(ix->sects[i]->writedonechan, 0);
+				err |= recvul(ix->sects[i]->writedonechan);
 			if(ix->bloom)
-				recv(ix->bloom->writedonechan, 0);
+				err |= recvul(ix->bloom->writedonechan);
 
-			trace(TraceProc, "icachewritecoord donewrite");
-			setatailstate(&iwrite.as);
+			trace(TraceProc, "icachewritecoord donewrite err=%d", err);
+			if(err == 0)
+				setatailstate(&iwrite.as);
 		}
 		icacheclean(nil);	/* wake up anyone waiting */
 		trace(TraceWork, "finish");
blob - 8cff4180ee3f5e5d4ed318fec42ee11983687fb4
blob + c69192a7e1f8b0ee3d2eeeea7161affe6687e3c7
--- src/cmd/venti/srv/index.c
+++ src/cmd/venti/srv/index.c
@@ -23,16 +23,10 @@
 #include "dat.h"
 #include "fns.h"
 
-/*static int	bucklook(u8int *score, int type, u8int *data, int n); */
-/*static int	writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b); */
-/*static int	okibucket(IBucket *ib, ISect *is); */
 static int	initindex1(Index*);
 static ISect	*initisect1(ISect *is);
-/*static int	splitiblock(Index *ix, DBlock *b, ISect *is, u32int buck, IBucket *ib); */
 
 #define KEY(k,d)	((d) ? (k)>>(32-(d)) : 0)
-
-/*static QLock	indexlock;	//ZZZ */
 
 static char IndexMagic[] = "venti index configuration";
 
@@ -375,6 +369,8 @@ initisect(Part *part)
 		seterr(EAdmin, "can't read index section header: %r");
 		return nil;
 	}
+print("read %s at %d: %.2ux %.2ux %.2ux %.2ux\n",
+	part->name, PartBlank, b->data[0], b->data[1], b->data[2], b->data[3]);
 
 	is = MKZ(ISect);
 	if(is == nil){
@@ -457,9 +453,10 @@ initisect1(ISect *is)
 	v = is->part->size & ~(u64int)(is->blocksize - 1);
 	if(is->blockbase + (u64int)is->blocks * is->blocksize != v){
 		seterr(ECorrupt, "invalid blocks in index section %s", is->name);
-/*ZZZZZZZZZ */
-/*		freeisect(is); */
-/*		return nil; */
+		/* ZZZ what to do? 
+		freeisect(is);
+		return nil;
+		*/
 	}
 
 	if(is->stop - is->start > is->blocks){
@@ -482,9 +479,10 @@ wbisect(ISect *is)
 	ZBlock *b;
 
 	b = alloczblock(HeadSize, 1, 0);
-	if(b == nil)
-/*ZZZ set error? */
+	if(b == nil){
+		/* ZZZ set error? */
 		return -1;
+	}
 
 	if(packisect(is, b->data) < 0){
 		seterr(ECorrupt, "can't make index section header: %r");
@@ -789,7 +787,7 @@ loadibucket0(Index *ix, u32int buck, ISect **pis, u32i
 /*
  * find the number of the index section holding score
  */
-static int
+int
 indexsect1(Index *ix, u8int *score)
 {
 	return indexsect0(ix, hashbits(score, 32) / ix->div);
blob - 1fe3cf5c375cdf7e38cff67bfc951a8c7dcd96a4
blob + 13e6fe6aa12f0d8e52c9695a79f38df9bcf12383
--- src/cmd/venti/srv/lump.c
+++ src/cmd/venti/srv/lump.c
@@ -2,6 +2,7 @@
 #include "dat.h"
 #include "fns.h"
 
+int			syncwrites = 0;
 int			queuewrites = 0;
 int			writestodevnull = 0;
 
@@ -45,7 +46,7 @@ readlump(u8int *score, int type, u32int size, int *cac
 		*cached = 0;
 
 	if(lookupscore(score, type, &ia, &rac) < 0){
-		/*ZZZ place to check for someone trying to guess scores */
+		/* ZZZ place to check for someone trying to guess scores */
 		seterr(EOk, "no block with score %V/%d exists", score, type);
 
 		putlump(u);
@@ -92,7 +93,15 @@ writelump(Packet *p, u8int *score, int type, u32int cr
 	if(u->data != nil){
 		ok = 0;
 		if(packetcmp(p, u->data) != 0){
-			seterr(EStrange, "score collision");
+			uchar nscore[VtScoreSize];
+
+			packetsha1(u->data, nscore);
+			if(scorecmp(u->score, score) != 0)
+				seterr(EStrange, "lookuplump returned bad score %V not %V", u->score, score);
+			else if(scorecmp(u->score, nscore) != 0)
+				seterr(EStrange, "lookuplump returned bad data %V not %V", nscore, u->score);
+			else
+				seterr(EStrange, "score collision %V", score);
 			ok = -1;
 		}
 		packetfree(p);
@@ -138,7 +147,13 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
 		if(old != nil){
 			ok = 0;
 			if(packetcmp(p, old) != 0){
-				seterr(EStrange, "score collision");
+				uchar nscore[VtScoreSize];
+
+				packetsha1(old, nscore);
+				if(scorecmp(u->score, nscore) != 0)
+					seterr(EStrange, "readilump returned bad data %V not %V", nscore, u->score);
+				else
+					seterr(EStrange, "score collision %V", u->score);
 				ok = -1;
 			}
 			packetfree(p);
@@ -160,6 +175,12 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
 		insertlump(u, p);
 	else
 		packetfree(p);
+	
+	if(syncwrites){
+		flushdcache();
+		flushicache();
+		flushdcache();
+	}
 
 	ms = msec() - ms;
 	addstat2(StatRpcWriteNew, 1, StatRpcWriteNewTime, ms);
blob - f183e128d06ebe8bf7840a4590a4f759a06e7dc7
blob + b989c3cbfd45658a94a3df0283ccc03b449d8696
--- src/cmd/venti/srv/lumpcache.c
+++ src/cmd/venti/srv/lumpcache.c
@@ -11,7 +11,7 @@ enum
 {
 	HashLog		= 9,
 	HashSize	= 1<<HashLog,
-	HashMask	= HashSize - 1
+	HashMask	= HashSize - 1,
 };
 
 struct LumpCache
@@ -175,7 +175,6 @@ again:
 	 * remove it from the heap, and fix up the heap.
 	 */
 	size = packetasize(p);
-/*ZZZ */
 	while(lumpcache.avail < size){
 		trace(TraceLump, "insertlump bump");
 		CHECK(checklumpcache());
@@ -277,6 +276,15 @@ bumplump(void)
 	return b;
 }
 
+void
+emptylumpcache(void)
+{
+	qlock(&lumpcache.lock);
+	while(bumplump())
+		;
+	qunlock(&lumpcache.lock);
+}
+
 /*
  * delete an arbitrary block from the heap
  */
@@ -415,3 +423,4 @@ checklumpcache(void)
 	if(lumpcache.nheap + nfree + refed != lumpcache.nblocks)
 		sysfatal("lc: missing blocks: %d %d %d %d", lumpcache.nheap, refed, nfree, lumpcache.nblocks);
 }
+
blob - 1b03f41ce7a65bee9625e3fb0dbe6659f386a10e
blob + 869eaeae0caadda9a8bbfe19be74b85bc7f204c0
--- src/cmd/venti/srv/lumpqueue.c
+++ src/cmd/venti/srv/lumpqueue.c
@@ -58,22 +58,6 @@ initlumpqueues(int nq)
 			seterr(EOk, "can't start write queue slave: %r");
 			return -1;
 		}
-		if(vtproc(queueproc, q) < 0){
-			seterr(EOk, "can't start write queue slave: %r");
-			return -1;
-		}
-		if(vtproc(queueproc, q) < 0){
-			seterr(EOk, "can't start write queue slave: %r");
-			return -1;
-		}
-		if(vtproc(queueproc, q) < 0){
-			seterr(EOk, "can't start write queue slave: %r");
-			return -1;
-		}
-		if(vtproc(queueproc, q) < 0){
-			seterr(EOk, "can't start write queue slave: %r");
-			return -1;
-		}
 	}
 
 	return 0;
blob - c50539fc7c44995ea0f4596eca3771d29ed081da
blob + 2f9a89daf96855c9e243f8f125103ac6571b68b6
--- src/cmd/venti/srv/mkfile
+++ src/cmd/venti/srv/mkfile
@@ -11,6 +11,7 @@ LIBOFILES=\
 	config.$O\
 	conv.$O\
 	dcache.$O\
+	disksched.$O\
 	dump.$O\
 	graph.$O\
 	httpd.$O\
@@ -52,11 +53,13 @@ TARG=\
 	fmtbloom\
 	fmtisect\
 	fmtindex\
+	fixarenas\
 	buildindex\
 	checkarenas\
 	checkindex\
 	clumpstats\
 	findscore\
+	mirrorarenas\
 	rdarena\
 	wrarena\
 	syncindex\
blob - ff5c98e50b849de3b73b74439a27a2bfb0514b82
blob + 1f8238c49124e57f30ad920270de68926fca050a
--- src/cmd/venti/srv/part.c
+++ src/cmd/venti/srv/part.c
@@ -145,8 +145,6 @@ initpart(char *name, int mode)
 	if(hi == 0)
 		hi = dir->length;
 	part->size = hi - part->offset;
-fprint(2, "part %s: file %s offset %,lld size %,lld\n", 
-	name, file, part->offset, part->size);
 #ifdef CANBLOCKSIZE
 	{
 		struct statfs sfs;
@@ -203,10 +201,32 @@ prwb(char *name, int fd, int isread, u64int offset, vo
 	u32int c, delta, icount, opsize;
 	int r;
 
+	icount = count;
 	buf = vbuf;
+
+#ifndef PLAN9PORT
+	op = isread ? "read" : "write";
+	dst = buf;
+	freetmp = nil;
+	while(count > 0){
+		opsize = min(count, 131072 /* blocksize */);
+		if(isread)
+			r = pread(fd, dst, opsize, offset);
+		else
+			r = pwrite(fd, dst, opsize, offset);
+		if(r <= 0)
+			goto Error;
+		offset += r;
+		count -= r;
+		dst += r;
+		if(r != opsize)
+			goto Error;
+	}
+	return icount;
+#endif
+
 	tmp = nil;
 	freetmp = nil;
-	icount = count;
 	opsize = blocksize;
 
 	if(count == 0){
@@ -313,7 +333,7 @@ print("FAILED isread=%d r=%d count=%d blocksize=%d\n",
 			memmove(buf, tmp, count);
 		else{
 			memmove(tmp, buf, count);
-			if(pwrite(fd, tmp, blocksize, offset) != blocksize){
+			if(pwrite(fd, tmp, opsize, offset) != blocksize){
 				dst = tmp;
 				op = "write";
 				goto Error;
@@ -332,9 +352,16 @@ Error:
 	return -1;
 }
 
+#ifndef PLAN9PORT
+static int sdreset(Part*);
+static int reopen(Part*);
+static int threadspawnl(int[3], char*, char*, ...);
+#endif
+
 int
 rwpart(Part *part, int isread, u64int offset, u8int *buf, u32int count)
 {
+	int n, try;
 	u32int blocksize;
 
 	trace(TraceDisk, "%s %s %ud at 0x%llx", 
@@ -351,9 +378,33 @@ rwpart(Part *part, int isread, u64int offset, u8int *b
 	if(blocksize == 0)
 		blocksize = 4096;
 
-	return prwb(part->filename, part->fd, isread, part->offset+offset, buf, count, blocksize);
-}
+	for(try=0;; try++){
+		n = prwb(part->filename, part->fd, isread, part->offset+offset, buf, count, blocksize);
+		if(n >= 0 || try > 10)
+			break;
 
+#ifndef PLAN9PORT
+	    {
+		char err[ERRMAX];
+		/*
+		 * This happens with the sdmv disks frustratingly often.
+		 * Try to fix things up and continue.
+		 */
+		rerrstr(err, sizeof err);
+		if(strstr(err, "i/o timeout") || strstr(err, "i/o error")){
+			if(sdreset(part) >= 0)
+				reopen(part);
+			continue;
+		}else if(strstr(err, "partition has changed")){
+			reopen(part);
+			continue;
+		}
+	    }
+#endif
+		break;
+	}
+	return n;
+}
 int
 readpart(Part *part, u64int offset, u8int *buf, u32int count)
 {
@@ -389,5 +440,202 @@ readfile(char *name)
 	}
 	freepart(p);
 	return b;
+}
+
+
+
+
+
+
+
+
+#ifndef PLAN9PORT
+static int
+sdreset(Part *part)
+{
+	char *name, *p;
+	int i, fd, xfd[3], rv;
+	static QLock resetlk;
+	Dir *d, *dd;
+	
+	fprint(2, "sdreset %s\n", part->name);
+	name = emalloc(strlen(part->filename)+20);
+	strcpy(name, part->filename);
+	p = strrchr(name, '/');
+	if(p)
+		p++;
+	else
+		p = name;
+	
+	strcpy(p, "ctl");
+	d = dirstat(name);
+	if(d == nil){
+		free(name);
+		return -1;
+	}
+
+	/*
+	 * We don't need multiple people resetting the disk.
+	 */
+	qlock(&resetlk);
+	if((fd = open(name, OWRITE)) < 0)
+		goto error;
+	dd = dirfstat(fd);
+	if(d && dd && d->qid.vers != dd->qid.vers){
+		fprint(2, "sdreset %s: got scooped\n", part->name);
+		/* Someone else got here first. */
+		if(access(part->filename, AEXIST) >= 0)
+			goto ok;
+		goto error;
+	}
+
+	/*
+	 * Write "reset" to the ctl file to cause the chipset
+	 * to reinitialize itself (specific to sdmv driver).
+	 * Ignore error in case using other disk.
+	 */
+	fprint(2, "sdreset %s: reset ctl\n", part->name);
+	write(fd, "reset", 5);
+
+	if(access(part->filename, AEXIST) >= 0)
+		goto ok;
+
+	/*
+	 * Re-run fdisk and prep.  Don't use threadwaitchan
+	 * to avoid coordinating for it.  Reopen ctl because 
+	 * we reset the disk.
+	 */
+	strcpy(p, "ctl");
+	close(fd);
+	if((fd = open(name, OWRITE)) < 0)
+		goto error;
+	strcpy(p, "data");
+	xfd[0] = open("/dev/null", OREAD);
+	xfd[1] = dup(fd, -1);
+	xfd[2] = dup(2, -1);
+	fprint(2, "sdreset %s: run fdisk %s\n", part->name, name);
+	if(threadspawnl(xfd, "/bin/disk/fdisk", "disk/fdisk", "-p", name, nil) < 0){
+		close(xfd[0]);
+		close(xfd[1]);
+		close(xfd[2]);
+		goto error;
+	}
+	strcpy(p, "plan9");
+	for(i=0; i<=20; i++){
+		sleep(i*100);
+		if(access(part->filename, AEXIST) >= 0)
+			goto ok;
+		if(access(name, AEXIST) >= 0)
+			goto prep;
+	}
+	goto error;
+	
+prep:
+	strcpy(p, "ctl");
+	close(fd);
+	if((fd = open(name, OWRITE)) < 0)
+		goto error;
+	strcpy(p, "plan9");
+	xfd[0] = open("/dev/null", OREAD);
+	xfd[1] = dup(fd, -1);
+	xfd[2] = dup(2, -1);
+	fprint(2, "sdreset %s: run prep\n", part->name);
+	if(threadspawnl(xfd, "/bin/disk/prep", "disk/prep", "-p", name, nil) < 0){
+		close(xfd[0]);
+		close(xfd[1]);
+		close(xfd[2]);
+		goto error;
+	}
+	for(i=0; i<=20; i++){
+		sleep(i*100);
+		if(access(part->filename, AEXIST) >= 0)
+			goto ok;
+	}
+
+error:
+	fprint(2, "sdreset %s: error: %r\n", part->name);
+	rv = -1;
+	if(fd >= 0)
+		close(fd);
+	goto out;
+
+ok:
+	fprint(2, "sdreset %s: all okay\n", part->name);
+	rv = 0;
+	goto out;
+
+out:
+	free(name);
+	qunlock(&resetlk);
+	return rv;
 }
 
+static int
+reopen(Part *part)
+{
+	int fd;
+	
+	fprint(2, "reopen %s\n", part->filename);
+	if((fd = open(part->filename, ORDWR)) < 0){
+		fprint(2, "reopen %s: %r\n", part->filename);
+		return -1;
+	}	
+	if(fd != part->fd){
+		dup(fd, part->fd);
+		close(fd);
+	}
+	return 0;
+}
+
+typedef struct Spawn Spawn;
+struct Spawn
+{
+	Channel *c;
+	int fd[3];
+	char *file;
+	char **argv;
+};
+
+static void
+spawnproc(void *v)
+{
+	int i, *fd;
+	Spawn *s;
+	
+	rfork(RFFDG);
+	s = v;
+	fd = s->fd;
+	for(i=0; i<3; i++)
+		dup(fd[i], i);
+	if(fd[0] > 2)
+		close(fd[0]);
+	if(fd[1] > 2 && fd[1] != fd[0])
+		close(fd[1]);
+	if(fd[2] > 2 && fd[2] != fd[1] && fd[2] != fd[0])
+		close(fd[2]);
+	procexec(s->c, s->file, s->argv);
+}
+
+static int
+threadspawnl(int fd[3], char *file, char *argv0, ...)
+{
+	int pid;
+	Spawn s;
+	
+	s.c = chancreate(sizeof(void*), 0);
+	memmove(s.fd, fd, sizeof(s.fd));
+	s.file = file;
+	s.argv = &argv0;
+	vtproc(spawnproc, &s);
+	pid = recvul(s.c);
+	if(pid < 0)
+		return -1;
+	close(fd[0]);
+	if(fd[1] != fd[0])
+		close(fd[1]);
+	if(fd[2] != fd[1] && fd[2] != fd[0])
+		close(fd[2]);
+	return pid;
+}
+
+#endif
blob - /dev/null
blob + 253b4edbc5dc8591265f2abfa744fd77f44895cc (mode 644)
--- /dev/null
+++ src/cmd/venti/srv/mirrorarenas.c
@@ -0,0 +1,464 @@
+/*
+ * Mirror one arena partition onto another.  
+ * Be careful to copy only new data.
+ */
+
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+Channel *writechan;
+
+typedef struct Write Write;
+struct Write
+{
+	uchar *p;
+	int n;
+	uvlong o;
+	int error;
+};
+
+Part *src;
+Part *dst;
+int force;
+int verbose;
+char *status;
+uvlong astart, aend;
+
+void
+usage(void)
+{
+	fprint(2, "usage: mirrorarenas [-v] src dst [ranges]\n");
+	threadexitsall("usage");
+}
+
+int
+ereadpart(Part *p, u64int offset, u8int *buf, u32int count)
+{
+	if(readpart(p, offset, buf, count) != count){
+		print("%T readpart %s at %#llux+%ud: %r\n", p->name, offset, count);
+		return -1;
+	}
+	return 0;
+}
+		
+int
+ewritepart(Part *p, u64int offset, u8int *buf, u32int count)
+{
+	if(writepart(p, offset, buf, count) != count){
+		print("%T writepart %s at %#llux+%ud: %r\n", p->name, offset, count);
+		return -1;
+	}
+	return 0;
+}
+
+/*
+ * Extra proc to do writes to dst, so that we can overlap reading
+ * src with writing dst during copy.  This is an easy factor of two
+ * (almost) in performance.
+ */
+static void
+writeproc(void *v)
+{
+	Write *w;
+	
+	USED(v);
+	while((w = recvp(writechan)) != nil){
+		if(w->n == 0)
+			continue;
+		if(ewritepart(dst, w->o, w->p, w->n) < 0)
+			w->error = 1;
+	}
+}
+
+int
+copy(uvlong start, uvlong end, char *what, DigestState *ds)
+{
+	int i, n;
+	uvlong o;
+	static uchar tmp[2][1024*1024];
+	Write w[2];
+	
+	assert(start <= end);
+	assert(astart <= start && start < aend);
+	assert(astart <= end && end <= aend);
+
+	if(verbose && start != end)
+		print("%T   copy %,llud-%,llud %s\n", start, end, what);
+
+	i = 0;
+	memset(w, 0, sizeof w);
+	for(o=start; o<end; o+=n){
+		if(w[i].error)
+			goto error;
+		n = sizeof tmp[i];
+		if(o+n > end)
+			n = end - o;
+		if(ereadpart(src, o, tmp[i], n) < 0)
+			goto error;
+		w[i].p = tmp[i];
+		w[i].o = o;
+		w[i].n = n;
+		w[i].error = 0;
+		sendp(writechan, &w[i]);
+		if(ds)
+			sha1(tmp[i], n, nil, ds);
+		i = 1-i;
+	}
+	if(w[i].error)
+		goto error;
+
+	/*
+	 * wait for queued write to finish
+	 */
+	w[i].p = nil;
+	w[i].o = 0;
+	w[i].n = 0;
+	w[i].error = 0;
+	sendp(writechan, &w[i]);
+	i = 1-i;
+	if(w[i].error)
+		return -1;
+	return 0;
+
+error:
+	/*
+	 * sync with write proc
+	 */
+	w[i].p = nil;
+	w[i].o = 0;
+	w[i].n = 0;
+	w[i].error = 0;
+	sendp(writechan, &w[i]);
+	return -1;
+}
+
+/* single-threaded, for reference */
+int
+copy1(uvlong start, uvlong end, char *what, DigestState *ds)
+{
+	int n;
+	uvlong o;
+	static uchar tmp[1024*1024];
+	
+	assert(start <= end);
+	assert(astart <= start && start < aend);
+	assert(astart <= end && end <= aend);
+
+	if(verbose && start != end)
+		print("%T   copy %,llud-%,llud %s\n", start, end, what);
+
+	for(o=start; o<end; o+=n){
+		n = sizeof tmp;
+		if(o+n > end)
+			n = end - o;
+		if(ereadpart(src, o, tmp, n) < 0)
+			return -1;
+		if(ds)
+			sha1(tmp, n, nil, ds);
+		if(ewritepart(dst, o, tmp, n) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+int
+asha1(Part *p, uvlong start, uvlong end, DigestState *ds)
+{
+	int n;
+	uvlong o;
+	static uchar tmp[1024*1024];
+
+	if(start == end)
+		return 0;
+	assert(start < end);
+
+	if(verbose)
+		print("%T   sha1 %,llud-%,llud\n", start, end);
+
+	for(o=start; o<end; o+=n){
+		n = sizeof tmp;
+		if(o+n > end)
+			n = end - o;
+		if(ereadpart(p, o, tmp, n) < 0)
+			return -1;
+		sha1(tmp, n, nil, ds);
+	}
+	return 0;
+}
+
+uvlong
+rdown(uvlong a, int b)
+{
+	return a-a%b;
+}
+
+uvlong
+rup(uvlong a, int b)
+{
+	if(a%b == 0)
+		return a;
+	return a+b-a%b;
+}
+
+void
+mirror(Arena *sa, Arena *da)
+{
+	vlong v, si, di, end;
+	int clumpmax, blocksize;
+	static uchar buf[MaxIoSize];
+	ArenaHead h;
+	DigestState xds, *ds;
+	vlong shaoff, base;
+	
+	base = sa->base;
+	blocksize = sa->blocksize;
+	end = sa->base + sa->size;
+	
+	astart = base - blocksize;
+	aend = end + blocksize;
+	
+	shaoff = 0;
+
+	if(force){
+		copy(astart, aend, "all", nil);
+		return;
+	}
+
+	if(verbose)
+		print("%T %s (%,llud-%,llud)\n", sa->name, astart, aend);
+
+	if(sa->diskstats.sealed && da->diskstats.sealed && scorecmp(da->score, zeroscore) != 0){
+		if(scorecmp(sa->score, da->score) == 0)
+			return;
+		print("%T arena %s: sealed score mismatch %V vs %V\n", sa->name, sa->score, da->score);
+		status = "errors";
+		return;
+	}
+	if(da->diskstats.sealed && scorecmp(da->score, zeroscore) != 0){
+		print("%T arena %s: dst is sealed, src is not\n", sa->name);
+		status = "errors";
+		return;
+	}
+	if(sa->diskstats.used < da->diskstats.used){
+		print("%T arena %s: src used %,lld < dst used %,lld\n", sa->name, sa->diskstats.used, da->diskstats.used);
+		status = "errors";
+		return;
+	}
+
+	if(da->clumpmagic != sa->clumpmagic){
+		/*
+		 * Write this now to reduce the window in which
+		 * the head and tail disagree about clumpmagic.
+		 */
+		da->clumpmagic = sa->clumpmagic;
+		memset(buf, 0, sizeof buf);
+		packarena(da, buf);
+		if(ewritepart(dst, end, buf, blocksize) < 0)
+			return;
+	}
+	
+	memset(&h, 0, sizeof h);
+	h.version = da->version;
+	strcpy(h.name, da->name);
+	h.blocksize = da->blocksize;
+	h.size = da->size + 2*da->blocksize;
+	h.clumpmagic = da->clumpmagic;
+	memset(buf, 0, sizeof buf);
+	packarenahead(&h, buf);
+	if(ewritepart(dst, base - blocksize, buf, blocksize) < 0)
+		return;
+
+	ds = nil;
+	if(sa->diskstats.sealed && scorecmp(sa->score, zeroscore) != 0){
+		/* start sha1 state with header */
+		memset(&xds, 0, sizeof xds);
+		ds = &xds;
+		sha1(buf, blocksize, nil, ds);
+		shaoff = base;
+	}
+	
+	if(sa->diskstats.used != da->diskstats.used){
+		di = base+rdown(da->diskstats.used, blocksize);
+		si = base+rup(sa->diskstats.used, blocksize);
+		if(ds && asha1(dst, shaoff, di, ds) < 0)
+			return;
+		if(copy(di, si, "data", ds) < 0)
+			return;
+		shaoff = si;
+	}
+	
+	clumpmax = sa->clumpmax;
+	di = end - da->diskstats.clumps/clumpmax * blocksize;
+	si = end - (sa->diskstats.clumps+clumpmax-1)/clumpmax * blocksize;
+
+	if(sa->diskstats.sealed){
+		/*
+		 * might be a small hole between the end of the 
+		 * data and the beginning of the directory.
+		 */
+		v = base+rup(sa->diskstats.used, blocksize);
+		if(ds && asha1(dst, shaoff, v, ds) < 0)
+			return;
+		if(copy(v, si, "hole", ds) < 0)
+			return;
+		shaoff = si;
+	}
+
+	if(da->diskstats.clumps != sa->diskstats.clumps){
+		if(ds && asha1(dst, shaoff, si, ds) < 0)
+			return;
+		if(copy(si, di, "directory", ds) < 0)	/* si < di  because clumpinfo blocks grow down */
+			return;
+		shaoff = di;
+	}
+
+	da->ctime = sa->ctime;
+	da->wtime = sa->wtime;
+	da->diskstats = sa->diskstats;
+	da->diskstats.sealed = 0;
+	
+	memset(buf, 0, sizeof buf);
+	packarena(da, buf);
+	if(ewritepart(dst, end, buf, blocksize) < 0)
+		return;
+
+	if(ds){
+		asha1(dst, shaoff, end, ds);
+		da->diskstats.sealed = 1;
+		memset(buf, 0, sizeof buf);
+		packarena(da, buf);
+		sha1(buf, blocksize, da->score, ds);
+		if(scorecmp(sa->score, da->score) == 0){
+			if(verbose)
+				print("%T arena %s: %V\n", sa->name, da->score);
+			scorecp(buf+blocksize-VtScoreSize, da->score);
+			if(ewritepart(dst, end, buf, blocksize) < 0)
+				return;
+		}else{
+			print("%T arena %s: sealing dst: score mismatch: %V vs %V\n", sa->name, sa->score, da->score);
+			memset(&xds, 0, sizeof xds);
+			asha1(dst, base-blocksize, end, &xds);
+			sha1(buf, blocksize, da->score, &xds);
+			print("%T   reseal: %V\n", da->score);
+			status = "errors";
+		}
+	}
+}
+
+void
+mirrormany(ArenaPart *sp, ArenaPart *dp, char *range)
+{
+	int i, lo, hi;
+	char *s, *t;
+	Arena *sa, *da;
+
+	if(range == nil){
+		for(i=0; i<sp->narenas; i++){
+			sa = sp->arenas[i];
+			da = dp->arenas[i];
+			mirror(sa, da);
+		}
+		return;
+	}
+	if(strcmp(range, "none") == 0)
+		return;
+
+	for(s=range; *s; s=t){
+		t = strchr(s, ',');
+		if(t)
+			*t++ = 0;
+		else
+			t = s+strlen(s);
+		if(*s == '-')
+			lo = 0;
+		else
+			lo = strtol(s, &s, 0);
+		hi = lo;
+		if(*s == '-'){
+			s++;
+			if(*s == 0)
+				hi = sp->narenas-1;
+			else
+				hi = strtol(s, &s, 0);
+		}
+		if(*s != 0){
+			print("%T bad arena range: %s\n", s);
+			continue;
+		}
+		for(i=lo; i<=hi; i++){
+			sa = sp->arenas[i];
+			da = dp->arenas[i];
+			mirror(sa, da);
+		}
+	}	
+}
+
+
+void
+threadmain(int argc, char **argv)
+{
+	int i;
+	Arena *sa, *da;
+	ArenaPart *s, *d;
+	char *ranges;
+	
+	ventifmtinstall();
+
+	ARGBEGIN{
+	case 'F':
+		force = 1;
+		break;
+	case 'v':
+		verbose++;
+		break;
+	default:
+		usage();
+	}ARGEND
+	
+	if(argc != 2 && argc != 3)
+		usage();
+	ranges = nil;
+	if(argc == 3)
+		ranges = argv[2];
+
+	if((src = initpart(argv[0], OREAD)) == nil)
+		sysfatal("initpart %s: %r", argv[0]);
+	if((dst = initpart(argv[1], ORDWR)) == nil)
+		sysfatal("initpart %s: %r", argv[1]);
+	if((s = initarenapart(src)) == nil)
+		sysfatal("initarenapart %s: %r", argv[0]);
+	for(i=0; i<s->narenas; i++)
+		delarena(s->arenas[i]);
+	if((d = initarenapart(dst)) == nil)
+		sysfatal("loadarenapart %s: %r", argv[1]);
+	for(i=0; i<d->narenas; i++)
+		delarena(d->arenas[i]);
+	
+	/*
+	 * The arena geometries must match or all bets are off.
+	 */
+	if(s->narenas != d->narenas)
+		sysfatal("arena count mismatch: %d vs %d", s->narenas, d->narenas);
+	for(i=0; i<s->narenas; i++){
+		sa = s->arenas[i];
+		da = d->arenas[i];
+		if(sa->version != da->version)
+			sysfatal("arena %d: version mismatch: %d vs %d", i, sa->version, da->version);
+		if(sa->blocksize != da->blocksize)
+			sysfatal("arena %d: blocksize mismatch: %d vs %d", i, sa->blocksize, da->blocksize);
+		if(sa->size != da->size)
+			sysfatal("arena %d: size mismatch: %,lld vs %,lld", i, sa->size, da->size);
+		if(strcmp(sa->name, da->name) != 0)
+			sysfatal("arena %d: name mismatch: %s vs %s", i, sa->name, da->name);
+	}
+	
+	/*
+	 * Mirror one arena at a time.
+	 */
+	writechan = chancreate(sizeof(void*), 0);
+	vtproc(writeproc, nil);
+	mirrormany(s, d, ranges);
+	sendp(writechan, nil);
+	threadexitsall(status);
+}
blob - 90c74ccceed97f90d3af4b5d3c0d54762ad88ebc
blob + 111db0187fa47017c41c628517fbafdd4602b060
--- src/cmd/venti/srv/printarenas.c
+++ src/cmd/venti/srv/printarenas.c
@@ -36,7 +36,7 @@ shoulddump(char *name, int argc, char **argv)
 
 enum
 {
-	ClumpChunks = 32*1024
+	ClumpChunks = 32*1024,
 };
 
 void
blob - /dev/null
blob + 25418beb1ff3d66be453f916901b24ea242c3e0f (mode 644)
--- /dev/null
+++ src/cmd/venti/srv/printarenapart.c
@@ -0,0 +1,160 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+uchar buf[64*1024];
+
+void
+usage(void)
+{
+	fprint(2, "usage: printarenapart arenafile [offset]\n");
+	threadexitsall("usage");
+}
+
+static void
+rdarena(Arena *arena, u64int offset)
+{
+	u64int a, aa, e;
+	u32int magic;
+	Clump cl;
+	uchar score[VtScoreSize];
+	ZBlock *lump;
+
+	printarena(2, arena);
+
+	a = arena->base;
+	e = arena->base + arena->size;
+	if(offset != ~(u64int)0) {
+		if(offset >= e-a)
+			sysfatal("bad offset %llud >= %llud\n",
+				offset, e-a);
+		aa = offset;
+	} else
+		aa = 0;
+
+	for(; aa < e; aa += ClumpSize+cl.info.size) {
+		magic = clumpmagic(arena, aa);
+		if(magic == ClumpFreeMagic)
+			break;
+		if(magic != arena->clumpmagic) {
+			fprint(2, "illegal clump magic number %#8.8ux offset %llud\n",
+				magic, aa);
+			break;
+		}
+		lump = loadclump(arena, aa, 0, &cl, score, 0);
+		if(lump == nil) {
+			fprint(2, "clump %llud failed to read: %r\n", aa);
+			break;
+		}
+		if(cl.info.type != VtCorruptType) {
+			scoremem(score, lump->data, cl.info.uncsize);
+			if(scorecmp(cl.info.score, score) != 0) {
+				fprint(2, "clump %llud has mismatched score\n", aa);
+				break;
+			}
+			if(vttypevalid(cl.info.type) < 0) {
+				fprint(2, "clump %llud has bad type %d\n", aa, cl.info.type);
+				break;
+			}
+		}
+		print("%22llud %V %3d %5d\n", aa, score, cl.info.type, cl.info.uncsize);
+		freezblock(lump);
+	}
+	print("end offset %llud\n", aa);
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+	char *file, *p, *name;
+	char *table;
+	u64int offset;
+	Part *part;
+	ArenaPart ap;
+	ArenaHead head;
+	Arena tail;
+	char ct[40], mt[40];
+
+	readonly = 1;	/* for part.c */
+	ARGBEGIN{
+	default:
+		usage();
+		break;
+	}ARGEND
+
+	switch(argc) {
+	default:
+		usage();
+	case 1:
+		file = argv[0];
+	}
+
+	ventifmtinstall();
+	statsinit();
+
+	part = initpart(file, OREAD|ODIRECT);
+	if(part == nil)
+		sysfatal("can't open file %s: %r", file);
+	if(readpart(part, PartBlank, buf, sizeof buf) < 0)
+		sysfatal("can't read file %s: %r", file);
+
+	if(unpackarenapart(&ap, buf) < 0)
+		sysfatal("corrupted arena part header: %r");
+
+	print("# arena part version=%d blocksize=%d arenabase=%d\n",
+		ap.version, ap.blocksize, ap.arenabase);
+	ap.tabbase = (PartBlank+HeadSize+ap.blocksize-1)&~(ap.blocksize-1);
+	ap.tabsize = ap.arenabase - ap.tabbase;
+
+print("A");
+	table = malloc(ap.tabsize+1);
+	if(readpart(part, ap.tabbase, (uchar*)table, ap.tabsize) < 0)
+		sysfatal("read %s: %r", file);
+	table[ap.tabsize] = 0;
+
+print("A");
+	partblocksize(part, ap.blocksize);
+	initdcache(8 * MaxDiskBlock);
+
+print("A");
+/* XXX - read the number of arenas from the first line */
+	for(p=table; p && *p; p=strchr(p, '\n')){
+		if(*p == '\n')
+			p++;
+		name = p;
+		p = strpbrk(p, " \t");
+		if(p == nil){
+			fprint(2, "bad line: %s\n", name);
+			break;
+		}
+print("%p\n", p);
+		offset = strtoull(p, nil, 0);
+		if(readpart(part, offset, buf, sizeof buf) < 0){
+			fprint(2, "%s: read %s: %r\n", argv0, file);
+			continue;
+		}
+		if(unpackarenahead(&head, buf) < 0){
+			fprint(2, "%s: unpackarenahead: %r\n", argv0);
+			continue;
+		}
+		if(readpart(part, offset+head.size-head.blocksize, buf, head.blocksize) < 0){
+			fprint(2, "%s: read %s: %r\n", argv0, file);
+			continue;
+		}
+		if(unpackarena(&tail, buf) < 0){
+			fprint(2, "%s: unpackarena: %r\n", argv0);
+			continue;
+		}
+		print("arena %s %lld clumps=%,d cclumps=%,d used=%,lld uncsize=%,lld%s\n",
+			tail.name, offset,
+			tail.diskstats.clumps, tail.diskstats.cclumps,
+			tail.diskstats.used, tail.diskstats.uncsize,
+			tail.diskstats.sealed ? " sealed" : "");
+		strcpy(ct, ctime(tail.ctime));
+		ct[28] = 0;
+		strcpy(mt, ctime(tail.wtime));
+		mt[28] = 0;
+		print("\tctime=%s\n\tmtime=%s\n", ct, mt);
+	}
+	threadexitsall(0);
+}
blob - 7ed9ba3a08f10c69146eadfff47d1e2c5254f7b5
blob + fc5e85e7102e2dbcac108c53b14083a7eb8dc723
--- src/cmd/venti/srv/sortientry.c
+++ src/cmd/venti/srv/sortientry.c
@@ -61,7 +61,7 @@ sortrawientries(Index *ix, Part *tmp, u64int *base, Bl
 	u32int n;
 	int i, ok;
 
-/*ZZZ should allow configuration of bits, bucket size */
+/* ZZZ should allow configuration of bits, bucket size */
 	ib = initiebucks(tmp, 8, 64*1024);
 	if(ib == nil){
 		seterr(EOk, "can't create sorting buckets: %r");
@@ -116,10 +116,7 @@ readarenainfo(IEBucks *ib, Arena *arena, u64int a, Blo
 	ClumpInfo *ci, *cis;
 	u32int clump;
 	int i, n, ok, nskip;
-/*	static Biobuf bout; */
 
-/*ZZZ remove fprint? */
-/*fprint(2, "ra %s %d %d\n", arena->name, arena->memstats.clumps, arena->diskstats.clumps); */
 	if(arena->memstats.clumps)
 		fprint(2, "\tarena %s: %d entries\n", arena->name, arena->memstats.clumps);
 	else
@@ -129,7 +126,6 @@ readarenainfo(IEBucks *ib, Arena *arena, u64int a, Blo
 	ok = 0;
 	nskip = 0;
 	memset(&ie, 0, sizeof(IEntry));
-/*	Binit(&bout, 1, OWRITE); */
 	for(clump = 0; clump < arena->memstats.clumps; clump += n){
 		n = ClumpChunks;
 		if(n > arena->memstats.clumps - clump)
@@ -148,18 +144,15 @@ readarenainfo(IEBucks *ib, Arena *arena, u64int a, Blo
 			a += ci->size + ClumpSize;
 			ie.ia.blocks = (ci->size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
 			scorecp(ie.score, ci->score);
-		/*	Bprint(&bout, "%22lld %V %3d %5d\n", */
-		/*		ie.ia.addr, ie.score, ie.ia.type, ie.ia.size); */
 			if(ci->type == VtCorruptType){
-			/*	print("! %V %22lld %3d %5d %3d\n", */
-			/*		ie.score, ie.ia.addr, ie.ia.type, ie.ia.size, ie.ia.blocks); */
+				if(0) print("! %V %22lld %3d %5d %3d\n",
+					ie.score, ie.ia.addr, ie.ia.type, ie.ia.size, ie.ia.blocks);
 				nskip++;
 			}else
 				sprayientry(ib, &ie);
 			markbloomfilter(b, ie.score);
 		}
 	}
-/*	Bterm(&bout); */
 	free(cis);
 	if(ok < 0)
 		return TWID32;
@@ -358,8 +351,8 @@ readiebuck(IEBucks *ib, int b)
 	m = ib->bucks[b].used;
 	if(m == 0)
 		m = ib->usable;
-/*	if(ib->bucks[b].total) */
-/*		fprint(2, "\tbucket %d: %d entries\n", b, ib->bucks[b].total/IEntrySize); */
+	if(0) if(ib->bucks[b].total)
+		fprint(2, "\tbucket %d: %d entries\n", b, ib->bucks[b].total/IEntrySize);
 	while(head != TWID32){
 		if(readpart(ib->part, (u64int)head * ib->size, &ib->buf[n], m+U32Size) < 0){
 			seterr(EOk, "can't read index sort bucket: %r");
blob - f578860adab94622db2d25b9add9e51d8967e90e
blob + 874f7d27065398e0d6277cd5d18d54f235ba856c
--- src/cmd/venti/srv/stats.c
+++ src/cmd/venti/srv/stats.c
@@ -80,7 +80,7 @@ Statdesc statdesc[NStat] =
 	{ "isect block write bytes", },
 
 	{ "sum reads", },
-	{ "sum read bytes", }
+	{ "sum read bytes", },
 };
 
 QLock statslock;
blob - 7a5d6f9dd4daf5d139c5f4ee014a9f5dc589a2c5
blob + 89546f55626f00d85d325e63c43941293820500b
--- src/cmd/venti/srv/syncarena.c
+++ src/cmd/venti/srv/syncarena.c
@@ -30,12 +30,11 @@ syncarena(Arena *arena, u64int start, u32int n, int zo
 	ZBlock *lump;
 	Clump cl;
 	ClumpInfo ci;
-	static ClumpInfo zci = { -1 };
+	static ClumpInfo zci = { .type = -1 };
 	u8int score[VtScoreSize];
 	u64int uncsize, used, aa;
 	u32int clump, clumps, cclumps, magic;
 	int err, flush, broken;
-	AState as;
 
 	used = arena->memstats.used;
 	clumps = arena->memstats.clumps;
@@ -133,19 +132,21 @@ syncarena(Arena *arena, u64int start, u32int n, int zo
 		flushdcache();
 	}
 
+fprint(2, "arena %s: start=%lld fix=%d flush=%d %lld->%lld %ud->%ud %ud->%ud %lld->%lld\n",
+	arena->name,
+	start,
+	fix,
+	flush,
+	used, arena->memstats.used,
+	clumps, arena->memstats.clumps,
+	cclumps, arena->memstats.cclumps,
+	uncsize, arena->memstats.uncsize);
+
 	if(used != arena->memstats.used
 	|| clumps != arena->memstats.clumps
 	|| cclumps != arena->memstats.cclumps
 	|| uncsize != arena->memstats.uncsize)
 		err |= SyncHeader;
-	if(start && (err&SyncHeader)){
-		trace(TraceProc, "syncarena setdcachestate");
-		as.arena = arena;
-		as.aa = start+arena->memstats.used;
-		as.stats = arena->memstats;
-		setdcachestate(&as);
-	}
-
 	return err;
 }
 
blob - 56bf1527267934c625868a9ad0be115d477f4378
blob + 72d45f18fc96a59c97c9fa2dc4256d0ae59c97b9
--- src/cmd/venti/srv/syncindex.c
+++ src/cmd/venti/srv/syncindex.c
@@ -48,6 +48,8 @@ threadmain(int argc, char *argv[])
 	ventifmtinstall();
 	if(initventi(argv[0], &conf) < 0)
 		sysfatal("can't init venti: %r");
+	if(mainindex->bloom && loadbloom(mainindex->bloom) < 0)
+		sysfatal("can't load bloom filter: %r");
 
 	if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16))
 		bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16);
blob - 12b69ed2c3aa5a0f8ea2be71b7769903bf4473e4
blob + e214d712718f11eae3f8687ba4dcf0fee2fc1486
--- src/cmd/venti/srv/syncindex0.c
+++ src/cmd/venti/srv/syncindex0.c
@@ -121,6 +121,7 @@ int
 syncindex(Index *ix, int fix, int mustflush, int check)
 {
 	Arena *arena;
+	AState as;
 	u64int a;
 	u32int clump;
 	int i, e, e1, ok, ok1, flush;
@@ -130,7 +131,12 @@ syncindex(Index *ix, int fix, int mustflush, int check
 	for(i = 0; i < ix->narenas; i++){
 		trace(TraceProc, "syncindex start %d", i);
 		arena = ix->arenas[i];
-		clump = arena->memstats.clumps;
+		/*
+		 * Syncarena will scan through the arena looking for blocks
+		 * that have been forgotten.  It will update arena->memstats.used,
+		 * so save the currenct copy as the place to start the 
+		 * syncarenaindex scan.
+		 */
 		a = arena->memstats.used;
 		e = syncarena(arena, ix->amap[i].start, TWID32, fix, fix);
 		e1 = e;
@@ -138,15 +144,23 @@ syncindex(Index *ix, int fix, int mustflush, int check
 			e1 &= ~(SyncHeader|SyncCIZero|SyncCIErr);
 		if(e1 == SyncHeader)
 			fprint(2, "arena %s: header is out-of-date\n", arena->name);
+		clump = arena->diskstats.clumps;
 		if(e1)
 			ok = -1;
 		else{
 			ok1 = syncarenaindex(ix, arena, clump, a + ix->amap[i].start, fix, &flush, check);
 			if(ok1 < 0)
 				fprint(2, "syncarenaindex: %r\n");
+fprint(2, "arena %s: wbarena in syncindex\n", arena->name);
 			if(fix && ok1==0 && (e & SyncHeader) && wbarena(arena) < 0)
 				fprint(2, "arena=%s header write failed: %r\n", arena->name);
 			ok |= ok1;
+
+fprint(2, "arena %s: setdcachestate\n", arena->name);
+			as.arena = arena;
+			as.aa = ix->amap[i].start + arena->memstats.used;
+			as.stats = arena->memstats;
+			setdcachestate(&as);
 		}
 	}
 	if(missing || wrong)
blob - 587046cc5aea3b0583729932f6d85207d7cf89ee
blob + 5530bd07d003f43c444940d600c9b41cae07a809
--- src/cmd/venti/srv/unwhack.c
+++ src/cmd/venti/srv/unwhack.c
@@ -23,7 +23,7 @@ static uchar lenval[1 << (DBigLenBits - 1)] =
 static uchar lenbits[] =
 {
 	0, 0, 0,
-	2, 3, 5, 5
+	2, 3, 5, 5,
 };
 
 static uchar offbits[16] =
blob - 03fd9065117709e039ad8c4c5c9f18d1f0f8b5cb
blob + 0fd0f04ff5c4552437cf034b94e78dee0f7a52e6
--- src/cmd/venti/srv/utils.c
+++ src/cmd/venti/srv/utils.c
@@ -148,6 +148,7 @@ emalloc(ulong n)
 		sysfatal("out of memory allocating %lud", n);
 	}
 	memset(p, 0xa5, n);
+	setmalloctag(p, getcallerpc(&n));
 if(0)print("emalloc %p-%p by %lux\n", p, (char*)p+n, getcallerpc(&n));
 	return p;
 }
@@ -164,6 +165,7 @@ ezmalloc(ulong n)
 		sysfatal("out of memory allocating %lud", n);
 	}
 	memset(p, 0, n);
+	setmalloctag(p, getcallerpc(&n));
 if(0)print("ezmalloc %p-%p by %lux\n", p, (char*)p+n, getcallerpc(&n));
 	return p;
 }
@@ -177,6 +179,7 @@ erealloc(void *p, ulong n)
 			abort();
 		sysfatal("out of memory allocating %lud", n);
 	}
+	setrealloctag(p, getcallerpc(&p));
 if(0)print("erealloc %p-%p by %lux\n", p, (char*)p+n, getcallerpc(&p));
 	return p;
 }
@@ -190,6 +193,7 @@ estrdup(char *s)
 	n = strlen(s) + 1;
 	t = emalloc(n);
 	memmove(t, s, n);
+	setmalloctag(t, getcallerpc(&s));
 if(0)print("estrdup %p-%p by %lux\n", t, (char*)t+n, getcallerpc(&s));
 	return t;
 }
@@ -231,6 +235,7 @@ ventifmtinstall(void)
 	fmtinstall('F', vtfcallfmt);
 	fmtinstall('H', encodefmt);
 	fmtinstall('I', ientryfmt);
+	fmtinstall('T', vttimefmt);
 	fmtinstall('V', vtscorefmt);
 }
 
blob - 1e924aebb3d33394fc3fce0cc925261a88f5f8f2
blob + e9ca05364653a4486e854bc578ae14b8a0c4494c
--- src/cmd/venti/srv/venti.c
+++ src/cmd/venti/srv/venti.c
@@ -105,6 +105,8 @@ threadmain(int argc, char *argv[])
 	fprint(2, "conf...");
 	if(initventi(configfile, &config) < 0)
 		sysfatal("can't init server: %r");
+	if(mainindex->bloom && loadbloom(mainindex->bloom) < 0)
+		sysfatal("can't load bloom filter: %r");
 
 	if(mem == 0)
 		mem = config.mem;
@@ -210,8 +212,8 @@ ventiserver(void *v)
 		trace(TraceRpc, "<- %F", &r->tx);
 		r->rx.msgtype = r->tx.msgtype+1;
 		addstat(StatRpcTotal, 1);
-	/*	print("req (arenas[0]=%p sects[0]=%p) %F\n", */
-	/*		mainindex->arenas[0], mainindex->sects[0], &r->tx); */
+		if(0) print("req (arenas[0]=%p sects[0]=%p) %F\n",
+			mainindex->arenas[0], mainindex->sects[0], &r->tx);
 		switch(r->tx.msgtype){
 		default:
 			vtrerror(r, "unknown request");
blob - 5236c0934d87c1a1a1255f372f89aef895b773a3
blob + 2cdb7ba05356c28e70d3c04a0ea5b2e0bac11c5a
--- src/cmd/venti/srv/verifyarena.c
+++ src/cmd/venti/srv/verifyarena.c
@@ -3,65 +3,102 @@
 #include "fns.h"
 
 static int	verbose;
+static int	fd;
+static uchar	*data;
+static int	blocksize;
+static int	sleepms;
 
 void
 usage(void)
 {
-	fprint(2, "usage: verifyarena [-v]\n");
+	fprint(2, "usage: verifyarena [-b blocksize] [-s ms] [-v] [arenapart [name...]]\n");
 	threadexitsall(0);
 }
 
-static void
+static int
+preadblock(uchar *buf, int n, vlong off)
+{
+	int nr, m;
+
+	for(nr = 0; nr < n; nr += m){
+		m = n - nr;
+		m = pread(fd, &buf[nr], m, off+nr);
+		if(m <= 0){
+			if(m == 0)
+				werrstr("early eof");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static int
 readblock(uchar *buf, int n)
 {
 	int nr, m;
 
 	for(nr = 0; nr < n; nr += m){
 		m = n - nr;
-		m = read(0, &buf[nr], m);
-		if(m <= 0)
-			sysfatal("can't read arena from standard input: %r");
+		m = read(fd, &buf[nr], m);
+		if(m <= 0){
+			if(m == 0)
+				werrstr("early eof");
+			return -1;
+		}
 	}
+	return 0;
 }
 
 static void
-verifyarena(void)
+verifyarena(char *name, vlong len)
 {
 	Arena arena;
 	ArenaHead head;
-	ZBlock *b;
 	DigestState s;
 	u64int n, e;
 	u32int bs;
 	u8int score[VtScoreSize];
 
-	fprint(2, "verify arena from standard input\n");
+	fprint(2, "verify %s\n", name);
 
 	memset(&arena, 0, sizeof arena);
 	memset(&s, 0, sizeof s);
 
 	/*
-	 * read the little bit, which will included the header
+	 * read a little bit, which will include the header
 	 */
-	bs = MaxIoSize;
-	b = alloczblock(bs, 0, 0);
-	readblock(b->data, HeadSize);
-	sha1(b->data, HeadSize, nil, &s);
-	if(unpackarenahead(&head, b->data) < 0)
-		sysfatal("corrupted arena header: %r");
+	if(readblock(data, HeadSize) < 0){
+		fprint(2, "%s: reading header: %r\n", name);
+		return;
+	}
+	sha1(data, HeadSize, nil, &s);
+	if(unpackarenahead(&head, data) < 0){
+		fprint(2, "%s: corrupt arena header: %r\n", name);
+		return;
+	}
 	if(head.version != ArenaVersion4 && head.version != ArenaVersion5)
-		fprint(2, "warning: unknown arena version %d\n", head.version);
+		fprint(2, "%s: warning: unknown arena version %d\n", name, head.version);
+	if(len != 0 && len != head.size)
+		fprint(2, "%s: warning: unexpected length %lld != %lld\n", name, head.size, len);
+	if(strcmp(name, "<stdin>") != 0 && strcmp(head.name, name) != 0)
+		fprint(2, "%s: warning: unexpected name %s\n", name, head.name);
 
 	/*
 	 * now we know how much to read
 	 * read everything but the last block, which is special
 	 */
 	e = head.size - head.blocksize;
+	bs = blocksize;
 	for(n = HeadSize; n < e; n += bs){
 		if(n + bs > e)
 			bs = e - n;
-		readblock(b->data, bs);
-		sha1(b->data, bs, nil, &s);
+		if(readblock(data, bs) < 0){
+			fprint(2, "%s: read data: %r\n", name);
+			return;
+		}
+		sha1(data, bs, nil, &s);
+		if(sleepms)
+			sleep(sleepms);
 	}
 
 	/*
@@ -69,8 +106,11 @@ verifyarena(void)
 	 * the sum is calculated assuming the slot for the sum is zero.
 	 */
 	bs = head.blocksize;
-	readblock(b->data, bs);
-	sha1(b->data, bs-VtScoreSize, nil, &s);
+	if(readblock(data, bs) < 0){
+		fprint(2, "%s: read last block: %r\n", name);
+		return;
+	}
+	sha1(data, bs-VtScoreSize, nil, &s);
 	sha1(zeroscore, VtScoreSize, nil, &s);
 	sha1(nil, 0, score, &s);
 
@@ -78,37 +118,73 @@ verifyarena(void)
 	 * validity check on the trailer
 	 */
 	arena.blocksize = head.blocksize;
-	if(unpackarena(&arena, b->data) < 0)
-		sysfatal("corrupted arena trailer: %r");
-	scorecp(arena.score, &b->data[arena.blocksize - VtScoreSize]);
+	if(unpackarena(&arena, data) < 0){
+		fprint(2, "%s: corrupt arena trailer: %r\n", name);
+		return;
+	}
+	scorecp(arena.score, &data[arena.blocksize - VtScoreSize]);
 
-	if(namecmp(arena.name, head.name) != 0)
-		sysfatal("arena header and trailer names clash: %s vs. %s\n", head.name, arena.name);
-	if(arena.version != head.version)
-		sysfatal("arena header and trailer versions clash: %d vs. %d\n", head.version, arena.version);
+	if(namecmp(arena.name, head.name) != 0){
+		fprint(2, "%s: wrong name in trailer: %s vs. %s\n", 
+			name, head.name, arena.name);
+		return;
+	}
+	if(arena.version != head.version){
+		fprint(2, "%s: wrong version in trailer: %d vs. %d\n", 
+			name, head.version, arena.version);
+		return;
+	}
 	arena.size = head.size - 2 * head.blocksize;
 
 	/*
 	 * check for no checksum or the same
 	 */
-	if(scorecmp(score, arena.score) != 0){
-		if(scorecmp(zeroscore, arena.score) != 0)
-			fprint(2, "warning: mismatched checksums for arena=%s, found=%V calculated=%V",
-				arena.name, arena.score, score);
-		scorecp(arena.score, score);
-	}else
-		fprint(2, "matched score\n");
-
+	if(scorecmp(score, arena.score) == 0)
+		fprint(2, "%s: verified score\n", name);
+	else if(scorecmp(zeroscore, arena.score) == 0)
+		fprint(2, "%s: unsealed\n", name);
+	else{
+		fprint(2, "%s: mismatch checksum - found=%V calculated=%V\n",
+			name, arena.score, score);
+		return;
+	}
 	printarena(2, &arena);
 }
 
+static int
+shouldcheck(char *name, char **s, int n)
+{
+	int i;
+	
+	if(n == 0)
+		return 1;
+
+	for(i=0; i<n; i++){
+		if(s[i] && strcmp(name, s[i]) == 0){
+			s[i] = nil;
+			return 1;
+		}
+	}
+	return 0;
+}
+
 void
 threadmain(int argc, char *argv[])
 {
+	int i, nline;
+	char *p, *q, *table, *f[10], line[256];
+	vlong start, stop;
+	ArenaPart ap;
+	
 	ventifmtinstall();
-	statsinit();
-
+	blocksize = MaxIoSize;
 	ARGBEGIN{
+	case 'b':
+		blocksize = unittoull(EARGF(usage()));
+		break;
+	case 's':
+		sleepms = atoi(EARGF(usage()));
+		break;
 	case 'v':
 		verbose++;
 		break;
@@ -117,11 +193,69 @@ threadmain(int argc, char *argv[])
 		break;
 	}ARGEND
 
-	readonly = 1;
+	data = vtmalloc(blocksize);
+	if(argc == 0){
+		fd = 0;
+		verifyarena("<stdin>", 0);
+		threadexitsall(nil);
+	}
+	
+	if((fd = open(argv[0], OREAD)) < 0)
+		sysfatal("open %s: %r", argv[0]);
 
-	if(argc != 0)
-		usage();
+	if(preadblock(data, 8192, PartBlank) < 0)
+		sysfatal("read arena part header: %r");
+	if(unpackarenapart(&ap, data) < 0)
+		sysfatal("corrupted arena part header: %r");
+	fprint(2, "# arena part version=%d blocksize=%d arenabase=%d\n",
+		ap.version, ap.blocksize, ap.arenabase);
+	ap.tabbase = (PartBlank+HeadSize+ap.blocksize-1)&~(ap.blocksize-1);
+	ap.tabsize = ap.arenabase - ap.tabbase;
+	table = malloc(ap.tabsize+1);
+	if(preadblock((uchar*)table, ap.tabsize, ap.tabbase) < 0)
+		sysfatal("reading arena part directory: %r");
+	table[ap.tabsize] = 0;
+	
+	nline = atoi(table);
+	p = strchr(table, '\n');
+	if(p)
+		p++;
+	for(i=0; i<nline; i++){
+		if(p == nil){
+			fprint(2, "warning: unexpected arena table end\n");
+			break;
+		}
+		q = strchr(p, '\n');
+		if(q)
+			*q++ = 0;
+		if(strlen(p) >= sizeof line){
+			fprint(2, "warning: long arena table line: %s\n", p);
+			p = q;
+			continue;
+		}
+		strcpy(line, p);
+		memset(f, 0, sizeof f);
+		if(tokenize(line, f, nelem(f)) < 3){
+			fprint(2, "warning: bad arena table line: %s\n", p);
+			p = q;
+			continue;
+		}
+		p = q;
+		if(shouldcheck(f[0], argv+1, argc-1)){
+			start = strtoull(f[1], 0, 0);
+			stop = strtoull(f[2], 0, 0);
+			if(stop <= start){
+				fprint(2, "%s: bad start,stop %lld,%lld\n", f[0], stop, start);
+				continue;
+			}
+			if(seek(fd, start, 0) < 0)
+				fprint(2, "%s: seek to start: %r\n", f[0]);
+			verifyarena(f[0], stop - start);
+		}
+	}
+	for(i=1; i<argc; i++)
+		if(argv[i] != 0)
+			fprint(2, "%s: did not find arena\n", argv[i]);
 
-	verifyarena();
-	threadexitsall(0);
+	threadexitsall(nil);
 }
blob - 8a2e9299e61380777ea75004b56c23b1e56028e1
blob + a0d57f2e3ac3ff3e2f5fa57c2a2b0e207ad1075d
--- src/cmd/venti/srv/wrarena.c
+++ src/cmd/venti/srv/wrarena.c
@@ -83,8 +83,8 @@ rdarena(Arena *arena, u64int offset)
 		if(magic == ClumpFreeMagic)
 			break;
 		if(magic != arena->clumpmagic) {
-		/*	fprint(2, "illegal clump magic number %#8.8ux offset %llud\n", */
-		/*		magic, aa); */
+			if(0) fprint(2, "illegal clump magic number %#8.8ux offset %llud\n",
+				magic, aa);
 			break;
 		}
 		lump = loadclump(arena, aa, 0, &cl, score, 0);
blob - 4cc96dd42a500ebab8b8e9ff2960267ac7a9f9d6
blob + 4aa11f45a8c02e1be7841246e2869206c0e8d163
--- src/cmd/venti/srv/zblock.c
+++ src/cmd/venti/srv/zblock.c
@@ -5,11 +5,13 @@
 void
 fmtzbinit(Fmt *f, ZBlock *b)
 {
-	memset(f, 0, sizeof *f);
-	fmtlocaleinit(f, nil, nil, nil);
+	f->runes = 0;
 	f->start = b->data;
 	f->to = f->start;
 	f->stop = (char*)f->start + b->len;
+	f->flush = nil;
+	f->farg = nil;
+	f->nfmt = 0;
 }
 
 #define ROUNDUP(p, n) ((void*)(((uintptr)(p)+(n)-1)&~(uintptr)((n)-1)))
blob - 70c25b73d2d94f37f51d8a4679c2eb18bda496fa
blob + 7602627cc5069e2a6423498579edbca3474210fd
--- src/cmd/venti/srv/zeropart.c
+++ src/cmd/venti/srv/zeropart.c
@@ -10,10 +10,6 @@ zeropart(Part *part, int blocksize)
 	int w;
 
 	fprint(2, "clearing the partition\n");
-/*fprint(2, "NOT!\n"); */
-/*return; */
-/*b=alloczblock(MaxIoSize, 1, blocksize); */
-/*freezblock(b); */
 	b = alloczblock(MaxIoSize, 1, blocksize);
 
 	w = 0;