Commit Diff


commit - 25a4e89fa907ed5a5f5d84eccfb66180007d9c68
commit + 7a400ee957a0815287af806e18ef90dd18b47f82
blob - 7cd2eb73c4244b555587e0fee267d74a3461e5d8
blob + c576e9aaf4485e7457af1e716ca93e7ed8648a01
--- src/cmd/venti/srv/arena.c
+++ src/cmd/venti/srv/arena.c
@@ -16,6 +16,7 @@ static int	loadarena(Arena *arena);
 static CIBlock	*getcib(Arena *arena, int clump, int writing, CIBlock *rock);
 static void	putcib(Arena *arena, CIBlock *cib);
 static void	sumproc(void *);
+static void loadcig(Arena *arena);
 
 static QLock	sumlock;
 static Rendez	sumwait;
@@ -137,14 +138,23 @@ readclumpinfos(Arena *arena, int clump, ClumpInfo *cis
 	CIBlock *cib, r;
 	int i;
 
-	for(i = 0; i < n; i++){
+	/*
+	 * because the clump blocks are laid out
+	 * in reverse order at the end of the arena,
+	 * it can be a few percent faster to read
+	 * the clumps backwards, which reads the
+	 * disk blocks forwards.
+	 */
+	for(i = n-1; i >= 0; i--){
 		cib = getcib(arena, clump + i, 0, &r);
-		if(cib == nil)
-			break;
+		if(cib == nil){
+			n = i;
+			continue;
+		}
 		unpackclumpinfo(&cis[i], &cib->data->data[cib->offset]);
 		putcib(arena, cib);
 	}
-	return i;
+	return n;
 }
 
 /*
@@ -349,7 +359,28 @@ writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64i
 	if(c->info.size < c->info.uncsize)
 		arena->memstats.cclumps++;
 
-	clump = arena->memstats.clumps++;
+	clump = arena->memstats.clumps;
+	if(clump % ArenaCIGSize == 0){
+		if(arena->cig == nil){
+			loadcig(arena);
+			if(arena->cig == nil)
+				goto NoCIG;
+		}
+		/* add aa as start of next cig */
+		if(clump/ArenaCIGSize != arena->ncig){
+			fprint(2, "bad arena cig computation %s: writing clump %d but %d cigs\n",
+				arena->name, clump, arena->ncig);
+			arena->ncig = -1;
+			vtfree(arena->cig);
+			arena->cig = nil;
+			goto NoCIG;
+		}
+		arena->cig = vtrealloc(arena->cig, (arena->ncig+1)*sizeof arena->cig[0]);
+		arena->cig[arena->ncig++].offset = aa;
+	}
+NoCIG:
+	arena->memstats.clumps++;
+
 	if(arena->memstats.clumps == 0)
 		sysfatal("clumps wrapped");
 	arena->wtime = now();
@@ -752,3 +783,157 @@ putcib(Arena *arena, CIBlock *cib)
 	putdblock(cib->data);
 	cib->data = nil;
 }
+
+
+/*
+ * For index entry readahead purposes, the arenas are 
+ * broken into smaller subpieces, called clump info groups
+ * or cigs.  Each cig has ArenaCIGSize clumps (ArenaCIGSize
+ * is chosen to make the index entries take up about half
+ * a megabyte).  The index entries do not contain enough
+ * information to determine what the clump index is for
+ * a given address in an arena.  That info is needed both for
+ * figuring out which clump group an address belongs to 
+ * and for prefetching a clump group's index entries from
+ * the arena table of contents.  The first time clump groups
+ * are accessed, we scan the entire arena table of contents
+ * (which might be 10s of megabytes), recording the data 
+ * offset of each clump group.
+ */
+
+/* 
+ * load clump info group information by scanning entire toc.
+ */
+static void
+loadcig(Arena *arena)
+{
+	u32int i, j, ncig, nci;
+	ArenaCIG *cig;
+	ClumpInfo *ci;
+	u64int offset;
+	int ms;
+
+	if(arena->cig || arena->ncig < 0)
+		return;
+
+//	fprint(2, "loadcig %s\n", arena->name);
+	
+	ncig = (arena->memstats.clumps+ArenaCIGSize-1) / ArenaCIGSize;
+	if(ncig == 0){
+		arena->cig = vtmalloc(1);
+		arena->ncig = 0;
+		return;
+	}
+
+	ms = msec();
+	cig = vtmalloc(ncig*sizeof cig[0]);
+	ci = vtmalloc(ArenaCIGSize*sizeof ci[0]);
+	offset = 0;
+	for(i=0; i<ncig; i++){
+		nci = readclumpinfos(arena, i*ArenaCIGSize, ci, ArenaCIGSize);
+		cig[i].offset = offset;
+		for(j=0; j<nci; j++)
+			offset += ClumpSize + ci[j].size;
+		if(nci < ArenaCIGSize){
+			if(i != ncig-1){
+				vtfree(ci);
+				vtfree(cig);
+				arena->ncig = -1;
+				fprint(2, "loadcig %s: got %ud cigs, expected %ud\n", arena->name, i+1, ncig);
+				goto out;
+			}
+		}
+	}
+	vtfree(ci);
+	
+	arena->ncig = ncig;
+	arena->cig = cig;
+
+out:
+	ms = msec() - ms;
+	addstat2(StatCigLoad, 1, StatCigLoadTime, ms);
+}
+
+/*
+ * convert arena address into arena group + data boundaries.
+ */
+int
+arenatog(Arena *arena, u64int addr, u64int *gstart, u64int *glimit, int *g)
+{
+	int r, l, m;
+
+	qlock(&arena->lock);
+	if(arena->cig == nil)
+		loadcig(arena);
+	if(arena->cig == nil || arena->ncig == 0){
+		qunlock(&arena->lock);
+		return -1;
+	}
+
+	l = 1;
+	r = arena->ncig - 1;
+	while(l <= r){
+		m = (r + l) / 2;
+		if(arena->cig[m].offset <= addr)
+			l = m + 1;
+		else
+			r = m - 1;
+	}
+	l--;
+
+	*g = l;
+	*gstart = arena->cig[l].offset;
+	if(l+1 < arena->ncig)
+		*glimit = arena->cig[l+1].offset;
+	else
+		*glimit = arena->memstats.used;
+	qunlock(&arena->lock);
+	return 0;
+}
+
+/*
+ * load the clump info for group g into the index entries.
+ */
+int
+asumload(Arena *arena, int g, IEntry *entries, int nentries)
+{
+	int i, base, limit;
+	u64int addr;
+	ClumpInfo ci;
+	IEntry *ie;
+
+	if(nentries < ArenaCIGSize){
+		fprint(2, "asking for too few entries\n");
+		return -1;
+	}
+	
+	qlock(&arena->lock);
+	if(arena->cig == nil)
+		loadcig(arena);
+	if(arena->cig == nil || arena->ncig == 0 || g >= arena->ncig){
+		qunlock(&arena->lock);
+		return -1;
+	}
+	
+	addr = 0;
+	base = g*ArenaCIGSize;
+	limit = base + ArenaCIGSize;
+	if(base > arena->memstats.clumps)
+		base = arena->memstats.clumps;
+	ie = entries;
+	for(i=base; i<limit; i++){
+		if(readclumpinfo(arena, i, &ci) < 0)
+			break;
+		if(ci.type != VtCorruptType){
+			scorecp(ie->score, ci.score);
+			ie->ia.type = ci.type;
+			ie->ia.size = ci.uncsize;
+			ie->ia.blocks = (ci.size + ClumpSize + (1<<ABlockLog) - 1) >> ABlockLog;
+			ie->ia.addr = addr;
+			ie++;
+		}
+		addr += ClumpSize + ci.size;
+	}
+	qunlock(&arena->lock);
+	return ie - entries;
+}
blob - ef0e4ec94a5e154b897578e11fa2a83406a2972c
blob + ca95573024d3695075757d5496f3cc729865cff3
--- src/cmd/venti/srv/checkindex.c
+++ src/cmd/venti/srv/checkindex.c
@@ -8,7 +8,7 @@ static void
 phdr(DBlock *eb)
 {
 	static int did;
-	
+
 	if(!did){
 		did = 1;
 		print("# diff actual correct\n");
@@ -168,7 +168,7 @@ checkbloom(Bloom *b1, Bloom *b2, int fix)
 {
 	u32int *a1, *a2;
 	int i, n, extra, missing;
-	
+
 	if(b1==nil && b2==nil)
 		return 0;
 	if(b1==nil || b2==nil){
@@ -188,13 +188,14 @@ checkbloom(Bloom *b1, Bloom *b2, int fix)
 	missing = 0;
 	for(i=BloomHeadSize/4; i<n; i++){
 		if(a1[i] != a2[i]){
-print("%.8ux/%.8ux.", a1[i], a2[i]);
-			extra += countbits(a1[i] & ~a2[i]);
+// print("%.8ux/%.8ux.", a1[i], a2[i]);
+			extra   += countbits(a1[i] & ~a2[i]);
 			missing += countbits(a2[i] & ~a1[i]);
 		}
 	}
 	if(extra || missing)
-		fprint(2, "bloom filter: %d spurious bits, %d missing bits\n", extra, missing);
+		fprint(2, "bloom filter: %d spurious bits, %d missing bits\n",
+			extra, missing);
 	else
 		fprint(2, "bloom filter: correct\n");
 	if(!fix && missing){
blob - 2a0a0bc1840f7976134364851602be8497258d6b
blob + f72511b1770ddf545746ba94050071eaf6b83396
--- src/cmd/venti/srv/conv.c
+++ src/cmd/venti/srv/conv.c
@@ -581,9 +581,9 @@ unpackientry(IEntry *ie, u8int *buf)
 
 	scorecp(ie->score, p);
 	p += VtScoreSize;
-	ie->wtime = U32GET(p);
+	/* ie->wtime = U32GET(p); */
 	p += U32Size;
-	ie->train = U16GET(p);
+	/* ie->train = U16GET(p); */
 	p += U16Size;
 	if(p - buf != IEntryAddrOff)
 		sysfatal("unpackentry bad IEntryAddrOff amount");
@@ -613,9 +613,9 @@ packientry(IEntry *ie, u8int *buf)
 
 	scorecp(p, ie->score);
 	p += VtScoreSize;
-	U32PUT(p, ie->wtime);
+	U32PUT(p, 0); /* wtime */
 	p += U32Size;
-	U16PUT(p, ie->train);
+	U16PUT(p, 0); /* train */
 	p += U16Size;
 	U64PUT(p, ie->ia.addr, t32);
 	p += U64Size;
blob - 07be62f4c0f5d2249fbd4f42c55b4ec5043f903c
blob + e0a9e18a538aa0abd449716267afed83c71128d3
--- src/cmd/venti/srv/dat.h
+++ src/cmd/venti/srv/dat.h
@@ -3,6 +3,7 @@ typedef struct AMap		AMap;
 typedef struct AMapN		AMapN;
 typedef struct Arena		Arena;
 typedef struct AState	AState;
+typedef struct ArenaCIG	ArenaCIG;
 typedef struct ArenaHead	ArenaHead;
 typedef struct ArenaPart	ArenaPart;
 typedef struct ArenaTail	ArenaTail;
@@ -28,8 +29,10 @@ typedef struct ZBlock		ZBlock;
 typedef struct Round	Round;
 typedef struct Bloom	Bloom;
 
-#define TWID32	((u32int)~(u32int)0)
-#define TWID64	((u64int)~(u64int)0)
+#pragma incomplete IEStream
+
+#define	TWID32	((u32int)~(u32int)0)
+#define	TWID64	((u64int)~(u64int)0)
 #define	TWID8	((u8int)~(u8int)0)
 
 enum
@@ -44,7 +47,6 @@ enum
 	IndexBase		= 1024*1024,	/* initial address to use in an index */
 	MaxIo			= 64*1024,	/* max size of a single read or write operation */
 	ICacheBits		= 16,		/* default bits for indexing icache */
-	ICacheDepth		= 4,		/* default depth of an icache hash chain */
 	MaxAMap			= 2*1024,	/* max. allowed arenas in an address mapping; must be < 32*1024 */
 
 	/*
@@ -147,6 +149,8 @@ enum
 	DirtyArenaCib,
 	DirtyArenaTrailer,
 	DirtyMax,
+	
+	ArenaCIGSize = 10*1024,	// about 0.5 MB worth of IEntry.
 
 	VentiZZZZZZZZ
 };
@@ -371,8 +375,16 @@ struct Arena
 	u32int		ctime;			/* first time a block was written */
 	u32int		wtime;			/* last time a block was written */
 	u32int		clumpmagic;
+	
+	ArenaCIG	*cig;
+	int	ncig;
 };
 
+struct ArenaCIG
+{
+	u64int	offset;  // from arena base
+};
+
 /*
  * redundant storage of some fields at the beginning of each arena
  */
@@ -505,15 +517,21 @@ struct IAddr
  */
 struct IEntry
 {
-	u8int		score[VtScoreSize];
-	IEntry		*next;			/* next in hash chain */
-	IEntry		*nextdirty; 		/* next in dirty chain */
-	u32int		wtime;			/* last write time */
-	u16int		train;			/* relative train containing the most recent ref; 0 if no ref, 1 if in same car */
-	u8int		rac;			/* read ahead count */
-	u8int		dirty;		/* is dirty */
-	IAddr		ia;
+	/* on disk data - 32 bytes*/
+	u8int	score[VtScoreSize];
+	IAddr	ia;
+	
+	IEntry	*nexthash;
+	IEntry	*nextdirty;
+	IEntry	*next;
+	IEntry	*prev;
+	u8int	state;
 };
+enum {
+	IEClean = 0,
+	IEDirty = 1,
+	IESummary = 2,
+};
 
 /*
  * buckets in the on disk index table
@@ -607,6 +625,9 @@ enum
 	StatIcacheFlush,
 	StatIcacheStall,
 	StatIcacheReadTime,
+	StatIcacheLookup,
+	StatScacheHit,
+	StatScachePrefetch,
 
 	StatBloomHit,
 	StatBloomMiss,
@@ -628,6 +649,9 @@ enum
 
 	StatSumRead,
 	StatSumReadBytes,
+	
+	StatCigLoad,
+	StatCigLoadTime,
 
 	NStat
 };
blob - cf5299531c3d720c5f3ba607a21f8d87fd731e79
blob + 05a3e93726697ba8f843c5cc4a87792cf1b7a795
--- src/cmd/venti/srv/fns.h
+++ src/cmd/venti/srv/fns.h
@@ -6,8 +6,11 @@ void		addstat(int, int);
 void		addstat2(int, int, int, int);
 ZBlock		*alloczblock(u32int size, int zeroed, uint alignment);
 Arena		*amapitoa(Index *index, u64int a, u64int *aa);
+Arena		*amapitoag(Index *index, u64int a, u64int *gstart, u64int *glimit, int *g);
 u64int		arenadirsize(Arena *arena, u32int clumps);
+int		arenatog(Arena *arena, u64int aa, u64int *gstart, u64int *glimit, int *g);
 void		arenaupdate(Arena *arena, u32int size, u8int *score);
+int		asumload(Arena *arena, int g, IEntry *entries, int maxentries);
 void		backsumarena(Arena *arena);
 void	binstats(long (*fn)(Stats *s0, Stats *s1, void*), void *arg, long t0, long t1, Statbin *bin, int nbin);
 int		bloominit(Bloom*, vlong, uchar*);
@@ -64,6 +67,7 @@ int		iaddrcmp(IAddr *ia1, IAddr *ia2);
 IEntry*	icachedirty(u32int, u32int, u64int);
 ulong	icachedirtyfrac(void);
 void		icacheclean(IEntry*);
+int		icachelookup(u8int *score, int type, IAddr *ia);
 int		ientrycmp(const void *vie1, const void *vie2);
 char		*ifileline(IFile *f);
 int		ifilename(IFile *f, char *dst);
@@ -76,7 +80,7 @@ ArenaPart	*initarenapart(Part *part);
 int		initarenasum(void);
 void		initbloomfilter(Index*);
 void		initdcache(u32int mem);
-void		initicache(int bits, int depth);
+void		initicache(u32int mem);
 void		initicachewrite(void);
 IEStream	*initiestream(Part *part, u64int off, u64int clumps, u32int size);
 ISect		*initisect(Part *part);
@@ -87,7 +91,7 @@ Part*		initpart(char *name, int mode);
 void		initround(Round*, char*, int);
 int		initventi(char *config, Config *conf);
 void		insertlump(Lump *lump, Packet *p);
-int		insertscore(u8int *score, IAddr *ia, int write);
+int		insertscore(u8int *score, IAddr *ia, int state);
 void		kickdcache(void);
 void		kickicache(void);
 void		kickround(Round*, int wait);
@@ -97,8 +101,7 @@ DBlock	*loadibucket(Index *index, u8int *score, ISect 
 int		loadientry(Index *index, u8int *score, int type, IEntry *ie);
 void		logerr(int severity, char *fmt, ...);
 Lump		*lookuplump(u8int *score, int type);
-int		_lookupscore(u8int *score, int type, IAddr *ia, int *rac);
-int		lookupscore(u8int *score, int type, IAddr *ia, int *rac);
+int		lookupscore(u8int *score, int type, IAddr *ia);
 int		maparenas(AMap *am, Arena **arenas, int n, char *what);
 void		markbloomfilter(Bloom*, u8int*);
 uint		msec(void);
blob - 266218b19b9580ce1f593d1f3780de460163c28f
blob + 8cf937d1b49ee6829f5e43041672bd1773322eda
--- src/cmd/venti/srv/hdisk.c
+++ src/cmd/venti/srv/hdisk.c
@@ -547,7 +547,7 @@ debugread(HConnect *c, u8int *score)
 	Lump *u;
 	IAddr ia;
 	IEntry ie;
-	int i, rac;
+	int i;
 	Arena *arena;
 	u64int aa;
 	ZBlock *zb;
@@ -561,7 +561,7 @@ debugread(HConnect *c, u8int *score)
 	}
 	
 	hprint(&c->hout, "<h2>index search %V</h2><pre>\n", score);
-	if(_lookupscore(score, -1, &ia, nil) < 0)
+	if(icachelookup(score, -1, &ia) < 0)
 		hprint(&c->hout, "  icache: not found\n");
 	else
 		hprint(&c->hout, "  icache: addr=%#llx size=%d type=%d blocks=%d\n",
@@ -585,12 +585,12 @@ debugread(HConnect *c, u8int *score)
 			hprint(&c->hout, " -cache");
 		putlump(u);
 		
-		if(lookupscore(score, type, &ia, &rac) < 0){
+		if(lookupscore(score, type, &ia) < 0){
 			hprint(&c->hout, " -lookup\n");
 			continue;
 		}
-		hprint(&c->hout, "\n  lookupscore: addr=%#llx size=%d blocks=%d rac=%d\n",
-			ia.addr, ia.size, ia.blocks, rac);
+		hprint(&c->hout, "\n  lookupscore: addr=%#llx size=%d blocks=%d\n",
+			ia.addr, ia.size, ia.blocks);
 		
 		arena = amapitoa(mainindex, ia.addr, &aa);
 		if(arena == nil){
blob - 6b7f20fab74a9807ea3c5808545bdfcaa4be37da
blob + 51d8b9a1fe18efb0d1e7aaea212d97da373f4e7b
--- src/cmd/venti/srv/httpd.c
+++ src/cmd/venti/srv/httpd.c
@@ -895,7 +895,7 @@ static char* graphname[] =
 
 	"icachehit",
 	"icachemiss",
-	"icachelookup",
+	"icacheread",
 	"icachewrite",
 	"icachefill",
 	"icacheprefetch",
@@ -904,6 +904,9 @@ static char* graphname[] =
 	"icacheflush",
 	"icachestall",
 	"icachelookuptime",
+	"icachelookup",
+	"scachehit",
+	"scacheprefetch",
 
 	"bloomhit",
 	"bloommiss",
@@ -925,6 +928,9 @@ static char* graphname[] =
 
 	"sumread",
 	"sumreadbyte",
+	
+	"cigload",
+	"cigloadtime",
 };
 
 static int
blob - 32cbb5d340dc4efc64069fffa0aea9fa6f6ce6bc
blob + 384fd2c1437a09aa33ab7c19bc53e9a252dbf583
--- src/cmd/venti/srv/icache.c
+++ src/cmd/venti/srv/icache.c
@@ -2,236 +2,421 @@
 #include "dat.h"
 #include "fns.h"
 
+int icacheprefetch = 1;
+
 typedef struct ICache ICache;
+typedef struct IHash IHash;
+typedef struct ISum ISum;
+
 struct ICache
 {
-	QLock	lock;			/* locks hash table & all associated data */
+	QLock	lock;
 	Rendez	full;
-	IEntry	**heads;		/* heads of all the hash chains */
-	int	bits;			/* bits to use for indexing heads */
-	u32int	size;			/* number of heads; == 1 << bits, should be < entries */
-	IEntry	*base;			/* all allocated hash table entries */
-	IEntry	*free;
-	u32int	entries;		/* elements in base */
-	IEntry	*dirty;		/* chain of dirty elements */
-	u32int	ndirty;
+	IHash	*hash;
+	IEntry	*entries;
+	int		nentries;
+	IEntry	free;
+	IEntry	clean;
+	IEntry	dirty;
 	u32int	maxdirty;
-	u32int	unused;			/* index of first unused element in base */
-	u32int	stolen;			/* last head from which an element was stolen */
+	u32int	ndirty;
 
-	Arena	*last[4];
-	Arena	*lastload;
-	int		nlast;
+	ISum	**sum;
+	int		nsum;
+	IHash	*shash;
+	IEntry	*sentries;
+	int		nsentries;
 };
 
-int icacheprefetch = 1;
-
 static ICache icache;
 
-static IEntry	*icachealloc(IAddr *ia, u8int *score);
-
 /*
- * bits is the number of bits in the icache hash table
- * depth is the average depth
- * memory usage is about (1<<bits) * depth * sizeof(IEntry) + (1<<bits) * sizeof(IEntry*)
+ * Hash table of IEntries
  */
-void
-initicache(int bits, int depth)
+
+struct IHash
 {
-	icache.bits = bits;
-	icache.size = 1 << bits;
-	icache.entries = depth * icache.size;
-	icache.maxdirty = icache.entries/2;
-	icache.base = MKNZ(IEntry, icache.entries);
-	icache.heads = MKNZ(IEntry*, icache.size);
-	icache.full.l = &icache.lock;
-	setstat(StatIcacheSize, icache.entries);
-}
+	int bits;
+	u32int size;
+	IEntry **table;
+};
 
-ulong
-icachedirtyfrac(void)
+static IHash*
+mkihash(int size1)
 {
-	return (vlong)icache.ndirty*IcacheFrac / icache.entries;
+	u32int size;
+	int bits;
+	IHash *ih;
+	
+	bits = 0;
+	size = 1;
+	while(size < size1){
+		bits++;
+		size <<= 1;
+	}
+	
+	ih = vtmallocz(sizeof(IHash)+size*sizeof(ih->table[0]));
+	ih->table = (IEntry**)(ih+1);
+	ih->bits = bits;
+	ih->size = size;
+	return ih;
 }
 
-u32int
-hashbits(u8int *sc, int bits)
+static IEntry*
+ihashlookup(IHash *ih, u8int score[VtScoreSize], int type)
 {
-	u32int v;
-
-	v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3];
-	if(bits < 32)
-		 v >>= (32 - bits);
-	return v;
+	u32int h;
+	IEntry *ie;
+	
+	h = hashbits(score, ih->bits);
+	for(ie=ih->table[h]; ie; ie=ie->nexthash)
+		if((type == -1 || type == ie->ia.type) && scorecmp(score, ie->score) == 0)
+			return ie;
+	return nil;
 }
 
 static void
-loadarenaclumps(Arena *arena, u64int aa)
+ihashdelete(IHash *ih, IEntry *ie, char *what)
 {
-	ulong i;
-	ClumpInfo ci;
-	IAddr ia;
-
-	for(i=0; i<arena->memstats.clumps; i++){
-		if(readclumpinfo(arena, i, &ci) < 0)
-			break;
-		ia.type = ci.type;
-		ia.size = ci.uncsize;
-		ia.blocks = (ci.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
-		ia.addr = aa;
-		aa += ClumpSize + ci.size;
-		if(ia.type != VtCorruptType)
-			insertscore(ci.score, &ia, 0);
-	}
+	u32int h;
+	IEntry **l;
+	
+	h = hashbits(ie->score, ih->bits);
+	for(l=&ih->table[h]; *l; l=&(*l)->nexthash)
+		if(*l == ie){
+			*l = ie->nexthash;
+			return;
+		}
+	fprint(2, "warning: %s %V not found in ihashdelete\n", what, ie->score);
 }
 
-int
-_lookupscore(u8int *score, int type, IAddr *ia, int *rac)
+static void
+ihashinsert(IHash *ih, IEntry *ie)
 {
 	u32int h;
-	IEntry *ie, *last;
+	
+	h = hashbits(ie->score, ih->bits);
+	ie->nexthash = ih->table[h];
+	ih->table[h] = ie;
+}
 
-	qlock(&icache.lock);
-	h = hashbits(score, icache.bits);
-	last = nil;
-	for(ie = icache.heads[h]; ie != nil; ie = ie->next){
-		if((ie->ia.type == type || type == -1) && scorecmp(ie->score, score)==0){
-			if(last != nil)
-				last->next = ie->next;
-			else
-				icache.heads[h] = ie->next;
-			addstat(StatIcacheHit, 1);
-			if(rac)
-				ie->rac = 1;
-			trace(TraceLump, "lookupscore incache");
-			ie->next = icache.heads[h];
-			icache.heads[h] = ie;
 
-			*ia = ie->ia;
-			if(rac)
-				*rac = ie->rac;
-			qunlock(&icache.lock);
-			return 0;
-		}
-		last = ie;
-	}
-	addstat(StatIcacheMiss, 1);
-	qunlock(&icache.lock);
-	return -1;
+/*
+ * IEntry lists.
+ */
+
+static IEntry*
+popout(IEntry *ie)
+{
+	if(ie->prev == nil && ie->next == nil)
+		return ie;
+	ie->prev->next = ie->next;
+	ie->next->prev = ie->prev;
+	ie->next = nil;
+	ie->prev = nil;
+	return ie;
 }
 
+static IEntry*
+poplast(IEntry *list)
+{
+	if(list->prev == list)
+		return nil;
+	return popout(list->prev);
+}
 
+static IEntry*
+pushfirst(IEntry *list, IEntry *ie)
+{
+	popout(ie);
+	ie->prev = list;
+	ie->next = list->next;
+	ie->prev->next = ie;
+	ie->next->prev = ie;
+	return ie;
+}
+
 /*
-ZZZ need to think about evicting the correct IEntry,
-and writing back the wtime.
- * look up data score in the index cache
- * if this fails, pull it in from the disk index table, if it exists.
- *
- * must be called with the lump for this score locked
+ * Arena summary cache.
  */
-int
-lookupscore(u8int *score, int type, IAddr *ia, int *rac)
+struct ISum
 {
-	IEntry d, *ie;
-	u32int h;
-	u64int aa;
-	Arena *load;
-	int i, ret;
-	uint ms;
+	QLock	lock;
+	IEntry	*entries;
+	int	nentries;
+	int	loaded;
+	u64int addr;
+	u64int limit;
+	Arena *arena;
+	int g;
+};
 
-	aa = 0;
-	ms = msec();
-	
-	trace(TraceLump, "lookupscore %V.%d", score, type);
+static ISum*
+scachelookup(u64int addr)
+{
+	int i;
+	ISum *s;
 
-	ret = 0;
-	if(_lookupscore(score, type, ia, rac) < 0){
-		if(loadientry(mainindex, score, type, &d) < 0){
-			ret = -1;
-			goto out;
+	for(i=0; i<icache.nsum; i++){
+		s = icache.sum[i];
+		if(s->addr <= addr && addr < s->limit){
+			if(i > 0){
+				memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]);
+				icache.sum[0] = s;
+			}
+			return s;
 		}
+	}
+	return nil;
+}
 
-		/* failed in cache but found on disk - fill cache. */
-		trace(TraceLump, "lookupscore loaded");
-		addstat(StatIcacheFill, 1);
+static void
+sumclear(ISum *s)
+{
+	int i;
 
-		/*
-		 * no one else can load an entry for this score,
-		 * since we have this score's lump's lock.
-		 */
-		qlock(&icache.lock);
+	for(i=0; i<s->nentries; i++)
+		ihashdelete(icache.shash, &s->entries[i], "scache");
+	s->nentries = 0;
+	s->loaded = 0;
+	s->addr = 0;
+	s->limit = 0;
+	s->arena = nil;
+	s->g = 0;
+}
+
+static ISum*
+scacheevict(void)
+{
+	ISum *s;
+	int i;
 	
-		/*
-		 * If we notice that all the hits are coming from one arena,
-		 * load the table of contents for that arena into the cache.
-		 */
-		load = nil;
-		h = hashbits(score, icache.bits);
-		ie = icachealloc(&d.ia, score);
-		if(icacheprefetch){
-			icache.last[icache.nlast++%nelem(icache.last)] = amapitoa(mainindex, ie->ia.addr, &aa);
-			aa = ie->ia.addr - aa;	/* compute base addr of arena */
-			for(i=0; i<nelem(icache.last); i++)
-				if(icache.last[i] != icache.last[0])
-					break;
-			if(i==nelem(icache.last) && icache.lastload != icache.last[0]){
-				load = icache.last[0];
-				icache.lastload = load;
+	for(i=icache.nsum-1; i>=0; i--){
+		s = icache.sum[i];
+		if(canqlock(&s->lock)){
+			if(i > 0){
+				memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]);
+				icache.sum[0] = s;
 			}
+			sumclear(s);
+			return s;
 		}
-	
-		ie->next = icache.heads[h];
-		icache.heads[h] = ie;
-	
-		*ia = ie->ia;
-		*rac = ie->rac;
-	
-		qunlock(&icache.lock);
-		if(load){
-			trace(TraceProc, "preload 0x%llux", aa);
-			loadarenaclumps(load, aa);
-		}
 	}
+	return nil;
+}
 
-out:
-	ms = msec() - ms;
-	addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms);
-
-	return ret;
+static void
+scachehit(u64int addr)
+{
+	scachelookup(addr);	/* for move-to-front */
 }
 
-/*
- * insert a new element in the hash table.
- */
-int
-insertscore(u8int *score, IAddr *ia, int write)
+static void
+scachesetup(ISum *s, u64int addr)
 {
-	IEntry *ie, se;
-	u32int h;
+	u64int addr0, limit;
+	int g;
 
-	trace(TraceLump, "insertscore enter");
-	if(write)
-		addstat(StatIcacheWrite, 1);
-	else
-		addstat(StatIcachePrefetch, 1);
+	s->arena = amapitoag(mainindex, addr, &addr0, &limit, &g);
+	s->addr = addr0;
+	s->limit = limit;
+	s->g = g;
+}
 
-	qlock(&icache.lock);
-	h = hashbits(score, icache.bits);
+static void
+scacheload(ISum *s)
+{
+	int i, n;
 
-	ie = icachealloc(ia, score);
-	if(write){
-		icache.ndirty++;
-		setstat(StatIcacheDirty, icache.ndirty);
-		delaykickicache();
-		ie->dirty = 1;
+	s->loaded = 1;
+	n = asumload(s->arena, s->g, s->entries, ArenaCIGSize);
+	/*
+	 * n can be less then ArenaCIGSize, either if the clump group
+	 * is the last in the arena and is only partially filled, or if there
+	 * are corrupt clumps in the group -- those are not returned.
+	 */
+	for(i=0; i<n; i++){
+		s->entries[i].ia.addr += s->addr;
+		ihashinsert(icache.shash, &s->entries[i]);
 	}
-	ie->next = icache.heads[h];
-	icache.heads[h] = ie;
+//fprint(2, "%T scacheload %s %d - %d entries\n", s->arena->name, s->g, n);
+	addstat(StatScachePrefetch, n);
+	s->nentries = n;
+}
 
-	se = *ie;
+static ISum*
+scachemiss(u64int addr)
+{
+	ISum *s;
+
+	s = scachelookup(addr);
+	if(s == nil){
+		/* first time: make an entry in the cache but don't populate it yet */
+		s = scacheevict();
+		if(s == nil)
+			return nil;
+		scachesetup(s, addr);
+		qunlock(&s->lock);
+		return nil;
+	}
+
+	/* second time: load from disk */
+	qlock(&s->lock);
+	if(s->loaded || !icacheprefetch){
+		qunlock(&s->lock);
+		return nil;
+	}
+	
+	return s;	/* locked */
+}
+
+/*
+ * Index cache.
+ */
+
+void
+initicache(u32int mem0)
+{
+	u32int mem;
+	int i, entries, scache;
+	
+	icache.full.l = &icache.lock;
+
+	mem = mem0;
+	entries = mem / (sizeof(IEntry)+sizeof(IEntry*));
+	scache = (entries/8) / ArenaCIGSize;
+	entries -= entries/8;
+	if(scache < 4)
+		scache = 4;
+	if(scache > 16)
+		scache = 16;
+	if(entries < 1000)
+		entries = 1000;
+fprint(2, "icache %,d bytes = %,d entries; %d scache\n", mem0, entries, scache);
+
+	icache.clean.prev = icache.clean.next = &icache.clean;
+	icache.dirty.prev = icache.dirty.next = &icache.dirty;
+	icache.free.prev = icache.free.next = &icache.free;
+	
+	icache.hash = mkihash(entries);
+	icache.nentries = entries;
+	setstat(StatIcacheSize, entries);
+	icache.entries = vtmallocz(entries*sizeof icache.entries[0]);
+	icache.maxdirty = entries / 2;
+	for(i=0; i<entries; i++)
+		pushfirst(&icache.free, &icache.entries[i]);
+
+	icache.nsum = scache;
+	icache.sum = vtmallocz(scache*sizeof icache.sum[0]);
+	icache.sum[0] = vtmallocz(scache*sizeof icache.sum[0][0]);
+	icache.nsentries = scache * ArenaCIGSize;
+	icache.sentries = vtmallocz(scache*ArenaCIGSize*sizeof icache.sentries[0]);
+	icache.shash = mkihash(scache*ArenaCIGSize);
+	for(i=0; i<scache; i++){
+		icache.sum[i] = icache.sum[0] + i;
+		icache.sum[i]->entries = icache.sentries + i*ArenaCIGSize;
+	}
+}
+
+
+static IEntry*
+evictlru(void)
+{
+	IEntry *ie;
+	
+	ie = poplast(&icache.clean);
+	if(ie == nil)
+		return nil;
+	ihashdelete(icache.hash, ie, "evictlru");
+	return ie;
+}
+
+static void
+icacheinsert(u8int score[VtScoreSize], IAddr *ia, int state)
+{
+	IEntry *ie;
+
+	if((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){
+		addstat(StatIcacheStall, 1);
+		while((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){
+			// Could safely return here if state == IEClean.
+			// But if state == IEDirty, have to wait to make
+			// sure we don't lose an index write.  
+			// Let's wait all the time.
+			flushdcache();
+			kickicache();
+			rsleep(&icache.full);
+		}
+		addstat(StatIcacheStall, -1);
+	}
+
+	memmove(ie->score, score, VtScoreSize);
+	ie->state = state;
+	ie->ia = *ia;
+	if(state == IEClean){
+		addstat(StatIcachePrefetch, 1);
+		pushfirst(&icache.clean, ie);
+	}else{
+		addstat(StatIcacheWrite, 1);
+		assert(state == IEDirty);
+		icache.ndirty++;
+		setstat(StatIcacheDirty, icache.ndirty);
+		delaykickicache();
+		pushfirst(&icache.dirty, ie);
+	}
+	ihashinsert(icache.hash, ie);
+}
+
+int
+icachelookup(u8int score[VtScoreSize], int type, IAddr *ia)
+{
+	IEntry *ie;
+
+	qlock(&icache.lock);
+	addstat(StatIcacheLookup, 1);
+	if((ie = ihashlookup(icache.hash, score, type)) != nil){
+		*ia = ie->ia;
+		if(ie->state == IEClean)
+			pushfirst(&icache.clean, ie);
+		addstat(StatIcacheHit, 1);
+		qunlock(&icache.lock);
+		return 0;
+	}
+
+	if((ie = ihashlookup(icache.shash, score, type)) != nil){
+		*ia = ie->ia;
+		icacheinsert(score, &ie->ia, IEClean);
+		scachehit(ie->ia.addr);
+		addstat(StatScacheHit, 1);
+		qunlock(&icache.lock);
+		return 0;
+	}
+	addstat(StatIcacheMiss, 1);
 	qunlock(&icache.lock);
 
-	if(write && icache.ndirty >= icache.maxdirty)
+	return -1;
+}
+
+int
+insertscore(u8int score[VtScoreSize], IAddr *ia, int state)
+{
+	ISum *toload;
+
+	qlock(&icache.lock);
+	icacheinsert(score, ia, state);
+	if(state == IEClean)
+		toload = scachemiss(ia->addr);
+	else{
+		assert(state == IEDirty);
+		toload = nil;
+	}
+	qunlock(&icache.lock);
+	if(toload){
+		scacheload(toload);
+		qunlock(&toload->lock);
+	}
+	
+	if(icache.ndirty >= icache.maxdirty)
 		kickicache();
 
 	/*
@@ -240,125 +425,81 @@ insertscore(u8int *score, IAddr *ia, int write)
 	 * the lump, meaning any searches for this block
 	 * will hit in the lump cache until after we return.
 	 */
-	markbloomfilter(mainindex->bloom, score);
+	if(state == IEDirty)
+		markbloomfilter(mainindex->bloom, score);
 
 	return 0;
 }
 
-/*
- * allocate a index cache entry which hasn't been used in a while.
- * must be called with icache.lock locked
- * if the score is already in the table, update the entry.
- */
-static IEntry *
-icachealloc(IAddr *ia, u8int *score)
+static int
+lookupscore_untimed(u8int score[VtScoreSize], int type, IAddr *ia)
 {
-	int i;
-	IEntry *ie, *last, *clean, *lastclean;
-	u32int h;
+	IEntry d;
 
-	h = hashbits(score, icache.bits);
-	last = nil;
-	for(ie = icache.heads[h]; ie != nil; ie = ie->next){
-		if(ie->ia.type == ia->type && scorecmp(ie->score, score)==0){
-			if(last != nil)
-				last->next = ie->next;
-			else
-				icache.heads[h] = ie->next;
-			trace(TraceLump, "icachealloc hit");
-			ie->rac = 1;
-			return ie;
-		}
-		last = ie;
-	}
+	if(icachelookup(score, type, ia) >= 0)
+		return 0;
 
-	h = icache.unused;
-	if(h < icache.entries){
-		ie = &icache.base[h++];
-		icache.unused = h;
-		trace(TraceLump, "icachealloc unused");
-		goto Found;
-	}
+	addstat(StatIcacheFill, 1);
+	if(loadientry(mainindex, score, type, &d) < 0)
+		return -1;
 	
-	if((ie = icache.free) != nil){
-		icache.free = ie->next;
-		goto Found;
-	}
+	insertscore(score, &d.ia, IEClean);
+	*ia = d.ia;
+	return 0;
+}
 
-	h = icache.stolen;
-	for(i=0;; i++){
-		h++;
-		if(h >= icache.size)
-			h = 0;
-		if(i == icache.size){
-			trace(TraceLump, "icachealloc sleep");
-			addstat(StatIcacheStall, 1);
-			while(icache.ndirty == icache.entries){
-				/*
-				 * This is a bit suspect.  Kickicache will wake up the
-				 * icachewritecoord, but if all the index entries are for
-				 * unflushed disk blocks, icachewritecoord won't be
-				 * able to do much.  It always rewakes everyone when
-				 * it thinks it is done, though, so at least we'll go around
-				 * the while loop again.  Also, if icachewritecoord sees
-				 * that the disk state hasn't change at all since the last
-				 * time around, it kicks the disk.  This needs to be
-				 * rethought, but it shouldn't deadlock anymore.
-				 */
-				kickicache();
-				rsleep(&icache.full);
-			}
-			addstat(StatIcacheStall, -1);
-			i = 0;
-		}
-		lastclean = nil;
-		clean = nil;
-		last = nil;
-		for(ie=icache.heads[h]; ie; last=ie, ie=ie->next){
-			if(!ie->dirty){
-				clean = ie;
-				lastclean = last;
-			}
-		}
-		if(clean){
-			if(lastclean)
-				lastclean->next = clean->next;
-			else
-				icache.heads[h] = clean->next;
-			clean->next = nil;
-			icache.stolen = h;
-			ie = clean;
-			trace(TraceLump, "icachealloc steal");
-			goto Found;
-		}
-	}
+int
+lookupscore(u8int score[VtScoreSize], int type, IAddr *ia)
+{
+	int ms, ret;
+	
+	ms = msec();
+	ret = lookupscore_untimed(score, type, ia);
+	ms = msec() - ms;
+	addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms);
+	return ret;
+}
+	
+u32int
+hashbits(u8int *sc, int bits)
+{
+	u32int v;
 
-Found:
-	ie->ia = *ia;
-	scorecp(ie->score, score);
-	ie->rac = 0;	
-	return ie;
+	v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3];
+	if(bits < 32)
+		 v >>= (32 - bits);
+	return v;
 }
 
+ulong
+icachedirtyfrac(void)
+{
+	return (vlong)icache.ndirty*IcacheFrac / icache.nentries;
+}
+
+/*
+ * Return a singly-linked list of dirty index entries.
+ * with 32-bit hash numbers between lo and hi
+ * and address < limit.
+ */
 IEntry*
 icachedirty(u32int lo, u32int hi, u64int limit)
 {
-	int i;
 	u32int h;
 	IEntry *ie, *dirty;
 
 	dirty = nil;
 	trace(TraceProc, "icachedirty enter");
 	qlock(&icache.lock);
-	for(i=0; i<icache.size; i++)
-	for(ie = icache.heads[i]; ie; ie=ie->next)
-		if(ie->dirty && ie->ia.addr != 0 && ie->ia.addr < limit){
+	for(ie = icache.dirty.next; ie != &icache.dirty; ie=ie->next){
+		if(ie->state == IEDirty && ie->ia.addr < limit){
 			h = hashbits(ie->score, 32);
 			if(lo <= h && h <= hi){
 				ie->nextdirty = dirty;
 				dirty = ie;
 			}
 		}
+	}
 	qunlock(&icache.lock);
 	trace(TraceProc, "icachedirty exit");
 	if(dirty == nil)
@@ -366,36 +507,49 @@ icachedirty(u32int lo, u32int hi, u64int limit)
 	return dirty;
 }
 
+
+/*
+ * The singly-linked non-circular list of index entries ie
+ * has been written to disk.  Move them to the clean list.
+ */
 void
 icacheclean(IEntry *ie)
 {
-	trace(TraceProc, "icachedirty enter");
+	IEntry *next;
+	
+	trace(TraceProc, "icacheclean enter");
 	qlock(&icache.lock);
-	for(; ie; ie=ie->nextdirty){
+	for(; ie; ie=next){
+		assert(ie->state == IEDirty);
+		next = ie->nextdirty;
+		ie->nextdirty = nil;
+		popout(ie); /* from icache.dirty */
 		icache.ndirty--;
-		ie->dirty = 0;
+		ie->state = IEClean;
+		pushfirst(&icache.clean, ie);
 	}
 	setstat(StatIcacheDirty, icache.ndirty);
 	rwakeupall(&icache.full);
 	qunlock(&icache.lock);
-	trace(TraceProc, "icachedirty exit");
+	trace(TraceProc, "icacheclean exit");
 }
 
 void
 emptyicache(void)
 {
 	int i;
-	IEntry *ie, **lie;
+	IEntry *ie;
+	ISum *s;
 	
 	qlock(&icache.lock);
-	for(i=0; i<icache.size; i++)
-	for(lie=&icache.heads[i]; (ie=*lie); ){
-		if(ie->dirty == 0){
-			*lie = ie->next;
-			ie->next = icache.free;
-			icache.free = ie;
-		}else
-			lie = &ie->next;
+	while((ie = evictlru()) != nil)
+		pushfirst(&icache.free, ie);
+	for(i=0; i<icache.nsum; i++){
+		s = icache.sum[i];
+		qlock(&s->lock);
+		sumclear(s);
+		qunlock(&s->lock);
 	}
 	qunlock(&icache.lock);
 }
+
blob - 49344c9f1de978cbad3c5181952e2a5e320c46f2
blob + 0805001c65aa70f92dffb8fc197778ff13d1e925
--- src/cmd/venti/srv/icachewrite.c
+++ src/cmd/venti/srv/icachewrite.c
@@ -45,6 +45,16 @@ initicachewrite(void)
 	vtproc(delaykickroundproc, &iwrite.round);
 }
 
+static u64int
+ie2diskaddr(Index *ix, ISect *is, IEntry *ie)
+{
+	u64int bucket, addr;
+
+	bucket = hashbits(ie->score, 32)/ix->div;
+	addr = is->blockbase + ((bucket - is->start) << is->blocklog);
+	return addr;
+}
+
 static IEntry*
 nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *paddr, uint *pnbuf)
 {
@@ -55,13 +65,13 @@ nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *
 
 	bsize = 1<<is->blocklog;
 	iefirst = *pie;
-	addr = is->blockbase + ((u64int)(hashbits(iefirst->score, 32) / ix->div - is->start) << is->blocklog);
+	addr = ie2diskaddr(ix, is, iefirst);
 	nbuf = 0;
-	for(l=&iefirst->nextdirty; (ie=*l)!=nil; l=&(*l)->nextdirty){
-		naddr = is->blockbase + ((u64int)(hashbits(ie->score, 32) / ix->div - is->start) << is->blocklog);
+	for(l = &iefirst->nextdirty; (ie = *l) != nil; l = &(*l)->nextdirty){
+		naddr = ie2diskaddr(ix, is, ie);
 		if(naddr - addr >= Bufsize)
 			break;
-		nbuf = naddr-addr;
+		nbuf = naddr - addr;
 	}
 	nbuf += bsize;
 
@@ -75,7 +85,7 @@ nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *
 static int
 icachewritesect(Index *ix, ISect *is, u8int *buf)
 {
-	int err, h, bsize, t;
+	int err, i, werr, h, bsize, t;
 	u32int lo, hi;
 	u64int addr, naddr;
 	uint nbuf, off;
@@ -89,29 +99,32 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
 	else
 		hi = is->stop * ix->div - 1;
 
-	trace(TraceProc, "icachewritesect enter %ud %ud %llud", lo, hi, iwrite.as.aa);
+	trace(TraceProc, "icachewritesect enter %ud %ud %llud",
+		lo, hi, iwrite.as.aa);
 
 	iedirty = icachedirty(lo, hi, iwrite.as.aa);
 	iedirty = iesort(iedirty);
-	bsize = 1<<is->blocklog;
+	bsize = 1 << is->blocklog;
 	err = 0;
 
 	while(iedirty){
 		disksched();
-		while((t=icachesleeptime) == SleepForever){
+		while((t = icachesleeptime) == SleepForever){
 			sleep(1000);
 			disksched();
 		}
 		if(t < minicachesleeptime)
 			t = minicachesleeptime;
-		sleep(t);
+		if(t > 0)
+			sleep(t);
 		trace(TraceProc, "icachewritesect nextchunk");
 		chunk = nextchunk(ix, is, &iedirty, &addr, &nbuf);
 
-		trace(TraceProc, "icachewritesect readpart 0x%llux+0x%ux", addr, nbuf);
+		trace(TraceProc, "icachewritesect readpart 0x%llux+0x%ux",
+			addr, nbuf);
 		if(readpart(is->part, addr, buf, nbuf) < 0){
-			/* XXX more details here */
-			fprint(2, "icachewriteproc readpart: %r\n");
+			fprint(2, "%s: part %s addr 0x%llux: icachewritesect "
+				"readpart: %r\n", argv0, is->part->name, addr);
 			err  = -1;
 			continue;
 		}
@@ -120,31 +133,34 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
 		addstat(StatIsectRead, 1);
 
 		for(l=&chunk; (ie=*l)!=nil; l=&ie->nextdirty){
-		again:
-			naddr = is->blockbase + ((u64int)(hashbits(ie->score, 32) / ix->div - is->start) << is->blocklog);
+again:
+			naddr = ie2diskaddr(ix, is, ie);
 			off = naddr - addr;
 			if(off+bsize > nbuf){
-				fprint(2, "whoops! addr=0x%llux nbuf=%ud addr+nbuf=0x%llux naddr=0x%llux\n",
-					addr, nbuf, addr+nbuf, naddr);
+				fprint(2, "%s: whoops! addr=0x%llux nbuf=%ud "
+					"addr+nbuf=0x%llux naddr=0x%llux\n",
+					argv0, addr, nbuf, addr+nbuf, naddr);
 				assert(off+bsize <= nbuf);
 			}
 			unpackibucket(&ib, buf+off, is->bucketmagic);
 			if(okibucket(&ib, is) < 0){
-				fprint(2, "bad bucket XXX\n");
+				fprint(2, "%s: bad bucket XXX\n", argv0);
 				goto skipit;
 			}
-			trace(TraceProc, "icachewritesect add %V at 0x%llux", ie->score, naddr);
+			trace(TraceProc, "icachewritesect add %V at 0x%llux",
+				ie->score, naddr);
 			h = bucklook(ie->score, ie->ia.type, ib.data, ib.n);
 			if(h & 1){
 				h ^= 1;
 				packientry(ie, &ib.data[h]);
 			}else if(ib.n < is->buckmax){
-				memmove(&ib.data[h+IEntrySize], &ib.data[h], ib.n*IEntrySize - h);
+				memmove(&ib.data[h + IEntrySize], &ib.data[h],
+					ib.n*IEntrySize - h);
 				ib.n++;
 				packientry(ie, &ib.data[h]);
 			}else{
-				fprint(2, "bucket overflow XXX\n");
-			skipit:
+				fprint(2, "%s: bucket overflow XXX\n", argv0);
+skipit:
 				err = -1;
 				*l = ie->nextdirty;
 				ie = *l;
@@ -154,33 +170,29 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
 					break;
 			}
 			packibucket(&ib, buf+off, is->bucketmagic);
-			/* XXX
-			 * This is not quite right - it's good that we 
-			 * update the cached block (if any) here, but
-			 * since the block doesn't get written until writepart
-			 * below, we also need to make sure that the cache 
-			 * doesn't load the stale block before we write it to
-			 * disk below.  We could lock the disk cache during
-			 * the writepart, but that's pretty annoying.
-			 * Another possibility would be never to cache
-			 * index partition blocks.  The hit rate on those is
-			 * miniscule anyway.
-			 */
-			if((b = _getdblock(is->part, naddr, ORDWR, 0)) != nil){
-				memmove(b->data, buf+off, bsize);
-				putdblock(b);
-			}
 		}
 
 		diskaccess(1);
 
 		trace(TraceProc, "icachewritesect writepart", addr, nbuf);
-		if(writepart(is->part, addr, buf, nbuf) < 0 || flushpart(is->part) < 0){
-			/* XXX more details here */
-			fprint(2, "icachewriteproc writepart: %r\n");
+		werr = 0;
+		if(writepart(is->part, addr, buf, nbuf) < 0 || flushpart(is->part) < 0)
+			werr = -1;
+
+		for(i=0; i<nbuf; i+=bsize){
+			if((b = _getdblock(is->part, addr+i, ORDWR, 0)) != nil){
+				memmove(b->data, buf+i, bsize);
+				putdblock(b);
+			}
+		}
+
+		if(werr < 0){
+			fprint(2, "%s: part %s addr 0x%llux: icachewritesect "
+				"writepart: %r\n", argv0, is->part->name, addr);
 			err = -1;
 			continue;
 		}
+		
 		addstat(StatIsectWriteBytes, nbuf);
 		addstat(StatIsectWrite, 1);
 		icacheclean(chunk);
blob - e5d5460aea59905ea4c6c36a83d86beb225a31e0
blob + 36d96b941608bebbb22f4e1eecab5475abe22976
--- src/cmd/venti/srv/ifile.c
+++ src/cmd/venti/srv/ifile.c
@@ -2,46 +2,57 @@
 #include "dat.h"
 #include "fns.h"
 
+static char vcmagic[] = "venti config\n";
+
+enum {
+	Maxconfig = 8 * 1024,
+	Maglen = sizeof vcmagic - 1,
+};
+
 int
 readifile(IFile *f, char *name)
 {
-	int m;
 	Part *p;
 	ZBlock *b;
 	u8int *z;
-	
+
 	p = initpart(name, OREAD);
 	if(p == nil)
 		return -1;
-	b = alloczblock(8192, 1, 0);
+	b = alloczblock(Maxconfig+1, 1, 0);
 	if(b == nil){
 		seterr(EOk, "can't alloc for %s: %R", name);
 		return -1;
 	}
 	if(p->size > PartBlank){
 		/*
-		 * this is likely a real venti partition, in which case
-		 * we're looking for the config file stored as 8k at end of PartBlank.
+		 * this is likely a real venti partition, in which case we're
+		 * looking for the config file stored as 8k at end of PartBlank.
 		 */
-		if(readpart(p, PartBlank-8192, b->data, 8192) < 0){
+		if(readpart(p, PartBlank-Maxconfig, b->data, Maxconfig) < 0){
 			seterr(EOk, "can't read %s: %r", name);
 			freezblock(b);
 			freepart(p);
 			return -1;
 		}
-		m = 5+1+6+1;
-		if(memcmp(b->data, "venti config\n", m) != 0){
+		b->data[Maxconfig] = '\0';
+		if(memcmp(b->data, vcmagic, Maglen) != 0){
 			seterr(EOk, "bad venti config magic in %s", name);
 			freezblock(b);
 			freepart(p);
 			return -1;
 		}
-		b->data += m;
-		b->len -= m;
-		z = memchr(b->data, 0, b->len);
+		/*
+		 * if we change b->data+b->_size, freezblock
+		 * will blow an assertion, so don't.
+		 */
+		b->data  += Maglen;
+		b->_size -= Maglen;
+		b->len   -= Maglen;
+		z = memchr(b->data, '\0', b->len);
 		if(z)
 			b->len = z - b->data;
-	}else if(p->size > 8192){
+	}else if(p->size > Maxconfig){
 		seterr(EOk, "config file is too large");
 		freepart(p);
 		freezblock(b);
blob - a5ffeef9cab59fd8a4d5551bf29c7cf3fdc1605c
blob + dd49e05505945c6cbe456ca79eadd7c2a134869e
--- src/cmd/venti/srv/index.c
+++ src/cmd/venti/srv/index.c
@@ -596,6 +596,25 @@ print("want arena %d for %llux\n", l, a);
 	return ix->arenas[l];
 }
 
+/*
+ * convert an arena index to the bounds of the containing arena group.
+ */
+Arena*
+amapitoag(Index *ix, u64int a, u64int *gstart, u64int *glimit, int *g)
+{
+	u64int aa;
+	Arena *arena;
+	
+	arena = amapitoa(ix, a, &aa);
+	if(arena == nil)
+		return nil;
+	if(arenatog(arena, aa, gstart, glimit, g) < 0)
+		return nil;
+	*gstart += a - aa;
+	*glimit += a - aa;
+	return arena;
+}
+
 int
 iaddrcmp(IAddr *ia1, IAddr *ia2)
 {
blob - 1db62717194ec1ccfc178ce703c1a007b8775f99
blob + 206d5e064352953fc097a3c95e2440e9d440ed0f
--- src/cmd/venti/srv/lump.c
+++ src/cmd/venti/srv/lump.c
@@ -7,7 +7,7 @@ int			queuewrites = 0;
 int			writestodevnull = 0;
 int			verifywrites = 0;
 
-static Packet		*readilump(Lump *u, IAddr *ia, u8int *score, int rac);
+static Packet		*readilump(Lump *u, IAddr *ia, u8int *score);
 
 /*
  * Some of this logic is duplicated in hdisk.c
@@ -19,7 +19,6 @@ readlump(u8int *score, int type, u32int size, int *cac
 	Packet *p;
 	IAddr ia;
 	u32int n;
-	int rac;
 
 	trace(TraceLump, "readlump enter");
 /*
@@ -49,7 +48,7 @@ readlump(u8int *score, int type, u32int size, int *cac
 	if(cached)
 		*cached = 0;
 
-	if(lookupscore(score, type, &ia, &rac) < 0){
+	if(lookupscore(score, type, &ia) < 0){
 		/* ZZZ place to check for someone trying to guess scores */
 		seterr(EOk, "no block with score %V/%d exists", score, type);
 
@@ -64,7 +63,7 @@ readlump(u8int *score, int type, u32int size, int *cac
 	}
 
 	trace(TraceLump, "readlump readilump");
-	p = readilump(u, &ia, score, rac);
+	p = readilump(u, &ia, score);
 	putlump(u);
 
 	trace(TraceLump, "readlump exit");
@@ -134,9 +133,8 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
 	Packet *old;
 	IAddr ia;
 	int ok;
-	int rac;
 
-	if(lookupscore(u->score, u->type, &ia, &rac) == 0){
+	if(lookupscore(u->score, u->type, &ia) == 0){
 		if(verifywrites == 0){
 			/* assume the data is here! */
 			packetfree(p);
@@ -149,7 +147,7 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
 		 * if the read fails,
 		 * assume it was corrupted data and store the block again
 		 */
-		old = readilump(u, &ia, u->score, rac);
+		old = readilump(u, &ia, u->score);
 		if(old != nil){
 			ok = 0;
 			if(packetcmp(p, old) != 0){
@@ -176,7 +174,7 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
 	ok = storeclump(mainindex, flat, u->score, u->type, creator, &ia);
 	freezblock(flat);
 	if(ok == 0)
-		ok = insertscore(u->score, &ia, 1);
+		ok = insertscore(u->score, &ia, IEDirty);
 	if(ok == 0)
 		insertlump(u, p);
 	else
@@ -191,41 +189,16 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
 	ms = msec() - ms;
 	addstat2(StatRpcWriteNew, 1, StatRpcWriteNewTime, ms);
 	return ok;
-}
-
-static void
-lreadahead(u64int a, Arena *arena, u64int aa, int n)
-{	
-	u8int buf[ClumpSize];
-	Clump cl;
-	IAddr ia;
-
-	while(n > 0) {
-		if (aa >= arena->memstats.used)
-			break;
-		if(readarena(arena, aa, buf, ClumpSize) < ClumpSize)
-			break;
-		if(unpackclump(&cl, buf, arena->clumpmagic) < 0)
-			break;
-		ia.addr = a;
-		ia.type = cl.info.type;
-		ia.size = cl.info.uncsize;
-		ia.blocks = (cl.info.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
-		insertscore(cl.info.score, &ia, 0);
-		a += ClumpSize + cl.info.size;
-		aa += ClumpSize + cl.info.size;
-		n--;
-	}
 }
 
 static Packet*
-readilump(Lump *u, IAddr *ia, u8int *score, int rac)
+readilump(Lump *u, IAddr *ia, u8int *score)
 {
 	Arena *arena;
 	ZBlock *zb;
 	Packet *p, *pp;
 	Clump cl;
-	u64int a, aa;
+	u64int aa;
 	u8int sc[VtScoreSize];
 
 	trace(TraceLump, "readilump enter");
@@ -258,13 +231,6 @@ readilump(Lump *u, IAddr *ia, u8int *score, int rac)
 		return nil;
 	}
 
-	if(rac == 0) {
-		trace(TraceLump, "readilump readahead");
-		a = ia->addr + ClumpSize + cl.info.size;
-		aa += ClumpSize + cl.info.size;
-		lreadahead(a, arena, aa, 20);
-	}
-
 	trace(TraceLump, "readilump success");
 	p = zblock2packet(zb, cl.info.uncsize);
 	freezblock(zb);
blob - 3a66bf6dea1ab67363c6e819cad4e82f4d9a7fa3
blob + 5ee4d91fc9802c1d237df60ecbb8afa0e20f523c
--- src/cmd/venti/srv/stats.c
+++ src/cmd/venti/srv/stats.c
@@ -60,6 +60,9 @@ Statdesc statdesc[NStat] =
 	{ "index cache flushes", },
 	{ "index cache stalls", },
 	{ "index cache read time", },
+	{ "index cache lookups" },
+	{ "index cache summary hits" },
+	{ "index cache summary prefetches" },
 
 	{ "bloom filter hits", },
 	{ "bloom filter misses", },
@@ -81,6 +84,9 @@ Statdesc statdesc[NStat] =
 
 	{ "sum reads", },
 	{ "sum read bytes", },
+	
+	{ "cig loads" },
+	{ "cig load time" },
 };
 
 QLock statslock;
blob - 72d45f18fc96a59c97c9fa2dc4256d0ae59c97b9
blob + fb3c4ce284077e58148da0916b0dbc3adf501e01
--- src/cmd/venti/srv/syncindex.c
+++ src/cmd/venti/srv/syncindex.c
@@ -56,13 +56,7 @@ threadmain(int argc, char *argv[])
 	if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
 	initdcache(bcmem);
 	initlumpcache(1*1024*1024, 1024/8);
-	icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth);
-	if(icmem < 4)
-		icmem = 4;
-	if(1) fprint(2, "initialize %d bytes of index cache for %d index entries\n",
-		(sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth,
-		(1 << icmem) * ICacheDepth);
-	initicache(icmem, ICacheDepth);
+	initicache(icmem);
 	initicachewrite();
 	if(mainindex->bloom)
 		startbloomproc(mainindex->bloom);
blob - 6a008bad6e740e3dabdbfd537e141c28b10a5611
blob + 98f6adf11db9d584fc0fc74a81c3775e3230c663
--- src/cmd/venti/srv/syncindex0.c
+++ src/cmd/venti/srv/syncindex0.c
@@ -101,7 +101,7 @@ syncarenaindex(Index *ix, Arena *arena, u32int clump, 
 			}
 			flush = 1;
 			trace(TraceProc, "syncarenaindex insert %V", ci->score);
-			insertscore(ci->score, &ia, 1);
+			insertscore(ci->score, &ia, IEDirty);
 		}
 
 		if(0 && clump / 1000 != (clump + n) / 1000)
blob - fab837824aa568c552f4fb9f5332127948fac8b5
blob + 4d59dfc6d483735ab4be20600894ec472e43a021
--- src/cmd/venti/srv/venti.c
+++ src/cmd/venti/srv/venti.c
@@ -18,7 +18,7 @@ static void	ventiserver(void*);
 void
 usage(void)
 {
-	fprint(2, "usage: venti [-Ldrs] [-a ventiaddr] [-c config] "
+	fprint(2, "usage: venti [-Ldrsw] [-a ventiaddr] [-c config] "
 "[-h httpaddr] [-B blockcachesize] [-C cachesize] [-I icachesize] [-W webroot]\n");
 	threadexitsall("usage");
 }
@@ -73,6 +73,9 @@ threadmain(int argc, char *argv[])
 	case 's':
 		nofork = 1;
 		break;
+	case 'w':			/* compatibility with old venti */
+		queuewrites = 1;
+		break;
 	case 'W':
 		webroot = EARGF(usage());
 		break;
@@ -103,9 +106,6 @@ threadmain(int argc, char *argv[])
 	if(configfile == nil)
 		configfile = "venti.conf";
 
-	if(initarenasum() < 0)
-		fprint(2, "warning: can't initialize arena summing process: %r");
-
 	fprint(2, "conf...");
 	if(initventi(configfile, &config) < 0)
 		sysfatal("can't init server: %r");
@@ -143,13 +143,7 @@ threadmain(int argc, char *argv[])
 		mem, mem / (8 * 1024));
 	initlumpcache(mem, mem / (8 * 1024));
 
-	icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth);
-	if(icmem < 4)
-		icmem = 4;
-	if(0) fprint(2, "initialize %d bytes of index cache for %d index entries\n",
-		(sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth,
-		(1 << icmem) * ICacheDepth);
-	initicache(icmem, ICacheDepth);
+	initicache(icmem);
 	initicachewrite();
 
 	/*
@@ -179,6 +173,9 @@ threadmain(int argc, char *argv[])
 		}
 	}
 
+	if(initarenasum() < 0)
+		fprint(2, "warning: can't initialize arena summing process: %r");
+
 	fprint(2, "announce %s...", vaddr);
 	ventisrv = vtlisten(vaddr);
 	if(ventisrv == nil)
@@ -272,5 +269,3 @@ ventiserver(void *v)
 	flushicache();
 	threadexitsall(0);
 }
-
-
blob - 76e9f276ac4ff9ce0c9b5ebfb8aad10f44537833
blob + 64de5bbb5846277af173c6acf3b5765c332e88ab
--- src/cmd/venti/srv/www/stats.js
+++ src/cmd/venti/srv/www/stats.js
@@ -38,6 +38,10 @@ graphname = new Array(
 		"icache dirty %",
 	"arg=icachehit&graph=pctdiff&arg2=icachelookup&max=100",
 		"icache hit %",
+	"arg=scachehit&graph=pctdiff&arg2=icachelookup&max=100",
+		"scache hit %",
+	"arg=icachemiss&graph=pctdiff&arg2=icachelookup&max=100",
+		"icache miss %",
 	"arg=icachelookuptime&graph=divdiff&arg2=icachelookup",
 		"icache lookup time",
 	"arg=icacheprefetch&graph=diff",
@@ -75,6 +79,8 @@ graphname = new Array(
 		"fresh write RPC time",
 	"arg=rpcwriteoldtime&graph=divdiff&arg2=rpcwriteold",
 		"dup write RPC time",
+	"arg=cigloadtime&graph=divdiff&arg2=cigload",
+		"cig load time",
 
 	"arg=sumreadbyte&graph=diff",
 		"checksum bytes/second",
@@ -118,8 +124,11 @@ column1 = new Array(
 	"!icache",
 	"arg=icachedirty&graph=pct&arg2=icachesize&max=100",
 	"arg=icachehit&graph=pctdiff&arg2=icachelookup&max=100",
+	"arg=scachehit&graph=pctdiff&arg2=icachelookup&max=100",
+	"arg=icachemiss&graph=pctdiff&arg2=icachelookup&max=100",
 	"arg=icachewrite&graph=diff",
 	"arg=icacheprefetch&graph=diff",
+	"arg=scacheprefetch&graph=diff",
 	
 	"!dcache",
 	"arg=dcachedirty&graph=pct&arg2=dcachesize&max=100",
@@ -154,6 +163,7 @@ column2 = new Array(
 	"arg=rpcreaduncachedtime&graph=divdiff&arg2=rpcreaduncached",
 	"arg=rpcwritenewtime&graph=divdiff&arg2=rpcwritenew",
 	"arg=rpcwriteoldtime&graph=divdiff&arg2=rpcwriteold",
+	"arg=cigloadtime&graph=divdiff&arg2=cigload",
 	
 	"END"
 )