commit 7a400ee957a0815287af806e18ef90dd18b47f82 from: Russ Cox date: Tue Sep 25 13:47:31 2007 UTC venti: new icache commit - 25a4e89fa907ed5a5f5d84eccfb66180007d9c68 commit + 7a400ee957a0815287af806e18ef90dd18b47f82 blob - 7cd2eb73c4244b555587e0fee267d74a3461e5d8 blob + c576e9aaf4485e7457af1e716ca93e7ed8648a01 --- src/cmd/venti/srv/arena.c +++ src/cmd/venti/srv/arena.c @@ -16,6 +16,7 @@ static int loadarena(Arena *arena); static CIBlock *getcib(Arena *arena, int clump, int writing, CIBlock *rock); static void putcib(Arena *arena, CIBlock *cib); static void sumproc(void *); +static void loadcig(Arena *arena); static QLock sumlock; static Rendez sumwait; @@ -137,14 +138,23 @@ readclumpinfos(Arena *arena, int clump, ClumpInfo *cis CIBlock *cib, r; int i; - for(i = 0; i < n; i++){ + /* + * because the clump blocks are laid out + * in reverse order at the end of the arena, + * it can be a few percent faster to read + * the clumps backwards, which reads the + * disk blocks forwards. + */ + for(i = n-1; i >= 0; i--){ cib = getcib(arena, clump + i, 0, &r); - if(cib == nil) - break; + if(cib == nil){ + n = i; + continue; + } unpackclumpinfo(&cis[i], &cib->data->data[cib->offset]); putcib(arena, cib); } - return i; + return n; } /* @@ -349,7 +359,28 @@ writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64i if(c->info.size < c->info.uncsize) arena->memstats.cclumps++; - clump = arena->memstats.clumps++; + clump = arena->memstats.clumps; + if(clump % ArenaCIGSize == 0){ + if(arena->cig == nil){ + loadcig(arena); + if(arena->cig == nil) + goto NoCIG; + } + /* add aa as start of next cig */ + if(clump/ArenaCIGSize != arena->ncig){ + fprint(2, "bad arena cig computation %s: writing clump %d but %d cigs\n", + arena->name, clump, arena->ncig); + arena->ncig = -1; + vtfree(arena->cig); + arena->cig = nil; + goto NoCIG; + } + arena->cig = vtrealloc(arena->cig, (arena->ncig+1)*sizeof arena->cig[0]); + arena->cig[arena->ncig++].offset = aa; + } +NoCIG: + arena->memstats.clumps++; + if(arena->memstats.clumps == 0) sysfatal("clumps wrapped"); arena->wtime = now(); @@ -752,3 +783,157 @@ putcib(Arena *arena, CIBlock *cib) putdblock(cib->data); cib->data = nil; } + + +/* + * For index entry readahead purposes, the arenas are + * broken into smaller subpieces, called clump info groups + * or cigs. Each cig has ArenaCIGSize clumps (ArenaCIGSize + * is chosen to make the index entries take up about half + * a megabyte). The index entries do not contain enough + * information to determine what the clump index is for + * a given address in an arena. That info is needed both for + * figuring out which clump group an address belongs to + * and for prefetching a clump group's index entries from + * the arena table of contents. The first time clump groups + * are accessed, we scan the entire arena table of contents + * (which might be 10s of megabytes), recording the data + * offset of each clump group. + */ + +/* + * load clump info group information by scanning entire toc. + */ +static void +loadcig(Arena *arena) +{ + u32int i, j, ncig, nci; + ArenaCIG *cig; + ClumpInfo *ci; + u64int offset; + int ms; + + if(arena->cig || arena->ncig < 0) + return; + +// fprint(2, "loadcig %s\n", arena->name); + + ncig = (arena->memstats.clumps+ArenaCIGSize-1) / ArenaCIGSize; + if(ncig == 0){ + arena->cig = vtmalloc(1); + arena->ncig = 0; + return; + } + + ms = msec(); + cig = vtmalloc(ncig*sizeof cig[0]); + ci = vtmalloc(ArenaCIGSize*sizeof ci[0]); + offset = 0; + for(i=0; incig = -1; + fprint(2, "loadcig %s: got %ud cigs, expected %ud\n", arena->name, i+1, ncig); + goto out; + } + } + } + vtfree(ci); + + arena->ncig = ncig; + arena->cig = cig; + +out: + ms = msec() - ms; + addstat2(StatCigLoad, 1, StatCigLoadTime, ms); +} + +/* + * convert arena address into arena group + data boundaries. + */ +int +arenatog(Arena *arena, u64int addr, u64int *gstart, u64int *glimit, int *g) +{ + int r, l, m; + + qlock(&arena->lock); + if(arena->cig == nil) + loadcig(arena); + if(arena->cig == nil || arena->ncig == 0){ + qunlock(&arena->lock); + return -1; + } + + l = 1; + r = arena->ncig - 1; + while(l <= r){ + m = (r + l) / 2; + if(arena->cig[m].offset <= addr) + l = m + 1; + else + r = m - 1; + } + l--; + + *g = l; + *gstart = arena->cig[l].offset; + if(l+1 < arena->ncig) + *glimit = arena->cig[l+1].offset; + else + *glimit = arena->memstats.used; + qunlock(&arena->lock); + return 0; +} + +/* + * load the clump info for group g into the index entries. + */ +int +asumload(Arena *arena, int g, IEntry *entries, int nentries) +{ + int i, base, limit; + u64int addr; + ClumpInfo ci; + IEntry *ie; + + if(nentries < ArenaCIGSize){ + fprint(2, "asking for too few entries\n"); + return -1; + } + + qlock(&arena->lock); + if(arena->cig == nil) + loadcig(arena); + if(arena->cig == nil || arena->ncig == 0 || g >= arena->ncig){ + qunlock(&arena->lock); + return -1; + } + + addr = 0; + base = g*ArenaCIGSize; + limit = base + ArenaCIGSize; + if(base > arena->memstats.clumps) + base = arena->memstats.clumps; + ie = entries; + for(i=base; iscore, ci.score); + ie->ia.type = ci.type; + ie->ia.size = ci.uncsize; + ie->ia.blocks = (ci.size + ClumpSize + (1<> ABlockLog; + ie->ia.addr = addr; + ie++; + } + addr += ClumpSize + ci.size; + } + qunlock(&arena->lock); + return ie - entries; +} blob - ef0e4ec94a5e154b897578e11fa2a83406a2972c blob + ca95573024d3695075757d5496f3cc729865cff3 --- src/cmd/venti/srv/checkindex.c +++ src/cmd/venti/srv/checkindex.c @@ -8,7 +8,7 @@ static void phdr(DBlock *eb) { static int did; - + if(!did){ did = 1; print("# diff actual correct\n"); @@ -168,7 +168,7 @@ checkbloom(Bloom *b1, Bloom *b2, int fix) { u32int *a1, *a2; int i, n, extra, missing; - + if(b1==nil && b2==nil) return 0; if(b1==nil || b2==nil){ @@ -188,13 +188,14 @@ checkbloom(Bloom *b1, Bloom *b2, int fix) missing = 0; for(i=BloomHeadSize/4; iscore, p); p += VtScoreSize; - ie->wtime = U32GET(p); + /* ie->wtime = U32GET(p); */ p += U32Size; - ie->train = U16GET(p); + /* ie->train = U16GET(p); */ p += U16Size; if(p - buf != IEntryAddrOff) sysfatal("unpackentry bad IEntryAddrOff amount"); @@ -613,9 +613,9 @@ packientry(IEntry *ie, u8int *buf) scorecp(p, ie->score); p += VtScoreSize; - U32PUT(p, ie->wtime); + U32PUT(p, 0); /* wtime */ p += U32Size; - U16PUT(p, ie->train); + U16PUT(p, 0); /* train */ p += U16Size; U64PUT(p, ie->ia.addr, t32); p += U64Size; blob - 07be62f4c0f5d2249fbd4f42c55b4ec5043f903c blob + e0a9e18a538aa0abd449716267afed83c71128d3 --- src/cmd/venti/srv/dat.h +++ src/cmd/venti/srv/dat.h @@ -3,6 +3,7 @@ typedef struct AMap AMap; typedef struct AMapN AMapN; typedef struct Arena Arena; typedef struct AState AState; +typedef struct ArenaCIG ArenaCIG; typedef struct ArenaHead ArenaHead; typedef struct ArenaPart ArenaPart; typedef struct ArenaTail ArenaTail; @@ -28,8 +29,10 @@ typedef struct ZBlock ZBlock; typedef struct Round Round; typedef struct Bloom Bloom; -#define TWID32 ((u32int)~(u32int)0) -#define TWID64 ((u64int)~(u64int)0) +#pragma incomplete IEStream + +#define TWID32 ((u32int)~(u32int)0) +#define TWID64 ((u64int)~(u64int)0) #define TWID8 ((u8int)~(u8int)0) enum @@ -44,7 +47,6 @@ enum IndexBase = 1024*1024, /* initial address to use in an index */ MaxIo = 64*1024, /* max size of a single read or write operation */ ICacheBits = 16, /* default bits for indexing icache */ - ICacheDepth = 4, /* default depth of an icache hash chain */ MaxAMap = 2*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */ /* @@ -147,6 +149,8 @@ enum DirtyArenaCib, DirtyArenaTrailer, DirtyMax, + + ArenaCIGSize = 10*1024, // about 0.5 MB worth of IEntry. VentiZZZZZZZZ }; @@ -371,8 +375,16 @@ struct Arena u32int ctime; /* first time a block was written */ u32int wtime; /* last time a block was written */ u32int clumpmagic; + + ArenaCIG *cig; + int ncig; }; +struct ArenaCIG +{ + u64int offset; // from arena base +}; + /* * redundant storage of some fields at the beginning of each arena */ @@ -505,15 +517,21 @@ struct IAddr */ struct IEntry { - u8int score[VtScoreSize]; - IEntry *next; /* next in hash chain */ - IEntry *nextdirty; /* next in dirty chain */ - u32int wtime; /* last write time */ - u16int train; /* relative train containing the most recent ref; 0 if no ref, 1 if in same car */ - u8int rac; /* read ahead count */ - u8int dirty; /* is dirty */ - IAddr ia; + /* on disk data - 32 bytes*/ + u8int score[VtScoreSize]; + IAddr ia; + + IEntry *nexthash; + IEntry *nextdirty; + IEntry *next; + IEntry *prev; + u8int state; }; +enum { + IEClean = 0, + IEDirty = 1, + IESummary = 2, +}; /* * buckets in the on disk index table @@ -607,6 +625,9 @@ enum StatIcacheFlush, StatIcacheStall, StatIcacheReadTime, + StatIcacheLookup, + StatScacheHit, + StatScachePrefetch, StatBloomHit, StatBloomMiss, @@ -628,6 +649,9 @@ enum StatSumRead, StatSumReadBytes, + + StatCigLoad, + StatCigLoadTime, NStat }; blob - cf5299531c3d720c5f3ba607a21f8d87fd731e79 blob + 05a3e93726697ba8f843c5cc4a87792cf1b7a795 --- src/cmd/venti/srv/fns.h +++ src/cmd/venti/srv/fns.h @@ -6,8 +6,11 @@ void addstat(int, int); void addstat2(int, int, int, int); ZBlock *alloczblock(u32int size, int zeroed, uint alignment); Arena *amapitoa(Index *index, u64int a, u64int *aa); +Arena *amapitoag(Index *index, u64int a, u64int *gstart, u64int *glimit, int *g); u64int arenadirsize(Arena *arena, u32int clumps); +int arenatog(Arena *arena, u64int aa, u64int *gstart, u64int *glimit, int *g); void arenaupdate(Arena *arena, u32int size, u8int *score); +int asumload(Arena *arena, int g, IEntry *entries, int maxentries); void backsumarena(Arena *arena); void binstats(long (*fn)(Stats *s0, Stats *s1, void*), void *arg, long t0, long t1, Statbin *bin, int nbin); int bloominit(Bloom*, vlong, uchar*); @@ -64,6 +67,7 @@ int iaddrcmp(IAddr *ia1, IAddr *ia2); IEntry* icachedirty(u32int, u32int, u64int); ulong icachedirtyfrac(void); void icacheclean(IEntry*); +int icachelookup(u8int *score, int type, IAddr *ia); int ientrycmp(const void *vie1, const void *vie2); char *ifileline(IFile *f); int ifilename(IFile *f, char *dst); @@ -76,7 +80,7 @@ ArenaPart *initarenapart(Part *part); int initarenasum(void); void initbloomfilter(Index*); void initdcache(u32int mem); -void initicache(int bits, int depth); +void initicache(u32int mem); void initicachewrite(void); IEStream *initiestream(Part *part, u64int off, u64int clumps, u32int size); ISect *initisect(Part *part); @@ -87,7 +91,7 @@ Part* initpart(char *name, int mode); void initround(Round*, char*, int); int initventi(char *config, Config *conf); void insertlump(Lump *lump, Packet *p); -int insertscore(u8int *score, IAddr *ia, int write); +int insertscore(u8int *score, IAddr *ia, int state); void kickdcache(void); void kickicache(void); void kickround(Round*, int wait); @@ -97,8 +101,7 @@ DBlock *loadibucket(Index *index, u8int *score, ISect int loadientry(Index *index, u8int *score, int type, IEntry *ie); void logerr(int severity, char *fmt, ...); Lump *lookuplump(u8int *score, int type); -int _lookupscore(u8int *score, int type, IAddr *ia, int *rac); -int lookupscore(u8int *score, int type, IAddr *ia, int *rac); +int lookupscore(u8int *score, int type, IAddr *ia); int maparenas(AMap *am, Arena **arenas, int n, char *what); void markbloomfilter(Bloom*, u8int*); uint msec(void); blob - 266218b19b9580ce1f593d1f3780de460163c28f blob + 8cf937d1b49ee6829f5e43041672bd1773322eda --- src/cmd/venti/srv/hdisk.c +++ src/cmd/venti/srv/hdisk.c @@ -547,7 +547,7 @@ debugread(HConnect *c, u8int *score) Lump *u; IAddr ia; IEntry ie; - int i, rac; + int i; Arena *arena; u64int aa; ZBlock *zb; @@ -561,7 +561,7 @@ debugread(HConnect *c, u8int *score) } hprint(&c->hout, "

index search %V

\n", score);
-	if(_lookupscore(score, -1, &ia, nil) < 0)
+	if(icachelookup(score, -1, &ia) < 0)
 		hprint(&c->hout, "  icache: not found\n");
 	else
 		hprint(&c->hout, "  icache: addr=%#llx size=%d type=%d blocks=%d\n",
@@ -585,12 +585,12 @@ debugread(HConnect *c, u8int *score)
 			hprint(&c->hout, " -cache");
 		putlump(u);
 		
-		if(lookupscore(score, type, &ia, &rac) < 0){
+		if(lookupscore(score, type, &ia) < 0){
 			hprint(&c->hout, " -lookup\n");
 			continue;
 		}
-		hprint(&c->hout, "\n  lookupscore: addr=%#llx size=%d blocks=%d rac=%d\n",
-			ia.addr, ia.size, ia.blocks, rac);
+		hprint(&c->hout, "\n  lookupscore: addr=%#llx size=%d blocks=%d\n",
+			ia.addr, ia.size, ia.blocks);
 		
 		arena = amapitoa(mainindex, ia.addr, &aa);
 		if(arena == nil){
blob - 6b7f20fab74a9807ea3c5808545bdfcaa4be37da
blob + 51d8b9a1fe18efb0d1e7aaea212d97da373f4e7b
--- src/cmd/venti/srv/httpd.c
+++ src/cmd/venti/srv/httpd.c
@@ -895,7 +895,7 @@ static char* graphname[] =
 
 	"icachehit",
 	"icachemiss",
-	"icachelookup",
+	"icacheread",
 	"icachewrite",
 	"icachefill",
 	"icacheprefetch",
@@ -904,6 +904,9 @@ static char* graphname[] =
 	"icacheflush",
 	"icachestall",
 	"icachelookuptime",
+	"icachelookup",
+	"scachehit",
+	"scacheprefetch",
 
 	"bloomhit",
 	"bloommiss",
@@ -925,6 +928,9 @@ static char* graphname[] =
 
 	"sumread",
 	"sumreadbyte",
+	
+	"cigload",
+	"cigloadtime",
 };
 
 static int
blob - 32cbb5d340dc4efc64069fffa0aea9fa6f6ce6bc
blob + 384fd2c1437a09aa33ab7c19bc53e9a252dbf583
--- src/cmd/venti/srv/icache.c
+++ src/cmd/venti/srv/icache.c
@@ -2,236 +2,421 @@
 #include "dat.h"
 #include "fns.h"
 
+int icacheprefetch = 1;
+
 typedef struct ICache ICache;
+typedef struct IHash IHash;
+typedef struct ISum ISum;
+
 struct ICache
 {
-	QLock	lock;			/* locks hash table & all associated data */
+	QLock	lock;
 	Rendez	full;
-	IEntry	**heads;		/* heads of all the hash chains */
-	int	bits;			/* bits to use for indexing heads */
-	u32int	size;			/* number of heads; == 1 << bits, should be < entries */
-	IEntry	*base;			/* all allocated hash table entries */
-	IEntry	*free;
-	u32int	entries;		/* elements in base */
-	IEntry	*dirty;		/* chain of dirty elements */
-	u32int	ndirty;
+	IHash	*hash;
+	IEntry	*entries;
+	int		nentries;
+	IEntry	free;
+	IEntry	clean;
+	IEntry	dirty;
 	u32int	maxdirty;
-	u32int	unused;			/* index of first unused element in base */
-	u32int	stolen;			/* last head from which an element was stolen */
+	u32int	ndirty;
 
-	Arena	*last[4];
-	Arena	*lastload;
-	int		nlast;
+	ISum	**sum;
+	int		nsum;
+	IHash	*shash;
+	IEntry	*sentries;
+	int		nsentries;
 };
 
-int icacheprefetch = 1;
-
 static ICache icache;
 
-static IEntry	*icachealloc(IAddr *ia, u8int *score);
-
 /*
- * bits is the number of bits in the icache hash table
- * depth is the average depth
- * memory usage is about (1<table[0]));
+	ih->table = (IEntry**)(ih+1);
+	ih->bits = bits;
+	ih->size = size;
+	return ih;
 }
 
-u32int
-hashbits(u8int *sc, int bits)
+static IEntry*
+ihashlookup(IHash *ih, u8int score[VtScoreSize], int type)
 {
-	u32int v;
-
-	v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3];
-	if(bits < 32)
-		 v >>= (32 - bits);
-	return v;
+	u32int h;
+	IEntry *ie;
+	
+	h = hashbits(score, ih->bits);
+	for(ie=ih->table[h]; ie; ie=ie->nexthash)
+		if((type == -1 || type == ie->ia.type) && scorecmp(score, ie->score) == 0)
+			return ie;
+	return nil;
 }
 
 static void
-loadarenaclumps(Arena *arena, u64int aa)
+ihashdelete(IHash *ih, IEntry *ie, char *what)
 {
-	ulong i;
-	ClumpInfo ci;
-	IAddr ia;
-
-	for(i=0; imemstats.clumps; i++){
-		if(readclumpinfo(arena, i, &ci) < 0)
-			break;
-		ia.type = ci.type;
-		ia.size = ci.uncsize;
-		ia.blocks = (ci.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
-		ia.addr = aa;
-		aa += ClumpSize + ci.size;
-		if(ia.type != VtCorruptType)
-			insertscore(ci.score, &ia, 0);
-	}
+	u32int h;
+	IEntry **l;
+	
+	h = hashbits(ie->score, ih->bits);
+	for(l=&ih->table[h]; *l; l=&(*l)->nexthash)
+		if(*l == ie){
+			*l = ie->nexthash;
+			return;
+		}
+	fprint(2, "warning: %s %V not found in ihashdelete\n", what, ie->score);
 }
 
-int
-_lookupscore(u8int *score, int type, IAddr *ia, int *rac)
+static void
+ihashinsert(IHash *ih, IEntry *ie)
 {
 	u32int h;
-	IEntry *ie, *last;
+	
+	h = hashbits(ie->score, ih->bits);
+	ie->nexthash = ih->table[h];
+	ih->table[h] = ie;
+}
 
-	qlock(&icache.lock);
-	h = hashbits(score, icache.bits);
-	last = nil;
-	for(ie = icache.heads[h]; ie != nil; ie = ie->next){
-		if((ie->ia.type == type || type == -1) && scorecmp(ie->score, score)==0){
-			if(last != nil)
-				last->next = ie->next;
-			else
-				icache.heads[h] = ie->next;
-			addstat(StatIcacheHit, 1);
-			if(rac)
-				ie->rac = 1;
-			trace(TraceLump, "lookupscore incache");
-			ie->next = icache.heads[h];
-			icache.heads[h] = ie;
 
-			*ia = ie->ia;
-			if(rac)
-				*rac = ie->rac;
-			qunlock(&icache.lock);
-			return 0;
-		}
-		last = ie;
-	}
-	addstat(StatIcacheMiss, 1);
-	qunlock(&icache.lock);
-	return -1;
+/*
+ * IEntry lists.
+ */
+
+static IEntry*
+popout(IEntry *ie)
+{
+	if(ie->prev == nil && ie->next == nil)
+		return ie;
+	ie->prev->next = ie->next;
+	ie->next->prev = ie->prev;
+	ie->next = nil;
+	ie->prev = nil;
+	return ie;
 }
 
+static IEntry*
+poplast(IEntry *list)
+{
+	if(list->prev == list)
+		return nil;
+	return popout(list->prev);
+}
 
+static IEntry*
+pushfirst(IEntry *list, IEntry *ie)
+{
+	popout(ie);
+	ie->prev = list;
+	ie->next = list->next;
+	ie->prev->next = ie;
+	ie->next->prev = ie;
+	return ie;
+}
+
 /*
-ZZZ need to think about evicting the correct IEntry,
-and writing back the wtime.
- * look up data score in the index cache
- * if this fails, pull it in from the disk index table, if it exists.
- *
- * must be called with the lump for this score locked
+ * Arena summary cache.
  */
-int
-lookupscore(u8int *score, int type, IAddr *ia, int *rac)
+struct ISum
 {
-	IEntry d, *ie;
-	u32int h;
-	u64int aa;
-	Arena *load;
-	int i, ret;
-	uint ms;
+	QLock	lock;
+	IEntry	*entries;
+	int	nentries;
+	int	loaded;
+	u64int addr;
+	u64int limit;
+	Arena *arena;
+	int g;
+};
 
-	aa = 0;
-	ms = msec();
-	
-	trace(TraceLump, "lookupscore %V.%d", score, type);
+static ISum*
+scachelookup(u64int addr)
+{
+	int i;
+	ISum *s;
 
-	ret = 0;
-	if(_lookupscore(score, type, ia, rac) < 0){
-		if(loadientry(mainindex, score, type, &d) < 0){
-			ret = -1;
-			goto out;
+	for(i=0; iaddr <= addr && addr < s->limit){
+			if(i > 0){
+				memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]);
+				icache.sum[0] = s;
+			}
+			return s;
 		}
+	}
+	return nil;
+}
 
-		/* failed in cache but found on disk - fill cache. */
-		trace(TraceLump, "lookupscore loaded");
-		addstat(StatIcacheFill, 1);
+static void
+sumclear(ISum *s)
+{
+	int i;
 
-		/*
-		 * no one else can load an entry for this score,
-		 * since we have this score's lump's lock.
-		 */
-		qlock(&icache.lock);
+	for(i=0; inentries; i++)
+		ihashdelete(icache.shash, &s->entries[i], "scache");
+	s->nentries = 0;
+	s->loaded = 0;
+	s->addr = 0;
+	s->limit = 0;
+	s->arena = nil;
+	s->g = 0;
+}
+
+static ISum*
+scacheevict(void)
+{
+	ISum *s;
+	int i;
 	
-		/*
-		 * If we notice that all the hits are coming from one arena,
-		 * load the table of contents for that arena into the cache.
-		 */
-		load = nil;
-		h = hashbits(score, icache.bits);
-		ie = icachealloc(&d.ia, score);
-		if(icacheprefetch){
-			icache.last[icache.nlast++%nelem(icache.last)] = amapitoa(mainindex, ie->ia.addr, &aa);
-			aa = ie->ia.addr - aa;	/* compute base addr of arena */
-			for(i=0; i=0; i--){
+		s = icache.sum[i];
+		if(canqlock(&s->lock)){
+			if(i > 0){
+				memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]);
+				icache.sum[0] = s;
 			}
+			sumclear(s);
+			return s;
 		}
-	
-		ie->next = icache.heads[h];
-		icache.heads[h] = ie;
-	
-		*ia = ie->ia;
-		*rac = ie->rac;
-	
-		qunlock(&icache.lock);
-		if(load){
-			trace(TraceProc, "preload 0x%llux", aa);
-			loadarenaclumps(load, aa);
-		}
 	}
+	return nil;
+}
 
-out:
-	ms = msec() - ms;
-	addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms);
-
-	return ret;
+static void
+scachehit(u64int addr)
+{
+	scachelookup(addr);	/* for move-to-front */
 }
 
-/*
- * insert a new element in the hash table.
- */
-int
-insertscore(u8int *score, IAddr *ia, int write)
+static void
+scachesetup(ISum *s, u64int addr)
 {
-	IEntry *ie, se;
-	u32int h;
+	u64int addr0, limit;
+	int g;
 
-	trace(TraceLump, "insertscore enter");
-	if(write)
-		addstat(StatIcacheWrite, 1);
-	else
-		addstat(StatIcachePrefetch, 1);
+	s->arena = amapitoag(mainindex, addr, &addr0, &limit, &g);
+	s->addr = addr0;
+	s->limit = limit;
+	s->g = g;
+}
 
-	qlock(&icache.lock);
-	h = hashbits(score, icache.bits);
+static void
+scacheload(ISum *s)
+{
+	int i, n;
 
-	ie = icachealloc(ia, score);
-	if(write){
-		icache.ndirty++;
-		setstat(StatIcacheDirty, icache.ndirty);
-		delaykickicache();
-		ie->dirty = 1;
+	s->loaded = 1;
+	n = asumload(s->arena, s->g, s->entries, ArenaCIGSize);
+	/*
+	 * n can be less then ArenaCIGSize, either if the clump group
+	 * is the last in the arena and is only partially filled, or if there
+	 * are corrupt clumps in the group -- those are not returned.
+	 */
+	for(i=0; ientries[i].ia.addr += s->addr;
+		ihashinsert(icache.shash, &s->entries[i]);
 	}
-	ie->next = icache.heads[h];
-	icache.heads[h] = ie;
+//fprint(2, "%T scacheload %s %d - %d entries\n", s->arena->name, s->g, n);
+	addstat(StatScachePrefetch, n);
+	s->nentries = n;
+}
 
-	se = *ie;
+static ISum*
+scachemiss(u64int addr)
+{
+	ISum *s;
+
+	s = scachelookup(addr);
+	if(s == nil){
+		/* first time: make an entry in the cache but don't populate it yet */
+		s = scacheevict();
+		if(s == nil)
+			return nil;
+		scachesetup(s, addr);
+		qunlock(&s->lock);
+		return nil;
+	}
+
+	/* second time: load from disk */
+	qlock(&s->lock);
+	if(s->loaded || !icacheprefetch){
+		qunlock(&s->lock);
+		return nil;
+	}
+	
+	return s;	/* locked */
+}
+
+/*
+ * Index cache.
+ */
+
+void
+initicache(u32int mem0)
+{
+	u32int mem;
+	int i, entries, scache;
+	
+	icache.full.l = &icache.lock;
+
+	mem = mem0;
+	entries = mem / (sizeof(IEntry)+sizeof(IEntry*));
+	scache = (entries/8) / ArenaCIGSize;
+	entries -= entries/8;
+	if(scache < 4)
+		scache = 4;
+	if(scache > 16)
+		scache = 16;
+	if(entries < 1000)
+		entries = 1000;
+fprint(2, "icache %,d bytes = %,d entries; %d scache\n", mem0, entries, scache);
+
+	icache.clean.prev = icache.clean.next = &icache.clean;
+	icache.dirty.prev = icache.dirty.next = &icache.dirty;
+	icache.free.prev = icache.free.next = &icache.free;
+	
+	icache.hash = mkihash(entries);
+	icache.nentries = entries;
+	setstat(StatIcacheSize, entries);
+	icache.entries = vtmallocz(entries*sizeof icache.entries[0]);
+	icache.maxdirty = entries / 2;
+	for(i=0; ientries = icache.sentries + i*ArenaCIGSize;
+	}
+}
+
+
+static IEntry*
+evictlru(void)
+{
+	IEntry *ie;
+	
+	ie = poplast(&icache.clean);
+	if(ie == nil)
+		return nil;
+	ihashdelete(icache.hash, ie, "evictlru");
+	return ie;
+}
+
+static void
+icacheinsert(u8int score[VtScoreSize], IAddr *ia, int state)
+{
+	IEntry *ie;
+
+	if((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){
+		addstat(StatIcacheStall, 1);
+		while((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){
+			// Could safely return here if state == IEClean.
+			// But if state == IEDirty, have to wait to make
+			// sure we don't lose an index write.  
+			// Let's wait all the time.
+			flushdcache();
+			kickicache();
+			rsleep(&icache.full);
+		}
+		addstat(StatIcacheStall, -1);
+	}
+
+	memmove(ie->score, score, VtScoreSize);
+	ie->state = state;
+	ie->ia = *ia;
+	if(state == IEClean){
+		addstat(StatIcachePrefetch, 1);
+		pushfirst(&icache.clean, ie);
+	}else{
+		addstat(StatIcacheWrite, 1);
+		assert(state == IEDirty);
+		icache.ndirty++;
+		setstat(StatIcacheDirty, icache.ndirty);
+		delaykickicache();
+		pushfirst(&icache.dirty, ie);
+	}
+	ihashinsert(icache.hash, ie);
+}
+
+int
+icachelookup(u8int score[VtScoreSize], int type, IAddr *ia)
+{
+	IEntry *ie;
+
+	qlock(&icache.lock);
+	addstat(StatIcacheLookup, 1);
+	if((ie = ihashlookup(icache.hash, score, type)) != nil){
+		*ia = ie->ia;
+		if(ie->state == IEClean)
+			pushfirst(&icache.clean, ie);
+		addstat(StatIcacheHit, 1);
+		qunlock(&icache.lock);
+		return 0;
+	}
+
+	if((ie = ihashlookup(icache.shash, score, type)) != nil){
+		*ia = ie->ia;
+		icacheinsert(score, &ie->ia, IEClean);
+		scachehit(ie->ia.addr);
+		addstat(StatScacheHit, 1);
+		qunlock(&icache.lock);
+		return 0;
+	}
+	addstat(StatIcacheMiss, 1);
 	qunlock(&icache.lock);
 
-	if(write && icache.ndirty >= icache.maxdirty)
+	return -1;
+}
+
+int
+insertscore(u8int score[VtScoreSize], IAddr *ia, int state)
+{
+	ISum *toload;
+
+	qlock(&icache.lock);
+	icacheinsert(score, ia, state);
+	if(state == IEClean)
+		toload = scachemiss(ia->addr);
+	else{
+		assert(state == IEDirty);
+		toload = nil;
+	}
+	qunlock(&icache.lock);
+	if(toload){
+		scacheload(toload);
+		qunlock(&toload->lock);
+	}
+	
+	if(icache.ndirty >= icache.maxdirty)
 		kickicache();
 
 	/*
@@ -240,125 +425,81 @@ insertscore(u8int *score, IAddr *ia, int write)
 	 * the lump, meaning any searches for this block
 	 * will hit in the lump cache until after we return.
 	 */
-	markbloomfilter(mainindex->bloom, score);
+	if(state == IEDirty)
+		markbloomfilter(mainindex->bloom, score);
 
 	return 0;
 }
 
-/*
- * allocate a index cache entry which hasn't been used in a while.
- * must be called with icache.lock locked
- * if the score is already in the table, update the entry.
- */
-static IEntry *
-icachealloc(IAddr *ia, u8int *score)
+static int
+lookupscore_untimed(u8int score[VtScoreSize], int type, IAddr *ia)
 {
-	int i;
-	IEntry *ie, *last, *clean, *lastclean;
-	u32int h;
+	IEntry d;
 
-	h = hashbits(score, icache.bits);
-	last = nil;
-	for(ie = icache.heads[h]; ie != nil; ie = ie->next){
-		if(ie->ia.type == ia->type && scorecmp(ie->score, score)==0){
-			if(last != nil)
-				last->next = ie->next;
-			else
-				icache.heads[h] = ie->next;
-			trace(TraceLump, "icachealloc hit");
-			ie->rac = 1;
-			return ie;
-		}
-		last = ie;
-	}
+	if(icachelookup(score, type, ia) >= 0)
+		return 0;
 
-	h = icache.unused;
-	if(h < icache.entries){
-		ie = &icache.base[h++];
-		icache.unused = h;
-		trace(TraceLump, "icachealloc unused");
-		goto Found;
-	}
+	addstat(StatIcacheFill, 1);
+	if(loadientry(mainindex, score, type, &d) < 0)
+		return -1;
 	
-	if((ie = icache.free) != nil){
-		icache.free = ie->next;
-		goto Found;
-	}
+	insertscore(score, &d.ia, IEClean);
+	*ia = d.ia;
+	return 0;
+}
 
-	h = icache.stolen;
-	for(i=0;; i++){
-		h++;
-		if(h >= icache.size)
-			h = 0;
-		if(i == icache.size){
-			trace(TraceLump, "icachealloc sleep");
-			addstat(StatIcacheStall, 1);
-			while(icache.ndirty == icache.entries){
-				/*
-				 * This is a bit suspect.  Kickicache will wake up the
-				 * icachewritecoord, but if all the index entries are for
-				 * unflushed disk blocks, icachewritecoord won't be
-				 * able to do much.  It always rewakes everyone when
-				 * it thinks it is done, though, so at least we'll go around
-				 * the while loop again.  Also, if icachewritecoord sees
-				 * that the disk state hasn't change at all since the last
-				 * time around, it kicks the disk.  This needs to be
-				 * rethought, but it shouldn't deadlock anymore.
-				 */
-				kickicache();
-				rsleep(&icache.full);
-			}
-			addstat(StatIcacheStall, -1);
-			i = 0;
-		}
-		lastclean = nil;
-		clean = nil;
-		last = nil;
-		for(ie=icache.heads[h]; ie; last=ie, ie=ie->next){
-			if(!ie->dirty){
-				clean = ie;
-				lastclean = last;
-			}
-		}
-		if(clean){
-			if(lastclean)
-				lastclean->next = clean->next;
-			else
-				icache.heads[h] = clean->next;
-			clean->next = nil;
-			icache.stolen = h;
-			ie = clean;
-			trace(TraceLump, "icachealloc steal");
-			goto Found;
-		}
-	}
+int
+lookupscore(u8int score[VtScoreSize], int type, IAddr *ia)
+{
+	int ms, ret;
+	
+	ms = msec();
+	ret = lookupscore_untimed(score, type, ia);
+	ms = msec() - ms;
+	addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms);
+	return ret;
+}
+	
+u32int
+hashbits(u8int *sc, int bits)
+{
+	u32int v;
 
-Found:
-	ie->ia = *ia;
-	scorecp(ie->score, score);
-	ie->rac = 0;	
-	return ie;
+	v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3];
+	if(bits < 32)
+		 v >>= (32 - bits);
+	return v;
 }
 
+ulong
+icachedirtyfrac(void)
+{
+	return (vlong)icache.ndirty*IcacheFrac / icache.nentries;
+}
+
+/*
+ * Return a singly-linked list of dirty index entries.
+ * with 32-bit hash numbers between lo and hi
+ * and address < limit.
+ */
 IEntry*
 icachedirty(u32int lo, u32int hi, u64int limit)
 {
-	int i;
 	u32int h;
 	IEntry *ie, *dirty;
 
 	dirty = nil;
 	trace(TraceProc, "icachedirty enter");
 	qlock(&icache.lock);
-	for(i=0; inext)
-		if(ie->dirty && ie->ia.addr != 0 && ie->ia.addr < limit){
+	for(ie = icache.dirty.next; ie != &icache.dirty; ie=ie->next){
+		if(ie->state == IEDirty && ie->ia.addr < limit){
 			h = hashbits(ie->score, 32);
 			if(lo <= h && h <= hi){
 				ie->nextdirty = dirty;
 				dirty = ie;
 			}
 		}
+	}
 	qunlock(&icache.lock);
 	trace(TraceProc, "icachedirty exit");
 	if(dirty == nil)
@@ -366,36 +507,49 @@ icachedirty(u32int lo, u32int hi, u64int limit)
 	return dirty;
 }
 
+
+/*
+ * The singly-linked non-circular list of index entries ie
+ * has been written to disk.  Move them to the clean list.
+ */
 void
 icacheclean(IEntry *ie)
 {
-	trace(TraceProc, "icachedirty enter");
+	IEntry *next;
+	
+	trace(TraceProc, "icacheclean enter");
 	qlock(&icache.lock);
-	for(; ie; ie=ie->nextdirty){
+	for(; ie; ie=next){
+		assert(ie->state == IEDirty);
+		next = ie->nextdirty;
+		ie->nextdirty = nil;
+		popout(ie); /* from icache.dirty */
 		icache.ndirty--;
-		ie->dirty = 0;
+		ie->state = IEClean;
+		pushfirst(&icache.clean, ie);
 	}
 	setstat(StatIcacheDirty, icache.ndirty);
 	rwakeupall(&icache.full);
 	qunlock(&icache.lock);
-	trace(TraceProc, "icachedirty exit");
+	trace(TraceProc, "icacheclean exit");
 }
 
 void
 emptyicache(void)
 {
 	int i;
-	IEntry *ie, **lie;
+	IEntry *ie;
+	ISum *s;
 	
 	qlock(&icache.lock);
-	for(i=0; idirty == 0){
-			*lie = ie->next;
-			ie->next = icache.free;
-			icache.free = ie;
-		}else
-			lie = &ie->next;
+	while((ie = evictlru()) != nil)
+		pushfirst(&icache.free, ie);
+	for(i=0; ilock);
+		sumclear(s);
+		qunlock(&s->lock);
 	}
 	qunlock(&icache.lock);
 }
+
blob - 49344c9f1de978cbad3c5181952e2a5e320c46f2
blob + 0805001c65aa70f92dffb8fc197778ff13d1e925
--- src/cmd/venti/srv/icachewrite.c
+++ src/cmd/venti/srv/icachewrite.c
@@ -45,6 +45,16 @@ initicachewrite(void)
 	vtproc(delaykickroundproc, &iwrite.round);
 }
 
+static u64int
+ie2diskaddr(Index *ix, ISect *is, IEntry *ie)
+{
+	u64int bucket, addr;
+
+	bucket = hashbits(ie->score, 32)/ix->div;
+	addr = is->blockbase + ((bucket - is->start) << is->blocklog);
+	return addr;
+}
+
 static IEntry*
 nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *paddr, uint *pnbuf)
 {
@@ -55,13 +65,13 @@ nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *
 
 	bsize = 1<blocklog;
 	iefirst = *pie;
-	addr = is->blockbase + ((u64int)(hashbits(iefirst->score, 32) / ix->div - is->start) << is->blocklog);
+	addr = ie2diskaddr(ix, is, iefirst);
 	nbuf = 0;
-	for(l=&iefirst->nextdirty; (ie=*l)!=nil; l=&(*l)->nextdirty){
-		naddr = is->blockbase + ((u64int)(hashbits(ie->score, 32) / ix->div - is->start) << is->blocklog);
+	for(l = &iefirst->nextdirty; (ie = *l) != nil; l = &(*l)->nextdirty){
+		naddr = ie2diskaddr(ix, is, ie);
 		if(naddr - addr >= Bufsize)
 			break;
-		nbuf = naddr-addr;
+		nbuf = naddr - addr;
 	}
 	nbuf += bsize;
 
@@ -75,7 +85,7 @@ nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *
 static int
 icachewritesect(Index *ix, ISect *is, u8int *buf)
 {
-	int err, h, bsize, t;
+	int err, i, werr, h, bsize, t;
 	u32int lo, hi;
 	u64int addr, naddr;
 	uint nbuf, off;
@@ -89,29 +99,32 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
 	else
 		hi = is->stop * ix->div - 1;
 
-	trace(TraceProc, "icachewritesect enter %ud %ud %llud", lo, hi, iwrite.as.aa);
+	trace(TraceProc, "icachewritesect enter %ud %ud %llud",
+		lo, hi, iwrite.as.aa);
 
 	iedirty = icachedirty(lo, hi, iwrite.as.aa);
 	iedirty = iesort(iedirty);
-	bsize = 1<blocklog;
+	bsize = 1 << is->blocklog;
 	err = 0;
 
 	while(iedirty){
 		disksched();
-		while((t=icachesleeptime) == SleepForever){
+		while((t = icachesleeptime) == SleepForever){
 			sleep(1000);
 			disksched();
 		}
 		if(t < minicachesleeptime)
 			t = minicachesleeptime;
-		sleep(t);
+		if(t > 0)
+			sleep(t);
 		trace(TraceProc, "icachewritesect nextchunk");
 		chunk = nextchunk(ix, is, &iedirty, &addr, &nbuf);
 
-		trace(TraceProc, "icachewritesect readpart 0x%llux+0x%ux", addr, nbuf);
+		trace(TraceProc, "icachewritesect readpart 0x%llux+0x%ux",
+			addr, nbuf);
 		if(readpart(is->part, addr, buf, nbuf) < 0){
-			/* XXX more details here */
-			fprint(2, "icachewriteproc readpart: %r\n");
+			fprint(2, "%s: part %s addr 0x%llux: icachewritesect "
+				"readpart: %r\n", argv0, is->part->name, addr);
 			err  = -1;
 			continue;
 		}
@@ -120,31 +133,34 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
 		addstat(StatIsectRead, 1);
 
 		for(l=&chunk; (ie=*l)!=nil; l=&ie->nextdirty){
-		again:
-			naddr = is->blockbase + ((u64int)(hashbits(ie->score, 32) / ix->div - is->start) << is->blocklog);
+again:
+			naddr = ie2diskaddr(ix, is, ie);
 			off = naddr - addr;
 			if(off+bsize > nbuf){
-				fprint(2, "whoops! addr=0x%llux nbuf=%ud addr+nbuf=0x%llux naddr=0x%llux\n",
-					addr, nbuf, addr+nbuf, naddr);
+				fprint(2, "%s: whoops! addr=0x%llux nbuf=%ud "
+					"addr+nbuf=0x%llux naddr=0x%llux\n",
+					argv0, addr, nbuf, addr+nbuf, naddr);
 				assert(off+bsize <= nbuf);
 			}
 			unpackibucket(&ib, buf+off, is->bucketmagic);
 			if(okibucket(&ib, is) < 0){
-				fprint(2, "bad bucket XXX\n");
+				fprint(2, "%s: bad bucket XXX\n", argv0);
 				goto skipit;
 			}
-			trace(TraceProc, "icachewritesect add %V at 0x%llux", ie->score, naddr);
+			trace(TraceProc, "icachewritesect add %V at 0x%llux",
+				ie->score, naddr);
 			h = bucklook(ie->score, ie->ia.type, ib.data, ib.n);
 			if(h & 1){
 				h ^= 1;
 				packientry(ie, &ib.data[h]);
 			}else if(ib.n < is->buckmax){
-				memmove(&ib.data[h+IEntrySize], &ib.data[h], ib.n*IEntrySize - h);
+				memmove(&ib.data[h + IEntrySize], &ib.data[h],
+					ib.n*IEntrySize - h);
 				ib.n++;
 				packientry(ie, &ib.data[h]);
 			}else{
-				fprint(2, "bucket overflow XXX\n");
-			skipit:
+				fprint(2, "%s: bucket overflow XXX\n", argv0);
+skipit:
 				err = -1;
 				*l = ie->nextdirty;
 				ie = *l;
@@ -154,33 +170,29 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
 					break;
 			}
 			packibucket(&ib, buf+off, is->bucketmagic);
-			/* XXX
-			 * This is not quite right - it's good that we 
-			 * update the cached block (if any) here, but
-			 * since the block doesn't get written until writepart
-			 * below, we also need to make sure that the cache 
-			 * doesn't load the stale block before we write it to
-			 * disk below.  We could lock the disk cache during
-			 * the writepart, but that's pretty annoying.
-			 * Another possibility would be never to cache
-			 * index partition blocks.  The hit rate on those is
-			 * miniscule anyway.
-			 */
-			if((b = _getdblock(is->part, naddr, ORDWR, 0)) != nil){
-				memmove(b->data, buf+off, bsize);
-				putdblock(b);
-			}
 		}
 
 		diskaccess(1);
 
 		trace(TraceProc, "icachewritesect writepart", addr, nbuf);
-		if(writepart(is->part, addr, buf, nbuf) < 0 || flushpart(is->part) < 0){
-			/* XXX more details here */
-			fprint(2, "icachewriteproc writepart: %r\n");
+		werr = 0;
+		if(writepart(is->part, addr, buf, nbuf) < 0 || flushpart(is->part) < 0)
+			werr = -1;
+
+		for(i=0; ipart, addr+i, ORDWR, 0)) != nil){
+				memmove(b->data, buf+i, bsize);
+				putdblock(b);
+			}
+		}
+
+		if(werr < 0){
+			fprint(2, "%s: part %s addr 0x%llux: icachewritesect "
+				"writepart: %r\n", argv0, is->part->name, addr);
 			err = -1;
 			continue;
 		}
+		
 		addstat(StatIsectWriteBytes, nbuf);
 		addstat(StatIsectWrite, 1);
 		icacheclean(chunk);
blob - e5d5460aea59905ea4c6c36a83d86beb225a31e0
blob + 36d96b941608bebbb22f4e1eecab5475abe22976
--- src/cmd/venti/srv/ifile.c
+++ src/cmd/venti/srv/ifile.c
@@ -2,46 +2,57 @@
 #include "dat.h"
 #include "fns.h"
 
+static char vcmagic[] = "venti config\n";
+
+enum {
+	Maxconfig = 8 * 1024,
+	Maglen = sizeof vcmagic - 1,
+};
+
 int
 readifile(IFile *f, char *name)
 {
-	int m;
 	Part *p;
 	ZBlock *b;
 	u8int *z;
-	
+
 	p = initpart(name, OREAD);
 	if(p == nil)
 		return -1;
-	b = alloczblock(8192, 1, 0);
+	b = alloczblock(Maxconfig+1, 1, 0);
 	if(b == nil){
 		seterr(EOk, "can't alloc for %s: %R", name);
 		return -1;
 	}
 	if(p->size > PartBlank){
 		/*
-		 * this is likely a real venti partition, in which case
-		 * we're looking for the config file stored as 8k at end of PartBlank.
+		 * this is likely a real venti partition, in which case we're
+		 * looking for the config file stored as 8k at end of PartBlank.
 		 */
-		if(readpart(p, PartBlank-8192, b->data, 8192) < 0){
+		if(readpart(p, PartBlank-Maxconfig, b->data, Maxconfig) < 0){
 			seterr(EOk, "can't read %s: %r", name);
 			freezblock(b);
 			freepart(p);
 			return -1;
 		}
-		m = 5+1+6+1;
-		if(memcmp(b->data, "venti config\n", m) != 0){
+		b->data[Maxconfig] = '\0';
+		if(memcmp(b->data, vcmagic, Maglen) != 0){
 			seterr(EOk, "bad venti config magic in %s", name);
 			freezblock(b);
 			freepart(p);
 			return -1;
 		}
-		b->data += m;
-		b->len -= m;
-		z = memchr(b->data, 0, b->len);
+		/*
+		 * if we change b->data+b->_size, freezblock
+		 * will blow an assertion, so don't.
+		 */
+		b->data  += Maglen;
+		b->_size -= Maglen;
+		b->len   -= Maglen;
+		z = memchr(b->data, '\0', b->len);
 		if(z)
 			b->len = z - b->data;
-	}else if(p->size > 8192){
+	}else if(p->size > Maxconfig){
 		seterr(EOk, "config file is too large");
 		freepart(p);
 		freezblock(b);
blob - a5ffeef9cab59fd8a4d5551bf29c7cf3fdc1605c
blob + dd49e05505945c6cbe456ca79eadd7c2a134869e
--- src/cmd/venti/srv/index.c
+++ src/cmd/venti/srv/index.c
@@ -596,6 +596,25 @@ print("want arena %d for %llux\n", l, a);
 	return ix->arenas[l];
 }
 
+/*
+ * convert an arena index to the bounds of the containing arena group.
+ */
+Arena*
+amapitoag(Index *ix, u64int a, u64int *gstart, u64int *glimit, int *g)
+{
+	u64int aa;
+	Arena *arena;
+	
+	arena = amapitoa(ix, a, &aa);
+	if(arena == nil)
+		return nil;
+	if(arenatog(arena, aa, gstart, glimit, g) < 0)
+		return nil;
+	*gstart += a - aa;
+	*glimit += a - aa;
+	return arena;
+}
+
 int
 iaddrcmp(IAddr *ia1, IAddr *ia2)
 {
blob - 1db62717194ec1ccfc178ce703c1a007b8775f99
blob + 206d5e064352953fc097a3c95e2440e9d440ed0f
--- src/cmd/venti/srv/lump.c
+++ src/cmd/venti/srv/lump.c
@@ -7,7 +7,7 @@ int			queuewrites = 0;
 int			writestodevnull = 0;
 int			verifywrites = 0;
 
-static Packet		*readilump(Lump *u, IAddr *ia, u8int *score, int rac);
+static Packet		*readilump(Lump *u, IAddr *ia, u8int *score);
 
 /*
  * Some of this logic is duplicated in hdisk.c
@@ -19,7 +19,6 @@ readlump(u8int *score, int type, u32int size, int *cac
 	Packet *p;
 	IAddr ia;
 	u32int n;
-	int rac;
 
 	trace(TraceLump, "readlump enter");
 /*
@@ -49,7 +48,7 @@ readlump(u8int *score, int type, u32int size, int *cac
 	if(cached)
 		*cached = 0;
 
-	if(lookupscore(score, type, &ia, &rac) < 0){
+	if(lookupscore(score, type, &ia) < 0){
 		/* ZZZ place to check for someone trying to guess scores */
 		seterr(EOk, "no block with score %V/%d exists", score, type);
 
@@ -64,7 +63,7 @@ readlump(u8int *score, int type, u32int size, int *cac
 	}
 
 	trace(TraceLump, "readlump readilump");
-	p = readilump(u, &ia, score, rac);
+	p = readilump(u, &ia, score);
 	putlump(u);
 
 	trace(TraceLump, "readlump exit");
@@ -134,9 +133,8 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
 	Packet *old;
 	IAddr ia;
 	int ok;
-	int rac;
 
-	if(lookupscore(u->score, u->type, &ia, &rac) == 0){
+	if(lookupscore(u->score, u->type, &ia) == 0){
 		if(verifywrites == 0){
 			/* assume the data is here! */
 			packetfree(p);
@@ -149,7 +147,7 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
 		 * if the read fails,
 		 * assume it was corrupted data and store the block again
 		 */
-		old = readilump(u, &ia, u->score, rac);
+		old = readilump(u, &ia, u->score);
 		if(old != nil){
 			ok = 0;
 			if(packetcmp(p, old) != 0){
@@ -176,7 +174,7 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
 	ok = storeclump(mainindex, flat, u->score, u->type, creator, &ia);
 	freezblock(flat);
 	if(ok == 0)
-		ok = insertscore(u->score, &ia, 1);
+		ok = insertscore(u->score, &ia, IEDirty);
 	if(ok == 0)
 		insertlump(u, p);
 	else
@@ -191,41 +189,16 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
 	ms = msec() - ms;
 	addstat2(StatRpcWriteNew, 1, StatRpcWriteNewTime, ms);
 	return ok;
-}
-
-static void
-lreadahead(u64int a, Arena *arena, u64int aa, int n)
-{	
-	u8int buf[ClumpSize];
-	Clump cl;
-	IAddr ia;
-
-	while(n > 0) {
-		if (aa >= arena->memstats.used)
-			break;
-		if(readarena(arena, aa, buf, ClumpSize) < ClumpSize)
-			break;
-		if(unpackclump(&cl, buf, arena->clumpmagic) < 0)
-			break;
-		ia.addr = a;
-		ia.type = cl.info.type;
-		ia.size = cl.info.uncsize;
-		ia.blocks = (cl.info.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
-		insertscore(cl.info.score, &ia, 0);
-		a += ClumpSize + cl.info.size;
-		aa += ClumpSize + cl.info.size;
-		n--;
-	}
 }
 
 static Packet*
-readilump(Lump *u, IAddr *ia, u8int *score, int rac)
+readilump(Lump *u, IAddr *ia, u8int *score)
 {
 	Arena *arena;
 	ZBlock *zb;
 	Packet *p, *pp;
 	Clump cl;
-	u64int a, aa;
+	u64int aa;
 	u8int sc[VtScoreSize];
 
 	trace(TraceLump, "readilump enter");
@@ -258,13 +231,6 @@ readilump(Lump *u, IAddr *ia, u8int *score, int rac)
 		return nil;
 	}
 
-	if(rac == 0) {
-		trace(TraceLump, "readilump readahead");
-		a = ia->addr + ClumpSize + cl.info.size;
-		aa += ClumpSize + cl.info.size;
-		lreadahead(a, arena, aa, 20);
-	}
-
 	trace(TraceLump, "readilump success");
 	p = zblock2packet(zb, cl.info.uncsize);
 	freezblock(zb);
blob - 3a66bf6dea1ab67363c6e819cad4e82f4d9a7fa3
blob + 5ee4d91fc9802c1d237df60ecbb8afa0e20f523c
--- src/cmd/venti/srv/stats.c
+++ src/cmd/venti/srv/stats.c
@@ -60,6 +60,9 @@ Statdesc statdesc[NStat] =
 	{ "index cache flushes", },
 	{ "index cache stalls", },
 	{ "index cache read time", },
+	{ "index cache lookups" },
+	{ "index cache summary hits" },
+	{ "index cache summary prefetches" },
 
 	{ "bloom filter hits", },
 	{ "bloom filter misses", },
@@ -81,6 +84,9 @@ Statdesc statdesc[NStat] =
 
 	{ "sum reads", },
 	{ "sum read bytes", },
+	
+	{ "cig loads" },
+	{ "cig load time" },
 };
 
 QLock statslock;
blob - 72d45f18fc96a59c97c9fa2dc4256d0ae59c97b9
blob + fb3c4ce284077e58148da0916b0dbc3adf501e01
--- src/cmd/venti/srv/syncindex.c
+++ src/cmd/venti/srv/syncindex.c
@@ -56,13 +56,7 @@ threadmain(int argc, char *argv[])
 	if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
 	initdcache(bcmem);
 	initlumpcache(1*1024*1024, 1024/8);
-	icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth);
-	if(icmem < 4)
-		icmem = 4;
-	if(1) fprint(2, "initialize %d bytes of index cache for %d index entries\n",
-		(sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth,
-		(1 << icmem) * ICacheDepth);
-	initicache(icmem, ICacheDepth);
+	initicache(icmem);
 	initicachewrite();
 	if(mainindex->bloom)
 		startbloomproc(mainindex->bloom);
blob - 6a008bad6e740e3dabdbfd537e141c28b10a5611
blob + 98f6adf11db9d584fc0fc74a81c3775e3230c663
--- src/cmd/venti/srv/syncindex0.c
+++ src/cmd/venti/srv/syncindex0.c
@@ -101,7 +101,7 @@ syncarenaindex(Index *ix, Arena *arena, u32int clump, 
 			}
 			flush = 1;
 			trace(TraceProc, "syncarenaindex insert %V", ci->score);
-			insertscore(ci->score, &ia, 1);
+			insertscore(ci->score, &ia, IEDirty);
 		}
 
 		if(0 && clump / 1000 != (clump + n) / 1000)
blob - fab837824aa568c552f4fb9f5332127948fac8b5
blob + 4d59dfc6d483735ab4be20600894ec472e43a021
--- src/cmd/venti/srv/venti.c
+++ src/cmd/venti/srv/venti.c
@@ -18,7 +18,7 @@ static void	ventiserver(void*);
 void
 usage(void)
 {
-	fprint(2, "usage: venti [-Ldrs] [-a ventiaddr] [-c config] "
+	fprint(2, "usage: venti [-Ldrsw] [-a ventiaddr] [-c config] "
 "[-h httpaddr] [-B blockcachesize] [-C cachesize] [-I icachesize] [-W webroot]\n");
 	threadexitsall("usage");
 }
@@ -73,6 +73,9 @@ threadmain(int argc, char *argv[])
 	case 's':
 		nofork = 1;
 		break;
+	case 'w':			/* compatibility with old venti */
+		queuewrites = 1;
+		break;
 	case 'W':
 		webroot = EARGF(usage());
 		break;
@@ -103,9 +106,6 @@ threadmain(int argc, char *argv[])
 	if(configfile == nil)
 		configfile = "venti.conf";
 
-	if(initarenasum() < 0)
-		fprint(2, "warning: can't initialize arena summing process: %r");
-
 	fprint(2, "conf...");
 	if(initventi(configfile, &config) < 0)
 		sysfatal("can't init server: %r");
@@ -143,13 +143,7 @@ threadmain(int argc, char *argv[])
 		mem, mem / (8 * 1024));
 	initlumpcache(mem, mem / (8 * 1024));
 
-	icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth);
-	if(icmem < 4)
-		icmem = 4;
-	if(0) fprint(2, "initialize %d bytes of index cache for %d index entries\n",
-		(sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth,
-		(1 << icmem) * ICacheDepth);
-	initicache(icmem, ICacheDepth);
+	initicache(icmem);
 	initicachewrite();
 
 	/*
@@ -179,6 +173,9 @@ threadmain(int argc, char *argv[])
 		}
 	}
 
+	if(initarenasum() < 0)
+		fprint(2, "warning: can't initialize arena summing process: %r");
+
 	fprint(2, "announce %s...", vaddr);
 	ventisrv = vtlisten(vaddr);
 	if(ventisrv == nil)
@@ -272,5 +269,3 @@ ventiserver(void *v)
 	flushicache();
 	threadexitsall(0);
 }
-
-
blob - 76e9f276ac4ff9ce0c9b5ebfb8aad10f44537833
blob + 64de5bbb5846277af173c6acf3b5765c332e88ab
--- src/cmd/venti/srv/www/stats.js
+++ src/cmd/venti/srv/www/stats.js
@@ -38,6 +38,10 @@ graphname = new Array(
 		"icache dirty %",
 	"arg=icachehit&graph=pctdiff&arg2=icachelookup&max=100",
 		"icache hit %",
+	"arg=scachehit&graph=pctdiff&arg2=icachelookup&max=100",
+		"scache hit %",
+	"arg=icachemiss&graph=pctdiff&arg2=icachelookup&max=100",
+		"icache miss %",
 	"arg=icachelookuptime&graph=divdiff&arg2=icachelookup",
 		"icache lookup time",
 	"arg=icacheprefetch&graph=diff",
@@ -75,6 +79,8 @@ graphname = new Array(
 		"fresh write RPC time",
 	"arg=rpcwriteoldtime&graph=divdiff&arg2=rpcwriteold",
 		"dup write RPC time",
+	"arg=cigloadtime&graph=divdiff&arg2=cigload",
+		"cig load time",
 
 	"arg=sumreadbyte&graph=diff",
 		"checksum bytes/second",
@@ -118,8 +124,11 @@ column1 = new Array(
 	"!icache",
 	"arg=icachedirty&graph=pct&arg2=icachesize&max=100",
 	"arg=icachehit&graph=pctdiff&arg2=icachelookup&max=100",
+	"arg=scachehit&graph=pctdiff&arg2=icachelookup&max=100",
+	"arg=icachemiss&graph=pctdiff&arg2=icachelookup&max=100",
 	"arg=icachewrite&graph=diff",
 	"arg=icacheprefetch&graph=diff",
+	"arg=scacheprefetch&graph=diff",
 	
 	"!dcache",
 	"arg=dcachedirty&graph=pct&arg2=dcachesize&max=100",
@@ -154,6 +163,7 @@ column2 = new Array(
 	"arg=rpcreaduncachedtime&graph=divdiff&arg2=rpcreaduncached",
 	"arg=rpcwritenewtime&graph=divdiff&arg2=rpcwritenew",
 	"arg=rpcwriteoldtime&graph=divdiff&arg2=rpcwriteold",
+	"arg=cigloadtime&graph=divdiff&arg2=cigload",
 	
 	"END"
 )