commit 28b49df3542a635cca788f3de213385f3fcb6334 from: rsc date: Tue Jul 18 15:26:33 2006 UTC assorted changes from Plan 9 commit - 686bd37d9d8db5e3b969a3aa2d5b455e0976b262 commit + 28b49df3542a635cca788f3de213385f3fcb6334 blob - 8cfe3e5d886798f6a924dc5a286f35a63306be63 blob + 0121d4512d7bca8a694f226c10db52baca86068e --- src/cmd/venti/srv/arena.c +++ src/cmd/venti/srv/arena.c @@ -20,6 +20,7 @@ static void sumproc(void *); static QLock sumlock; static Rendez sumwait; static ASum *sumq; +static ASum *sumqtail; static uchar zero[8192]; int arenasumsleeptime; @@ -257,7 +258,6 @@ writearena(Arena *arena, u64int aa, u8int *clbuf, u32i if(m > n - nn) m = n - nn; memmove(&b->data[off], &clbuf[nn], m); - /* ok = writepart(arena->part, a, b->data, blocksize); */ ok = 0; putdblock(b); if(ok < 0){ @@ -329,7 +329,6 @@ writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64i if(m > n - nn) m = n - nn; memmove(&b->data[off], &clbuf[nn], m); - /* ok = writepart(arena->part, a, b->data, blocksize); */ ok = 0; putdblock(b); if(ok < 0){ @@ -356,6 +355,7 @@ writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64i arena->ctime = arena->wtime; writeclumpinfo(arena, clump, &c->info); + wbarena(arena); /* set up for call to setdcachestate */ as.arena = arena; @@ -410,6 +410,9 @@ setatailstate(AState *as) trace(0, "setatailstate %s 0x%llux clumps %d", as->arena->name, as->aa, as->stats.clumps); + /* + * Look up as->arena to find index. + */ ix = mainindex; for(i=0; inarenas; i++) if(ix->arenas[i] == as->arena) @@ -419,6 +422,9 @@ setatailstate(AState *as) return; } + /* + * Walk backward until we find the last time these were in sync. + */ for(j=i; --j>=0; ){ a = ix->arenas[j]; if(atailcmp(&a->diskstats, &a->memstats) == 0) @@ -464,8 +470,12 @@ backsumarena(Arena *arena) return; qlock(&sumlock); as->arena = arena; - as->next = sumq; - sumq = as; + as->next = nil; + if(sumq) + sumqtail->next = as; + else + sumq = as; + sumqtail = as; rwakeup(&sumwait); qunlock(&sumlock); } @@ -499,6 +509,7 @@ sumarena(Arena *arena) DigestState s; u64int a, e; u32int bs; + int t; u8int score[VtScoreSize]; bs = MaxIoSize; @@ -512,7 +523,12 @@ sumarena(Arena *arena) b = alloczblock(bs, 0, arena->part->blocksize); e = arena->base + arena->size; for(a = arena->base - arena->blocksize; a + arena->blocksize <= e; a += bs){ - sleep(arenasumsleeptime); + disksched(); + while((t=arenasumsleeptime) == SleepForever){ + sleep(1000); + disksched(); + } + sleep(t); if(a + bs > e) bs = arena->blocksize; if(readpart(arena->part, a, b->data, bs) < 0) @@ -595,7 +611,7 @@ wbarenahead(Arena *arena) b = alloczblock(arena->blocksize, 1, arena->part->blocksize); if(b == nil){ logerr(EAdmin, "can't write arena header: %r"); -/*/ZZZ add error message? */ +/* ZZZ add error message? */ return -1; } /* @@ -681,18 +697,22 @@ okarena(Arena *arena) ok = 0; dsize = arenadirsize(arena, arena->diskstats.clumps); if(arena->diskstats.used + dsize > arena->size){ - seterr(ECorrupt, "arena used > size"); + seterr(ECorrupt, "arena %s used > size", arena->name); ok = -1; } if(arena->diskstats.cclumps > arena->diskstats.clumps) - logerr(ECorrupt, "arena has more compressed clumps than total clumps"); + logerr(ECorrupt, "arena %s has more compressed clumps than total clumps", arena->name); + /* + * This need not be true if some of the disk is corrupted. + * if(arena->diskstats.uncsize + arena->diskstats.clumps * ClumpSize + arena->blocksize < arena->diskstats.used) - logerr(ECorrupt, "arena uncompressed size inconsistent with used space %lld %d %lld", arena->diskstats.uncsize, arena->diskstats.clumps, arena->diskstats.used); + logerr(ECorrupt, "arena %s uncompressed size inconsistent with used space %lld %d %lld", arena->name, arena->diskstats.uncsize, arena->diskstats.clumps, arena->diskstats.used); + */ if(arena->ctime > arena->wtime) - logerr(ECorrupt, "arena creation time after last write time"); + logerr(ECorrupt, "arena %s creation time after last write time", arena->name); return ok; } blob - 0bffd3cfcba7fcc89901c0ab407e20dc9a2b191b blob + 05cb396b7736f34c911e120e25a8609b167dcb17 --- src/cmd/venti/srv/arenas.c +++ src/cmd/venti/srv/arenas.c @@ -214,7 +214,7 @@ wbarenapart(ArenaPart *ap) return -1; b = alloczblock(HeadSize, 1, 0); if(b == nil) -/*ZZZ set error message? */ +/* ZZZ set error message? */ return -1; if(packarenapart(ap, b->data) < 0){ @@ -337,8 +337,8 @@ wbarenamap(AMap *am, int n, Part *part, u64int base, u /* * amap: n '\n' amapelem * n * n: u32int - * amapelem: name '\t' astart '\t' asize '\n' - * astart, asize: u64int + * amapelem: name '\t' astart '\t' astop '\n' + * astart, astop: u64int */ int parseamap(IFile *f, AMapN *amn) blob - e54e3885f34362a53b2e55af5daa0b5c70738f3b blob + 7ea5f640e011c8dbd951610c2ce2325d247967d3 --- src/cmd/venti/srv/bloom.c +++ src/cmd/venti/srv/bloom.c @@ -7,6 +7,8 @@ #include "dat.h" #include "fns.h" +int ignorebloom; + int bloominit(Bloom *b, vlong vsize, u8int *data) { @@ -24,6 +26,7 @@ bloominit(Bloom *b, vlong vsize, u8int *data) if(unpackbloomhead(b, data) < 0) return -1; +fprint(2, "bloom size %lud nhash %d\n", b->size, b->nhash); b->mask = b->size-1; b->data = data; return 0; @@ -38,11 +41,7 @@ wbbloomhead(Bloom *b) Bloom* readbloom(Part *p) { - int i, n; - uint ones; uchar buf[512]; - uchar *data; - u32int *a; Bloom *b; b = vtmallocz(sizeof *b); @@ -52,14 +51,40 @@ readbloom(Part *p) vtfree(b); return nil; } + b->part = p; + return b; +} + +int +resetbloom(Bloom *b) +{ + uchar *data; + data = vtmallocz(b->size); - if(readpart(p, 0, data, b->size) < 0){ +fprint(2, "bloom data %lud\n", b->size); + b->data = data; + if(b->size == MaxBloomSize) /* 2^32 overflows ulong */ + addstat(StatBloomBits, b->size*8-1); + else + addstat(StatBloomBits, b->size*8); + return 0; +} + +int +loadbloom(Bloom *b) +{ + int i, n; + uint ones; + uchar *data; + u32int *a; + + data = vtmallocz(b->size); + if(readpart(b->part, 0, data, b->size) < 0){ vtfree(b); vtfree(data); - return nil; + return -1; } b->data = data; - b->part = p; a = (u32int*)b->data; n = b->size/4; @@ -73,7 +98,7 @@ readbloom(Part *p) else addstat(StatBloomBits, b->size*8); - return b; + return 0; } int @@ -101,6 +126,8 @@ gethashes(u8int *score, ulong *h) a ^= *(u32int*)(score+i); b ^= *(u32int*)(score+i+4); } + if(i+4 <= VtScoreSize) /* 20 is not 4-aligned */ + a ^= *(u32int*)(score+i); for(i=0; idata == nil) return 1; + if(ignorebloom) + return 1; + ms = msec(); rlock(&b->lk); r = _inbloomfilter(b, score); runlock(&b->lk); - ms = msec() - ms; + ms = ms - msec(); addstat2(StatBloomLookup, 1, StatBloomLookupTime, ms); if(r) addstat(StatBloomMiss, 1); @@ -173,7 +203,7 @@ inbloomfilter(Bloom *b, u8int *score) void markbloomfilter(Bloom *b, u8int *score) { - if(b == nil) + if(b == nil || b->data == nil) return; rlock(&b->lk); @@ -186,14 +216,18 @@ markbloomfilter(Bloom *b, u8int *score) static void bloomwriteproc(void *v) { + int ret; Bloom *b; - + + threadsetname("bloomwriteproc"); b = v; for(;;){ recv(b->writechan, 0); - if(writebloom(b) < 0) + if((ret=writebloom(b)) < 0) fprint(2, "oops! writing bloom: %r\n"); - send(b->writedonechan, 0); + else + ret = 0; + sendul(b->writedonechan, ret); } } blob - 225bdc43cdf2ec7b8f85a442d8a00f51818f2518 blob + 73f8056beabd693a030efc722aa0bca94fab61b1 --- src/cmd/venti/srv/buildbuck.c +++ src/cmd/venti/srv/buildbuck.c @@ -21,7 +21,7 @@ initiestream(Part *part, u64int off, u64int clumps, u3 { IEStream *ies; -/*ZZZ out of memory? */ +/* out of memory? */ ies = MKZ(IEStream); ies->buf = MKN(u8int, size); ies->epos = ies->buf; @@ -61,7 +61,7 @@ peekientry(IEStream *ies) nn -= n; if(nn == 0) return nil; -/*fprint(2, "peek %d from %llud into %p\n", nn, ies->off, ies->epos); */ +//fprint(2, "peek %d from %llud into %p\n", nn, ies->off, ies->epos); if(readpart(ies->part, ies->off, ies->epos, nn) < 0){ seterr(EOk, "can't read sorted index entries: %r"); return nil; @@ -101,7 +101,7 @@ buildbucket(Index *ix, IEStream *ies, IBucket *ib, uin b = peekientry(ies); if(b == nil) return TWID32; -/*fprint(2, "b=%p ies->n=%lld ib.n=%d buck=%d score=%V\n", b, ies->n, ib->n, iebuck(ix, b, ib, ies), b); */ +/* fprint(2, "b=%p ies->n=%lld ib.n=%d buck=%d score=%V\n", b, ies->n, ib->n, iebuck(ix, b, ib, ies), b); */ if(ib->n == 0) buck = iebuck(ix, b, ib, ies); else{ blob - e70a830d340e0ee2ccd29b04875e7b2ac665d4b1 blob + b6866daf3072ebbd69681b24ccf30326bdd90d03 --- src/cmd/venti/srv/buildindex.c +++ src/cmd/venti/srv/buildindex.c @@ -1,164 +1,936 @@ /* - * Rebuild the Venti index from scratch. + * Rebuild the index from scratch, in place. */ - #include "stdinc.h" #include "dat.h" #include "fns.h" -/* - * Write a single bucket. Could profit from a big buffer here - * so that we can absorb sporadic runs of blocks into one write, - * avoiding disk seeks. - */ -static int -writebucket(Index *ix, u32int buck, IBucket *ib, ZBlock *b) +enum { - ISect *is; + MinBufSize = 64*1024, + MaxBufSize = 4*1024*1024, +}; - is = ix->sects[indexsect0(ix, buck)]; - if(buck < is->start || buck >= is->stop){ - seterr(EAdmin, "cannot find index section for bucket %lud\n", (ulong)buck); - return -1; - } - buck -= is->start; +int dumb; +int errors; +char **isect; +int nisect; +int bloom; +int zero; -/* - qlock(&stats.lock); - stats.indexwrites++; - qunlock(&stats.lock); -*/ - packibucket(ib, b->data, is->bucketmagic); - return writepart(is->part, is->blockbase + ((u64int)buck << is->blocklog), b->data, is->blocksize); -} +u32int isectmem; +u64int totalbuckets; +u64int totalclumps; +Channel *arenadonechan; +Channel *isectdonechan; +Index *ix; -static int -buildindex(Index *ix, Part *part, u64int off, u64int clumps, int zero) -{ - IEStream *ies; - IBucket ib, zib; - ZBlock *z, *b; - u32int next, buck; - int ok; - uint nbuck; - u64int found = 0; +u64int arenaentries; +u64int skipentries; +u64int indexentries; -/*ZZZ make buffer size configurable */ - b = alloczblock(ix->blocksize, 0, ix->blocksize); - z = alloczblock(ix->blocksize, 1, ix->blocksize); - ies = initiestream(part, off, clumps, 64*1024); - if(b == nil || z == nil || ies == nil){ - ok = 0; - goto breakout; - return -1; - } - ok = 0; - next = 0; - memset(&ib, 0, sizeof ib); - ib.data = b->data + IBucketSize; - zib.data = z->data + IBucketSize; - zib.n = 0; - nbuck = 0; - for(;;){ - buck = buildbucket(ix, ies, &ib, ix->blocksize-IBucketSize); - found += ib.n; - if(zero){ - for(; next != buck; next++){ - if(next == ix->buckets){ - if(buck != TWID32){ - fprint(2, "bucket out of range\n"); - ok = -1; - } - goto breakout; - } - if(writebucket(ix, next, &zib, z) < 0){ - fprint(2, "can't write zero bucket to buck=%d: %r", next); - ok = -1; - } - } - } - if(buck >= ix->buckets){ - if(buck == TWID32) - break; - fprint(2, "bucket out of range\n"); - ok = -1; - goto breakout; - } - if(writebucket(ix, buck, &ib, b) < 0){ - fprint(2, "bad bucket found=%lld: %r\n", found); - ok = -1; - } - next = buck + 1; - if(++nbuck%10000 == 0) - fprint(2, "\t%,d buckets written...\n", nbuck); - } -breakout:; - fprint(2, "wrote index with %lld entries\n", found); - freeiestream(ies); - freezblock(z); - freezblock(b); - return ok; -} +static int shouldprocess(ISect*); +static void isectproc(void*); +static void arenapartproc(void*); void usage(void) { - fprint(2, "usage: buildindex [-Z] [-B blockcachesize] config tmppart\n"); - threadexitsall(0); + fprint(2, "usage: buildindex [-b] [-i isect]... [-M imem] venti.conf\n"); + threadexitsall("usage"); } -Config conf; - void threadmain(int argc, char *argv[]) { - Part *part; - u64int clumps, base; - u32int bcmem; - int zero; - - zero = 1; - bcmem = 0; + int fd, i, napart; + u32int bcmem, imem; + Config conf; + Part *p; + ventifmtinstall(); + imem = 256*1024*1024; ARGBEGIN{ - case 'B': - bcmem = unittoull(ARGF()); + case 'b': + bloom = 1; break; - case 'Z': - zero = 0; + case 'i': + isect = vtrealloc(isect, (nisect+1)*sizeof(isect[0])); + isect[nisect++] = EARGF(usage()); break; + case 'd': /* debugging - make sure to run all 3 passes */ + dumb = 1; + break; + case 'M': + imem = unittoull(EARGF(usage())); + break; default: usage(); break; }ARGEND - - if(argc != 2) + + if(argc != 1) usage(); if(initventi(argv[0], &conf) < 0) sysfatal("can't init venti: %r"); + ix = mainindex; + if(nisect == 0 && ix->bloom) + bloom = 1; + if(bloom && ix->bloom && resetbloom(ix->bloom) < 0) + sysfatal("loadbloom: %r"); + if(bloom && !ix->bloom) + sysfatal("-b specified but no bloom filter"); + if(!bloom) + ix->bloom = nil; + isectmem = imem/ix->nsects; - if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16)) - bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16); + /* + * safety first - only need read access to arenas + */ + p = nil; + for(i=0; inarenas; i++){ + if(ix->arenas[i]->part != p){ + p = ix->arenas[i]->part; + if((fd = open(p->filename, OREAD)) < 0) + sysfatal("cannot reopen %s: %r", p->filename); + dup(fd, p->fd); + close(fd); + } + } + + /* + * need a block for every arena + */ + bcmem = maxblocksize * (mainindex->narenas + 16); if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); initdcache(bcmem); + + totalclumps = 0; + for(i=0; inarenas; i++) + totalclumps += ix->arenas[i]->diskstats.clumps; + + totalbuckets = 0; + for(i=0; insects; i++) + totalbuckets += ix->sects[i]->blocks; + fprint(2, "%,lld clumps, %,lld buckets\n", totalclumps, totalbuckets); - fprint(2, "building a new index %s using %s for temporary storage\n", mainindex->name, argv[1]); + /* start index procs */ + fprint(2, "%T read index\n"); + isectdonechan = chancreate(sizeof(void*), 0); + for(i=0; insects; i++){ + if(shouldprocess(ix->sects[i])) + ix->sects[i]->writechan = chancreate(sizeof(IEntry), 0); + vtproc(isectproc, ix->sects[i]); + } + + for(i=0; inarenas; i++){ + if(ix->arenas[i]->part != p){ + p = ix->arenas[i]->part; + vtproc(arenapartproc, p); + napart++; + } + } - clumps = sortrawientries(mainindex, part, &base, mainindex->bloom); - if(clumps == TWID64) - sysfatal("can't build sorted index: %r"); - fprint(2, "found and sorted index entries for clumps=%lld at %lld\n", clumps, base); + /* wait for arena procs to finish */ + for(i=0; insects; i++) + if(ix->sects[i]->writechan) + send(ix->sects[i]->writechan, nil); + + /* wait for index procs to finish */ + for(i=0; insects; i++) + if(ix->sects[i]->writechan) + recvp(isectdonechan); + + if(ix->bloom && writebloom(ix->bloom) < 0) + fprint(2, "writing bloom filter: %r\n"); + + fprint(2, "%T done arenaentries=%,lld indexed=%,lld (nskip=%,lld)\n", + arenaentries, indexentries, skipentries); + threadexitsall(nil); +} + +static int +shouldprocess(ISect *is) +{ + int i; - if(mainindex->bloom) - writebloom(mainindex->bloom); + if(nisect == 0) + return 1; - threadexitsall(0); + for(i=0; iname) == 0){ + isect[i] = nil; + return 1; + } + return 0; +} + +static void +add(u64int *a, u64int n) +{ + static Lock l; + + lock(&l); + *a += n; + unlock(&l); +} + +/* + * Read through an arena partition and send each of its IEntries + * to the appropriate index section. When finished, send on + * arenadonechan. + */ +enum +{ + ClumpChunks = 32*1024, +}; +static void +arenapartproc(void *v) +{ + int i, j, n, nskip, x; + u32int clump; + u64int addr, tot; + Arena *a; + ClumpInfo *ci, *cis; + IEntry ie; + Part *p; + + p = v; + threadsetname("arenaproc %s", p->name); + + nskip = 0; + tot = 0; + cis = MKN(ClumpInfo, ClumpChunks); + for(i=0; inarenas; i++){ + a = ix->arenas[i]; + if(a->part != p) + continue; + if(a->memstats.clumps) + fprint(2, "%T arena %s: %d entries\n", + a->name, a->memstats.clumps); + addr = ix->amap[i].start; + for(clump=0; clumpmemstats.clumps; clump+=n){ + n = ClumpChunks; + if(n > a->memstats.clumps - clump) + n = a->memstats.clumps - clump; + if(readclumpinfos(a, clump, cis, n) != n){ + fprint(2, "%T arena %s: directory read: %r\n", a->name); + errors = 1; + break; + } + for(j=0; jtype; + ie.ia.size = ci->uncsize; + ie.ia.addr = addr; + addr += ci->size + ClumpSize; + ie.ia.blocks = (ci->size + ClumpSize + (1<> ABlockLog; + scorecp(ie.score, ci->score); + if(ci->type == VtCorruptType) + nskip++; + else{ + tot++; + x = indexsect(ix, ie.score); + assert(0 <= x && x < ix->nsects); + if(ix->sects[x]->writechan) + send(ix->sects[x]->writechan, &ie); + if(ix->bloom) + markbloomfilter(ix->bloom, ie.score); + } + } + } + } + add(&arenaentries, tot); + add(&skipentries, nskip); + sendp(arenadonechan, p); +} + +/* + * Convert score into relative bucket number in isect. + * Can pass a packed ientry instead of score - score is first. + */ +static u32int +score2bucket(ISect *is, uchar *score) +{ + u32int b; + + b = hashbits(score, 32)/ix->div; + assert(is->start <= b && b < is->stop); + return b - is->start; +} + +/* + * Convert offset in index section to bucket number. + */ +static u32int +offset2bucket(ISect *is, u64int offset) +{ + u32int b; + + assert(is->blockbase <= offset); + offset -= is->blockbase; + b = offset/is->blocksize; + assert(b < is->stop-is->start); + return b; +} + +/* + * Convert bucket number to offset. + */ +static u64int +bucket2offset(ISect *is, u32int b) +{ + assert(b <= is->stop-is->start); + return is->blockbase + (u64int)b*is->blocksize; +} + +/* + * IEntry buffers to hold initial round of spraying. + */ +typedef struct Buf Buf; +struct Buf +{ + Part *part; /* partition being written */ + uchar *bp; /* current block */ + uchar *ep; /* end of block */ + uchar *wp; /* write position in block */ + u64int boffset; /* start offset */ + u64int woffset; /* next write offset */ + u64int eoffset; /* end offset */ + u32int nentry; /* number of entries written */ +}; + +static void +bflush(Buf *buf) +{ + u32int bufsize; + + if(buf->woffset >= buf->eoffset) + sysfatal("buf index chunk overflow - need bufger index"); + bufsize = buf->ep - buf->bp; + if(writepart(buf->part, buf->woffset, buf->bp, bufsize) < 0){ + fprint(2, "write %s: %r\n", buf->part->name); + errors = 1; + } + buf->woffset += bufsize; + memset(buf->bp, 0, bufsize); + buf->wp = buf->bp; +} + +static void +bwrite(Buf *buf, IEntry *ie) +{ + if(buf->wp+IEntrySize > buf->ep) + bflush(buf); + assert(buf->bp <= buf->wp && buf->wp < buf->ep); + packientry(ie, buf->wp); + buf->wp += IEntrySize; + assert(buf->bp <= buf->wp && buf->wp <= buf->ep); + buf->nentry++; +} + +/* + * Minibuffer. In-memory data structure holds our place + * in the buffer but has no block data. We are writing and + * reading the minibuffers at the same time. (Careful!) + */ +typedef struct Minibuf Minibuf; +struct Minibuf +{ + u64int boffset; /* start offset */ + u64int roffset; /* read offset */ + u64int woffset; /* write offset */ + u64int eoffset; /* end offset */ + u32int nentry; /* # entries left to read */ + u32int nwentry; /* # entries written */ +}; + +/* + * Index entry pool. Used when trying to shuffle around + * the entries in a big buffer into the corresponding M minibuffers. + * Sized to hold M*EntriesPerBlock entries, so that there will always + * either be room in the pool for another block worth of entries + * or there will be an entire block worth of sorted entries to + * write out. + */ +typedef struct IEntryLink IEntryLink; +typedef struct IPool IPool; + +struct IEntryLink +{ + uchar ie[IEntrySize]; /* raw IEntry */ + IEntryLink *next; /* next in chain */ +}; + +struct IPool +{ + ISect *isect; + u32int buck0; /* first bucket in pool */ + u32int mbufbuckets; /* buckets per minibuf */ + IEntryLink *entry; /* all IEntryLinks */ + u32int nentry; /* # of IEntryLinks */ + IEntryLink *free; /* free list */ + u32int nfree; /* # on free list */ + Minibuf *mbuf; /* all minibufs */ + u32int nmbuf; /* # of minibufs */ + IEntryLink **mlist; /* lists for each minibuf */ + u32int *mcount; /* # on each mlist[i] */ + u32int bufsize; /* block buffer size */ + uchar *rbuf; /* read buffer */ + uchar *wbuf; /* write buffer */ + u32int epbuf; /* entries per block buffer */ +}; + +/* +static int +countsokay(IPool *p) +{ + int i; + u64int n; + + n = 0; + for(i=0; inmbuf; i++) + n += p->mcount[i]; + n += p->nfree; + if(n != p->nentry){ + print("free %ud:", p->nfree); + for(i=0; inmbuf; i++) + print(" %ud", p->mcount[i]); + print(" = %lld nentry: %ud\n", n, p->nentry); + } + return n == p->nentry; +} +*/ + +static IPool* +mkipool(ISect *isect, Minibuf *mbuf, u32int nmbuf, + u32int mbufbuckets, u32int bufsize) +{ + u32int i, nentry; + uchar *data; + IPool *p; + IEntryLink *l; + + nentry = (nmbuf+1)*bufsize / IEntrySize; + p = ezmalloc(sizeof(IPool) + +nentry*sizeof(IEntry) + +nmbuf*sizeof(IEntryLink*) + +nmbuf*sizeof(u32int) + +3*bufsize); + + p->isect = isect; + p->mbufbuckets = mbufbuckets; + p->bufsize = bufsize; + p->entry = (IEntryLink*)(p+1); + p->nentry = nentry; + p->mlist = (IEntryLink**)(p->entry+nentry); + p->mcount = (u32int*)(p->mlist+nmbuf); + p->nmbuf = nmbuf; + p->mbuf = mbuf; + data = (uchar*)(p->mcount+nmbuf); + data += bufsize - (u32int)data%bufsize; + p->rbuf = data; + p->wbuf = data+bufsize; + p->epbuf = bufsize/IEntrySize; + + for(i=0; inentry; i++){ + l = &p->entry[i]; + l->next = p->free; + p->free = l; + p->nfree++; + } + return p; } + +/* + * Add the index entry ie to the pool p. + * Caller must know there is room. + */ +static void +ipoolinsert(IPool *p, uchar *ie) +{ + u32int buck, x; + IEntryLink *l; + + assert(p->free != nil); + + buck = score2bucket(p->isect, ie); + x = (buck-p->buck0) / p->mbufbuckets; + if(x >= p->nmbuf){ + fprint(2, "buck=%ud mbufbucket=%ud x=%ud\n", + buck, p->mbufbuckets, x); + } + assert(x < p->nmbuf); + + l = p->free; + p->free = l->next; + p->nfree--; + memmove(l->ie, ie, IEntrySize); + l->next = p->mlist[x]; + p->mlist[x] = l; + p->mcount[x]++; +} + +/* + * Pull out a block containing as many + * entries as possible for minibuffer x. + */ +static u32int +ipoolgetbuf(IPool *p, u32int x) +{ + uchar *bp, *ep, *wp; + IEntryLink *l; + u32int n; + + bp = p->wbuf; + ep = p->wbuf + p->bufsize; + n = 0; + assert(x < p->nmbuf); + for(wp=bp; wp+IEntrySize<=ep && p->mlist[x]; wp+=IEntrySize){ + l = p->mlist[x]; + p->mlist[x] = l->next; + p->mcount[x]--; + memmove(wp, l->ie, IEntrySize); + l->next = p->free; + p->free = l; + p->nfree++; + n++; + } + memset(wp, 0, ep-wp); + return n; +} + +/* + * Read a block worth of entries from the minibuf + * into the pool. Caller must know there is room. + */ +static void +ipoolloadblock(IPool *p, Minibuf *mb) +{ + u32int i, n; + + assert(mb->nentry > 0); + assert(mb->roffset >= mb->woffset); + assert(mb->roffset < mb->eoffset); + + n = p->bufsize/IEntrySize; + if(n > mb->nentry) + n = mb->nentry; + if(readpart(p->isect->part, mb->roffset, p->rbuf, p->bufsize) < 0) + fprint(2, "readpart %s: %r\n", p->isect->part->name); + else{ + for(i=0; irbuf+i*IEntrySize); + } + mb->nentry -= n; + mb->roffset += p->bufsize; +} + +/* + * Write out a block worth of entries to minibuffer x. + * If necessary, pick up the data there before overwriting it. + */ +static void +ipoolflush0(IPool *pool, u32int x) +{ + u32int bufsize; + Minibuf *mb; + + mb = pool->mbuf+x; + bufsize = pool->bufsize; + mb->nwentry += ipoolgetbuf(pool, x); + if(mb->nentry > 0 && mb->roffset == mb->woffset){ + assert(pool->nfree >= pool->bufsize/IEntrySize); + /* + * There will be room in the pool -- we just + * removed a block worth. + */ + ipoolloadblock(pool, mb); + } + if(writepart(pool->isect->part, mb->woffset, pool->wbuf, bufsize) < 0) + fprint(2, "writepart %s: %r\n", pool->isect->part->name); + mb->woffset += bufsize; +} + +/* + * Write out some full block of entries. + * (There must be one -- the pool is almost full!) + */ +static void +ipoolflush1(IPool *pool) +{ + u32int i; + + assert(pool->nfree <= pool->epbuf); + + for(i=0; inmbuf; i++){ + if(pool->mcount[i] >= pool->epbuf){ + ipoolflush0(pool, i); + return; + } + } + /* can't be reached - someone must be full */ + sysfatal("ipoolflush1"); +} + +/* + * Flush all the entries in the pool out to disk. + * Nothing more to read from disk. + */ +static void +ipoolflush(IPool *pool) +{ + u32int i; + + for(i=0; inmbuf; i++) + while(pool->mlist[i]) + ipoolflush0(pool, i); + assert(pool->nfree == pool->nentry); +} + +/* + * Third pass. Pick up each minibuffer from disk into + * memory and then write out the buckets. + */ + +/* + * Compare two packed index entries. + * Usual ordering except break ties by putting higher + * index addresses first (assumes have duplicates + * due to corruption in the lower addresses). + */ +static int +ientrycmpaddr(const void *va, const void *vb) +{ + int i; + uchar *a, *b; + + a = (uchar*)va; + b = (uchar*)vb; + i = ientrycmp(a, b); + if(i) + return i; + return -memcmp(a+IEntryAddrOff, b+IEntryAddrOff, 8); +} + +static void +zerorange(Part *p, u64int o, u64int e) +{ + static uchar zero[MaxIoSize]; + u32int n; + + for(; o e) + n = e-o; + if(writepart(p, o, zero, n) < 0) + fprint(2, "writepart %s: %r\n", p->name); + } +} + +/* + * Load a minibuffer into memory and write out the + * corresponding buckets. + */ +static void +sortminibuffer(ISect *is, Minibuf *mb, uchar *buf, u32int nbuf, u32int bufsize) +{ + uchar *buckdata, *p, *q, *ep; + u32int b, lastb, memsize, n; + u64int o; + IBucket ib; + Part *part; + + part = is->part; + buckdata = emalloc(is->blocksize); + + if(mb->nwentry == 0) + return; + + /* + * read entire buffer. + */ + assert(mb->nwentry*IEntrySize <= mb->woffset-mb->boffset); + assert(mb->woffset-mb->boffset <= nbuf); + if(readpart(part, mb->boffset, buf, mb->woffset-mb->boffset) < 0){ + fprint(2, "readpart %s: %r\n", part->name); + errors = 1; + return; + } + assert(*(uint*)buf != 0xa5a5a5a5); + + /* + * remove fragmentation due to IEntrySize + * not evenly dividing Bufsize + */ + memsize = (bufsize/IEntrySize)*IEntrySize; + for(o=mb->boffset, p=q=buf; owoffset; o+=bufsize){ + memmove(p, q, memsize); + p += memsize; + q += bufsize; + } + ep = buf + mb->nwentry*IEntrySize; + assert(ep <= buf+nbuf); + + /* + * sort entries + */ + qsort(buf, mb->nwentry, IEntrySize, ientrycmpaddr); + + /* + * write buckets out + */ + n = 0; + lastb = offset2bucket(is, mb->boffset); + for(p=buf; p is->blocksize) + sysfatal("bucket overflow - make index bigger"); + memmove(buckdata+IBucketSize, p, q-p); + ib.n = (q-p)/IEntrySize; + n += ib.n; + packibucket(&ib, buckdata, is->bucketmagic); + if(writepart(part, bucket2offset(is, b), buckdata, is->blocksize) < 0) + fprint(2, "write %s: %r\n", part->name); + lastb = b; + } + if(lastb+1 < is->stop-is->start && zero) + zerorange(part, bucket2offset(is, lastb+1), bucket2offset(is, is->stop - is->start)); + + if(n != mb->nwentry) + fprint(2, "sortminibuffer bug: n=%ud nwentry=%ud have=%ld\n", n, mb->nwentry, (ep-buf)/IEntrySize); + + free(buckdata); +} + +static void +isectproc(void *v) +{ + u32int buck, bufbuckets, bufsize, epbuf, i, j; + u32int mbufbuckets, n, nbucket, nn, space; + u32int nbuf, nminibuf, xminiclump, prod; + u64int blocksize, offset, xclump; + uchar *data, *p; + Buf *buf; + IEntry ie; + IPool *ipool; + ISect *is; + Minibuf *mbuf, *mb; + + is = v; + blocksize = is->blocksize; + nbucket = is->stop - is->start; + + /* + * Three passes: + * pass 1 - write index entries from arenas into + * large sequential sections on index disk. + * requires nbuf * bufsize memory. + * + * pass 2 - split each section into minibufs. + * requires nminibuf * bufsize memory. + * + * pass 3 - read each minibuf into memory and + * write buckets out. + * requires entries/minibuf * IEntrySize memory. + * + * The larger we set bufsize the less seeking hurts us. + * + * The fewer sections and minibufs we have, the less + * seeking hurts us. + * + * The fewer sections and minibufs we have, the + * more entries we end up with in each minibuf + * at the end. + * + * Shoot for using half our memory to hold each + * minibuf. The chance of a random distribution + * getting off by 2x is quite low. + * + * Once that is decided, figure out the smallest + * nminibuf and nsection/biggest bufsize we can use + * and still fit in the memory constraints. + */ + + /* expected number of clump index entries we'll see */ + xclump = nbucket * (double)totalclumps/totalbuckets; + + /* number of clumps we want to see in a minibuf */ + xminiclump = isectmem/2/IEntrySize; + + /* total number of minibufs we need */ + prod = xclump / xminiclump; + + /* if possible, skip second pass */ + if(!dumb && prod*MinBufSize < isectmem){ + nbuf = prod; + nminibuf = 1; + }else{ + /* otherwise use nsection = sqrt(nmini) */ + for(nbuf=1; nbuf*nbuf isectmem) + sysfatal("not enough memory"); + nminibuf = nbuf; + } + /* size buffer to use extra memory */ + bufsize = MinBufSize; + while(bufsize*2*nbuf <= isectmem && bufsize < MaxBufSize) + bufsize *= 2; + data = emalloc(nbuf*bufsize); + epbuf = bufsize/IEntrySize; + + fprint(2, "%T %s: %,ud buckets, %,ud groups, %,ud minigroups, %,ud buffer\n", + is->part->name, nbucket, nbuf, nminibuf, bufsize); + /* + * Accept index entries from arena procs. + */ + buf = MKNZ(Buf, nbuf); + p = data; + offset = is->blockbase; + bufbuckets = (nbucket+nbuf-1)/nbuf; + for(i=0; ipart; + buf[i].bp = p; + buf[i].wp = p; + p += bufsize; + buf[i].ep = p; + buf[i].boffset = offset; + buf[i].woffset = offset; + if(i < nbuf-1){ + offset += bufbuckets*blocksize; + buf[i].eoffset = offset; + }else{ + offset = is->blockbase + nbucket*blocksize; + buf[i].eoffset = offset; + } + } + assert(p == data+nbuf*bufsize); + + n = 0; + while(recv(is->writechan, &ie) == 1){ + if(ie.ia.addr == 0) + break; + buck = score2bucket(is, ie.score); + i = buck/bufbuckets; + assert(i < nbuf); + bwrite(&buf[i], &ie); + n++; + } + add(&indexentries, n); + + nn = 0; + for(i=0; ipart->name); + + /* + * Rearrange entries into minibuffers and then + * split each minibuffer into buckets. + */ + mbuf = MKN(Minibuf, nminibuf); + mbufbuckets = (bufbuckets+nminibuf-1)/nminibuf; + for(i=0; iboffset = offset; + if(j < nminibuf-1){ + offset += mbufbuckets*blocksize; + mb->eoffset = offset; + }else + mb->eoffset = buf[i].eoffset; + mb->roffset = mb->boffset; + mb->woffset = mb->boffset; + mb->nentry = epbuf * (mb->eoffset - mb->boffset)/bufsize; + if(mb->nentry > buf[i].nentry) + mb->nentry = buf[i].nentry; + buf[i].nentry -= mb->nentry; + nn += mb->nentry; + } + if(n != nn) + fprint(2, "isectproc bug2: n=%ud nn=%ud (i=%d)\n", n, nn, i);; + /* + * Rearrange. + */ + if(!dumb && nminibuf == 1){ + mbuf[0].nwentry = mbuf[0].nentry; + mbuf[0].woffset = buf[i].woffset; + }else{ + ipool = mkipool(is, mbuf, nminibuf, mbufbuckets, bufsize); + ipool->buck0 = bufbuckets*i; + for(j=0; jnentry > 0){ + if(ipool->nfree < epbuf){ + ipoolflush1(ipool); + /* ipoolflush1 might change mb->nentry */ + continue; + } + assert(ipool->nfree >= epbuf); + ipoolloadblock(ipool, mb); + } + } + ipoolflush(ipool); + nn = 0; + for(j=0; jblocksize, 0, ix->blocksize); z = alloczblock(ix->blocksize, 1, ix->blocksize); ies = initiestream(part, off, clumps, 64*1024); @@ -260,6 +260,8 @@ threadmain(int argc, char *argv[]) if(initventi(argv[0], &conf) < 0) sysfatal("can't init venti: %r"); + if(mainindex->bloom && loadbloom(mainindex->bloom) < 0) + sysfatal("can't load bloom filter: %r"); oldbloom = mainindex->bloom; newbloom = nil; if(oldbloom){ blob - 88ebdb50ab0740ddce38bdcd21a239fdd86ed0b7 blob + ec277864c056eec8bcf6d6e89c672bfb91e60d80 --- src/cmd/venti/srv/clump.c +++ src/cmd/venti/srv/clump.c @@ -91,7 +91,7 @@ clumpmagic(Arena *arena, u64int aa) { u8int buf[U32Size]; - if(readarena(arena, aa, buf, U32Size) < 0) + if(readarena(arena, aa, buf, U32Size) == TWID32) return TWID32; return unpackmagic(buf); } @@ -138,6 +138,11 @@ loadclump(Arena *arena, u64int aa, int blocks, Clump * freezblock(cb); return nil; } + if(cl->info.type == VtCorruptType){ + seterr(EOk, "clump is marked corrupt"); + freezblock(cb); + return nil; + } n -= ClumpSize; if(n < cl->info.size){ freezblock(cb); blob - 83f51df0f990c7a6214e2fbb440a8212f8ed7665 blob + 58b3d25c0bc48c8f623b06ecf34aec035857b2d2 --- src/cmd/venti/srv/conv.c +++ src/cmd/venti/srv/conv.c @@ -23,7 +23,7 @@ static struct { ArenaHeadMagic, "ArenaHeadMagic", ArenaMagic, "ArenaMagic", ISectMagic, "ISectMagic", - BloomMagic, "BloomMagic" + BloomMagic, "BloomMagic", }; static char* @@ -138,9 +138,6 @@ unpackarena(Arena *arena, u8int *buf) p += U64Size; arena->diskstats.sealed = U8GET(p); p += U8Size; - - arena->memstats = arena->diskstats; - switch(arena->version){ case ArenaVersion4: sz = ArenaSize4; @@ -153,6 +150,35 @@ unpackarena(Arena *arena, u8int *buf) seterr(ECorrupt, "arena has bad version number %d", arena->version); return -1; } + /* + * Additional fields for the memstats version of the stats. + * Diskstats reflects what is committed to the index. + * Memstats reflects what is in the arena. Originally intended + * this to be a version 5 extension, but might as well use for + * all the existing version 4 arenas too. + * + * To maintain backwards compatibility with existing venti + * installations using the older format, we define that if + * memstats == diskstats, then the extension fields are not + * included (see packarena below). That is, only partially + * indexed arenas have these fields. Fully indexed arenas + * (in particular, sealed arenas) do not. + */ + if(U8GET(p) == 1){ + sz += ArenaSize5a-ArenaSize5; + p += U8Size; + arena->memstats.clumps = U32GET(p); + p += U32Size; + arena->memstats.cclumps = U32GET(p); + p += U32Size; + arena->memstats.used = U64GET(p); + p += U64Size; + arena->memstats.uncsize = U64GET(p); + p += U64Size; + arena->memstats.sealed = U8GET(p); + p += U8Size; + }else + arena->memstats = arena->diskstats; if(buf + sz != p) sysfatal("unpackarena unpacked wrong amount"); @@ -162,6 +188,12 @@ unpackarena(Arena *arena, u8int *buf) int packarena(Arena *arena, u8int *buf) { + return _packarena(arena, buf, 0); +} + +int +_packarena(Arena *arena, u8int *buf, int forceext) +{ int sz; u8int *p; u32int t32; @@ -207,6 +239,30 @@ packarena(Arena *arena, u8int *buf) p += U64Size; U8PUT(p, arena->diskstats.sealed); p += U8Size; + + /* + * Extension fields; see above. + */ + if(forceext + || arena->memstats.clumps != arena->diskstats.clumps + || arena->memstats.cclumps != arena->diskstats.cclumps + || arena->memstats.used != arena->diskstats.used + || arena->memstats.uncsize != arena->diskstats.uncsize + || arena->memstats.sealed != arena->diskstats.sealed){ + sz += ArenaSize5a - ArenaSize5; + U8PUT(p, 1); + p += U8Size; + U32PUT(p, arena->memstats.clumps); + p += U32Size; + U32PUT(p, arena->memstats.cclumps); + p += U32Size; + U64PUT(p, arena->memstats.used, t32); + p += U64Size; + U64PUT(p, arena->memstats.uncsize, t32); + p += U64Size; + U8PUT(p, arena->memstats.sealed); + p += U8Size; + } if(buf + sz != p) sysfatal("packarena packed wrong amount"); @@ -525,6 +581,8 @@ unpackientry(IEntry *ie, u8int *buf) p += U32Size; ie->train = U16GET(p); p += U16Size; + if(p - buf != IEntryAddrOff) + sysfatal("unpackentry bad IEntryAddrOff amount"); ie->ia.addr = U64GET(p); if(ie->ia.addr>>56) print("%.8H => %llux\n", p, ie->ia.addr); p += U64Size; blob - 5101ff88908d66d573628b9c13a15b9cd6d51643 blob + 4801204faa85488f2d1de4185838dd1fc5e7fb46 --- src/cmd/venti/srv/dat.h +++ src/cmd/venti/srv/dat.h @@ -75,23 +75,17 @@ enum /* * magic numbers on disk */ -/* _ClumpMagic = 0xd15cb10cU, / * clump header, deprecated */ -#define _ClumpMagic 0xd15cb10cU + _ClumpMagic = 0xd15cb10cU, /* clump header, deprecated */ ClumpFreeMagic = 0, /* free clump; terminates active clump log */ -/* ArenaPartMagic = 0xa9e4a5e7U, / * arena partition header */ -/* ArenaMagic = 0xf2a14eadU, / * arena trailer */ -/* ArenaHeadMagic = 0xd15c4eadU, / * arena header */ -#define ArenaPartMagic 0xa9e4a5e7U -#define ArenaMagic 0xf2a14eadU -#define ArenaHeadMagic 0xd15c4eadU - -/* BloomMagic = 0xb1004eadU, / * bloom filter header */ -#define BloomMagic 0xb1004eadU + ArenaPartMagic = 0xa9e4a5e7U, /* arena partition header */ + ArenaMagic = 0xf2a14eadU, /* arena trailer */ + ArenaHeadMagic = 0xd15c4eadU, /* arena header */ + + BloomMagic = 0xb1004eadU, /* bloom filter header */ BloomMaxHash = 32, -/* ISectMagic = 0xd15c5ec7U, / * index header */ -#define ISectMagic 0xd15c5ec7U + ISectMagic = 0xd15c5ec7U, /* index header */ ArenaPartVersion = 3, ArenaVersion4 = 4, @@ -120,6 +114,7 @@ enum ArenaPartSize = 4 * U32Size, ArenaSize4 = 2 * U64Size + 6 * U32Size + ANameSize + U8Size, ArenaSize5 = ArenaSize4 + U32Size, + ArenaSize5a = ArenaSize5 + 2 * U8Size + 2 * U32Size + 2 * U64Size, ArenaHeadSize4 = U64Size + 3 * U32Size + ANameSize, ArenaHeadSize5 = ArenaHeadSize4 + U32Size, BloomHeadSize = 4 * U32Size, @@ -137,10 +132,14 @@ enum */ IBucketSize = U32Size + U16Size, IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize, - IEntryTypeOff = VtScoreSize + U64Size + U32Size + 2 * U16Size, + IEntryTypeOff = VtScoreSize + U32Size + U16Size + U64Size + U16Size, + IEntryAddrOff = VtScoreSize + U32Size + U16Size, MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog, + + IcacheFrac = 1000000, /* denominator */ + SleepForever = 1000000000, /* magic value for sleep time */ /* * dirty flags - order controls disk write order */ @@ -356,13 +355,11 @@ struct Arena int blocksize; /* size of block to read or write */ u64int base; /* base address on disk */ u64int size; /* total space in the arena */ - u64int limit; /* storage limit for clumps */ u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */ int clumpmax; /* ClumpInfos per block */ AState mem; int inqueue; - DigestState sha1; /* * fields stored on disk @@ -477,6 +474,8 @@ struct ISect u32int tabsize; /* max. bytes in index config */ Channel *writechan; Channel *writedonechan; + void *ig; /* used by buildindex only */ + int ng; /* * fields stored on disk @@ -716,8 +715,19 @@ extern int writestodevnull; /* dangerous - for perfor extern int collectstats; extern QLock memdrawlock; extern int icachesleeptime; +extern int minicachesleeptime; extern int arenasumsleeptime; +extern int manualscheduling; +extern int l0quantum; +extern int l1quantum; +extern int ignorebloom; +extern int icacheprefetch; +extern int syncwrites; +extern Stats *stathist; +extern int nstathist; +extern ulong stattime; + #ifndef PLAN9PORT #pragma varargck type "V" uchar* #define ODIRECT 0 blob - f5cc8e64057554836e2166ead7dfdfc2cefcf1c7 blob + 4d6d0865ca4555ec9b3229c55b42be76b861b2b4 --- src/cmd/venti/srv/dcache.c +++ src/cmd/venti/srv/dcache.c @@ -34,7 +34,7 @@ enum { HashLog = 9, HashSize = 1<name, addr, miss ? "miss" : "hit"); */ } int @@ -230,6 +228,7 @@ rareadpart(Part *part, u64int addr, u8int *buf, uint n } if(load != 2 || addr >= part->size){ /* addr >= part->size: let readpart do the error */ runlock(&ralock); + diskaccess(0); return readpart(part, addr, buf, n); } @@ -239,6 +238,7 @@ fprint(2, "raread %s %llx\n", part->name, addr); nn = dcache.ramax; if(addr+nn > part->size) nn = part->size - addr; + diskaccess(0); if(readpart(part, addr, dcache.rabuf, nn) < 0){ wunlock(&ralock); return -1; @@ -297,7 +297,6 @@ _getdblock(Part *part, u64int addr, int mode, int load /* * look for the block in the cache */ -/*checkdcache(); */ qlock(&dcache.lock); again: for(b = dcache.heads[h]; b != nil; b = b->next){ @@ -367,7 +366,6 @@ found: fixheap(b->heap, b); qunlock(&dcache.lock); -/*checkdcache(); */ trace(TraceBlock, "getdblock lock"); addstat(StatDblockStall, 1); @@ -427,7 +425,6 @@ putdblock(DBlock *b) else wunlock(&b->lock); -/*checkdcache(); */ qlock(&dcache.lock); if(--b->ref == 0 && !b->dirty){ if(b->heap == TWID32) @@ -435,7 +432,6 @@ putdblock(DBlock *b) rwakeupall(&dcache.full); } qunlock(&dcache.lock); -/*checkdcache(); */ } void @@ -474,6 +470,25 @@ dirtydblock(DBlock *b, int dirty) qunlock(&dcache.lock); } +static void +unchain(DBlock *b) +{ + ulong h; + + /* + * unchain the block + */ + if(b->prev == nil){ + h = pbhash(b->addr); + if(dcache.heads[h] != b) + sysfatal("bad hash chains in disk cache"); + dcache.heads[h] = b->next; + }else + b->prev->next = b->next; + if(b->next != nil) + b->next->prev = b->prev; +} + /* * remove some block from use and update the free list and counters */ @@ -481,7 +496,6 @@ static DBlock* bumpdblock(void) { DBlock *b; - ulong h; trace(TraceBlock, "bumpdblock enter"); b = dcache.free; @@ -512,20 +526,26 @@ bumpdblock(void) trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr); - /* - * unchain the block - */ - if(b->prev == nil){ - h = pbhash(b->addr); - if(dcache.heads[h] != b) - sysfatal("bad hash chains in disk cache"); - dcache.heads[h] = b->next; - }else - b->prev->next = b->next; - if(b->next != nil) - b->next->prev = b->prev; - + unchain(b); return b; +} + +void +emptydcache(void) +{ + DBlock *b; + + qlock(&dcache.lock); + while(dcache.nheap > 0){ + b = dcache.heap[0]; + delheap(b); + if(!b->ref && !b->dirty){ + unchain(b); + b->next = dcache.free; + dcache.free = b; + } + } + qunlock(&dcache.lock); } /* @@ -683,6 +703,7 @@ static int parallelwrites(DBlock **b, DBlock **eb, int dirty) { DBlock **p, **q; + for(p=b; pdirty == dirty; p++){ assert(b<=p && ppart->writechan, *p); @@ -803,6 +824,7 @@ writeproc(void *v) trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr); wlock(&b->lock); trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr); + diskaccess(0); if(writepart(p, b->addr, b->data, b->size) < 0) fprint(2, "write error: %r\n"); /* XXX details! */ addstat(StatApartWrite, 1); blob - /dev/null blob + 687616e1772838fade8380d392ef21e45299c7b6 (mode 644) --- /dev/null +++ src/cmd/venti/srv/disksched.c @@ -0,0 +1,88 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +ulong lasttime[2]; +int manualscheduling; +int l0quantum = 120; +int l1quantum = 120; +ulong lasticachechange; + +void +disksched(void) +{ + int p, nwrite, nflush, ndirty, tdirty, toflush; + ulong t; + vlong cflush; + Stats *prev; + + /* + * no locks because all the data accesses are atomic. + */ + t = time(0); + if(manualscheduling){ + lasticachechange = t; + return; + } + + if(t-lasttime[0] < l0quantum){ + /* level-0 disk access going on */ + p = icachedirtyfrac(); + if(p < IcacheFrac*5/10){ /* can wait */ + icachesleeptime = SleepForever; + lasticachechange = t; + }else if(p > IcacheFrac*9/10){ /* can't wait */ + icachesleeptime = 0; + lasticachechange = t; + }else if(t-lasticachechange > 60){ + /* have minute worth of data for current rate */ + prev = &stathist[(stattime-60+nstathist)%nstathist]; + + /* # entries written to index cache */ + nwrite = stats.n[StatIcacheWrite] - prev->n[StatIcacheWrite]; + + /* # dirty entries in index cache */ + ndirty = stats.n[StatIcacheDirty] - prev->n[StatIcacheDirty]; + + /* # entries flushed to disk */ + nflush = nwrite - ndirty; + + /* want to stay around 70% dirty */ + tdirty = (vlong)stats.n[StatIcacheSize]*700/1000; + + /* assume nflush*icachesleeptime is a constant */ + cflush = (vlong)nflush*(icachesleeptime+1); + + /* computer number entries to write in next minute */ + toflush = nwrite + (stats.n[StatIcacheDirty] - tdirty); + + /* schedule for that many */ + if(toflush <= 0 || cflush/toflush > 100000) + icachesleeptime = SleepForever; + else + icachesleeptime = cflush/toflush; + } + arenasumsleeptime = SleepForever; + return; + } + if(t-lasttime[1] < l1quantum){ + /* level-1 disk access (icache flush) going on */ + icachesleeptime = 0; + arenasumsleeptime = SleepForever; + return; + } + /* no disk access going on - no holds barred*/ + icachesleeptime = 0; + arenasumsleeptime = 0; +} + +void +diskaccess(int level) +{ + if(level < 0 || level >= nelem(lasttime)){ + fprint(2, "bad level in diskaccess; caller=%lux\n", getcallerpc(&level)); + return; + } + lasttime[level] = time(0); +} + blob - 6681503dfdc1a04c5f2e32e1960ba1887234afbe blob + 226d97aef20e794ce196f496f89e764c9245ddd4 --- src/cmd/venti/srv/findscore.c +++ src/cmd/venti/srv/findscore.c @@ -27,7 +27,7 @@ findscore(Arena *arena, uchar *score) u32int clump; int i, n, found; -/*ZZZ remove fprint? */ +//ZZZ remove fprint? if(arena->memstats.clumps) fprint(2, "reading directory for arena=%s with %d entries\n", arena->name, arena->memstats.clumps); blob - /dev/null blob + 955159424e7bd416581bb9181484f7c05487fe03 (mode 644) --- /dev/null +++ src/cmd/venti/srv/fixarenas.c @@ -0,0 +1,1894 @@ +/* + * Check and fix an arena partition. + * + * This is a lot grittier than the rest of Venti because + * it can't just give up if a byte here or there is wrong. + * + * The rule here (hopefully followed!) is that block corruption + * only ever has a local effect -- there are no blocks that you + * can wipe out that will cause large portions of + * uncorrupted data blocks to be useless. + */ + +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include "whack.h" + +#pragma varargck type "z" uvlong +#pragma varargck type "z" vlong +#pragma varargck type "t" uint + +enum +{ + K = 1024, + M = 1024*1024, + G = 1024*1024*1024, + + Block = 4096, +}; + +int debugsha1; + +int verbose; +Part *part; +char *file; +char *basename; +char *dumpbase; +int fix; +int badreads; +int unseal; +uchar zero[MaxDiskBlock]; + +Arena lastarena; +ArenaPart ap; +uvlong arenasize; +int nbadread; +int nbad; +uvlong partend; +void checkarena(vlong, int); + +void +usage(void) +{ + fprint(2, "usage: fixarenas [-fv] [-a arenasize] [-b blocksize] file [ranges]\n"); + threadexitsall(0); +} + +/* + * Format number in simplest way that is okay with unittoull. + */ +static int +zfmt(Fmt *fmt) +{ + vlong x; + + x = va_arg(fmt->args, vlong); + if(x == 0) + return fmtstrcpy(fmt, "0"); + if(x%G == 0) + return fmtprint(fmt, "%lldG", x/G); + if(x%M == 0) + return fmtprint(fmt, "%lldM", x/M); + if(x%K == 0) + return fmtprint(fmt, "%lldK", x/K); + return fmtprint(fmt, "%lld", x); +} + +/* + * Format time like ctime without newline. + */ +static int +tfmt(Fmt *fmt) +{ + uint t; + char buf[30]; + + t = va_arg(fmt->args, uint); + strcpy(buf, ctime(t)); + buf[28] = 0; + return fmtstrcpy(fmt, buf); +} + +/* + * Coalesce messages about unreadable sectors into larger ranges. + * bad(0, 0) flushes the buffer. + */ +static void +bad(char *msg, vlong o, int len) +{ + static vlong lb0, lb1; + static char *lmsg; + + if(msg == nil) + msg = lmsg; + if(o == -1){ + lmsg = nil; + lb0 = 0; + lb1 = 0; + return; + } + if(lb1 != o || (msg && lmsg && strcmp(msg, lmsg) != 0)){ + if(lb0 != lb1) + print("%s %#llux+%#llux (%,lld+%,lld)\n", + lmsg, lb0, lb1-lb0, lb0, lb1-lb0); + lb0 = o; + } + lmsg = msg; + lb1 = o+len; +} + +/* + * Read in the len bytes of data at the offset. If can't for whatever reason, + * fill it with garbage but print an error. + */ +static uchar* +readdisk(uchar *buf, vlong offset, int len) +{ + int i, j, k, n; + + if(offset >= partend){ + memset(buf, 0xFB, sizeof buf); + return buf; + } + + if(offset+len > partend){ + memset(buf, 0xFB, sizeof buf); + len = partend - offset; + } + + if(readpart(part, offset, buf, len) >= 0) + return buf; + + /* + * The read failed. Clear the buffer to nonsense, and + * then try reading in smaller pieces. If that fails, + * read in even smaller pieces. And so on down to sectors. + */ + memset(buf, 0xFD, len); + for(i=0; i len) + n = len-i; + if(readpart(part, offset+i, buf+i, n) >= 0) + continue; + for(j=i; j len) + n = len-j; + if(readpart(part, offset+j, buf+j, n) >= 0) + continue; + for(k=j; k= 0) + continue; + bad("disk read failed at", k, 512); + badreads++; + } + } + } + bad(nil, 0, 0); + return buf; +} + +/* + * Buffer to support running SHA1 hash of the disk. + */ +typedef struct Shabuf Shabuf; +struct Shabuf +{ + int fd; + vlong offset; + DigestState state; + int rollback; + vlong r0; + DigestState *hist; + int nhist; +}; + +void +sbdebug(Shabuf *sb, char *file) +{ + int fd; + + if(sb->fd > 0){ + close(sb->fd); + sb->fd = 0; + } + if((fd = create(file, OWRITE, 0666)) < 0) + return; + if(fd == 0){ + fd = dup(fd, -1); + close(0); + } + sb->fd = fd; +} + +void +sbupdate(Shabuf *sb, uchar *p, vlong offset, int len) +{ + int n, x; + vlong o; + + if(sb->rollback && !sb->hist){ + sb->r0 = offset; + sb->nhist = 1; + sb->hist = vtmalloc(sb->nhist*sizeof *sb->hist); + memset(sb->hist, 0, sizeof sb->hist[0]); + } + if(sb->r0 == 0) + sb->r0 = offset; + + if(sb->offset < offset || sb->offset >= offset+len){ + if(0) print("sbupdate %p %#llux+%d but offset=%#llux\n", + p, offset, len, sb->offset); + return; + } + x = sb->offset - offset; + if(0) print("sbupdate %p %#llux+%d skip %d\n", + sb, offset, len, x); + if(x){ + p += x; + offset += x; + len -= x; + } + assert(sb->offset == offset); + + if(sb->fd > 0) + pwrite(sb->fd, p, len, offset - sb->r0); + + if(!sb->rollback){ + sha1(p, len, nil, &sb->state); + sb->offset += len; + return; + } + + /* save state every 4M so we can roll back quickly */ + o = offset - sb->r0; + while(len > 0){ + n = 4*M - o%(4*M); + if(n > len) + n = len; + sha1(p, n, nil, &sb->state); + sb->offset += n; + o += n; + p += n; + len -= n; + if(o%(4*M) == 0){ + x = o/(4*M); + if(x >= sb->nhist){ + if(x != sb->nhist) + print("oops! x=%d nhist=%d\n", x, sb->nhist); + sb->nhist += 32; + sb->hist = vtrealloc(sb->hist, sb->nhist*sizeof *sb->hist); + } + sb->hist[x] = sb->state; + } + } +} + +void +sbdiskhash(Shabuf *sb, vlong eoffset) +{ + static uchar dbuf[4*M]; + int n; + + while(sb->offset < eoffset){ + n = sizeof dbuf; + if(sb->offset+n > eoffset) + n = eoffset - sb->offset; + readdisk(dbuf, sb->offset, n); + sbupdate(sb, dbuf, sb->offset, n); + } +} + +void +sbrollback(Shabuf *sb, vlong offset) +{ + int x; + vlong o; + Dir d; + + if(!sb->rollback || !sb->r0){ + print("cannot rollback sha\n"); + return; + } + if(offset >= sb->offset) + return; + o = offset - sb->r0; + x = o/(4*M); + if(x >= sb->nhist){ + print("cannot rollback sha\n"); + return; + } + sb->state = sb->hist[x]; + sb->offset = sb->r0 + x*4*M; + assert(sb->offset <= offset); + + if(sb->fd > 0){ + nulldir(&d); + d.length = sb->offset - sb->r0; + dirfwstat(sb->fd, &d); + } +} + +void +sbscore(Shabuf *sb, uchar *score) +{ + if(sb->hist){ + free(sb->hist); + sb->hist = nil; + } + sha1(nil, 0, score, &sb->state); +} + +/* + * If we're fixing arenas, then editing this memory edits the disk! + * It will be written back out as new data is paged in. + */ +uchar buf[4*M]; +uchar sbuf[4*M]; +vlong bufoffset; +int buflen; + +static void pageout(void); +static uchar* +pagein(vlong offset, int len) +{ + pageout(); + if(offset >= partend){ + memset(buf, 0xFB, sizeof buf); + return buf; + } + + if(offset+len > partend){ + memset(buf, 0xFB, sizeof buf); + len = partend - offset; + } + bufoffset = offset; + buflen = len; + readdisk(buf, offset, len); + memmove(sbuf, buf, len); + return buf; +} + +static void +pageout(void) +{ + if(buflen==0 || !fix || memcmp(buf, sbuf, buflen) == 0){ + buflen = 0; + return; + } + if(writepart(part, bufoffset, buf, buflen) < 0) + print("disk write failed at %#llux+%#ux (%,lld+%,d)\n", + bufoffset, buflen, bufoffset, buflen); + buflen = 0; +} + +static void +zerorange(vlong offset, int len) +{ + int i; + vlong ooff; + int olen; + enum { MinBlock = 4*K, MaxBlock = 8*K }; + + if(0) + if(bufoffset <= offset && offset+len <= bufoffset+buflen){ + memset(buf+(offset-bufoffset), 0, len); + return; + } + + ooff = bufoffset; + olen = buflen; + + i = offset%MinBlock; + if(i+len < MaxBlock){ + pagein(offset-i, (len+MinBlock-1)&~(MinBlock-1)); + memset(buf+i, 0, len); + }else{ + pagein(offset-i, MaxBlock); + memset(buf+i, 0, MaxBlock-i); + offset += MaxBlock-i; + len -= MaxBlock-i; + while(len >= MaxBlock){ + pagein(offset, MaxBlock); + memset(buf, 0, MaxBlock); + offset += MaxBlock; + len -= MaxBlock; + } + pagein(offset, (len+MinBlock-1)&~(MinBlock-1)); + memset(buf, 0, len); + } + pagein(ooff, olen); +} + +/* + * read/write integers + * +static void +p16(uchar *p, u16int u) +{ + p[0] = (u>>8) & 0xFF; + p[1] = u & 0xFF; +} +*/ + +static u16int +u16(uchar *p) +{ + return (p[0]<<8)|p[1]; +} + +static void +p32(uchar *p, u32int u) +{ + p[0] = (u>>24) & 0xFF; + p[1] = (u>>16) & 0xFF; + p[2] = (u>>8) & 0xFF; + p[3] = u & 0xFF; +} + +static u32int +u32(uchar *p) +{ + return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3]; +} + +/* +static void +p64(uchar *p, u64int u) +{ + p32(p, u>>32); + p32(p, u); +} +*/ + +static u64int +u64(uchar *p) +{ + return ((u64int)u32(p)<<32) | u32(p+4); +} + +static int +vlongcmp(const void *va, const void *vb) +{ + vlong a, b; + + a = *(vlong*)va; + b = *(vlong*)vb; + if(a < b) + return -1; + if(b > a) + return 1; + return 0; +} + +/* D and S are in draw.h */ +#define D VD +#define S VS + +enum +{ + D = 0x10000, + Z = 0x20000, + S = 0x30000, + T = 0x40000, + N = 0xFFFF +}; +typedef struct Info Info; +struct Info +{ + int len; + char *name; +}; + +Info partinfo[] = { + 4, "magic", + D|4, "version", + Z|4, "blocksize", + 4, "arenabase", + 0 +}; + +Info headinfo4[] = { + 4, "magic", + D|4, "version", + S|ANameSize, "name", + Z|4, "blocksize", + Z|8, "size", + 0 +}; + +Info headinfo5[] = { + 4, "magic", + D|4, "version", + S|ANameSize, "name", + Z|4, "blocksize", + Z|8, "size", + 4, "clumpmagic", + 0 +}; + +Info tailinfo4[] = { + 4, "magic", + D|4, "version", + S|ANameSize, "name", + D|4, "clumps", + D|4, "cclumps", + T|4, "ctime", + T|4, "wtime", + D|8, "used", + D|8, "uncsize", + 1, "sealed", + 0 +}; + +Info tailinfo4a[] = { + /* tailinfo 4 */ + 4, "magic", + D|4, "version", + S|ANameSize, "name", + D|4, "clumps", + D|4, "cclumps", + T|4, "ctime", + T|4, "wtime", + D|8, "used", + D|8, "uncsize", + 1, "sealed", + + /* mem stats */ + 1, "extension", + D|4, "mem.clumps", + D|4, "mem.cclumps", + D|8, "mem.used", + D|8, "mem.uncsize", + 1, "mem.sealed", + 0 +}; + +Info tailinfo5[] = { + 4, "magic", + D|4, "version", + S|ANameSize, "name", + D|4, "clumps", + D|4, "cclumps", + T|4, "ctime", + T|4, "wtime", + 4, "clumpmagic", + D|8, "used", + D|8, "uncsize", + 1, "sealed", + 0 +}; + +Info tailinfo5a[] = { + /* tailinfo 5 */ + 4, "magic", + D|4, "version", + S|ANameSize, "name", + D|4, "clumps", + D|4, "cclumps", + T|4, "ctime", + T|4, "wtime", + 4, "clumpmagic", + D|8, "used", + D|8, "uncsize", + 1, "sealed", + + /* mem stats */ + 1, "extension", + D|4, "mem.clumps", + D|4, "mem.cclumps", + D|8, "mem.used", + D|8, "mem.uncsize", + 1, "mem.sealed", + 0 +}; + +void +showdiffs(uchar *want, uchar *have, int len, Info *info) +{ + int n; + + while(len > 0 && (n=info->len&N) > 0){ + if(memcmp(have, want, n) != 0){ + switch(info->len){ + case 1: + print("\t%s: correct=%d disk=%d\n", + info->name, *want, *have); + break; + case 4: + print("\t%s: correct=%#ux disk=%#ux\n", + info->name, u32(want), u32(have)); + break; + case D|4: + print("\t%s: correct=%,ud disk=%,ud\n", + info->name, u32(want), u32(have)); + break; + case T|4: + print("\t%s: correct=%t\n\t\tdisk=%t\n", + info->name, u32(want), u32(have)); + break; + case Z|4: + print("\t%s: correct=%z disk=%z\n", + info->name, (uvlong)u32(want), (uvlong)u32(have)); + break; + case D|8: + print("\t%s: correct=%,lld disk=%,lld\n", + info->name, u64(want), u64(have)); + break; + case Z|8: + print("\t%s: correct=%z disk=%z\n", + info->name, u64(want), u64(have)); + break; + case S|ANameSize: + print("\t%s: correct=%s disk=%.*s\n", + info->name, (char*)want, + utfnlen((char*)have, ANameSize-1), + (char*)have); + break; + default: + print("\t%s: correct=%.*H disk=%.*H\n", + info->name, n, want, n, have); + break; + } + } + have += n; + want += n; + len -= n; + info++; + } + if(len > 0 && memcmp(have, want, len) != 0){ + if(memcmp(want, zero, len) != 0) + print("!!\textra want data in showdiffs (bug in fixarenas)\n"); + else + print("\tnon-zero data on disk after structure\n"); + if(verbose > 1){ + print("want: %.*H\n", len, want); + print("have: %.*H\n", len, have); + } + } +} + +static int tabsizes[] = { 64*1024, 512*1024, }; +/* + * Poke around on the disk to guess what the ArenaPart numbers are. + */ +void +guessgeometry(void) +{ + int i, j, n, bestn, ndiff, nhead, ntail; + uchar *p, *ep, *sp; + u64int diff[100], head[20], tail[20]; + u64int offset, bestdiff; + + ap.version = ArenaPartVersion; + + if(arenasize == 0 || ap.blocksize == 0){ + /* + * The ArenaPart block at offset PartBlank may be corrupt or just wrong. + * Instead, look for the individual arena headers and tails, which there + * are many of, and once we've seen enough, infer the spacing. + * + * Of course, nothing in the file format requires that arenas be evenly + * spaced, but fmtarenas always does that for us. + */ + nhead = 0; + ntail = 0; + for(offset=PartBlank; offset bestn){ + bestn = n; + bestdiff = diff[i-1]; + } + n = 0; + } + } + print("arena size likely %z (%d of %d)\n", bestdiff, bestn, ndiff); + if(arenasize != 0 && arenasize != bestdiff) + print("using user-specified size %z instead\n", arenasize); + else + arenasize = bestdiff; + + /* + * The arena tail for an arena is arenasize-blocksize from the head. + */ + ndiff = 0; + for(i=j=0; i bestn){ + bestn = n; + bestdiff = diff[i-1]; + } + n = 0; + } + } + print("block size likely %z (%d of %d)\n", bestdiff, bestn, ndiff); + if(ap.blocksize != 0 && ap.blocksize != bestdiff) + print("using user-specified size %z instead\n", (vlong)ap.blocksize); + else + ap.blocksize = bestdiff; + if(ap.blocksize == 0 || ap.blocksize&(ap.blocksize-1)) + sysfatal("block size not a power of two"); + if(ap.blocksize > MaxDiskBlock) + sysfatal("block size too big (max=%d)", MaxDiskBlock); + + /* + * Use head/tail information to deduce arena base. + */ + ndiff = 0; + for(i=0; i bestn){ + bestn = n; + bestdiff = diff[i-1]; + } + n = 0; + } + } + ap.arenabase = bestdiff; + } + + ap.tabbase = (PartBlank+HeadSize+ap.blocksize-1)&~(ap.blocksize-1); + /* + * XXX pick up table, check arenabase. + * XXX pick up table, record base name. + */ + + /* + * Somewhat standard computation. + * Fmtarenas used to use 64k tab, now uses 512k tab. + */ + if(ap.arenabase == 0){ + for(i=0; iinfo.type = vtfromdisktype(*p); + if(cl->info.type == 0xFF) + return 0; + p++; + cl->info.size = u16(p); + p += U16Size; + cl->info.uncsize = u16(p); + if(cl->info.size > cl->info.uncsize) + return 0; + p += U16Size; + scorecp(cl->info.score, p); + p += VtScoreSize; + cl->encoding = *p; + p++; + cl->creator = u32(p); + p += U32Size; + cl->time = u32(p); + p += U32Size; + + switch(cl->encoding){ + case ClumpENone: + if(cl->info.size != cl->info.uncsize) + return 0; + scoremem(score, p, cl->info.size); + if(scorecmp(score, cl->info.score) != 0) + return 0; + break; + case ClumpECompress: + if(cl->info.size >= cl->info.uncsize) + return 0; + unwhackinit(&uw); + n = unwhack(&uw, ubuf, cl->info.uncsize, p, cl->info.size); + if(n != cl->info.uncsize) + return 0; + scoremem(score, ubuf, cl->info.uncsize); + if(scorecmp(score, cl->info.score) != 0) + return 0; + break; + default: + return 0; + } + p += cl->info.size; + + /* it all worked out in the end */ + *pmagic = magic; + return p - bp; +} + +/* + * All ClumpInfos seen in this arena. + * Kept in binary tree so we can look up by score. + */ +typedef struct Cit Cit; +struct Cit +{ + int left; + int right; + vlong corrupt; + ClumpInfo ci; +}; +Cit *cibuf; +int ciroot; +int ncibuf, mcibuf; + +void +resetcibuf(void) +{ + ncibuf = 0; + ciroot = -1; +} + +int* +ltreewalk(int *p, uchar *score) +{ + int i; + + for(;;){ + if(*p == -1) + return p; + i = scorecmp(cibuf[*p].ci.score, score); + if(i == 0) + return p; + if(i < 0) + p = &cibuf[*p].right; + else + p = &cibuf[*p].left; + } + return nil; /* stupid 8c */ +} + +void +addcibuf(ClumpInfo *ci, vlong corrupt) +{ + Cit *cit; + + if(ncibuf == mcibuf){ + mcibuf += 131072; + cibuf = vtrealloc(cibuf, mcibuf*sizeof cibuf[0]); + } + cit = &cibuf[ncibuf]; + cit->ci = *ci; + cit->left = -1; + cit->right = -1; + cit->corrupt = corrupt; + if(!corrupt) + *ltreewalk(&ciroot, ci->score) = ncibuf; + ncibuf++; +} + +void +addcicorrupt(vlong len) +{ + static ClumpInfo zci; + + addcibuf(&zci, len); +} + +int +haveclump(uchar *score) +{ + int i; + int p; + + p = ciroot; + for(;;){ + if(p == -1) + return 0; + i = scorecmp(cibuf[p].ci.score, score); + if(i == 0) + return 1; + if(i < 0) + p = cibuf[p].right; + else + p = cibuf[p].left; + } + return 0; /* stupid 8c */ +} + +int +matchci(ClumpInfo *ci, uchar *p) +{ + if(ci->type != vtfromdisktype(p[0])) + return 0; + if(ci->size != u16(p+1)) + return 0; + if(ci->uncsize != u16(p+3)) + return 0; + if(scorecmp(ci->score, p+5) != 0) + return 0; + return 1; +} + +int +sealedarena(uchar *p, int blocksize) +{ + int v, n; + + v = u32(p+4); + switch(v){ + default: + return 0; + case ArenaVersion4: + n = ArenaSize4; + break; + case ArenaVersion5: + n = ArenaSize5; + break; + } + if(p[n-1] != 1){ + print("arena tail says not sealed\n"); + return 0; + } + if(memcmp(p+n, zero, blocksize-VtScoreSize-n) != 0){ + print("arena tail followed by non-zero data\n"); + return 0; + } + if(memcmp(p+blocksize-VtScoreSize, zero, VtScoreSize) == 0){ + print("arena score zero\n"); + return 0; + } + return 1; +} + +int +okayname(char *name, int n) +{ + char buf[20]; + + if(nameok(name) < 0) + return 0; + sprint(buf, "%d", n); + if(strlen(name) < strlen(buf) + || strcmp(name+strlen(name)-strlen(buf), buf) != 0) + return 0; + return 1; +} + +int +clumpinfocmp(ClumpInfo *a, ClumpInfo *b) +{ + if(a->type != b->type) + return a->type - b->type; + if(a->size != b->size) + return a->size - b->size; + if(a->uncsize != b->uncsize) + return a->uncsize - b->uncsize; + return scorecmp(a->score, b->score); +} + +ClumpInfo* +loadci(vlong offset, Arena *arena, int nci) +{ + int i, j, per; + uchar *p, *sp; + ClumpInfo *bci, *ci; + + per = arena->blocksize/ClumpInfoSize; + bci = vtmalloc(nci*sizeof bci[0]); + ci = bci; + offset += arena->size - arena->blocksize; + p = sp = nil; + for(i=0; iblocksize; + offset -= arena->blocksize; + for(j=0; jblocksize/ClumpInfoSize; + offset += arena->size - arena->blocksize; + p = sp = nil; + for(i=0; iblocksize; + offset -= arena->blocksize; + memset(p, 0, arena->blocksize); + for(j=0; jsize = arenasize; + if(offset0+arena->size > partend) + arena->size = partend - offset0; + head->size = arena->size; + + arena->blocksize = ap.blocksize; + head->blocksize = arena->blocksize; + + /* + * Look for clump magic and name in head/tail blocks. + * All the other info we will reconstruct just in case. + */ + p = pagein(offset0, arena->blocksize); + memset(&ohead, 0, sizeof ohead); + if(unpackarenahead(&ohead, p) >= 0){ + head->version = ohead.version; + head->clumpmagic = ohead.clumpmagic; + if(okayname(ohead.name, anum)) + strcpy(head->name, ohead.name); + } + + p = pagein(offset0+arena->size-arena->blocksize, + arena->blocksize); + memset(&oarena, 0, sizeof oarena); + if(unpackarena(&oarena, p) >= 0){ + arena->version = oarena.version; + arena->clumpmagic = oarena.clumpmagic; + if(okayname(oarena.name, anum)) + strcpy(arena->name, oarena.name); + arena->diskstats.clumps = oarena.diskstats.clumps; +print("old arena: sealed=%d\n", oarena.diskstats.sealed); + arena->diskstats.sealed = oarena.diskstats.sealed; + } + + /* Head trumps arena. */ + if(head->version){ + arena->version = head->version; + arena->clumpmagic = head->clumpmagic; + } + if(arena->version == 0) + arena->version = ArenaVersion5; + if(basename) + snprint(arena->name, ANameSize, "%s%d", basename, anum); + else if(lastbase[0]) + snprint(arena->name, ANameSize, "%s%d", lastbase, anum); + else if(head->name[0]) + strcpy(arena->name, head->name); + else if(arena->name[0] == 0) + sysfatal("cannot determine base name for arena; use -n"); + strcpy(lastbase, arena->name); + sprint(dname, "%d", anum); + lastbase[strlen(lastbase)-strlen(dname)] = 0; + + /* Was working in arena, now copy to head. */ + head->version = arena->version; + memmove(head->name, arena->name, sizeof head->name); + head->blocksize = arena->blocksize; + head->size = arena->size; +} + +void +shahead(Shabuf *sb, vlong offset0, ArenaHead *head) +{ + uchar headbuf[MaxDiskBlock]; + + sb->offset = offset0; + memset(headbuf, 0, sizeof headbuf); + packarenahead(head, headbuf); + sbupdate(sb, headbuf, offset0, head->blocksize); +} + +u32int +newclumpmagic(int version) +{ + u32int m; + + if(version == ArenaVersion4) + return _ClumpMagic; + do{ + m = fastrand(); + }while(m==0 || m == _ClumpMagic); + return m; +} + +/* + * Poke around in the arena to find the clump data + * and compute the relevant statistics. + */ +void +guessarena(vlong offset0, int anum, ArenaHead *head, Arena *arena, + uchar *oldscore, uchar *score) +{ + uchar dbuf[MaxDiskBlock]; + int needtozero, clumps, nb1, nb2, minclumps; + int inbad, n, ncib, printed, sealing, smart; + u32int magic; + uchar *sp, *ep, *p; + vlong boffset, eoffset, lastclumpend, leaked; + vlong offset, toffset, totalcorrupt, v; + Clump cl; + ClumpInfo *bci, *ci, *eci, *xci; + Cit *bcit, *cit, *ecit; + Shabuf oldsha, newsha; + + /* + * We expect to find an arena, with data, between offset + * and offset+arenasize. With any luck, the data starts at + * offset+ap.blocksize. The blocks have variable size and + * aren't padded at all, which doesn't give us any alignment + * constraints. The blocks are compressed or high entropy, + * but the headers are pretty low entropy (except the score): + * + * type[1] (range 0 thru 9, 13) + * size[2] + * uncsize[2] (<= size) + * + * so we can look for these. We check the scores as we go, + * so we can't make any wrong turns. If we find ourselves + * in a dead end, scan forward looking for a new start. + */ + + resetcibuf(); + memset(head, 0, sizeof *head); + memset(arena, 0, sizeof *arena); + memset(oldscore, 0, VtScoreSize); + memset(score, 0, VtScoreSize); + memset(&oldsha, 0, sizeof oldsha); + memset(&newsha, 0, sizeof newsha); + newsha.rollback = 1; + + if(0){ + sbdebug(&oldsha, "old.sha"); + sbdebug(&newsha, "new.sha"); + } + + loadarenabasics(offset0, anum, head, arena); + + /* start the clump hunt */ + + clumps = 0; + totalcorrupt = 0; + sealing = 1; + boffset = offset0 + arena->blocksize; + offset = boffset; + eoffset = offset0+arena->size - arena->blocksize; + toffset = eoffset; + sp = pagein(offset0, 4*M); + + if(arena->diskstats.sealed){ + oldsha.offset = offset0; + sbupdate(&oldsha, sp, offset0, 4*M); + } + ep = sp+4*M; + p = sp + (boffset - offset0); + ncib = arena->blocksize / ClumpInfoSize; /* ci per block in index */ + lastclumpend = offset; + nbad = 0; + inbad = 0; + needtozero = 0; + minclumps = 0; + while(offset < eoffset){ + /* + * Shift buffer if we're running out of room. + */ + if(p+70*K >= ep){ + /* + * Start the post SHA1 buffer. By now we should know the + * clumpmagic and arena version, so we can create a + * correct head block to get things going. + */ + if(sealing && fix && newsha.offset == 0){ + newsha.offset = offset0; + if(arena->clumpmagic == 0){ + if(arena->version == 0) + arena->version = ArenaVersion5; + arena->clumpmagic = newclumpmagic(arena->version); + } + head->clumpmagic = arena->clumpmagic; + shahead(&newsha, offset0, head); + } + n = 4*M-256*K; + if(sealing && fix){ + sbdiskhash(&newsha, bufoffset); + sbupdate(&newsha, buf, bufoffset, 4*M-256*K); + } + pagein(bufoffset+n, 4*M); + p -= n; + if(arena->diskstats.sealed) + sbupdate(&oldsha, buf, bufoffset, 4*M); + } + + /* + * Check for a clump at p, which is at offset in the disk. + * Duplicate clumps happen in corrupted disks + * (the same pattern gets written many times in a row) + * and should never happen during regular use. + */ + if((n = isclump(p, &cl, &magic)) > 0){ + /* + * If we were in the middle of some corrupted data, + * flush a warning about it and then add any clump + * info blocks as necessary. + */ + if(inbad){ + inbad = 0; + v = offset-lastclumpend; + if(needtozero){ + zerorange(lastclumpend, v); + sbrollback(&newsha, lastclumpend); + print("corrupt clump data - %#llux+%#llux (%,llud bytes)\n", + lastclumpend, v, v); + } + addcicorrupt(v); + totalcorrupt += v; + nb1 = (minclumps+ncib-1)/ncib; + minclumps += (v+ClumpSize+VtMaxLumpSize-1)/(ClumpSize+VtMaxLumpSize); + nb2 = (minclumps+ncib-1)/ncib; + eoffset -= (nb2-nb1)*arena->blocksize; + } + + if(haveclump(cl.info.score)) + print("warning: duplicate clump %d %V\n", cl.info.type, cl.info.score); + + /* + * If clumps use different magic numbers, we don't care. + * We'll just use the first one we find and make the others + * follow suit. + */ + if(arena->clumpmagic == 0){ + print("clump type %d size %d score %V magic %x\n", + cl.info.type, cl.info.size, cl.info.score, magic); + arena->clumpmagic = magic; + if(magic == _ClumpMagic) + arena->version = ArenaVersion4; + else + arena->version = ArenaVersion5; + } + if(magic != arena->clumpmagic) + p32(p, arena->clumpmagic); + if(clumps == 0) + arena->ctime = cl.time; + + /* + * Record the clump, update arena stats, + * grow clump info blocks if needed. + */ + if(verbose > 1) + print("\tclump %d: %d %V at %#llux+%#ux (%d)\n", + clumps, cl.info.type, cl.info.score, offset, n, n); + addcibuf(&cl.info, 0); + if(minclumps%ncib == 0) + eoffset -= arena->blocksize; + minclumps++; + clumps++; + if(cl.encoding != ClumpENone) + arena->diskstats.cclumps++; + arena->diskstats.uncsize += cl.info.uncsize; + arena->wtime = cl.time; + + /* + * Move to next clump. + */ + offset += n; + p += n; + lastclumpend = offset; + }else{ + /* + * Overwrite malformed clump data with zeros later. + * For now, just record whether it needs to be overwritten. + * Bad regions must be of size at least ClumpSize. + * Postponing the overwriting keeps us from writing past + * the end of the arena data (which might be directory data) + * with zeros. + */ + if(!inbad){ + inbad = 1; + needtozero = 0; + if(memcmp(p, zero, ClumpSize) != 0) + needtozero = 1; + p += ClumpSize; + offset += ClumpSize; + nbad++; + }else{ + if(*p != 0) + needtozero = 1; + p++; + offset++; + } + } + } + pageout(); + + if(verbose) + print("readable clumps: %d; min. directory entries: %d\n", + clumps, minclumps); + arena->diskstats.used = lastclumpend - boffset; + leaked = eoffset - lastclumpend; + if(verbose) + print("used from %#llux to %#llux = %,lld (%,lld unused)\n", + boffset, lastclumpend, arena->diskstats.used, leaked); + + /* + * Finish the SHA1 of the old data. + */ + if(arena->diskstats.sealed){ + sbdiskhash(&oldsha, toffset); + readdisk(dbuf, toffset, arena->blocksize); + scorecp(dbuf+arena->blocksize-VtScoreSize, zero); + sbupdate(&oldsha, dbuf, toffset, arena->blocksize); + sbscore(&oldsha, oldscore); + } + + /* + * If we still don't know the clump magic, the arena + * must be empty. It still needs a value, so make + * something up. + */ + if(arena->version == 0) + arena->version = ArenaVersion5; + if(arena->clumpmagic == 0){ + if(arena->version == ArenaVersion4) + arena->clumpmagic = _ClumpMagic; + else{ + do + arena->clumpmagic = fastrand(); + while(arena->clumpmagic==_ClumpMagic + ||arena->clumpmagic==0); + } + head->clumpmagic = arena->clumpmagic; + } + + /* + * Guess at number of clumpinfo blocks to load. + * If we guess high, it's no big deal. If we guess low, + * we'll be forced into rewriting the whole directory. + * Still not such a big deal. + */ + if(clumps == 0 || arena->diskstats.used == totalcorrupt) + goto Nocib; + if(clumps < arena->diskstats.clumps) + clumps = arena->diskstats.clumps; + if(clumps < ncibuf) + clumps = ncibuf; + clumps += totalcorrupt/ + ((arena->diskstats.used - totalcorrupt)/clumps); + clumps += totalcorrupt/2000; + if(clumps < minclumps) + clumps = minclumps; + clumps += ncib-1; + clumps -= clumps%ncib; + + /* + * Can't write into the actual data. + */ + v = offset0 + arena->size - arena->blocksize; + v -= (clumps+ncib-1)/ncib * arena->blocksize; + if(v < lastclumpend){ + v = offset0 + arena->size - arena->blocksize; + clumps = (v-lastclumpend)/arena->blocksize * ncib; + } + + if(clumps < minclumps) + print("cannot happen?\n"); + + /* + * Check clumpinfo blocks against directory we created. + * The tricky part is handling the corrupt sections of arena. + * If possible, we remark just the affected directory entries + * rather than slide everything down. + * + * Allocate clumps+1 blocks and check that we don't need + * the last one at the end. + */ + bci = loadci(offset0, arena, clumps+1); + eci = bci+clumps+1; + bcit = cibuf; + ecit = cibuf+ncibuf; + smart = 1; +Again: + nbad = 0; + ci = bci; + for(cit=bcit; citcorrupt){ + vlong n, m; + if(smart){ + /* + * If we can, just mark existing entries as corrupt. + */ + n = cit->corrupt; + for(xci=ci; n>0 && xcisize; + if(n > 0 || xci >= eci) + goto Dumb; + printed = 0; + for(; citype != VtCorruptType){ + if(!printed){ + print("marking directory %d-%d as corrupt\n", + (int)(ci-bci), (int)(xci-bci)); + printed = 1; + } + print("\ttype=%d size=%d uncsize=%d score=%V\n", + ci->type, ci->size, ci->uncsize, ci->score); + } + ci->type = VtCorruptType; + } + }else{ + Dumb: + print("\trewriting clump directory\n"); + /* + * Otherwise, blaze a new trail. + */ + n = cit->corrupt; + while(n > 0 && ci < eci){ + if(n < ClumpSize) + sysfatal("bad math in clump corrupt"); + if(n <= VtMaxLumpSize+ClumpSize) + m = n; + else{ + m = VtMaxLumpSize+ClumpSize; + if(n-m < ClumpSize) + m -= ClumpSize; + } + ci->type = VtCorruptType; + ci->size = m-ClumpSize; + ci->uncsize = m-ClumpSize; + memset(ci->score, 0, VtScoreSize); + ci++; + n -= m; + } + } + continue; + } + if(clumpinfocmp(&cit->ci, ci) != 0){ + if(verbose && (smart || verbose>1)){ + print("clumpinfo %d\n", (int)(ci-bci)); + print("\twant: %d %d %d %V\n", + cit->ci.type, cit->ci.size, + cit->ci.uncsize, cit->ci.score); + print("\thave: %d %d %d %V\n", + ci->type, ci->size, + ci->uncsize, ci->score); + } + *ci = cit->ci; + nbad++; + } + ci++; + } + if(ci >= eci || cit < ecit){ + print("ran out of space editing existing directory; rewriting\n"); + print("# eci %ld ci %ld ecit %ld cit %ld\n", eci-bci, ci-bci, ecit-bcit, cit-bcit); + assert(smart); /* can't happen second time thru */ + smart = 0; + goto Again; + } + + assert(ci <= eci); + arena->diskstats.clumps = ci-bci; + eoffset = writeci(offset0, arena, bci, ci-bci); + if(sealing && fix) + sbrollback(&newsha, v); +print("eoffset=%lld lastclumpend=%lld diff=%lld unseal=%d\n", eoffset, lastclumpend, eoffset-lastclumpend, unseal); + if(lastclumpend > eoffset) + print("arena directory overwrote blocks! cannot happen!\n"); + free(bci); + if(smart && nbad) + print("arena directory has %d bad or missing entries\n", nbad); +Nocib: + if(eoffset - lastclumpend > 64*1024 && (!arena->diskstats.sealed || unseal)){ + if(arena->diskstats.sealed) + print("unsealing arena\n"); + sealing = 0; + memset(oldscore, 0, VtScoreSize); + } + + /* + * Finish the SHA1 of the new data - only meaningful + * if we've been writing to disk (`fix'). + */ + arena->diskstats.sealed = sealing; + arena->memstats = arena->diskstats; + if(sealing && fix){ + uchar tbuf[MaxDiskBlock]; + + sbdiskhash(&newsha, toffset); + memset(tbuf, 0, sizeof tbuf); + packarena(arena, tbuf); + sbupdate(&newsha, tbuf, toffset, arena->blocksize); + sbscore(&newsha, score); + } +} + +void +dumparena(vlong offset, int anum, Arena *arena) +{ + char buf[1000]; + vlong o, e; + int fd, n; + + snprint(buf, sizeof buf, "%s.%d", dumpbase, anum); + if((fd = create(buf, OWRITE, 0666)) < 0){ + fprint(2, "create %s: %r\n", buf); + return; + } + e = offset+arena->size; + for(o=offset; o e) + n = e-o; + if(pwrite(fd, pagein(o, n), n, o-offset) != n){ + fprint(2, "write %s at %#llux: %r\n", buf, o-offset); + return; + } + } +} + +void +checkarena(vlong offset, int anum) +{ + uchar dbuf[MaxDiskBlock]; + uchar *p, oldscore[VtScoreSize], score[VtScoreSize]; + Arena arena, oarena; + ArenaHead head; + Info *fmt, *fmta; + int sz; + + print("# arena %d: offset %#llux\n", anum, offset); + + if(offset >= partend){ + print("arena offset out of bounds\n"); + return; + } + + guessarena(offset, anum, &head, &arena, oldscore, score); + + if(verbose){ + print("#\tversion=%d name=%s blocksize=%d size=%z", + head.version, head.name, head.blocksize, head.size); + if(head.clumpmagic) + print(" clumpmagic=%#.8ux", head.clumpmagic); + print("\n#\tclumps=%d cclumps=%d used=%,lld uncsize=%,lld\n", + arena.diskstats.clumps, arena.diskstats.cclumps, + arena.diskstats.used, arena.diskstats.uncsize); + print("#\tctime=%t\n", arena.ctime); + print("#\twtime=%t\n", arena.wtime); + if(arena.diskstats.sealed) + print("#\tsealed score=%V\n", score); + } + + if(dumpbase){ + dumparena(offset, anum, &arena); + return; + } + + memset(dbuf, 0, sizeof dbuf); + packarenahead(&head, dbuf); + p = pagein(offset, arena.blocksize); + if(memcmp(dbuf, p, arena.blocksize) != 0){ + print("on-disk arena header incorrect\n"); + showdiffs(dbuf, p, arena.blocksize, + arena.version==ArenaVersion4 ? headinfo4 : headinfo5); + } + memmove(p, dbuf, arena.blocksize); + + memset(dbuf, 0, sizeof dbuf); + packarena(&arena, dbuf); + if(arena.diskstats.sealed) + scorecp(dbuf+arena.blocksize-VtScoreSize, score); + p = pagein(offset+arena.size-arena.blocksize, arena.blocksize); + memset(&oarena, 0, sizeof oarena); + unpackarena(&oarena, p); + if(arena.version == ArenaVersion4){ + sz = ArenaSize4; + fmt = tailinfo4; + fmta = tailinfo4a; + }else{ + sz = ArenaSize5; + fmt = tailinfo5; + fmta = tailinfo5a; + } + if(p[sz] == 1){ + fmt = fmta; + if(oarena.diskstats.sealed){ + /* + * some arenas were sealed with the extension + * before we adopted the convention that if it didn't + * add new information it gets dropped. + */ + _packarena(&arena, dbuf, 1); + } + } + if(memcmp(dbuf, p, arena.blocksize-VtScoreSize) != 0){ + print("on-disk arena tail incorrect\n"); + showdiffs(dbuf, p, arena.blocksize-VtScoreSize, fmt); + } + if(arena.diskstats.sealed){ + if(oarena.diskstats.sealed) + if(scorecmp(p+arena.blocksize-VtScoreSize, oldscore) != 0){ + print("on-disk arena seal score incorrect\n"); + print("\tcorrect=%V\n", oldscore); + print("\t disk=%V\n", p+arena.blocksize-VtScoreSize); + } + if(fix && scorecmp(p+arena.blocksize-VtScoreSize, score) != 0){ + print("%ssealing arena%s: %V\n", + oarena.diskstats.sealed ? "re" : "", + scorecmp(oldscore, score) == 0 ? + "" : " after changes", score); + } + } + memmove(p, dbuf, arena.blocksize); + + pageout(); +} + +AMapN* +buildamap(void) +{ + uchar *p; + vlong o; + ArenaHead h; + AMapN *an; + AMap *m; + + an = vtmallocz(sizeof *an); + for(o=ap.arenabase; o= 0){ + an->map = vtrealloc(an->map, (an->n+1)*sizeof an->map[0]); + m = &an->map[an->n++]; + m->start = o; + m->stop = o+h.size; + strcpy(m->name, h.name); + } + } + return an; +} + +void +checkmap(void) +{ + char *s; + uchar *p; + int i, len; + AMapN *an; + Fmt fmt; + + an = buildamap(); + fmtstrinit(&fmt); + fmtprint(&fmt, "%ud\n", an->n); + for(i=0; in; i++) + fmtprint(&fmt, "%s\t%lld\t%lld\n", + an->map[i].name, an->map[i].start, an->map[i].stop); + s = fmtstrflush(&fmt); + len = strlen(s); + if(len > ap.tabsize){ + print("arena partition map too long: need %z bytes have %z\n", + (vlong)len, (vlong)ap.tabsize); + len = ap.tabsize; + } + + if(ap.tabsize >= 4*M){ /* can't happen - max arenas is 2000 */ + print("arena partition map *way* too long\n"); + return; + } + + p = pagein(ap.tabbase, ap.tabsize); + if(memcmp(p, s, len) != 0){ + print("arena partition map incorrect; rewriting.\n"); + memmove(p, s, len); + } + pageout(); +} + +int mainstacksize = 512*1024; + +void +threadmain(int argc, char **argv) +{ + int mode; + + mode = OREAD; + readonly = 1; + ARGBEGIN{ + case 'U': + unseal = 1; + break; + case 'a': + arenasize = unittoull(EARGF(usage())); + break; + case 'b': + ap.blocksize = unittoull(EARGF(usage())); + break; + case 'f': + fix = 1; + mode = ORDWR; + readonly = 0; + break; + case 'n': + basename = EARGF(usage()); + break; + case 'v': + verbose++; + break; + case 'x': + dumpbase = EARGF(usage()); + break; + default: + usage(); + }ARGEND + + if(argc != 1 && argc != 2) + usage(); + + file = argv[0]; + + ventifmtinstall(); + fmtinstall('z', zfmt); + fmtinstall('t', tfmt); + quotefmtinstall(); + + part = initpart(file, mode|ODIRECT); + if(part == nil) + sysfatal("can't open %s: %r", file); + partend = part->size; + + checkarenas(argc > 1 ? argv[1] : nil); + checkmap(); + threadexitsall(nil); +} + blob - f35580ed48467e6f9d1cffa8210a82d491b0d731 blob + 1a6f1e4ba352b23e52f23aeed9dd5e01fb790d51 --- src/cmd/venti/srv/fns.h +++ src/cmd/venti/srv/fns.h @@ -24,8 +24,13 @@ void delaykickicache(void); void delaykickround(Round*); void delaykickroundproc(void*); void dirtydblock(DBlock*, int); +void diskaccess(int); +void disksched(void); AState diskstate(void); void *emalloc(ulong); +void emptydcache(void); +void emptyicache(void); +void emptylumpcache(void); void *erealloc(void *, ulong); char *estrdup(char*); void *ezmalloc(ulong); @@ -49,6 +54,7 @@ u32int hashbits(u8int *score, int nbits); int httpdinit(char *address, char *webroot); int iaddrcmp(IAddr *ia1, IAddr *ia2); IEntry* icachedirty(u32int, u32int, u64int); +ulong icachedirtyfrac(void); void icacheclean(IEntry*); int ientrycmp(const void *vie1, const void *vie2); char *ifileline(IFile *f); @@ -77,6 +83,7 @@ int insertscore(u8int *score, IAddr *ia, int write); void kickdcache(void); void kickicache(void); void kickround(Round*, int wait); +int loadbloom(Bloom*); ZBlock *loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int verify); DBlock *loadibucket(Index *index, u8int *score, ISect **is, u32int *buck, IBucket *ib); int loadientry(Index *index, u8int *score, int type, IEntry *ie); @@ -98,6 +105,7 @@ int okamap(AMap *am, int n, u64int start, u64int stop int okibucket(IBucket*, ISect*); int outputamap(Fmt *f, AMap *am, int n); int outputindex(Fmt *f, Index *ix); +int _packarena(Arena *arena, u8int *buf, int); int packarena(Arena *arena, u8int *buf); int packarenahead(ArenaHead *head, u8int *buf); int packarenapart(ArenaPart *as, u8int *buf); @@ -129,6 +137,7 @@ ZBlock *readfile(char *name); int readifile(IFile *f, char *name); Packet *readlump(u8int *score, int type, u32int size, int *cached); int readpart(Part *part, u64int addr, u8int *buf, u32int n); +int resetbloom(Bloom*); int runconfig(char *config, Config*); int scorecmp(u8int *, u8int *); void scoremem(u8int *score, u8int *buf, int size); blob - 647c74b2d23405dfc821db4ce8306fa54dc833e2 blob + 9c906ad739af5babbd43d93e8587a6211a2fb0d1 --- src/cmd/venti/srv/graph.c +++ src/cmd/venti/srv/graph.c @@ -55,7 +55,11 @@ ginit(void) first = 0; memimageinit(); +#ifdef PLAN9PORT smallfont = openmemsubfont(unsharp("#9/font/lucsans/lstr.10")); +#else + smallfont = openmemsubfont("/lib/font/bit/lucidasans/lstr.10"); +#endif black = memblack; blue = allocrepl(DBlue); red = allocrepl(DRed); @@ -121,7 +125,7 @@ statgraph(Graph *g) if(g->wid > nelem(bin)) g->wid = nelem(bin); if(g->fill < 0) - g->fill = ((uint)(uintptr)g->arg>>8)%nelem(lofill); + g->fill = ((uint)g->arg>>8)%nelem(lofill); if(g->fill > nelem(lofill)) g->fill %= nelem(lofill); @@ -151,7 +155,7 @@ statgraph(Graph *g) qlock(&memdrawlock); ginit(); if(smallfont==nil || black==nil || blue==nil || red==nil || hifill==nil || lofill==nil){ - werrstr("graphics initialization failed"); + werrstr("graphics initialization failed: %r"); qunlock(&memdrawlock); return nil; } @@ -186,12 +190,12 @@ statgraph(Graph *g) if(0) if(lastlo != -1){ if(lastlo < lo) - memimagedraw(m, Rect(x-1, lastlo, x, lo), hifill[g->fill], ZP, memopaque, ZP, S); + memimagedraw(m, Rect(x-1, lastlo, x, lo), hifill[g->fill%nelem(hifill)], ZP, memopaque, ZP, S); else if(lastlo > lo) - memimagedraw(m, Rect(x-1, lo, x, lastlo), hifill[g->fill], ZP, memopaque, ZP, S); + memimagedraw(m, Rect(x-1, lo, x, lastlo), hifill[g->fill%nelem(hifill)], ZP, memopaque, ZP, S); } - memimagedraw(m, Rect(x, hi, x+1,lo), hifill[g->fill], ZP, memopaque, ZP, S); - memimagedraw(m, Rect(x, lo, x+1, r.max.y), lofill[g->fill], ZP, memopaque, ZP, S); + memimagedraw(m, Rect(x, hi, x+1,lo), hifill[g->fill%nelem(hifill)], ZP, memopaque, ZP, S); + memimagedraw(m, Rect(x, lo, x+1, r.max.y), lofill[g->fill%nelem(lofill)], ZP, memopaque, ZP, S); lastlo = lo; } blob - ad7222dd8c21ad77458cf336add45f524fc65788 blob + 04d19d9d325bb84b429615ad10aae5a93fbb254f --- src/cmd/venti/srv/httpd.c +++ src/cmd/venti/srv/httpd.c @@ -9,7 +9,7 @@ extern QLock memdrawlock; enum { ObjNameSize = 64, - MaxObjs = 16 + MaxObjs = 64 }; struct HttpObj @@ -28,6 +28,12 @@ static int dindex(HConnect *c); static int xindex(HConnect *c); static int xlog(HConnect *c); static int sindex(HConnect *c); +static int hempty(HConnect *c); +static int hlcacheempty(HConnect *c); +static int hdcacheempty(HConnect *c); +static int hicacheempty(HConnect *c); +static int hicachekick(HConnect *c); +static int hdcachekick(HConnect *c); static int hicacheflush(HConnect *c); static int hdcacheflush(HConnect *c); static int notfound(HConnect *c); @@ -53,10 +59,17 @@ httpdinit(char *address, char *dir) httpdobj("/xindex", xindex); httpdobj("/flushicache", hicacheflush); httpdobj("/flushdcache", hdcacheflush); + httpdobj("/kickicache", hicachekick); + httpdobj("/kickdcache", hdcachekick); httpdobj("/graph/", xgraph); + httpdobj("/set", xset); httpdobj("/set/", xset); httpdobj("/log", xlog); httpdobj("/log/", xlog); + httpdobj("/empty", hempty); + httpdobj("/emptyicache", hicacheempty); + httpdobj("/emptylumpcache", hlcacheempty); + httpdobj("/emptydcache", hdcacheempty); if(vtproc(listenproc, address) < 0) return -1; @@ -105,8 +118,6 @@ listenproc(void *vaddress) char *address, ndir[NETPATHLEN], dir[NETPATHLEN]; int ctl, nctl, data; -/*sleep(1000); // let strace find us */ - address = vaddress; ctl = announce(address, dir); if(ctl < 0){ @@ -148,7 +159,6 @@ httpproc(void *v) HConnect *c; int ok, i, n; -/*sleep(1000); // let strace find us */ c = v; for(;;){ @@ -182,7 +192,7 @@ httpproc(void *v) } static int -percent(long v, long total) +percent(ulong v, ulong total) { if(total == 0) total = 1; @@ -240,6 +250,31 @@ preqtext(HConnect *c) } static int +herror(HConnect *c) +{ + int n; + Hio *hout; + + hout = &c->hout; + n = snprint(c->xferbuf, HBufSize, "Error\n

Error

\n
%r
\n"); + hprint(hout, "%s %s\r\n", hversion, "400 Bad Request"); + hprint(hout, "Date: %D\r\n", time(nil)); + hprint(hout, "Server: Venti\r\n"); + hprint(hout, "Content-Type: text/html\r\n"); + hprint(hout, "Content-Length: %d\r\n", n); + if(c->head.closeit) + hprint(hout, "Connection: close\r\n"); + else if(!http11(c)) + hprint(hout, "Connection: Keep-Alive\r\n"); + hprint(hout, "\r\n"); + + if(c->req.meth == nil || strcmp(c->req.meth, "HEAD") != 0) + hwrite(hout, c->xferbuf, n); + + return hflush(hout); +} + +static int notfound(HConnect *c) { int r; @@ -325,21 +360,53 @@ static struct "logging", &ventilogging, "stats", &collectstats, "icachesleeptime", &icachesleeptime, + "minicachesleeptime", &minicachesleeptime, "arenasumsleeptime", &arenasumsleeptime, + "l0quantum", &l0quantum, + "l1quantum", &l1quantum, + "manualscheduling", &manualscheduling, + "ignorebloom", &ignorebloom, + "syncwrites", &syncwrites, + "icacheprefetch", &icacheprefetch, 0 }; + +static int +xsetlist(HConnect *c) +{ + int i; + + if(preqtype(c, "text/plain") < 0) + return -1; + for(i=0; namedints[i].name; i++) + print("%s = %d\n", namedints[i].name, *namedints[i].p); + hflush(&c->hout); + return 0; +} + + static int xset(HConnect *c) { int i, nf, r; char *f[10], *s; + if(strcmp(c->req.uri, "/set") == 0 || strcmp(c->req.uri, "/set/") == 0) + return xsetlist(c); + s = estrdup(c->req.uri); nf = getfields(s+strlen("/set/"), f, nelem(f), 1, "/"); - if(nf < 1) - return notfound(c); + if(nf < 1){ + r = preqtext(c); + if(r < 0) + return r; + for(i=0; namedints[i].name; i++) + hprint(&c->hout, "%s = %d\n", namedints[i].name, *namedints[i].p); + hflush(&c->hout); + return 0; + } for(i=0; namedints[i].name; i++){ if(strcmp(f[0], namedints[i].name) == 0){ if(nf >= 2) @@ -495,6 +562,108 @@ darena(Hio *hout, Arena *arena) } static int +hempty(HConnect *c) +{ + Hio *hout; + int r; + + r = preqtext(c); + if(r < 0) + return r; + hout = &c->hout; + + emptylumpcache(); + emptydcache(); + emptyicache(); + hprint(hout, "emptied all caches\n"); + hflush(hout); + return 0; +} + +static int +hlcacheempty(HConnect *c) +{ + Hio *hout; + int r; + + r = preqtext(c); + if(r < 0) + return r; + hout = &c->hout; + + emptylumpcache(); + hprint(hout, "emptied lumpcache\n"); + hflush(hout); + return 0; +} + +static int +hicacheempty(HConnect *c) +{ + Hio *hout; + int r; + + r = preqtext(c); + if(r < 0) + return r; + hout = &c->hout; + + emptyicache(); + hprint(hout, "emptied icache\n"); + hflush(hout); + return 0; +} + +static int +hdcacheempty(HConnect *c) +{ + Hio *hout; + int r; + + r = preqtext(c); + if(r < 0) + return r; + hout = &c->hout; + + emptydcache(); + hprint(hout, "emptied dcache\n"); + hflush(hout); + return 0; +} +static int +hicachekick(HConnect *c) +{ + Hio *hout; + int r; + + r = preqtext(c); + if(r < 0) + return r; + hout = &c->hout; + + kickicache(); + hprint(hout, "kicked icache\n"); + hflush(hout); + return 0; +} + +static int +hdcachekick(HConnect *c) +{ + Hio *hout; + int r; + + r = preqtext(c); + if(r < 0) + return r; + hout = &c->hout; + + kickdcache(); + hprint(hout, "kicked dcache\n"); + hflush(hout); + return 0; +} +static int hicacheflush(HConnect *c) { Hio *hout; @@ -569,6 +738,7 @@ rawgraph(Stats *s, Stats *t, void *va) { Arg *a; + USED(s); a = va; return t->n[a->index]; } @@ -587,6 +757,7 @@ pctgraph(Stats *s, Stats *t, void *va) { Arg *a; + USED(s); a = va; return percent(t->n[a->index], t->n[a->index2]); } @@ -722,7 +893,7 @@ static char* graphname[] = "isectwritebyte", "sumread", - "sumreadbyte" + "sumreadbyte", }; static int @@ -733,7 +904,6 @@ findname(char *s) for(i=0; iname : "<nil>"; -fprint(2, "hdump xfer %d\n", h->xferenc); hprint(h, "\n"); hprint(h, "Venti Server Log: %s\n", name); hprint(h, "\n"); blob - 46d411e584473e1f20cf665ca5751b2fa032cf51 blob + 49f741e7911cbb0b2df3b69101cc5657b5afb143 --- src/cmd/venti/srv/icache.c +++ src/cmd/venti/srv/icache.c @@ -11,6 +11,7 @@ struct ICache int bits; /* bits to use for indexing heads */ u32int size; /* number of heads; == 1 << bits, should be < entries */ IEntry *base; /* all allocated hash table entries */ + IEntry *free; u32int entries; /* elements in base */ IEntry *dirty; /* chain of dirty elements */ u32int ndirty; @@ -23,6 +24,8 @@ struct ICache int nlast; }; +int icacheprefetch = 1; + static ICache icache; static IEntry *icachealloc(IAddr *ia, u8int *score); @@ -45,6 +48,12 @@ initicache(int bits, int depth) setstat(StatIcacheSize, icache.entries); } +ulong +icachedirtyfrac(void) +{ + return (vlong)icache.ndirty*IcacheFrac / icache.entries; +} + u32int hashbits(u8int *sc, int bits) { @@ -141,14 +150,16 @@ lookupscore(u8int *score, int type, IAddr *ia, int *ra * load the table of contents for that arena into the cache. */ ie = icachealloc(&d.ia, score); - icache.last[icache.nlast++%nelem(icache.last)] = amapitoa(mainindex, ie->ia.addr, &aa); - aa = ie->ia.addr - aa; /* compute base addr of arena */ - for(i=0; iia.addr, &aa); + aa = ie->ia.addr - aa; /* compute base addr of arena */ + for(i=0; inext; + goto Found; + } h = icache.stolen; for(i=0;; i++){ @@ -346,3 +362,21 @@ icacheclean(IEntry *ie) trace(TraceProc, "icachedirty exit"); } +void +emptyicache(void) +{ + int i; + IEntry *ie, **lie; + + qlock(&icache.lock); + for(i=0; idirty == 0){ + *lie = ie->next; + ie->next = icache.free; + icache.free = ie; + }else + lie = &ie->next; + } + qunlock(&icache.lock); +} blob - 9c36ba2ca54e48f55acc645d01f79b1b2d487418 blob + 003abb1856ba386f5402cc94860febb24ef99985 --- src/cmd/venti/srv/icachewrite.c +++ src/cmd/venti/srv/icachewrite.c @@ -12,6 +12,7 @@ static void icachewritecoord(void*); static IEntry *iesort(IEntry*); int icachesleeptime = 1000; /* milliseconds */ +int minicachesleeptime = 50; enum { @@ -74,7 +75,7 @@ nextchunk(Index *ix, ISect *is, IEntry **pie, u64int * static int icachewritesect(Index *ix, ISect *is, u8int *buf) { - int err, h, bsize; + int err, h, bsize, t; u32int lo, hi; u64int addr, naddr; uint nbuf, off; @@ -96,7 +97,14 @@ icachewritesect(Index *ix, ISect *is, u8int *buf) err = 0; while(iedirty){ - sleep(icachesleeptime); + disksched(); + while((t=icachesleeptime) == SleepForever){ + sleep(1000); + disksched(); + } + if(t < minicachesleeptime) + t = minicachesleeptime; + sleep(t); trace(TraceProc, "icachewritesect nextchunk"); chunk = nextchunk(ix, is, &iedirty, &addr, &nbuf); @@ -146,12 +154,15 @@ icachewritesect(Index *ix, ISect *is, u8int *buf) break; } packibucket(&ib, buf+off, is->bucketmagic); + /* XXX not right - must update cache after writepart */ if((b = _getdblock(is->part, naddr, ORDWR, 0)) != nil){ memmove(b->data, buf+off, bsize); putdblock(b); } } + diskaccess(1); + trace(TraceProc, "icachewritesect writepart", addr, nbuf); if(writepart(is->part, addr, buf, nbuf) < 0){ /* XXX */ @@ -171,6 +182,7 @@ icachewritesect(Index *ix, ISect *is, u8int *buf) static void icachewriteproc(void *v) { + int ret; uint bsize; ISect *is; Index *ix; @@ -188,17 +200,17 @@ icachewriteproc(void *v) trace(TraceProc, "icachewriteproc recv"); recv(is->writechan, 0); trace(TraceWork, "start"); - icachewritesect(ix, is, buf); + ret = icachewritesect(ix, is, buf); trace(TraceProc, "icachewriteproc send"); trace(TraceWork, "finish"); - send(is->writedonechan, 0); + sendul(is->writedonechan, ret); } } static void icachewritecoord(void *v) { - int i; + int i, err; Index *ix; AState as; @@ -216,9 +228,9 @@ icachewritecoord(void *v) as = diskstate(); if(as.arena==iwrite.as.arena && as.aa==iwrite.as.aa){ /* will not be able to do anything more than last flush - kick disk */ - trace(TraceProc, "icachewritecoord flush dcache"); + trace(TraceProc, "icachewritecoord kick dcache"); kickdcache(); - trace(TraceProc, "icachewritecoord flushed dcache"); + trace(TraceProc, "icachewritecoord kicked dcache"); } iwrite.as = as; @@ -229,13 +241,15 @@ icachewritecoord(void *v) if(ix->bloom) send(ix->bloom->writechan, 0); + err = 0; for(i=0; insects; i++) - recv(ix->sects[i]->writedonechan, 0); + err |= recvul(ix->sects[i]->writedonechan); if(ix->bloom) - recv(ix->bloom->writedonechan, 0); + err |= recvul(ix->bloom->writedonechan); - trace(TraceProc, "icachewritecoord donewrite"); - setatailstate(&iwrite.as); + trace(TraceProc, "icachewritecoord donewrite err=%d", err); + if(err == 0) + setatailstate(&iwrite.as); } icacheclean(nil); /* wake up anyone waiting */ trace(TraceWork, "finish"); blob - 8cff4180ee3f5e5d4ed318fec42ee11983687fb4 blob + c69192a7e1f8b0ee3d2eeeea7161affe6687e3c7 --- src/cmd/venti/srv/index.c +++ src/cmd/venti/srv/index.c @@ -23,16 +23,10 @@ #include "dat.h" #include "fns.h" -/*static int bucklook(u8int *score, int type, u8int *data, int n); */ -/*static int writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b); */ -/*static int okibucket(IBucket *ib, ISect *is); */ static int initindex1(Index*); static ISect *initisect1(ISect *is); -/*static int splitiblock(Index *ix, DBlock *b, ISect *is, u32int buck, IBucket *ib); */ #define KEY(k,d) ((d) ? (k)>>(32-(d)) : 0) - -/*static QLock indexlock; //ZZZ */ static char IndexMagic[] = "venti index configuration"; @@ -375,6 +369,8 @@ initisect(Part *part) seterr(EAdmin, "can't read index section header: %r"); return nil; } +print("read %s at %d: %.2ux %.2ux %.2ux %.2ux\n", + part->name, PartBlank, b->data[0], b->data[1], b->data[2], b->data[3]); is = MKZ(ISect); if(is == nil){ @@ -457,9 +453,10 @@ initisect1(ISect *is) v = is->part->size & ~(u64int)(is->blocksize - 1); if(is->blockbase + (u64int)is->blocks * is->blocksize != v){ seterr(ECorrupt, "invalid blocks in index section %s", is->name); -/*ZZZZZZZZZ */ -/* freeisect(is); */ -/* return nil; */ + /* ZZZ what to do? + freeisect(is); + return nil; + */ } if(is->stop - is->start > is->blocks){ @@ -482,9 +479,10 @@ wbisect(ISect *is) ZBlock *b; b = alloczblock(HeadSize, 1, 0); - if(b == nil) -/*ZZZ set error? */ + if(b == nil){ + /* ZZZ set error? */ return -1; + } if(packisect(is, b->data) < 0){ seterr(ECorrupt, "can't make index section header: %r"); @@ -789,7 +787,7 @@ loadibucket0(Index *ix, u32int buck, ISect **pis, u32i /* * find the number of the index section holding score */ -static int +int indexsect1(Index *ix, u8int *score) { return indexsect0(ix, hashbits(score, 32) / ix->div); blob - 1fe3cf5c375cdf7e38cff67bfc951a8c7dcd96a4 blob + 13e6fe6aa12f0d8e52c9695a79f38df9bcf12383 --- src/cmd/venti/srv/lump.c +++ src/cmd/venti/srv/lump.c @@ -2,6 +2,7 @@ #include "dat.h" #include "fns.h" +int syncwrites = 0; int queuewrites = 0; int writestodevnull = 0; @@ -45,7 +46,7 @@ readlump(u8int *score, int type, u32int size, int *cac *cached = 0; if(lookupscore(score, type, &ia, &rac) < 0){ - /*ZZZ place to check for someone trying to guess scores */ + /* ZZZ place to check for someone trying to guess scores */ seterr(EOk, "no block with score %V/%d exists", score, type); putlump(u); @@ -92,7 +93,15 @@ writelump(Packet *p, u8int *score, int type, u32int cr if(u->data != nil){ ok = 0; if(packetcmp(p, u->data) != 0){ - seterr(EStrange, "score collision"); + uchar nscore[VtScoreSize]; + + packetsha1(u->data, nscore); + if(scorecmp(u->score, score) != 0) + seterr(EStrange, "lookuplump returned bad score %V not %V", u->score, score); + else if(scorecmp(u->score, nscore) != 0) + seterr(EStrange, "lookuplump returned bad data %V not %V", nscore, u->score); + else + seterr(EStrange, "score collision %V", score); ok = -1; } packetfree(p); @@ -138,7 +147,13 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms) if(old != nil){ ok = 0; if(packetcmp(p, old) != 0){ - seterr(EStrange, "score collision"); + uchar nscore[VtScoreSize]; + + packetsha1(old, nscore); + if(scorecmp(u->score, nscore) != 0) + seterr(EStrange, "readilump returned bad data %V not %V", nscore, u->score); + else + seterr(EStrange, "score collision %V", u->score); ok = -1; } packetfree(p); @@ -160,6 +175,12 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms) insertlump(u, p); else packetfree(p); + + if(syncwrites){ + flushdcache(); + flushicache(); + flushdcache(); + } ms = msec() - ms; addstat2(StatRpcWriteNew, 1, StatRpcWriteNewTime, ms); blob - f183e128d06ebe8bf7840a4590a4f759a06e7dc7 blob + b989c3cbfd45658a94a3df0283ccc03b449d8696 --- src/cmd/venti/srv/lumpcache.c +++ src/cmd/venti/srv/lumpcache.c @@ -11,7 +11,7 @@ enum { HashLog = 9, HashSize = 1<length; part->size = hi - part->offset; -fprint(2, "part %s: file %s offset %,lld size %,lld\n", - name, file, part->offset, part->size); #ifdef CANBLOCKSIZE { struct statfs sfs; @@ -203,10 +201,32 @@ prwb(char *name, int fd, int isread, u64int offset, vo u32int c, delta, icount, opsize; int r; + icount = count; buf = vbuf; + +#ifndef PLAN9PORT + op = isread ? "read" : "write"; + dst = buf; + freetmp = nil; + while(count > 0){ + opsize = min(count, 131072 /* blocksize */); + if(isread) + r = pread(fd, dst, opsize, offset); + else + r = pwrite(fd, dst, opsize, offset); + if(r <= 0) + goto Error; + offset += r; + count -= r; + dst += r; + if(r != opsize) + goto Error; + } + return icount; +#endif + tmp = nil; freetmp = nil; - icount = count; opsize = blocksize; if(count == 0){ @@ -313,7 +333,7 @@ print("FAILED isread=%d r=%d count=%d blocksize=%d\n", memmove(buf, tmp, count); else{ memmove(tmp, buf, count); - if(pwrite(fd, tmp, blocksize, offset) != blocksize){ + if(pwrite(fd, tmp, opsize, offset) != blocksize){ dst = tmp; op = "write"; goto Error; @@ -332,9 +352,16 @@ Error: return -1; } +#ifndef PLAN9PORT +static int sdreset(Part*); +static int reopen(Part*); +static int threadspawnl(int[3], char*, char*, ...); +#endif + int rwpart(Part *part, int isread, u64int offset, u8int *buf, u32int count) { + int n, try; u32int blocksize; trace(TraceDisk, "%s %s %ud at 0x%llx", @@ -351,9 +378,33 @@ rwpart(Part *part, int isread, u64int offset, u8int *b if(blocksize == 0) blocksize = 4096; - return prwb(part->filename, part->fd, isread, part->offset+offset, buf, count, blocksize); -} + for(try=0;; try++){ + n = prwb(part->filename, part->fd, isread, part->offset+offset, buf, count, blocksize); + if(n >= 0 || try > 10) + break; +#ifndef PLAN9PORT + { + char err[ERRMAX]; + /* + * This happens with the sdmv disks frustratingly often. + * Try to fix things up and continue. + */ + rerrstr(err, sizeof err); + if(strstr(err, "i/o timeout") || strstr(err, "i/o error")){ + if(sdreset(part) >= 0) + reopen(part); + continue; + }else if(strstr(err, "partition has changed")){ + reopen(part); + continue; + } + } +#endif + break; + } + return n; +} int readpart(Part *part, u64int offset, u8int *buf, u32int count) { @@ -389,5 +440,202 @@ readfile(char *name) } freepart(p); return b; +} + + + + + + + + +#ifndef PLAN9PORT +static int +sdreset(Part *part) +{ + char *name, *p; + int i, fd, xfd[3], rv; + static QLock resetlk; + Dir *d, *dd; + + fprint(2, "sdreset %s\n", part->name); + name = emalloc(strlen(part->filename)+20); + strcpy(name, part->filename); + p = strrchr(name, '/'); + if(p) + p++; + else + p = name; + + strcpy(p, "ctl"); + d = dirstat(name); + if(d == nil){ + free(name); + return -1; + } + + /* + * We don't need multiple people resetting the disk. + */ + qlock(&resetlk); + if((fd = open(name, OWRITE)) < 0) + goto error; + dd = dirfstat(fd); + if(d && dd && d->qid.vers != dd->qid.vers){ + fprint(2, "sdreset %s: got scooped\n", part->name); + /* Someone else got here first. */ + if(access(part->filename, AEXIST) >= 0) + goto ok; + goto error; + } + + /* + * Write "reset" to the ctl file to cause the chipset + * to reinitialize itself (specific to sdmv driver). + * Ignore error in case using other disk. + */ + fprint(2, "sdreset %s: reset ctl\n", part->name); + write(fd, "reset", 5); + + if(access(part->filename, AEXIST) >= 0) + goto ok; + + /* + * Re-run fdisk and prep. Don't use threadwaitchan + * to avoid coordinating for it. Reopen ctl because + * we reset the disk. + */ + strcpy(p, "ctl"); + close(fd); + if((fd = open(name, OWRITE)) < 0) + goto error; + strcpy(p, "data"); + xfd[0] = open("/dev/null", OREAD); + xfd[1] = dup(fd, -1); + xfd[2] = dup(2, -1); + fprint(2, "sdreset %s: run fdisk %s\n", part->name, name); + if(threadspawnl(xfd, "/bin/disk/fdisk", "disk/fdisk", "-p", name, nil) < 0){ + close(xfd[0]); + close(xfd[1]); + close(xfd[2]); + goto error; + } + strcpy(p, "plan9"); + for(i=0; i<=20; i++){ + sleep(i*100); + if(access(part->filename, AEXIST) >= 0) + goto ok; + if(access(name, AEXIST) >= 0) + goto prep; + } + goto error; + +prep: + strcpy(p, "ctl"); + close(fd); + if((fd = open(name, OWRITE)) < 0) + goto error; + strcpy(p, "plan9"); + xfd[0] = open("/dev/null", OREAD); + xfd[1] = dup(fd, -1); + xfd[2] = dup(2, -1); + fprint(2, "sdreset %s: run prep\n", part->name); + if(threadspawnl(xfd, "/bin/disk/prep", "disk/prep", "-p", name, nil) < 0){ + close(xfd[0]); + close(xfd[1]); + close(xfd[2]); + goto error; + } + for(i=0; i<=20; i++){ + sleep(i*100); + if(access(part->filename, AEXIST) >= 0) + goto ok; + } + +error: + fprint(2, "sdreset %s: error: %r\n", part->name); + rv = -1; + if(fd >= 0) + close(fd); + goto out; + +ok: + fprint(2, "sdreset %s: all okay\n", part->name); + rv = 0; + goto out; + +out: + free(name); + qunlock(&resetlk); + return rv; } +static int +reopen(Part *part) +{ + int fd; + + fprint(2, "reopen %s\n", part->filename); + if((fd = open(part->filename, ORDWR)) < 0){ + fprint(2, "reopen %s: %r\n", part->filename); + return -1; + } + if(fd != part->fd){ + dup(fd, part->fd); + close(fd); + } + return 0; +} + +typedef struct Spawn Spawn; +struct Spawn +{ + Channel *c; + int fd[3]; + char *file; + char **argv; +}; + +static void +spawnproc(void *v) +{ + int i, *fd; + Spawn *s; + + rfork(RFFDG); + s = v; + fd = s->fd; + for(i=0; i<3; i++) + dup(fd[i], i); + if(fd[0] > 2) + close(fd[0]); + if(fd[1] > 2 && fd[1] != fd[0]) + close(fd[1]); + if(fd[2] > 2 && fd[2] != fd[1] && fd[2] != fd[0]) + close(fd[2]); + procexec(s->c, s->file, s->argv); +} + +static int +threadspawnl(int fd[3], char *file, char *argv0, ...) +{ + int pid; + Spawn s; + + s.c = chancreate(sizeof(void*), 0); + memmove(s.fd, fd, sizeof(s.fd)); + s.file = file; + s.argv = &argv0; + vtproc(spawnproc, &s); + pid = recvul(s.c); + if(pid < 0) + return -1; + close(fd[0]); + if(fd[1] != fd[0]) + close(fd[1]); + if(fd[2] != fd[1] && fd[2] != fd[0]) + close(fd[2]); + return pid; +} + +#endif blob - /dev/null blob + 253b4edbc5dc8591265f2abfa744fd77f44895cc (mode 644) --- /dev/null +++ src/cmd/venti/srv/mirrorarenas.c @@ -0,0 +1,464 @@ +/* + * Mirror one arena partition onto another. + * Be careful to copy only new data. + */ + +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +Channel *writechan; + +typedef struct Write Write; +struct Write +{ + uchar *p; + int n; + uvlong o; + int error; +}; + +Part *src; +Part *dst; +int force; +int verbose; +char *status; +uvlong astart, aend; + +void +usage(void) +{ + fprint(2, "usage: mirrorarenas [-v] src dst [ranges]\n"); + threadexitsall("usage"); +} + +int +ereadpart(Part *p, u64int offset, u8int *buf, u32int count) +{ + if(readpart(p, offset, buf, count) != count){ + print("%T readpart %s at %#llux+%ud: %r\n", p->name, offset, count); + return -1; + } + return 0; +} + +int +ewritepart(Part *p, u64int offset, u8int *buf, u32int count) +{ + if(writepart(p, offset, buf, count) != count){ + print("%T writepart %s at %#llux+%ud: %r\n", p->name, offset, count); + return -1; + } + return 0; +} + +/* + * Extra proc to do writes to dst, so that we can overlap reading + * src with writing dst during copy. This is an easy factor of two + * (almost) in performance. + */ +static void +writeproc(void *v) +{ + Write *w; + + USED(v); + while((w = recvp(writechan)) != nil){ + if(w->n == 0) + continue; + if(ewritepart(dst, w->o, w->p, w->n) < 0) + w->error = 1; + } +} + +int +copy(uvlong start, uvlong end, char *what, DigestState *ds) +{ + int i, n; + uvlong o; + static uchar tmp[2][1024*1024]; + Write w[2]; + + assert(start <= end); + assert(astart <= start && start < aend); + assert(astart <= end && end <= aend); + + if(verbose && start != end) + print("%T copy %,llud-%,llud %s\n", start, end, what); + + i = 0; + memset(w, 0, sizeof w); + for(o=start; o end) + n = end - o; + if(ereadpart(src, o, tmp[i], n) < 0) + goto error; + w[i].p = tmp[i]; + w[i].o = o; + w[i].n = n; + w[i].error = 0; + sendp(writechan, &w[i]); + if(ds) + sha1(tmp[i], n, nil, ds); + i = 1-i; + } + if(w[i].error) + goto error; + + /* + * wait for queued write to finish + */ + w[i].p = nil; + w[i].o = 0; + w[i].n = 0; + w[i].error = 0; + sendp(writechan, &w[i]); + i = 1-i; + if(w[i].error) + return -1; + return 0; + +error: + /* + * sync with write proc + */ + w[i].p = nil; + w[i].o = 0; + w[i].n = 0; + w[i].error = 0; + sendp(writechan, &w[i]); + return -1; +} + +/* single-threaded, for reference */ +int +copy1(uvlong start, uvlong end, char *what, DigestState *ds) +{ + int n; + uvlong o; + static uchar tmp[1024*1024]; + + assert(start <= end); + assert(astart <= start && start < aend); + assert(astart <= end && end <= aend); + + if(verbose && start != end) + print("%T copy %,llud-%,llud %s\n", start, end, what); + + for(o=start; o end) + n = end - o; + if(ereadpart(src, o, tmp, n) < 0) + return -1; + if(ds) + sha1(tmp, n, nil, ds); + if(ewritepart(dst, o, tmp, n) < 0) + return -1; + } + return 0; +} + +int +asha1(Part *p, uvlong start, uvlong end, DigestState *ds) +{ + int n; + uvlong o; + static uchar tmp[1024*1024]; + + if(start == end) + return 0; + assert(start < end); + + if(verbose) + print("%T sha1 %,llud-%,llud\n", start, end); + + for(o=start; o end) + n = end - o; + if(ereadpart(p, o, tmp, n) < 0) + return -1; + sha1(tmp, n, nil, ds); + } + return 0; +} + +uvlong +rdown(uvlong a, int b) +{ + return a-a%b; +} + +uvlong +rup(uvlong a, int b) +{ + if(a%b == 0) + return a; + return a+b-a%b; +} + +void +mirror(Arena *sa, Arena *da) +{ + vlong v, si, di, end; + int clumpmax, blocksize; + static uchar buf[MaxIoSize]; + ArenaHead h; + DigestState xds, *ds; + vlong shaoff, base; + + base = sa->base; + blocksize = sa->blocksize; + end = sa->base + sa->size; + + astart = base - blocksize; + aend = end + blocksize; + + shaoff = 0; + + if(force){ + copy(astart, aend, "all", nil); + return; + } + + if(verbose) + print("%T %s (%,llud-%,llud)\n", sa->name, astart, aend); + + if(sa->diskstats.sealed && da->diskstats.sealed && scorecmp(da->score, zeroscore) != 0){ + if(scorecmp(sa->score, da->score) == 0) + return; + print("%T arena %s: sealed score mismatch %V vs %V\n", sa->name, sa->score, da->score); + status = "errors"; + return; + } + if(da->diskstats.sealed && scorecmp(da->score, zeroscore) != 0){ + print("%T arena %s: dst is sealed, src is not\n", sa->name); + status = "errors"; + return; + } + if(sa->diskstats.used < da->diskstats.used){ + print("%T arena %s: src used %,lld < dst used %,lld\n", sa->name, sa->diskstats.used, da->diskstats.used); + status = "errors"; + return; + } + + if(da->clumpmagic != sa->clumpmagic){ + /* + * Write this now to reduce the window in which + * the head and tail disagree about clumpmagic. + */ + da->clumpmagic = sa->clumpmagic; + memset(buf, 0, sizeof buf); + packarena(da, buf); + if(ewritepart(dst, end, buf, blocksize) < 0) + return; + } + + memset(&h, 0, sizeof h); + h.version = da->version; + strcpy(h.name, da->name); + h.blocksize = da->blocksize; + h.size = da->size + 2*da->blocksize; + h.clumpmagic = da->clumpmagic; + memset(buf, 0, sizeof buf); + packarenahead(&h, buf); + if(ewritepart(dst, base - blocksize, buf, blocksize) < 0) + return; + + ds = nil; + if(sa->diskstats.sealed && scorecmp(sa->score, zeroscore) != 0){ + /* start sha1 state with header */ + memset(&xds, 0, sizeof xds); + ds = &xds; + sha1(buf, blocksize, nil, ds); + shaoff = base; + } + + if(sa->diskstats.used != da->diskstats.used){ + di = base+rdown(da->diskstats.used, blocksize); + si = base+rup(sa->diskstats.used, blocksize); + if(ds && asha1(dst, shaoff, di, ds) < 0) + return; + if(copy(di, si, "data", ds) < 0) + return; + shaoff = si; + } + + clumpmax = sa->clumpmax; + di = end - da->diskstats.clumps/clumpmax * blocksize; + si = end - (sa->diskstats.clumps+clumpmax-1)/clumpmax * blocksize; + + if(sa->diskstats.sealed){ + /* + * might be a small hole between the end of the + * data and the beginning of the directory. + */ + v = base+rup(sa->diskstats.used, blocksize); + if(ds && asha1(dst, shaoff, v, ds) < 0) + return; + if(copy(v, si, "hole", ds) < 0) + return; + shaoff = si; + } + + if(da->diskstats.clumps != sa->diskstats.clumps){ + if(ds && asha1(dst, shaoff, si, ds) < 0) + return; + if(copy(si, di, "directory", ds) < 0) /* si < di because clumpinfo blocks grow down */ + return; + shaoff = di; + } + + da->ctime = sa->ctime; + da->wtime = sa->wtime; + da->diskstats = sa->diskstats; + da->diskstats.sealed = 0; + + memset(buf, 0, sizeof buf); + packarena(da, buf); + if(ewritepart(dst, end, buf, blocksize) < 0) + return; + + if(ds){ + asha1(dst, shaoff, end, ds); + da->diskstats.sealed = 1; + memset(buf, 0, sizeof buf); + packarena(da, buf); + sha1(buf, blocksize, da->score, ds); + if(scorecmp(sa->score, da->score) == 0){ + if(verbose) + print("%T arena %s: %V\n", sa->name, da->score); + scorecp(buf+blocksize-VtScoreSize, da->score); + if(ewritepart(dst, end, buf, blocksize) < 0) + return; + }else{ + print("%T arena %s: sealing dst: score mismatch: %V vs %V\n", sa->name, sa->score, da->score); + memset(&xds, 0, sizeof xds); + asha1(dst, base-blocksize, end, &xds); + sha1(buf, blocksize, da->score, &xds); + print("%T reseal: %V\n", da->score); + status = "errors"; + } + } +} + +void +mirrormany(ArenaPart *sp, ArenaPart *dp, char *range) +{ + int i, lo, hi; + char *s, *t; + Arena *sa, *da; + + if(range == nil){ + for(i=0; inarenas; i++){ + sa = sp->arenas[i]; + da = dp->arenas[i]; + mirror(sa, da); + } + return; + } + if(strcmp(range, "none") == 0) + return; + + for(s=range; *s; s=t){ + t = strchr(s, ','); + if(t) + *t++ = 0; + else + t = s+strlen(s); + if(*s == '-') + lo = 0; + else + lo = strtol(s, &s, 0); + hi = lo; + if(*s == '-'){ + s++; + if(*s == 0) + hi = sp->narenas-1; + else + hi = strtol(s, &s, 0); + } + if(*s != 0){ + print("%T bad arena range: %s\n", s); + continue; + } + for(i=lo; i<=hi; i++){ + sa = sp->arenas[i]; + da = dp->arenas[i]; + mirror(sa, da); + } + } +} + + +void +threadmain(int argc, char **argv) +{ + int i; + Arena *sa, *da; + ArenaPart *s, *d; + char *ranges; + + ventifmtinstall(); + + ARGBEGIN{ + case 'F': + force = 1; + break; + case 'v': + verbose++; + break; + default: + usage(); + }ARGEND + + if(argc != 2 && argc != 3) + usage(); + ranges = nil; + if(argc == 3) + ranges = argv[2]; + + if((src = initpart(argv[0], OREAD)) == nil) + sysfatal("initpart %s: %r", argv[0]); + if((dst = initpart(argv[1], ORDWR)) == nil) + sysfatal("initpart %s: %r", argv[1]); + if((s = initarenapart(src)) == nil) + sysfatal("initarenapart %s: %r", argv[0]); + for(i=0; inarenas; i++) + delarena(s->arenas[i]); + if((d = initarenapart(dst)) == nil) + sysfatal("loadarenapart %s: %r", argv[1]); + for(i=0; inarenas; i++) + delarena(d->arenas[i]); + + /* + * The arena geometries must match or all bets are off. + */ + if(s->narenas != d->narenas) + sysfatal("arena count mismatch: %d vs %d", s->narenas, d->narenas); + for(i=0; inarenas; i++){ + sa = s->arenas[i]; + da = d->arenas[i]; + if(sa->version != da->version) + sysfatal("arena %d: version mismatch: %d vs %d", i, sa->version, da->version); + if(sa->blocksize != da->blocksize) + sysfatal("arena %d: blocksize mismatch: %d vs %d", i, sa->blocksize, da->blocksize); + if(sa->size != da->size) + sysfatal("arena %d: size mismatch: %,lld vs %,lld", i, sa->size, da->size); + if(strcmp(sa->name, da->name) != 0) + sysfatal("arena %d: name mismatch: %s vs %s", i, sa->name, da->name); + } + + /* + * Mirror one arena at a time. + */ + writechan = chancreate(sizeof(void*), 0); + vtproc(writeproc, nil); + mirrormany(s, d, ranges); + sendp(writechan, nil); + threadexitsall(status); +} blob - 90c74ccceed97f90d3af4b5d3c0d54762ad88ebc blob + 111db0187fa47017c41c628517fbafdd4602b060 --- src/cmd/venti/srv/printarenas.c +++ src/cmd/venti/srv/printarenas.c @@ -36,7 +36,7 @@ shoulddump(char *name, int argc, char **argv) enum { - ClumpChunks = 32*1024 + ClumpChunks = 32*1024, }; void blob - /dev/null blob + 25418beb1ff3d66be453f916901b24ea242c3e0f (mode 644) --- /dev/null +++ src/cmd/venti/srv/printarenapart.c @@ -0,0 +1,160 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +uchar buf[64*1024]; + +void +usage(void) +{ + fprint(2, "usage: printarenapart arenafile [offset]\n"); + threadexitsall("usage"); +} + +static void +rdarena(Arena *arena, u64int offset) +{ + u64int a, aa, e; + u32int magic; + Clump cl; + uchar score[VtScoreSize]; + ZBlock *lump; + + printarena(2, arena); + + a = arena->base; + e = arena->base + arena->size; + if(offset != ~(u64int)0) { + if(offset >= e-a) + sysfatal("bad offset %llud >= %llud\n", + offset, e-a); + aa = offset; + } else + aa = 0; + + for(; aa < e; aa += ClumpSize+cl.info.size) { + magic = clumpmagic(arena, aa); + if(magic == ClumpFreeMagic) + break; + if(magic != arena->clumpmagic) { + fprint(2, "illegal clump magic number %#8.8ux offset %llud\n", + magic, aa); + break; + } + lump = loadclump(arena, aa, 0, &cl, score, 0); + if(lump == nil) { + fprint(2, "clump %llud failed to read: %r\n", aa); + break; + } + if(cl.info.type != VtCorruptType) { + scoremem(score, lump->data, cl.info.uncsize); + if(scorecmp(cl.info.score, score) != 0) { + fprint(2, "clump %llud has mismatched score\n", aa); + break; + } + if(vttypevalid(cl.info.type) < 0) { + fprint(2, "clump %llud has bad type %d\n", aa, cl.info.type); + break; + } + } + print("%22llud %V %3d %5d\n", aa, score, cl.info.type, cl.info.uncsize); + freezblock(lump); + } + print("end offset %llud\n", aa); +} + +void +threadmain(int argc, char *argv[]) +{ + char *file, *p, *name; + char *table; + u64int offset; + Part *part; + ArenaPart ap; + ArenaHead head; + Arena tail; + char ct[40], mt[40]; + + readonly = 1; /* for part.c */ + ARGBEGIN{ + default: + usage(); + break; + }ARGEND + + switch(argc) { + default: + usage(); + case 1: + file = argv[0]; + } + + ventifmtinstall(); + statsinit(); + + part = initpart(file, OREAD|ODIRECT); + if(part == nil) + sysfatal("can't open file %s: %r", file); + if(readpart(part, PartBlank, buf, sizeof buf) < 0) + sysfatal("can't read file %s: %r", file); + + if(unpackarenapart(&ap, buf) < 0) + sysfatal("corrupted arena part header: %r"); + + print("# arena part version=%d blocksize=%d arenabase=%d\n", + ap.version, ap.blocksize, ap.arenabase); + ap.tabbase = (PartBlank+HeadSize+ap.blocksize-1)&~(ap.blocksize-1); + ap.tabsize = ap.arenabase - ap.tabbase; + +print("A"); + table = malloc(ap.tabsize+1); + if(readpart(part, ap.tabbase, (uchar*)table, ap.tabsize) < 0) + sysfatal("read %s: %r", file); + table[ap.tabsize] = 0; + +print("A"); + partblocksize(part, ap.blocksize); + initdcache(8 * MaxDiskBlock); + +print("A"); +/* XXX - read the number of arenas from the first line */ + for(p=table; p && *p; p=strchr(p, '\n')){ + if(*p == '\n') + p++; + name = p; + p = strpbrk(p, " \t"); + if(p == nil){ + fprint(2, "bad line: %s\n", name); + break; + } +print("%p\n", p); + offset = strtoull(p, nil, 0); + if(readpart(part, offset, buf, sizeof buf) < 0){ + fprint(2, "%s: read %s: %r\n", argv0, file); + continue; + } + if(unpackarenahead(&head, buf) < 0){ + fprint(2, "%s: unpackarenahead: %r\n", argv0); + continue; + } + if(readpart(part, offset+head.size-head.blocksize, buf, head.blocksize) < 0){ + fprint(2, "%s: read %s: %r\n", argv0, file); + continue; + } + if(unpackarena(&tail, buf) < 0){ + fprint(2, "%s: unpackarena: %r\n", argv0); + continue; + } + print("arena %s %lld clumps=%,d cclumps=%,d used=%,lld uncsize=%,lld%s\n", + tail.name, offset, + tail.diskstats.clumps, tail.diskstats.cclumps, + tail.diskstats.used, tail.diskstats.uncsize, + tail.diskstats.sealed ? " sealed" : ""); + strcpy(ct, ctime(tail.ctime)); + ct[28] = 0; + strcpy(mt, ctime(tail.wtime)); + mt[28] = 0; + print("\tctime=%s\n\tmtime=%s\n", ct, mt); + } + threadexitsall(0); +} blob - 7ed9ba3a08f10c69146eadfff47d1e2c5254f7b5 blob + fc5e85e7102e2dbcac108c53b14083a7eb8dc723 --- src/cmd/venti/srv/sortientry.c +++ src/cmd/venti/srv/sortientry.c @@ -61,7 +61,7 @@ sortrawientries(Index *ix, Part *tmp, u64int *base, Bl u32int n; int i, ok; -/*ZZZ should allow configuration of bits, bucket size */ +/* ZZZ should allow configuration of bits, bucket size */ ib = initiebucks(tmp, 8, 64*1024); if(ib == nil){ seterr(EOk, "can't create sorting buckets: %r"); @@ -116,10 +116,7 @@ readarenainfo(IEBucks *ib, Arena *arena, u64int a, Blo ClumpInfo *ci, *cis; u32int clump; int i, n, ok, nskip; -/* static Biobuf bout; */ -/*ZZZ remove fprint? */ -/*fprint(2, "ra %s %d %d\n", arena->name, arena->memstats.clumps, arena->diskstats.clumps); */ if(arena->memstats.clumps) fprint(2, "\tarena %s: %d entries\n", arena->name, arena->memstats.clumps); else @@ -129,7 +126,6 @@ readarenainfo(IEBucks *ib, Arena *arena, u64int a, Blo ok = 0; nskip = 0; memset(&ie, 0, sizeof(IEntry)); -/* Binit(&bout, 1, OWRITE); */ for(clump = 0; clump < arena->memstats.clumps; clump += n){ n = ClumpChunks; if(n > arena->memstats.clumps - clump) @@ -148,18 +144,15 @@ readarenainfo(IEBucks *ib, Arena *arena, u64int a, Blo a += ci->size + ClumpSize; ie.ia.blocks = (ci->size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; scorecp(ie.score, ci->score); - /* Bprint(&bout, "%22lld %V %3d %5d\n", */ - /* ie.ia.addr, ie.score, ie.ia.type, ie.ia.size); */ if(ci->type == VtCorruptType){ - /* print("! %V %22lld %3d %5d %3d\n", */ - /* ie.score, ie.ia.addr, ie.ia.type, ie.ia.size, ie.ia.blocks); */ + if(0) print("! %V %22lld %3d %5d %3d\n", + ie.score, ie.ia.addr, ie.ia.type, ie.ia.size, ie.ia.blocks); nskip++; }else sprayientry(ib, &ie); markbloomfilter(b, ie.score); } } -/* Bterm(&bout); */ free(cis); if(ok < 0) return TWID32; @@ -358,8 +351,8 @@ readiebuck(IEBucks *ib, int b) m = ib->bucks[b].used; if(m == 0) m = ib->usable; -/* if(ib->bucks[b].total) */ -/* fprint(2, "\tbucket %d: %d entries\n", b, ib->bucks[b].total/IEntrySize); */ + if(0) if(ib->bucks[b].total) + fprint(2, "\tbucket %d: %d entries\n", b, ib->bucks[b].total/IEntrySize); while(head != TWID32){ if(readpart(ib->part, (u64int)head * ib->size, &ib->buf[n], m+U32Size) < 0){ seterr(EOk, "can't read index sort bucket: %r"); blob - f578860adab94622db2d25b9add9e51d8967e90e blob + 874f7d27065398e0d6277cd5d18d54f235ba856c --- src/cmd/venti/srv/stats.c +++ src/cmd/venti/srv/stats.c @@ -80,7 +80,7 @@ Statdesc statdesc[NStat] = { "isect block write bytes", }, { "sum reads", }, - { "sum read bytes", } + { "sum read bytes", }, }; QLock statslock; blob - 7a5d6f9dd4daf5d139c5f4ee014a9f5dc589a2c5 blob + 89546f55626f00d85d325e63c43941293820500b --- src/cmd/venti/srv/syncarena.c +++ src/cmd/venti/srv/syncarena.c @@ -30,12 +30,11 @@ syncarena(Arena *arena, u64int start, u32int n, int zo ZBlock *lump; Clump cl; ClumpInfo ci; - static ClumpInfo zci = { -1 }; + static ClumpInfo zci = { .type = -1 }; u8int score[VtScoreSize]; u64int uncsize, used, aa; u32int clump, clumps, cclumps, magic; int err, flush, broken; - AState as; used = arena->memstats.used; clumps = arena->memstats.clumps; @@ -133,19 +132,21 @@ syncarena(Arena *arena, u64int start, u32int n, int zo flushdcache(); } +fprint(2, "arena %s: start=%lld fix=%d flush=%d %lld->%lld %ud->%ud %ud->%ud %lld->%lld\n", + arena->name, + start, + fix, + flush, + used, arena->memstats.used, + clumps, arena->memstats.clumps, + cclumps, arena->memstats.cclumps, + uncsize, arena->memstats.uncsize); + if(used != arena->memstats.used || clumps != arena->memstats.clumps || cclumps != arena->memstats.cclumps || uncsize != arena->memstats.uncsize) err |= SyncHeader; - if(start && (err&SyncHeader)){ - trace(TraceProc, "syncarena setdcachestate"); - as.arena = arena; - as.aa = start+arena->memstats.used; - as.stats = arena->memstats; - setdcachestate(&as); - } - return err; } blob - 56bf1527267934c625868a9ad0be115d477f4378 blob + 72d45f18fc96a59c97c9fa2dc4256d0ae59c97b9 --- src/cmd/venti/srv/syncindex.c +++ src/cmd/venti/srv/syncindex.c @@ -48,6 +48,8 @@ threadmain(int argc, char *argv[]) ventifmtinstall(); if(initventi(argv[0], &conf) < 0) sysfatal("can't init venti: %r"); + if(mainindex->bloom && loadbloom(mainindex->bloom) < 0) + sysfatal("can't load bloom filter: %r"); if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16)) bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16); blob - 12b69ed2c3aa5a0f8ea2be71b7769903bf4473e4 blob + e214d712718f11eae3f8687ba4dcf0fee2fc1486 --- src/cmd/venti/srv/syncindex0.c +++ src/cmd/venti/srv/syncindex0.c @@ -121,6 +121,7 @@ int syncindex(Index *ix, int fix, int mustflush, int check) { Arena *arena; + AState as; u64int a; u32int clump; int i, e, e1, ok, ok1, flush; @@ -130,7 +131,12 @@ syncindex(Index *ix, int fix, int mustflush, int check for(i = 0; i < ix->narenas; i++){ trace(TraceProc, "syncindex start %d", i); arena = ix->arenas[i]; - clump = arena->memstats.clumps; + /* + * Syncarena will scan through the arena looking for blocks + * that have been forgotten. It will update arena->memstats.used, + * so save the currenct copy as the place to start the + * syncarenaindex scan. + */ a = arena->memstats.used; e = syncarena(arena, ix->amap[i].start, TWID32, fix, fix); e1 = e; @@ -138,15 +144,23 @@ syncindex(Index *ix, int fix, int mustflush, int check e1 &= ~(SyncHeader|SyncCIZero|SyncCIErr); if(e1 == SyncHeader) fprint(2, "arena %s: header is out-of-date\n", arena->name); + clump = arena->diskstats.clumps; if(e1) ok = -1; else{ ok1 = syncarenaindex(ix, arena, clump, a + ix->amap[i].start, fix, &flush, check); if(ok1 < 0) fprint(2, "syncarenaindex: %r\n"); +fprint(2, "arena %s: wbarena in syncindex\n", arena->name); if(fix && ok1==0 && (e & SyncHeader) && wbarena(arena) < 0) fprint(2, "arena=%s header write failed: %r\n", arena->name); ok |= ok1; + +fprint(2, "arena %s: setdcachestate\n", arena->name); + as.arena = arena; + as.aa = ix->amap[i].start + arena->memstats.used; + as.stats = arena->memstats; + setdcachestate(&as); } } if(missing || wrong) blob - 587046cc5aea3b0583729932f6d85207d7cf89ee blob + 5530bd07d003f43c444940d600c9b41cae07a809 --- src/cmd/venti/srv/unwhack.c +++ src/cmd/venti/srv/unwhack.c @@ -23,7 +23,7 @@ static uchar lenval[1 << (DBigLenBits - 1)] = static uchar lenbits[] = { 0, 0, 0, - 2, 3, 5, 5 + 2, 3, 5, 5, }; static uchar offbits[16] = blob - 03fd9065117709e039ad8c4c5c9f18d1f0f8b5cb blob + 0fd0f04ff5c4552437cf034b94e78dee0f7a52e6 --- src/cmd/venti/srv/utils.c +++ src/cmd/venti/srv/utils.c @@ -148,6 +148,7 @@ emalloc(ulong n) sysfatal("out of memory allocating %lud", n); } memset(p, 0xa5, n); + setmalloctag(p, getcallerpc(&n)); if(0)print("emalloc %p-%p by %lux\n", p, (char*)p+n, getcallerpc(&n)); return p; } @@ -164,6 +165,7 @@ ezmalloc(ulong n) sysfatal("out of memory allocating %lud", n); } memset(p, 0, n); + setmalloctag(p, getcallerpc(&n)); if(0)print("ezmalloc %p-%p by %lux\n", p, (char*)p+n, getcallerpc(&n)); return p; } @@ -177,6 +179,7 @@ erealloc(void *p, ulong n) abort(); sysfatal("out of memory allocating %lud", n); } + setrealloctag(p, getcallerpc(&p)); if(0)print("erealloc %p-%p by %lux\n", p, (char*)p+n, getcallerpc(&p)); return p; } @@ -190,6 +193,7 @@ estrdup(char *s) n = strlen(s) + 1; t = emalloc(n); memmove(t, s, n); + setmalloctag(t, getcallerpc(&s)); if(0)print("estrdup %p-%p by %lux\n", t, (char*)t+n, getcallerpc(&s)); return t; } @@ -231,6 +235,7 @@ ventifmtinstall(void) fmtinstall('F', vtfcallfmt); fmtinstall('H', encodefmt); fmtinstall('I', ientryfmt); + fmtinstall('T', vttimefmt); fmtinstall('V', vtscorefmt); } blob - 1e924aebb3d33394fc3fce0cc925261a88f5f8f2 blob + e9ca05364653a4486e854bc578ae14b8a0c4494c --- src/cmd/venti/srv/venti.c +++ src/cmd/venti/srv/venti.c @@ -105,6 +105,8 @@ threadmain(int argc, char *argv[]) fprint(2, "conf..."); if(initventi(configfile, &config) < 0) sysfatal("can't init server: %r"); + if(mainindex->bloom && loadbloom(mainindex->bloom) < 0) + sysfatal("can't load bloom filter: %r"); if(mem == 0) mem = config.mem; @@ -210,8 +212,8 @@ ventiserver(void *v) trace(TraceRpc, "<- %F", &r->tx); r->rx.msgtype = r->tx.msgtype+1; addstat(StatRpcTotal, 1); - /* print("req (arenas[0]=%p sects[0]=%p) %F\n", */ - /* mainindex->arenas[0], mainindex->sects[0], &r->tx); */ + if(0) print("req (arenas[0]=%p sects[0]=%p) %F\n", + mainindex->arenas[0], mainindex->sects[0], &r->tx); switch(r->tx.msgtype){ default: vtrerror(r, "unknown request"); blob - 5236c0934d87c1a1a1255f372f89aef895b773a3 blob + 2cdb7ba05356c28e70d3c04a0ea5b2e0bac11c5a --- src/cmd/venti/srv/verifyarena.c +++ src/cmd/venti/srv/verifyarena.c @@ -3,65 +3,102 @@ #include "fns.h" static int verbose; +static int fd; +static uchar *data; +static int blocksize; +static int sleepms; void usage(void) { - fprint(2, "usage: verifyarena [-v]\n"); + fprint(2, "usage: verifyarena [-b blocksize] [-s ms] [-v] [arenapart [name...]]\n"); threadexitsall(0); } -static void +static int +preadblock(uchar *buf, int n, vlong off) +{ + int nr, m; + + for(nr = 0; nr < n; nr += m){ + m = n - nr; + m = pread(fd, &buf[nr], m, off+nr); + if(m <= 0){ + if(m == 0) + werrstr("early eof"); + return -1; + } + } + return 0; +} + +static int readblock(uchar *buf, int n) { int nr, m; for(nr = 0; nr < n; nr += m){ m = n - nr; - m = read(0, &buf[nr], m); - if(m <= 0) - sysfatal("can't read arena from standard input: %r"); + m = read(fd, &buf[nr], m); + if(m <= 0){ + if(m == 0) + werrstr("early eof"); + return -1; + } } + return 0; } static void -verifyarena(void) +verifyarena(char *name, vlong len) { Arena arena; ArenaHead head; - ZBlock *b; DigestState s; u64int n, e; u32int bs; u8int score[VtScoreSize]; - fprint(2, "verify arena from standard input\n"); + fprint(2, "verify %s\n", name); memset(&arena, 0, sizeof arena); memset(&s, 0, sizeof s); /* - * read the little bit, which will included the header + * read a little bit, which will include the header */ - bs = MaxIoSize; - b = alloczblock(bs, 0, 0); - readblock(b->data, HeadSize); - sha1(b->data, HeadSize, nil, &s); - if(unpackarenahead(&head, b->data) < 0) - sysfatal("corrupted arena header: %r"); + if(readblock(data, HeadSize) < 0){ + fprint(2, "%s: reading header: %r\n", name); + return; + } + sha1(data, HeadSize, nil, &s); + if(unpackarenahead(&head, data) < 0){ + fprint(2, "%s: corrupt arena header: %r\n", name); + return; + } if(head.version != ArenaVersion4 && head.version != ArenaVersion5) - fprint(2, "warning: unknown arena version %d\n", head.version); + fprint(2, "%s: warning: unknown arena version %d\n", name, head.version); + if(len != 0 && len != head.size) + fprint(2, "%s: warning: unexpected length %lld != %lld\n", name, head.size, len); + if(strcmp(name, "") != 0 && strcmp(head.name, name) != 0) + fprint(2, "%s: warning: unexpected name %s\n", name, head.name); /* * now we know how much to read * read everything but the last block, which is special */ e = head.size - head.blocksize; + bs = blocksize; for(n = HeadSize; n < e; n += bs){ if(n + bs > e) bs = e - n; - readblock(b->data, bs); - sha1(b->data, bs, nil, &s); + if(readblock(data, bs) < 0){ + fprint(2, "%s: read data: %r\n", name); + return; + } + sha1(data, bs, nil, &s); + if(sleepms) + sleep(sleepms); } /* @@ -69,8 +106,11 @@ verifyarena(void) * the sum is calculated assuming the slot for the sum is zero. */ bs = head.blocksize; - readblock(b->data, bs); - sha1(b->data, bs-VtScoreSize, nil, &s); + if(readblock(data, bs) < 0){ + fprint(2, "%s: read last block: %r\n", name); + return; + } + sha1(data, bs-VtScoreSize, nil, &s); sha1(zeroscore, VtScoreSize, nil, &s); sha1(nil, 0, score, &s); @@ -78,37 +118,73 @@ verifyarena(void) * validity check on the trailer */ arena.blocksize = head.blocksize; - if(unpackarena(&arena, b->data) < 0) - sysfatal("corrupted arena trailer: %r"); - scorecp(arena.score, &b->data[arena.blocksize - VtScoreSize]); + if(unpackarena(&arena, data) < 0){ + fprint(2, "%s: corrupt arena trailer: %r\n", name); + return; + } + scorecp(arena.score, &data[arena.blocksize - VtScoreSize]); - if(namecmp(arena.name, head.name) != 0) - sysfatal("arena header and trailer names clash: %s vs. %s\n", head.name, arena.name); - if(arena.version != head.version) - sysfatal("arena header and trailer versions clash: %d vs. %d\n", head.version, arena.version); + if(namecmp(arena.name, head.name) != 0){ + fprint(2, "%s: wrong name in trailer: %s vs. %s\n", + name, head.name, arena.name); + return; + } + if(arena.version != head.version){ + fprint(2, "%s: wrong version in trailer: %d vs. %d\n", + name, head.version, arena.version); + return; + } arena.size = head.size - 2 * head.blocksize; /* * check for no checksum or the same */ - if(scorecmp(score, arena.score) != 0){ - if(scorecmp(zeroscore, arena.score) != 0) - fprint(2, "warning: mismatched checksums for arena=%s, found=%V calculated=%V", - arena.name, arena.score, score); - scorecp(arena.score, score); - }else - fprint(2, "matched score\n"); - + if(scorecmp(score, arena.score) == 0) + fprint(2, "%s: verified score\n", name); + else if(scorecmp(zeroscore, arena.score) == 0) + fprint(2, "%s: unsealed\n", name); + else{ + fprint(2, "%s: mismatch checksum - found=%V calculated=%V\n", + name, arena.score, score); + return; + } printarena(2, &arena); } +static int +shouldcheck(char *name, char **s, int n) +{ + int i; + + if(n == 0) + return 1; + + for(i=0; i", 0); + threadexitsall(nil); + } + + if((fd = open(argv[0], OREAD)) < 0) + sysfatal("open %s: %r", argv[0]); - if(argc != 0) - usage(); + if(preadblock(data, 8192, PartBlank) < 0) + sysfatal("read arena part header: %r"); + if(unpackarenapart(&ap, data) < 0) + sysfatal("corrupted arena part header: %r"); + fprint(2, "# arena part version=%d blocksize=%d arenabase=%d\n", + ap.version, ap.blocksize, ap.arenabase); + ap.tabbase = (PartBlank+HeadSize+ap.blocksize-1)&~(ap.blocksize-1); + ap.tabsize = ap.arenabase - ap.tabbase; + table = malloc(ap.tabsize+1); + if(preadblock((uchar*)table, ap.tabsize, ap.tabbase) < 0) + sysfatal("reading arena part directory: %r"); + table[ap.tabsize] = 0; + + nline = atoi(table); + p = strchr(table, '\n'); + if(p) + p++; + for(i=0; i= sizeof line){ + fprint(2, "warning: long arena table line: %s\n", p); + p = q; + continue; + } + strcpy(line, p); + memset(f, 0, sizeof f); + if(tokenize(line, f, nelem(f)) < 3){ + fprint(2, "warning: bad arena table line: %s\n", p); + p = q; + continue; + } + p = q; + if(shouldcheck(f[0], argv+1, argc-1)){ + start = strtoull(f[1], 0, 0); + stop = strtoull(f[2], 0, 0); + if(stop <= start){ + fprint(2, "%s: bad start,stop %lld,%lld\n", f[0], stop, start); + continue; + } + if(seek(fd, start, 0) < 0) + fprint(2, "%s: seek to start: %r\n", f[0]); + verifyarena(f[0], stop - start); + } + } + for(i=1; iclumpmagic) { - /* fprint(2, "illegal clump magic number %#8.8ux offset %llud\n", */ - /* magic, aa); */ + if(0) fprint(2, "illegal clump magic number %#8.8ux offset %llud\n", + magic, aa); break; } lump = loadclump(arena, aa, 0, &cl, score, 0); blob - 4cc96dd42a500ebab8b8e9ff2960267ac7a9f9d6 blob + 4aa11f45a8c02e1be7841246e2869206c0e8d163 --- src/cmd/venti/srv/zblock.c +++ src/cmd/venti/srv/zblock.c @@ -5,11 +5,13 @@ void fmtzbinit(Fmt *f, ZBlock *b) { - memset(f, 0, sizeof *f); - fmtlocaleinit(f, nil, nil, nil); + f->runes = 0; f->start = b->data; f->to = f->start; f->stop = (char*)f->start + b->len; + f->flush = nil; + f->farg = nil; + f->nfmt = 0; } #define ROUNDUP(p, n) ((void*)(((uintptr)(p)+(n)-1)&~(uintptr)((n)-1))) blob - 70c25b73d2d94f37f51d8a4679c2eb18bda496fa blob + 7602627cc5069e2a6423498579edbca3474210fd --- src/cmd/venti/srv/zeropart.c +++ src/cmd/venti/srv/zeropart.c @@ -10,10 +10,6 @@ zeropart(Part *part, int blocksize) int w; fprint(2, "clearing the partition\n"); -/*fprint(2, "NOT!\n"); */ -/*return; */ -/*b=alloczblock(MaxIoSize, 1, blocksize); */ -/*freezblock(b); */ b = alloczblock(MaxIoSize, 1, blocksize); w = 0;