2 * The locking here is getting a little out of hand.
9 typedef struct DCache DCache;
14 HashSize = 1<<HashLog,
15 HashMask = HashSize - 1,
21 RWLock dirtylock; /* must be held to inspect or set b->dirty */
27 DBlock *free; /* list of available lumps */
28 u32int now; /* ticks for usage timestamps */
29 int size; /* max. size of any block; allocated to each block */
30 DBlock **heads; /* hash table for finding address */
31 int nheap; /* number of available victims */
32 DBlock **heap; /* heap for locating victims */
33 int nblocks; /* number of blocks allocated */
34 DBlock *blocks; /* array of block descriptors */
35 DBlock **write; /* array of block pointers to be written */
36 u8int *mem; /* memory for all block descriptors */
37 int ndirty; /* number of dirty blocks */
38 int maxdirty; /* max. number of dirty blocks */
43 static int downheap(int i, DBlock *b);
44 static int upheap(int i, DBlock *b);
45 static DBlock *bumpdblock(void);
46 static void delheap(DBlock *db);
47 static void fixheap(int i, DBlock *b);
48 static void _flushdcache(void);
49 static void flushproc(void*);
50 static void flushtimerproc(void*);
51 static void writeproc(void*);
54 initdcache(u32int mem)
57 u32int nblocks, blocksize;
60 if(mem < maxblocksize * 2)
61 sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2);
63 sysfatal("no max. block size given for disk cache");
64 blocksize = maxblocksize;
65 nblocks = mem / blocksize;
66 dcache.full.l = &dcache.lock;
67 dcache.flush.l = &dcache.lock;
68 dcache.anydirty.l = &dcache.lock;
69 dcache.flushdone.l = &dcache.lock;
70 dcache.nblocks = nblocks;
71 dcache.maxdirty = (nblocks * 3) / 4;
73 fprint(2, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n",
74 nblocks, blocksize, dcache.maxdirty);
75 dcache.size = blocksize;
76 dcache.heads = MKNZ(DBlock*, HashSize);
77 dcache.heap = MKNZ(DBlock*, nblocks);
78 dcache.blocks = MKNZ(DBlock, nblocks);
79 dcache.write = MKNZ(DBlock*, nblocks);
80 dcache.mem = MKNZ(u8int, nblocks * blocksize);
83 for(i = 0; i < nblocks; i++){
84 b = &dcache.blocks[i];
85 b->data = &dcache.mem[i * blocksize];
87 chaninit(&b->writedonechan, sizeof(void*), 1);
94 vtproc(flushproc, nil);
95 vtproc(flushtimerproc, nil);
103 #define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
104 h = (addr >> 32) ^ addr;
109 getdblock(Part *part, u64int addr, int read)
114 size = part->blocksize;
115 if(size > dcache.size){
116 seterr(EAdmin, "block size %d too big for cache", size);
122 * look for the block in the cache
127 for(b = dcache.heads[h]; b != nil; b = b->next){
128 if(b->part == part && b->addr == addr){
131 qunlock(&stats.lock);
137 qunlock(&stats.lock);
140 * missed: locate the block with the oldest second to last use.
141 * remove it from the heap, and fix up the heap.
145 logerr(EAdmin, "all disk cache blocks in use");
146 rsleep(&dcache.full);
151 * the new block has no last use, so assume it happens sometime in the middle
152 ZZZ this is not reasonable
154 b->used = (b->used2 + dcache.now) / 2;
157 * rechain the block on the correct hash chain
159 b->next = dcache.heads[h];
172 b->used = dcache.now++;
173 if(b->heap != TWID32)
176 qunlock(&dcache.lock);
183 memset(&b->data[b->size], 0, size - b->size);
185 if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
191 stats.pcbreads += size - b->size;
192 qunlock(&stats.lock);
209 runlock(&dcache.dirtylock);
217 else if(--b->ref == 0){
218 if(b->heap == TWID32)
219 upheap(dcache.nheap++, b);
220 rwakeupall(&dcache.full);
223 qunlock(&dcache.lock);
228 dirtydblock(DBlock *b, int dirty)
237 * Because we use an rlock to keep track of how
238 * many blocks are being dirtied (and a wlock to
239 * stop dirtying), we cannot try to dirty two blocks
240 * at the same time in the same thread -- if the
241 * wlock happens between them, we get a deadlock.
243 * The only place in the code where we need to
244 * dirty multiple blocks at once is in splitiblock, when
245 * we split an index block. The new block has a dirty
246 * flag of DirtyIndexSplit already, because it has to get
247 * written to disk before the old block (DirtyIndex).
248 * So when we see the DirtyIndexSplit block, we don't
249 * acquire the rlock (and we don't set dirtying, so putdblock
250 * won't drop the rlock). This kludginess means that
251 * the caller will have to be sure to putdblock on the
252 * new block before putdblock on the old block.
254 if(dirty == DirtyIndexSplit){
255 /* caller should have another dirty block */
256 assert(!canwlock(&dcache.dirtylock));
257 /* this block should be clean */
258 assert(b->dirtying == 0);
259 assert(b->dirty == 0);
261 rlock(&dcache.dirtylock);
262 assert(b->dirtying == 0);
268 stats.absorbedwrites++;
269 stats.dirtydblocks++;
270 qunlock(&stats.lock);
273 * In general, shouldn't mark any block as more than one
274 * type, except that split index blocks are a subcase of index
275 * blocks. Only clean blocks ever get marked DirtyIndexSplit,
276 * though, so we don't need the opposite conjunction here.
280 assert(b->dirty == dirty
281 || (b->dirty==DirtyIndexSplit && dirty==DirtyIndex));
286 if(p->writechan == nil){
287 fprint(2, "allocate write proc for part %s\n", p->name);
288 /* XXX hope this doesn't fail! */
289 p->writechan = chancreate(sizeof(DBlock*), dcache.nblocks);
290 vtproc(writeproc, p);
295 rwakeupall(&dcache.anydirty);
297 qunlock(&dcache.lock);
301 * remove some block from use and update the free list and counters
312 dcache.free = b->next;
316 if(dcache.ndirty >= dcache.maxdirty)
320 * remove blocks until we find one that is unused
321 * referenced blocks are left in the heap even though
322 * they can't be scavenged; this is simple a speed optimization
326 if(dcache.nheap == 0){
336 fprint(2, "bump %s at %llud\n", b->part->name, b->addr);
343 if(dcache.heads[h] != b)
344 sysfatal("bad hash chains in disk cache");
345 dcache.heads[h] = b->next;
347 b->prev->next = b->next;
349 b->next->prev = b->prev;
355 * delete an arbitrary block from the heap
360 if(db->heap == TWID32)
362 fixheap(db->heap, dcache.heap[--dcache.nheap]);
367 * push an element up or down to it's correct new location
370 fixheap(int i, DBlock *b)
372 if(upheap(i, b) == i)
377 upheap(int i, DBlock *b)
384 for(; i != 0; i = p){
387 if(b->used2 - now >= bb->used2 - now)
399 downheap(int i, DBlock *b)
408 if(k >= dcache.nheap)
410 if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now)
413 if(b->used2 - now <= bb->used2 - now)
425 findblock(DBlock *bb)
431 h = pbhash(bb->addr);
432 for(b = dcache.heads[h]; b != nil; b = b->next){
434 sysfatal("bad prev link");
439 sysfatal("block missing from hash table");
447 int i, k, refed, nfree;
452 for(i = 0; i < dcache.nheap; i++){
453 if(dcache.heap[i]->heap != i)
454 sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap);
455 if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now)
456 sysfatal("dc: bad heap ordering");
458 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
459 sysfatal("dc: bad heap ordering");
461 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
462 sysfatal("dc: bad heap ordering");
466 for(i = 0; i < dcache.nblocks; i++){
467 b = &dcache.blocks[i];
468 if(b->data != &dcache.mem[i * size])
469 sysfatal("dc: mis-blocked at %d", i);
470 if(b->ref && b->heap == TWID32)
475 && dcache.heap[b->heap] != b)
476 sysfatal("dc: spurious heap value");
480 for(b = dcache.free; b != nil; b = b->next){
481 if(b->addr != 0 || b->heap != TWID32)
482 sysfatal("dc: bad free list");
486 if(dcache.nheap + nfree + refed != dcache.nblocks)
487 sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks);
488 qunlock(&dcache.lock);
497 flushround = dcache.flushround;
498 rwakeupall(&dcache.flush);
499 while(flushround == dcache.flushround)
500 rsleep(&dcache.flushdone);
501 qunlock(&dcache.lock);
507 rwakeupall(&dcache.flush);
511 parallelwrites(DBlock **b, DBlock **eb, int dirty)
515 for(p=b; p<eb && (*p)->dirty == dirty; p++)
516 sendp((*p)->part->writechan, *p);
517 for(p=b; p<eb && (*p)->dirty == dirty; p++)
518 recvp(&(*p)->writedonechan);
524 * Sort first by dirty flag, then by partition, then by address in partition.
527 writeblockcmp(const void *va, const void *vb)
534 if(a->dirty != b->dirty)
535 return a->dirty - b->dirty;
536 if(a->part != b->part){
537 if(a->part < b->part)
539 if(a->part > b->part)
542 if(a->addr < b->addr)
548 flushtimerproc(void *v)
554 while(dcache.ndirty == 0)
555 rsleep(&dcache.anydirty);
556 round = dcache.flushround;
557 qunlock(&dcache.lock);
562 if(round == dcache.flushround){
563 rwakeupall(&dcache.flush);
564 while(round == dcache.flushround)
565 rsleep(&dcache.flushdone);
567 qunlock(&dcache.lock);
582 rwakeupall(&dcache.flushdone);
583 rsleep(&dcache.flush);
584 qunlock(&dcache.lock);
586 fprint(2, "flushing dcache: t=0 ms\n");
590 * Because we don't record any dependencies at all, we must write out
591 * all blocks currently dirty. Thus we must lock all the blocks that
592 * are currently dirty.
594 * We grab dirtylock to stop the dirtying of new blocks.
595 * Then we wait until all the current blocks finish being dirtied.
596 * Now all the dirty blocks in the system are immutable (clean blocks
597 * might still get recycled), so we can plan our disk writes.
599 * In a better scheme, dirtiers might lock the block for writing in getdblock,
600 * so that flushproc could lock all the blocks here and then unlock them as it
601 * finishes with them.
604 fprint(2, "flushproc: wlock at t=%lud\n", (ulong)(nsec()/1000) - t0);
605 wlock(&dcache.dirtylock);
607 fprint(2, "flushproc: build list at t=%lud\n", (ulong)(nsec()/1000) - t0);
608 write = dcache.write;
610 for(i=0; i<dcache.nblocks; i++){
611 b = &dcache.blocks[i];
616 qsort(write, n, sizeof(write[0]), writeblockcmp);
618 /* Write each stage of blocks out. */
619 fprint(2, "flushproc: write blocks at t=%lud\n", (ulong)(nsec()/1000) - t0);
621 for(j=1; j<DirtyMax; j++){
622 fprint(2, "flushproc: flush dirty %d at t=%lud\n", j, (ulong)(nsec()/1000) - t0);
623 i += parallelwrites(write+i, write+n, j);
627 fprint(2, "flushproc: update dirty bits at t=%lud\n", (ulong)(nsec()/1000) - t0);
633 if(b->ref == 0 && b->heap == TWID32){
634 upheap(dcache.nheap++, b);
635 rwakeupall(&dcache.full);
638 qunlock(&dcache.lock);
639 wunlock(&dcache.dirtylock);
641 fprint(2, "flushproc: done at t=%lud\n", (ulong)(nsec()/1000) - t0);
644 stats.dcacheflushes++;
645 stats.dcacheflushwrites += n;
646 qunlock(&stats.lock);
659 b = recvp(p->writechan);
660 if(writepart(p, b->addr, b->data, b->size) < 0)
661 fprint(2, "write error: %r\n"); /* XXX details! */
662 sendp(&b->writedonechan, b);