Blob


1 /*
2 * Disk cache.
3 *
4 * Caches raw disk blocks. Getdblock() gets a block, putdblock puts it back.
5 * Getdblock has a mode parameter that determines i/o and access to a block:
6 * if mode is OREAD or ORDWR, it is read from disk if not already in memory.
7 * If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned.
8 * It is *not* marked dirty -- once changes have been made, they should be noted
9 * by using dirtydblock() before putdblock().
10 *
11 * There is a global cache lock as well as a lock on each block.
12 * Within a thread, the cache lock can be acquired while holding a block lock,
13 * but not vice versa; and a block cannot be locked if you already hold the lock
14 * on another block.
15 *
16 * The flush proc writes out dirty blocks in batches, one batch per dirty tag.
17 * For example, the DirtyArena blocks are all written to disk before any of the
18 * DirtyArenaCib blocks.
19 *
20 * This code used to be in charge of flushing the dirty index blocks out to
21 * disk, but updating the index turned out to benefit from extra care.
22 * Now cached index blocks are never marked dirty. The index.c code takes
23 * care of updating them behind our back, and uses _getdblock to update any
24 * cached copies of the blocks as it changes them on disk.
25 */
27 #include "stdinc.h"
28 #include "dat.h"
29 #include "fns.h"
31 typedef struct DCache DCache;
33 enum
34 {
35 HashLog = 9,
36 HashSize = 1<<HashLog,
37 HashMask = HashSize - 1,
38 };
40 struct DCache
41 {
42 QLock lock;
43 RWLock dirtylock; /* must be held to inspect or set b->dirty */
44 Rendez full;
45 Round round;
46 DBlock *free; /* list of available lumps */
47 u32int now; /* ticks for usage timestamps */
48 int size; /* max. size of any block; allocated to each block */
49 DBlock **heads; /* hash table for finding address */
50 int nheap; /* number of available victims */
51 DBlock **heap; /* heap for locating victims */
52 int nblocks; /* number of blocks allocated */
53 DBlock *blocks; /* array of block descriptors */
54 DBlock **write; /* array of block pointers to be written */
55 u8int *mem; /* memory for all block descriptors */
56 int ndirty; /* number of dirty blocks */
57 int maxdirty; /* max. number of dirty blocks */
58 Channel *ra;
59 u8int *rabuf;
60 u32int ramax;
61 u32int rasize;
62 u64int raaddr;
63 Part *rapart;
65 AState diskstate;
66 AState state;
67 };
69 typedef struct Ra Ra;
70 struct Ra
71 {
72 Part *part;
73 u64int addr;
74 };
76 static DCache dcache;
78 static int downheap(int i, DBlock *b);
79 static int upheap(int i, DBlock *b);
80 static DBlock *bumpdblock(void);
81 static void delheap(DBlock *db);
82 static void fixheap(int i, DBlock *b);
83 static void flushproc(void*);
84 static void writeproc(void*);
85 static void raproc(void*);
87 void
88 initdcache(u32int mem)
89 {
90 DBlock *b, *last;
91 u32int nblocks, blocksize;
92 int i;
93 u8int *p;
95 if(mem < maxblocksize * 2)
96 sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2);
97 if(maxblocksize == 0)
98 sysfatal("no max. block size given for disk cache");
99 blocksize = maxblocksize;
100 nblocks = mem / blocksize;
101 dcache.full.l = &dcache.lock;
102 dcache.nblocks = nblocks;
103 dcache.maxdirty = (nblocks * 2) / 3;
104 trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n",
105 nblocks, blocksize, dcache.maxdirty);
106 dcache.size = blocksize;
107 dcache.heads = MKNZ(DBlock*, HashSize);
108 dcache.heap = MKNZ(DBlock*, nblocks);
109 dcache.blocks = MKNZ(DBlock, nblocks);
110 dcache.write = MKNZ(DBlock*, nblocks);
111 dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize);
112 dcache.ra = chancreate(sizeof(Ra), 0);
114 last = nil;
115 p = (u8int*)(((ulong)dcache.mem+blocksize-1)&~(ulong)(blocksize-1));
116 for(i = 0; i < nblocks; i++){
117 b = &dcache.blocks[i];
118 b->data = &p[i * blocksize];
119 b->heap = TWID32;
120 b->writedonechan = chancreate(sizeof(void*), 1);
121 b->next = last;
122 last = b;
124 dcache.rabuf = &p[i*blocksize];
125 dcache.ramax = 128*blocksize;
126 dcache.raaddr = 0;
127 dcache.rapart = nil;
129 dcache.free = last;
130 dcache.nheap = 0;
131 setstat(StatDcacheSize, nblocks);
132 initround(&dcache.round, "dcache", 120*1000);
134 vtproc(flushproc, nil);
135 vtproc(delaykickroundproc, &dcache.round);
136 vtproc(raproc, nil);
139 void
140 setdcachestate(AState *a)
142 trace(TraceBlock, "setdcachestate %s 0x%llux clumps %d", a->arena ? a->arena->name : nil, a->aa, a->stats.clumps);
143 qlock(&dcache.lock);
144 dcache.state = *a;
145 qunlock(&dcache.lock);
148 AState
149 diskstate(void)
151 AState a;
153 qlock(&dcache.lock);
154 a = dcache.diskstate;
155 qunlock(&dcache.lock);
156 return a;
159 static void
160 raproc(void *v)
162 Ra ra;
163 DBlock *b;
165 USED(v);
166 while(recv(dcache.ra, &ra) == 1){
167 if(ra.part->size <= ra.addr)
168 continue;
169 b = _getdblock(ra.part, ra.addr, OREAD, 2);
170 putdblock(b);
174 void
175 dreadahead(Part *part, u64int addr, int miss)
177 Ra ra;
178 static struct {
179 Part *part;
180 u64int addr;
181 } lastmiss;
182 static struct {
183 Part *part;
184 u64int addr;
185 int dir;
186 } lastra;
188 return;
189 if(miss){
190 if(lastmiss.part==part && lastmiss.addr==addr-dcache.size){
191 XRa:
192 lastra.part = part;
193 lastra.dir = addr-lastmiss.addr;
194 lastra.addr = addr+lastra.dir;
195 ra.part = part;
196 ra.addr = lastra.addr;
197 nbsend(dcache.ra, &ra);
198 }else if(lastmiss.part==part && lastmiss.addr==addr+dcache.size){
199 addr -= dcache.size;
200 goto XRa;
202 }else{
203 if(lastra.part==part && lastra.addr==addr){
204 lastra.addr += lastra.dir;
205 ra.part = part;
206 ra.addr = lastra.addr;
207 nbsend(dcache.ra, &ra);
211 if(miss){
212 lastmiss.part = part;
213 lastmiss.addr = addr;
216 // fprint(2, "%s %llx %s\n", part->name, addr, miss ? "miss" : "hit");
219 int
220 rareadpart(Part *part, u64int addr, u8int *buf, uint n, int load)
222 uint nn;
223 static RWLock ralock;
225 rlock(&ralock);
226 if(dcache.rapart==part && dcache.raaddr <= addr && addr+n <= dcache.raaddr+dcache.rasize){
227 memmove(buf, dcache.rabuf+(addr-dcache.raaddr), n);
228 runlock(&ralock);
229 return 0;
231 if(load != 2 || addr >= part->size){ /* addr >= part->size: let readpart do the error */
232 runlock(&ralock);
233 return readpart(part, addr, buf, n);
236 runlock(&ralock);
237 wlock(&ralock);
238 fprint(2, "raread %s %llx\n", part->name, addr);
239 nn = dcache.ramax;
240 if(addr+nn > part->size)
241 nn = part->size - addr;
242 if(readpart(part, addr, dcache.rabuf, nn) < 0){
243 wunlock(&ralock);
244 return -1;
246 memmove(buf, dcache.rabuf, n);
247 dcache.rapart = part;
248 dcache.rasize = nn;
249 dcache.raaddr = addr;
250 wunlock(&ralock);
252 addstat(StatApartReadBytes, nn-n);
253 return 0;
256 static u32int
257 pbhash(u64int addr)
259 u32int h;
261 #define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
262 h = (addr >> 32) ^ addr;
263 return hashit(h);
266 DBlock*
267 getdblock(Part *part, u64int addr, int mode)
269 DBlock *b;
270 uint ms;
272 ms = msec();
273 b = _getdblock(part, addr, mode, 1);
274 if(mode == OREAD || mode == ORDWR)
275 addstat(StatDcacheRead, 1);
276 if(mode == OWRITE || mode == ORDWR)
277 addstat(StatDcacheWrite, 1);
278 ms = msec() - ms;
279 addstat2(StatDcacheLookup, 1, StatDcacheLookupTime, ms);
280 return b;
283 DBlock*
284 _getdblock(Part *part, u64int addr, int mode, int load)
286 DBlock *b;
287 u32int h, size;
289 trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr);
290 size = part->blocksize;
291 if(size > dcache.size){
292 seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size);
293 return nil;
295 h = pbhash(addr);
297 /*
298 * look for the block in the cache
299 */
300 //checkdcache();
301 qlock(&dcache.lock);
302 again:
303 for(b = dcache.heads[h]; b != nil; b = b->next){
304 if(b->part == part && b->addr == addr){
305 /*
306 qlock(&stats.lock);
307 stats.pchit++;
308 qunlock(&stats.lock);
309 */
310 if(load){
311 addstat(StatDcacheHit, 1);
312 if(load != 2 && mode != OWRITE)
313 dreadahead(part, b->addr, 0);
315 goto found;
319 /*
320 * missed: locate the block with the oldest second to last use.
321 * remove it from the heap, and fix up the heap.
322 */
323 if(!load){
324 qunlock(&dcache.lock);
325 return nil;
328 addstat(StatDcacheMiss, 1);
330 b = bumpdblock();
331 if(b == nil){
332 trace(TraceBlock, "all disk cache blocks in use");
333 addstat(StatDcacheStall, 1);
334 rsleep(&dcache.full);
335 addstat(StatDcacheStall, -1);
336 goto again;
339 assert(!b->dirty);
341 /*
342 * the new block has no last use, so assume it happens sometime in the middle
343 ZZZ this is not reasonable
344 */
345 b->used = (b->used2 + dcache.now) / 2;
347 /*
348 * rechain the block on the correct hash chain
349 */
350 b->next = dcache.heads[h];
351 dcache.heads[h] = b;
352 if(b->next != nil)
353 b->next->prev = b;
354 b->prev = nil;
356 b->addr = addr;
357 b->part = part;
358 b->size = 0;
359 if(load != 2 && mode != OWRITE)
360 dreadahead(part, b->addr, 1);
362 found:
363 b->ref++;
364 b->used2 = b->used;
365 b->used = dcache.now++;
366 if(b->heap != TWID32)
367 fixheap(b->heap, b);
369 qunlock(&dcache.lock);
370 //checkdcache();
372 trace(TraceBlock, "getdblock lock");
373 addstat(StatDblockStall, 1);
374 if(mode == OREAD)
375 rlock(&b->lock);
376 else
377 wlock(&b->lock);
378 addstat(StatDblockStall, -1);
379 trace(TraceBlock, "getdblock locked");
381 if(b->size != size){
382 if(mode == OREAD){
383 addstat(StatDblockStall, 1);
384 runlock(&b->lock);
385 wlock(&b->lock);
386 addstat(StatDblockStall, -1);
388 if(b->size < size){
389 if(mode == OWRITE)
390 memset(&b->data[b->size], 0, size - b->size);
391 else{
392 trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr);
393 if(rareadpart(part, addr + b->size, &b->data[b->size], size - b->size, load) < 0){
394 b->mode = ORDWR; /* so putdblock wunlocks */
395 putdblock(b);
396 return nil;
398 trace(TraceBlock, "getdblock readpartdone");
399 addstat(StatApartRead, 1);
400 addstat(StatApartReadBytes, size-b->size);
403 b->size = size;
404 if(mode == OREAD){
405 addstat(StatDblockStall, 1);
406 wunlock(&b->lock);
407 rlock(&b->lock);
408 addstat(StatDblockStall, -1);
412 b->mode = mode;
413 trace(TraceBlock, "getdblock exit");
414 return b;
417 void
418 putdblock(DBlock *b)
420 if(b == nil)
421 return;
423 trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr);
425 if(b->mode == OREAD)
426 runlock(&b->lock);
427 else
428 wunlock(&b->lock);
430 //checkdcache();
431 qlock(&dcache.lock);
432 if(--b->ref == 0 && !b->dirty){
433 if(b->heap == TWID32)
434 upheap(dcache.nheap++, b);
435 rwakeupall(&dcache.full);
437 qunlock(&dcache.lock);
438 //checkdcache();
441 void
442 dirtydblock(DBlock *b, int dirty)
444 int odirty;
445 Part *p;
448 trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux", b->part->name, b->addr, dirty, getcallerpc(&b));
449 assert(b->ref != 0);
450 assert(b->mode==ORDWR || b->mode==OWRITE);
452 odirty = b->dirty;
453 if(b->dirty)
454 assert(b->dirty == dirty);
455 else
456 b->dirty = dirty;
458 p = b->part;
459 if(p->writechan == nil){
460 trace(TraceBlock, "dirtydblock allocwriteproc %s", p->name);
461 /* XXX hope this doesn't fail! */
462 p->writechan = chancreate(sizeof(DBlock*), dcache.nblocks);
463 vtproc(writeproc, p);
465 qlock(&dcache.lock);
466 if(!odirty){
467 dcache.ndirty++;
468 setstat(StatDcacheDirty, dcache.ndirty);
469 if(dcache.ndirty >= dcache.maxdirty)
470 kickround(&dcache.round, 0);
471 else
472 delaykickround(&dcache.round);
474 qunlock(&dcache.lock);
477 /*
478 * remove some block from use and update the free list and counters
479 */
480 static DBlock*
481 bumpdblock(void)
483 DBlock *b;
484 ulong h;
486 trace(TraceBlock, "bumpdblock enter");
487 b = dcache.free;
488 if(b != nil){
489 dcache.free = b->next;
490 return b;
493 if(dcache.ndirty >= dcache.maxdirty)
494 kickdcache();
496 /*
497 * remove blocks until we find one that is unused
498 * referenced blocks are left in the heap even though
499 * they can't be scavenged; this is simple a speed optimization
500 */
501 for(;;){
502 if(dcache.nheap == 0){
503 kickdcache();
504 trace(TraceBlock, "bumpdblock gotnothing");
505 return nil;
507 b = dcache.heap[0];
508 delheap(b);
509 if(!b->ref && !b->dirty)
510 break;
513 trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr);
515 /*
516 * unchain the block
517 */
518 if(b->prev == nil){
519 h = pbhash(b->addr);
520 if(dcache.heads[h] != b)
521 sysfatal("bad hash chains in disk cache");
522 dcache.heads[h] = b->next;
523 }else
524 b->prev->next = b->next;
525 if(b->next != nil)
526 b->next->prev = b->prev;
528 return b;
531 /*
532 * delete an arbitrary block from the heap
533 */
534 static void
535 delheap(DBlock *db)
537 if(db->heap == TWID32)
538 return;
539 fixheap(db->heap, dcache.heap[--dcache.nheap]);
540 db->heap = TWID32;
543 /*
544 * push an element up or down to it's correct new location
545 */
546 static void
547 fixheap(int i, DBlock *b)
549 if(upheap(i, b) == i)
550 downheap(i, b);
553 static int
554 upheap(int i, DBlock *b)
556 DBlock *bb;
557 u32int now;
558 int p;
560 now = dcache.now;
561 for(; i != 0; i = p){
562 p = (i - 1) >> 1;
563 bb = dcache.heap[p];
564 if(b->used2 - now >= bb->used2 - now)
565 break;
566 dcache.heap[i] = bb;
567 bb->heap = i;
570 dcache.heap[i] = b;
571 b->heap = i;
572 return i;
575 static int
576 downheap(int i, DBlock *b)
578 DBlock *bb;
579 u32int now;
580 int k;
582 now = dcache.now;
583 for(; ; i = k){
584 k = (i << 1) + 1;
585 if(k >= dcache.nheap)
586 break;
587 if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now)
588 k++;
589 bb = dcache.heap[k];
590 if(b->used2 - now <= bb->used2 - now)
591 break;
592 dcache.heap[i] = bb;
593 bb->heap = i;
596 dcache.heap[i] = b;
597 b->heap = i;
598 return i;
601 static void
602 findblock(DBlock *bb)
604 DBlock *b, *last;
605 int h;
607 last = nil;
608 h = pbhash(bb->addr);
609 for(b = dcache.heads[h]; b != nil; b = b->next){
610 if(last != b->prev)
611 sysfatal("bad prev link");
612 if(b == bb)
613 return;
614 last = b;
616 sysfatal("block missing from hash table");
619 void
620 checkdcache(void)
622 DBlock *b;
623 u32int size, now;
624 int i, k, refed, nfree;
626 qlock(&dcache.lock);
627 size = dcache.size;
628 now = dcache.now;
629 for(i = 0; i < dcache.nheap; i++){
630 if(dcache.heap[i]->heap != i)
631 sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap);
632 if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now)
633 sysfatal("dc: bad heap ordering");
634 k = (i << 1) + 1;
635 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
636 sysfatal("dc: bad heap ordering");
637 k++;
638 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
639 sysfatal("dc: bad heap ordering");
642 refed = 0;
643 for(i = 0; i < dcache.nblocks; i++){
644 b = &dcache.blocks[i];
645 if(b->data != &dcache.mem[i * size])
646 sysfatal("dc: mis-blocked at %d", i);
647 if(b->ref && b->heap == TWID32)
648 refed++;
649 if(b->addr)
650 findblock(b);
651 if(b->heap != TWID32
652 && dcache.heap[b->heap] != b)
653 sysfatal("dc: spurious heap value");
656 nfree = 0;
657 for(b = dcache.free; b != nil; b = b->next){
658 if(b->addr != 0 || b->heap != TWID32)
659 sysfatal("dc: bad free list");
660 nfree++;
663 if(dcache.nheap + nfree + refed != dcache.nblocks)
664 sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks);
665 qunlock(&dcache.lock);
668 void
669 flushdcache(void)
671 trace(TraceProc, "flushdcache enter");
672 kickround(&dcache.round, 1);
673 trace(TraceProc, "flushdcache exit");
676 void
677 kickdcache(void)
679 kickround(&dcache.round, 0);
682 static int
683 parallelwrites(DBlock **b, DBlock **eb, int dirty)
685 DBlock **p, **q;
686 for(p=b; p<eb && (*p)->dirty == dirty; p++){
687 assert(b<=p && p<eb);
688 sendp((*p)->part->writechan, *p);
690 q = p;
691 for(p=b; p<q; p++){
692 assert(b<=p && p<eb);
693 recvp((*p)->writedonechan);
696 return p-b;
699 /*
700 * Sort first by dirty flag, then by partition, then by address in partition.
701 */
702 static int
703 writeblockcmp(const void *va, const void *vb)
705 DBlock *a, *b;
707 a = *(DBlock**)va;
708 b = *(DBlock**)vb;
710 if(a->dirty != b->dirty)
711 return a->dirty - b->dirty;
712 if(a->part != b->part){
713 if(a->part < b->part)
714 return -1;
715 if(a->part > b->part)
716 return 1;
718 if(a->addr < b->addr)
719 return -1;
720 return 1;
723 static void
724 flushproc(void *v)
726 int i, j, n;
727 ulong t0;
728 DBlock *b, **write;
729 AState as;
731 USED(v);
732 threadsetname("flushproc");
733 for(;;){
734 waitforkick(&dcache.round);
736 trace(TraceWork, "start");
737 qlock(&dcache.lock);
738 as = dcache.state;
739 qunlock(&dcache.lock);
741 t0 = nsec()/1000;
743 trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0);
744 write = dcache.write;
745 n = 0;
746 for(i=0; i<dcache.nblocks; i++){
747 b = &dcache.blocks[i];
748 if(b->dirty)
749 write[n++] = b;
752 qsort(write, n, sizeof(write[0]), writeblockcmp);
754 /* Write each stage of blocks out. */
755 trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0);
756 i = 0;
757 for(j=1; j<DirtyMax; j++){
758 trace(TraceProc, "writeblocks.%d t=%lud", j, (ulong)(nsec()/1000)-t0);
759 i += parallelwrites(write+i, write+n, j);
761 if(i != n){
762 fprint(2, "in flushproc i=%d n=%d\n", i, n);
763 for(i=0; i<n; i++)
764 fprint(2, "\tblock %d: dirty=%d\n", i, write[i]->dirty);
765 abort();
768 /* XXX
769 * the locking here is suspect. what if a block is redirtied
770 * after the write happens? we'll still decrement dcache.ndirty here.
771 */
772 trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0);
773 qlock(&dcache.lock);
774 dcache.diskstate = as;
775 for(i=0; i<n; i++){
776 b = write[i];
777 --dcache.ndirty;
778 if(b->ref == 0 && b->heap == TWID32){
779 upheap(dcache.nheap++, b);
780 rwakeupall(&dcache.full);
783 setstat(StatDcacheDirty, dcache.ndirty);
784 qunlock(&dcache.lock);
785 addstat(StatDcacheFlush, 1);
786 trace(TraceWork, "finish");
790 static void
791 writeproc(void *v)
793 DBlock *b;
794 Part *p;
796 p = v;
798 threadsetname("writeproc:%s", p->name);
799 for(;;){
800 b = recvp(p->writechan);
801 trace(TraceWork, "start");
802 assert(b->part == p);
803 trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr);
804 wlock(&b->lock);
805 trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr);
806 if(writepart(p, b->addr, b->data, b->size) < 0)
807 fprint(2, "write error: %r\n"); /* XXX details! */
808 addstat(StatApartWrite, 1);
809 addstat(StatApartWriteBytes, b->size);
810 b->dirty = 0;
811 wunlock(&b->lock);
812 trace(TraceProc, "finish %s 0x%llux", p->name, b->addr);
813 trace(TraceWork, "finish");
814 sendp(b->writedonechan, b);