Blob


1 /*
2 * Disk cache.
3 *
4 * Caches raw disk blocks. Getdblock() gets a block, putdblock puts it back.
5 * Getdblock has a mode parameter that determines i/o and access to a block:
6 * if mode is OREAD or ORDWR, it is read from disk if not already in memory.
7 * If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned.
8 * It is *not* marked dirty -- once changes have been made, they should be noted
9 * by using dirtydblock() before putdblock().
10 *
11 * There is a global cache lock as well as a lock on each block.
12 * Within a thread, the cache lock can be acquired while holding a block lock,
13 * but not vice versa; and a block cannot be locked if you already hold the lock
14 * on another block.
15 *
16 * The flush proc writes out dirty blocks in batches, one batch per dirty tag.
17 * For example, the DirtyArena blocks are all written to disk before any of the
18 * DirtyArenaCib blocks.
19 *
20 * This code used to be in charge of flushing the dirty index blocks out to
21 * disk, but updating the index turned out to benefit from extra care.
22 * Now cached index blocks are never marked dirty. The index.c code takes
23 * care of updating them behind our back, and uses _getdblock to update any
24 * cached copies of the blocks as it changes them on disk.
25 */
27 #include "stdinc.h"
28 #include "dat.h"
29 #include "fns.h"
31 typedef struct DCache DCache;
33 enum
34 {
35 HashLog = 9,
36 HashSize = 1<<HashLog,
37 HashMask = HashSize - 1,
38 };
40 struct DCache
41 {
42 QLock lock;
43 RWLock dirtylock; /* must be held to inspect or set b->dirty */
44 Rendez full;
45 Round round;
46 DBlock *free; /* list of available lumps */
47 u32int now; /* ticks for usage timestamps */
48 int size; /* max. size of any block; allocated to each block */
49 DBlock **heads; /* hash table for finding address */
50 int nheap; /* number of available victims */
51 DBlock **heap; /* heap for locating victims */
52 int nblocks; /* number of blocks allocated */
53 DBlock *blocks; /* array of block descriptors */
54 DBlock **write; /* array of block pointers to be written */
55 u8int *mem; /* memory for all block descriptors */
56 int ndirty; /* number of dirty blocks */
57 int maxdirty; /* max. number of dirty blocks */
58 };
60 typedef struct Ra Ra;
61 struct Ra
62 {
63 Part *part;
64 u64int addr;
65 };
67 static DCache dcache;
69 static int downheap(int i, DBlock *b);
70 static int upheap(int i, DBlock *b);
71 static DBlock *bumpdblock(void);
72 static void delheap(DBlock *db);
73 static void fixheap(int i, DBlock *b);
74 static void flushproc(void*);
75 static void writeproc(void*);
77 void
78 initdcache(u32int mem)
79 {
80 DBlock *b, *last;
81 u32int nblocks, blocksize;
82 int i;
83 u8int *p;
85 if(mem < maxblocksize * 2)
86 sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2);
87 if(maxblocksize == 0)
88 sysfatal("no max. block size given for disk cache");
89 blocksize = maxblocksize;
90 nblocks = mem / blocksize;
91 dcache.full.l = &dcache.lock;
92 dcache.nblocks = nblocks;
93 dcache.maxdirty = (nblocks * 2) / 3;
94 trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n",
95 nblocks, blocksize, dcache.maxdirty);
96 dcache.size = blocksize;
97 dcache.heads = MKNZ(DBlock*, HashSize);
98 dcache.heap = MKNZ(DBlock*, nblocks);
99 dcache.blocks = MKNZ(DBlock, nblocks);
100 dcache.write = MKNZ(DBlock*, nblocks);
101 dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize);
103 last = nil;
104 p = (u8int*)(((uintptr)dcache.mem+blocksize-1)&~(uintptr)(blocksize-1));
105 for(i = 0; i < nblocks; i++){
106 b = &dcache.blocks[i];
107 b->data = &p[i * blocksize];
108 b->heap = TWID32;
109 b->writedonechan = chancreate(sizeof(void*), 1);
110 b->next = last;
111 last = b;
114 dcache.free = last;
115 dcache.nheap = 0;
116 setstat(StatDcacheSize, nblocks);
117 initround(&dcache.round, "dcache", 120*1000);
119 vtproc(flushproc, nil);
120 vtproc(delaykickroundproc, &dcache.round);
123 static u32int
124 pbhash(u64int addr)
126 u32int h;
128 #define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
129 h = (addr >> 32) ^ addr;
130 return hashit(h);
133 DBlock*
134 getdblock(Part *part, u64int addr, int mode)
136 DBlock *b;
138 b = _getdblock(part, addr, mode, 1);
139 if(mode == OREAD || mode == ORDWR)
140 addstat(StatDcacheRead, 1);
141 if(mode == OWRITE || mode == ORDWR)
142 addstat(StatDcacheWrite, 1);
143 return b;
146 DBlock*
147 _getdblock(Part *part, u64int addr, int mode, int load)
149 DBlock *b;
150 u32int h, size, ms;
152 ms = 0;
153 trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr);
154 size = part->blocksize;
155 if(size > dcache.size){
156 seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size);
157 if(load)
158 addstat(StatDcacheLookup, 1);
159 return nil;
161 h = pbhash(addr);
163 /*
164 * look for the block in the cache
165 */
166 qlock(&dcache.lock);
167 again:
168 for(b = dcache.heads[h]; b != nil; b = b->next){
169 if(b->part == part && b->addr == addr){
170 if(load)
171 addstat2(StatDcacheHit, 1, StatDcacheLookup, 1);
172 goto found;
176 /*
177 * missed: locate the block with the oldest second to last use.
178 * remove it from the heap, and fix up the heap.
179 */
180 if(!load){
181 qunlock(&dcache.lock);
182 return nil;
185 /*
186 * Only start timer here, on cache miss - calling msec() on plain cache hits
187 * makes cache hits system-call bound.
188 */
189 ms = msec();
190 addstat2(StatDcacheLookup, 1, StatDcacheMiss, 1);
192 b = bumpdblock();
193 if(b == nil){
194 trace(TraceBlock, "all disk cache blocks in use");
195 addstat(StatDcacheStall, 1);
196 rsleep(&dcache.full);
197 addstat(StatDcacheStall, -1);
198 goto again;
201 assert(!b->dirty);
203 /*
204 * the new block has no last use, so assume it happens sometime in the middle
205 ZZZ this is not reasonable
206 */
207 b->used = (b->used2 + dcache.now) / 2;
209 /*
210 * rechain the block on the correct hash chain
211 */
212 b->next = dcache.heads[h];
213 dcache.heads[h] = b;
214 if(b->next != nil)
215 b->next->prev = b;
216 b->prev = nil;
218 b->addr = addr;
219 b->part = part;
220 b->size = 0;
222 found:
223 b->ref++;
224 b->used2 = b->used;
225 b->used = dcache.now++;
226 if(b->heap != TWID32)
227 fixheap(b->heap, b);
229 if((mode == ORDWR || mode == OWRITE) && part->writechan == nil){
230 trace(TraceBlock, "getdblock allocwriteproc %s", part->name);
231 part->writechan = chancreate(sizeof(DBlock*), dcache.nblocks);
232 vtproc(writeproc, part);
234 qunlock(&dcache.lock);
236 trace(TraceBlock, "getdblock lock");
237 addstat(StatDblockStall, 1);
238 if(mode == OREAD)
239 rlock(&b->lock);
240 else
241 wlock(&b->lock);
242 addstat(StatDblockStall, -1);
243 trace(TraceBlock, "getdblock locked");
245 if(b->size != size){
246 if(mode == OREAD){
247 addstat(StatDblockStall, 1);
248 runlock(&b->lock);
249 wlock(&b->lock);
250 addstat(StatDblockStall, -1);
252 if(b->size < size){
253 if(mode == OWRITE)
254 memset(&b->data[b->size], 0, size - b->size);
255 else{
256 trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr);
257 diskaccess(0);
258 if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
259 b->mode = ORDWR; /* so putdblock wunlocks */
260 putdblock(b);
261 return nil;
263 trace(TraceBlock, "getdblock readpartdone");
264 addstat(StatApartRead, 1);
265 addstat(StatApartReadBytes, size-b->size);
268 b->size = size;
269 if(mode == OREAD){
270 addstat(StatDblockStall, 1);
271 wunlock(&b->lock);
272 rlock(&b->lock);
273 addstat(StatDblockStall, -1);
277 b->mode = mode;
278 trace(TraceBlock, "getdblock exit");
279 if(ms)
280 addstat(StatDcacheLookupTime, msec() - ms);
281 return b;
284 void
285 putdblock(DBlock *b)
287 if(b == nil)
288 return;
290 trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr);
292 if(b->mode == OREAD)
293 runlock(&b->lock);
294 else
295 wunlock(&b->lock);
297 qlock(&dcache.lock);
298 if(--b->ref == 0 && !b->dirty){
299 if(b->heap == TWID32)
300 upheap(dcache.nheap++, b);
301 rwakeupall(&dcache.full);
303 qunlock(&dcache.lock);
306 void
307 dirtydblock(DBlock *b, int dirty)
309 int odirty;
311 trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux",
312 b->part->name, b->addr, dirty, getcallerpc(&b));
313 assert(b->ref != 0);
314 assert(b->mode==ORDWR || b->mode==OWRITE);
316 odirty = b->dirty;
317 if(b->dirty)
318 assert(b->dirty == dirty);
319 else
320 b->dirty = dirty;
322 qlock(&dcache.lock);
323 if(!odirty){
324 dcache.ndirty++;
325 setstat(StatDcacheDirty, dcache.ndirty);
326 if(dcache.ndirty >= dcache.maxdirty)
327 kickround(&dcache.round, 0);
328 else
329 delaykickround(&dcache.round);
331 qunlock(&dcache.lock);
334 static void
335 unchain(DBlock *b)
337 ulong h;
339 /*
340 * unchain the block
341 */
342 if(b->prev == nil){
343 h = pbhash(b->addr);
344 if(dcache.heads[h] != b)
345 sysfatal("bad hash chains in disk cache");
346 dcache.heads[h] = b->next;
347 }else
348 b->prev->next = b->next;
349 if(b->next != nil)
350 b->next->prev = b->prev;
353 /*
354 * remove some block from use and update the free list and counters
355 */
356 static DBlock*
357 bumpdblock(void)
359 DBlock *b;
361 trace(TraceBlock, "bumpdblock enter");
362 b = dcache.free;
363 if(b != nil){
364 dcache.free = b->next;
365 return b;
368 if(dcache.ndirty >= dcache.maxdirty)
369 kickdcache();
371 /*
372 * remove blocks until we find one that is unused
373 * referenced blocks are left in the heap even though
374 * they can't be scavenged; this is simple a speed optimization
375 */
376 for(;;){
377 if(dcache.nheap == 0){
378 kickdcache();
379 trace(TraceBlock, "bumpdblock gotnothing");
380 return nil;
382 b = dcache.heap[0];
383 delheap(b);
384 if(!b->ref && !b->dirty)
385 break;
388 trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr);
390 unchain(b);
391 return b;
394 void
395 emptydcache(void)
397 DBlock *b;
399 qlock(&dcache.lock);
400 while(dcache.nheap > 0){
401 b = dcache.heap[0];
402 delheap(b);
403 if(!b->ref && !b->dirty){
404 unchain(b);
405 b->next = dcache.free;
406 dcache.free = b;
409 qunlock(&dcache.lock);
412 /*
413 * delete an arbitrary block from the heap
414 */
415 static void
416 delheap(DBlock *db)
418 if(db->heap == TWID32)
419 return;
420 fixheap(db->heap, dcache.heap[--dcache.nheap]);
421 db->heap = TWID32;
424 /*
425 * push an element up or down to it's correct new location
426 */
427 static void
428 fixheap(int i, DBlock *b)
430 if(upheap(i, b) == i)
431 downheap(i, b);
434 static int
435 upheap(int i, DBlock *b)
437 DBlock *bb;
438 u32int now;
439 int p;
441 now = dcache.now;
442 for(; i != 0; i = p){
443 p = (i - 1) >> 1;
444 bb = dcache.heap[p];
445 if(b->used2 - now >= bb->used2 - now)
446 break;
447 dcache.heap[i] = bb;
448 bb->heap = i;
451 dcache.heap[i] = b;
452 b->heap = i;
453 return i;
456 static int
457 downheap(int i, DBlock *b)
459 DBlock *bb;
460 u32int now;
461 int k;
463 now = dcache.now;
464 for(; ; i = k){
465 k = (i << 1) + 1;
466 if(k >= dcache.nheap)
467 break;
468 if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now)
469 k++;
470 bb = dcache.heap[k];
471 if(b->used2 - now <= bb->used2 - now)
472 break;
473 dcache.heap[i] = bb;
474 bb->heap = i;
477 dcache.heap[i] = b;
478 b->heap = i;
479 return i;
482 static void
483 findblock(DBlock *bb)
485 DBlock *b, *last;
486 int h;
488 last = nil;
489 h = pbhash(bb->addr);
490 for(b = dcache.heads[h]; b != nil; b = b->next){
491 if(last != b->prev)
492 sysfatal("bad prev link");
493 if(b == bb)
494 return;
495 last = b;
497 sysfatal("block missing from hash table");
500 void
501 checkdcache(void)
503 DBlock *b;
504 u32int size, now;
505 int i, k, refed, nfree;
507 qlock(&dcache.lock);
508 size = dcache.size;
509 now = dcache.now;
510 for(i = 0; i < dcache.nheap; i++){
511 if(dcache.heap[i]->heap != i)
512 sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap);
513 if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now)
514 sysfatal("dc: bad heap ordering");
515 k = (i << 1) + 1;
516 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
517 sysfatal("dc: bad heap ordering");
518 k++;
519 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
520 sysfatal("dc: bad heap ordering");
523 refed = 0;
524 for(i = 0; i < dcache.nblocks; i++){
525 b = &dcache.blocks[i];
526 if(b->data != &dcache.mem[i * size])
527 sysfatal("dc: mis-blocked at %d", i);
528 if(b->ref && b->heap == TWID32)
529 refed++;
530 if(b->addr)
531 findblock(b);
532 if(b->heap != TWID32
533 && dcache.heap[b->heap] != b)
534 sysfatal("dc: spurious heap value");
537 nfree = 0;
538 for(b = dcache.free; b != nil; b = b->next){
539 if(b->addr != 0 || b->heap != TWID32)
540 sysfatal("dc: bad free list");
541 nfree++;
544 if(dcache.nheap + nfree + refed != dcache.nblocks)
545 sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks);
546 qunlock(&dcache.lock);
549 void
550 flushdcache(void)
552 trace(TraceProc, "flushdcache enter");
553 kickround(&dcache.round, 1);
554 trace(TraceProc, "flushdcache exit");
557 void
558 kickdcache(void)
560 kickround(&dcache.round, 0);
563 static int
564 parallelwrites(DBlock **b, DBlock **eb, int dirty)
566 DBlock **p, **q;
567 Part *part;
569 for(p=b; p<eb && (*p)->dirty == dirty; p++){
570 assert(b<=p && p<eb);
571 sendp((*p)->part->writechan, *p);
573 q = p;
574 for(p=b; p<q; p++){
575 assert(b<=p && p<eb);
576 recvp((*p)->writedonechan);
579 /*
580 * Flush the partitions that have been written to.
581 */
582 part = nil;
583 for(p=b; p<q; p++){
584 if(part != (*p)->part){
585 part = (*p)->part;
586 flushpart(part); /* what if it fails? */
590 return p-b;
593 /*
594 * Sort first by dirty flag, then by partition, then by address in partition.
595 */
596 static int
597 writeblockcmp(const void *va, const void *vb)
599 DBlock *a, *b;
601 a = *(DBlock**)va;
602 b = *(DBlock**)vb;
604 if(a->dirty != b->dirty)
605 return a->dirty - b->dirty;
606 if(a->part != b->part){
607 if(a->part < b->part)
608 return -1;
609 if(a->part > b->part)
610 return 1;
612 if(a->addr < b->addr)
613 return -1;
614 return 1;
617 static void
618 flushproc(void *v)
620 int i, j, n;
621 ulong t0;
622 DBlock *b, **write;
624 USED(v);
625 threadsetname("flushproc");
626 for(;;){
627 waitforkick(&dcache.round);
629 trace(TraceWork, "start");
630 t0 = nsec()/1000;
631 trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0);
633 write = dcache.write;
634 n = 0;
635 for(i=0; i<dcache.nblocks; i++){
636 b = &dcache.blocks[i];
637 if(b->dirty)
638 write[n++] = b;
641 qsort(write, n, sizeof(write[0]), writeblockcmp);
643 /* Write each stage of blocks out. */
644 trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0);
645 i = 0;
646 for(j=1; j<DirtyMax; j++){
647 trace(TraceProc, "writeblocks.%d t=%lud",
648 j, (ulong)(nsec()/1000)-t0);
649 i += parallelwrites(write+i, write+n, j);
651 if(i != n){
652 fprint(2, "in flushproc i=%d n=%d\n", i, n);
653 for(i=0; i<n; i++)
654 fprint(2, "\tblock %d: dirty=%d\n",
655 i, write[i]->dirty);
656 abort();
659 /*
660 * b->dirty is protected by b->lock while ndirty is protected
661 * by dcache.lock, so the --ndirty below is the delayed one
662 * from clearing b->dirty in the write proc. It may happen
663 * that some other proc has come along and redirtied b since
664 * the write. That's okay, it just means that ndirty may be
665 * one too high until we catch up and do the decrement.
666 */
667 trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0);
668 qlock(&dcache.lock);
669 for(i=0; i<n; i++){
670 b = write[i];
671 --dcache.ndirty;
672 if(b->ref == 0 && b->heap == TWID32){
673 upheap(dcache.nheap++, b);
674 rwakeupall(&dcache.full);
677 setstat(StatDcacheDirty, dcache.ndirty);
678 qunlock(&dcache.lock);
679 addstat(StatDcacheFlush, 1);
680 trace(TraceWork, "finish");
684 static void
685 writeproc(void *v)
687 DBlock *b;
688 Part *p;
690 p = v;
692 threadsetname("writeproc:%s", p->name);
693 for(;;){
694 b = recvp(p->writechan);
695 trace(TraceWork, "start");
696 assert(b->part == p);
697 trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr);
698 wlock(&b->lock);
699 trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr);
700 diskaccess(0);
701 if(writepart(p, b->addr, b->data, b->size) < 0)
702 fprint(2, "%s: writeproc: part %s addr 0x%llux: write error: %r\n",
703 argv0, p->name, b->addr);
704 addstat(StatApartWrite, 1);
705 addstat(StatApartWriteBytes, b->size);
706 b->dirty = 0;
707 wunlock(&b->lock);
708 trace(TraceProc, "finish %s 0x%llux", p->name, b->addr);
709 trace(TraceWork, "finish");
710 sendp(b->writedonechan, b);