Blob


1 /*
2 * The locking here is getting a little out of hand.
3 */
5 #include "stdinc.h"
6 #include "dat.h"
7 #include "fns.h"
9 typedef struct DCache DCache;
11 enum
12 {
13 HashLog = 9,
14 HashSize = 1<<HashLog,
15 HashMask = HashSize - 1,
16 };
18 struct DCache
19 {
20 QLock lock;
21 RWLock dirtylock; /* must be held to inspect or set b->dirty */
22 u32int flushround;
23 Rendez anydirty;
24 Rendez full;
25 Rendez flush;
26 Rendez flushdone;
27 DBlock *free; /* list of available lumps */
28 u32int now; /* ticks for usage timestamps */
29 int size; /* max. size of any block; allocated to each block */
30 DBlock **heads; /* hash table for finding address */
31 int nheap; /* number of available victims */
32 DBlock **heap; /* heap for locating victims */
33 int nblocks; /* number of blocks allocated */
34 DBlock *blocks; /* array of block descriptors */
35 DBlock **write; /* array of block pointers to be written */
36 u8int *mem; /* memory for all block descriptors */
37 int ndirty; /* number of dirty blocks */
38 int maxdirty; /* max. number of dirty blocks */
39 };
41 static DCache dcache;
43 static int downheap(int i, DBlock *b);
44 static int upheap(int i, DBlock *b);
45 static DBlock *bumpdblock(void);
46 static void delheap(DBlock *db);
47 static void fixheap(int i, DBlock *b);
48 static void _flushdcache(void);
49 static void flushproc(void*);
50 static void flushtimerproc(void*);
51 static void writeproc(void*);
53 void
54 initdcache(u32int mem)
55 {
56 DBlock *b, *last;
57 u32int nblocks, blocksize;
58 int i;
60 if(mem < maxblocksize * 2)
61 sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2);
62 if(maxblocksize == 0)
63 sysfatal("no max. block size given for disk cache");
64 blocksize = maxblocksize;
65 nblocks = mem / blocksize;
66 dcache.full.l = &dcache.lock;
67 dcache.flush.l = &dcache.lock;
68 dcache.anydirty.l = &dcache.lock;
69 dcache.flushdone.l = &dcache.lock;
70 dcache.nblocks = nblocks;
71 dcache.maxdirty = (nblocks * 3) / 4;
72 if(1)
73 fprint(2, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n",
74 nblocks, blocksize, dcache.maxdirty);
75 dcache.size = blocksize;
76 dcache.heads = MKNZ(DBlock*, HashSize);
77 dcache.heap = MKNZ(DBlock*, nblocks);
78 dcache.blocks = MKNZ(DBlock, nblocks);
79 dcache.write = MKNZ(DBlock*, nblocks);
80 dcache.mem = MKNZ(u8int, nblocks * blocksize);
82 last = nil;
83 for(i = 0; i < nblocks; i++){
84 b = &dcache.blocks[i];
85 b->data = &dcache.mem[i * blocksize];
86 b->heap = TWID32;
87 chaninit(&b->writedonechan, sizeof(void*), 1);
88 b->next = last;
89 last = b;
90 }
91 dcache.free = last;
92 dcache.nheap = 0;
94 vtproc(flushproc, nil);
95 vtproc(flushtimerproc, nil);
96 }
98 static u32int
99 pbhash(u64int addr)
101 u32int h;
103 #define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
104 h = (addr >> 32) ^ addr;
105 return hashit(h);
108 DBlock*
109 getdblock(Part *part, u64int addr, int read)
111 DBlock *b;
112 u32int h, size;
114 size = part->blocksize;
115 if(size > dcache.size){
116 seterr(EAdmin, "block size %d too big for cache", size);
117 return nil;
119 h = pbhash(addr);
121 /*
122 * look for the block in the cache
123 */
124 //checkdcache();
125 qlock(&dcache.lock);
126 again:
127 for(b = dcache.heads[h]; b != nil; b = b->next){
128 if(b->part == part && b->addr == addr){
129 qlock(&stats.lock);
130 stats.pchit++;
131 qunlock(&stats.lock);
132 goto found;
135 qlock(&stats.lock);
136 stats.pcmiss++;
137 qunlock(&stats.lock);
139 /*
140 * missed: locate the block with the oldest second to last use.
141 * remove it from the heap, and fix up the heap.
142 */
143 b = bumpdblock();
144 if(b == nil){
145 logerr(EAdmin, "all disk cache blocks in use");
146 rsleep(&dcache.full);
147 goto again;
150 /*
151 * the new block has no last use, so assume it happens sometime in the middle
152 ZZZ this is not reasonable
153 */
154 b->used = (b->used2 + dcache.now) / 2;
156 /*
157 * rechain the block on the correct hash chain
158 */
159 b->next = dcache.heads[h];
160 dcache.heads[h] = b;
161 if(b->next != nil)
162 b->next->prev = b;
163 b->prev = nil;
165 b->addr = addr;
166 b->part = part;
167 b->size = 0;
169 found:
170 b->ref++;
171 b->used2 = b->used;
172 b->used = dcache.now++;
173 if(b->heap != TWID32)
174 fixheap(b->heap, b);
176 qunlock(&dcache.lock);
177 //checkdcache();
179 qlock(&b->lock);
180 if(b->size != size){
181 if(b->size < size){
182 if(!read)
183 memset(&b->data[b->size], 0, size - b->size);
184 else{
185 if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
186 putdblock(b);
187 return nil;
189 qlock(&stats.lock);
190 stats.pcreads++;
191 stats.pcbreads += size - b->size;
192 qunlock(&stats.lock);
195 b->size = size;
198 return b;
201 void
202 putdblock(DBlock *b)
204 if(b == nil)
205 return;
207 if(b->dirtying){
208 b->dirtying = 0;
209 runlock(&dcache.dirtylock);
211 qunlock(&b->lock);
213 //checkdcache();
214 qlock(&dcache.lock);
215 if(b->dirty)
216 delheap(b);
217 else if(--b->ref == 0){
218 if(b->heap == TWID32)
219 upheap(dcache.nheap++, b);
220 rwakeupall(&dcache.full);
223 qunlock(&dcache.lock);
224 //checkdcache();
227 void
228 dirtydblock(DBlock *b, int dirty)
230 int odirty;
231 Part *p;
234 assert(b->ref != 0);
236 /*
237 * Because we use an rlock to keep track of how
238 * many blocks are being dirtied (and a wlock to
239 * stop dirtying), we cannot try to dirty two blocks
240 * at the same time in the same thread -- if the
241 * wlock happens between them, we get a deadlock.
243 * The only place in the code where we need to
244 * dirty multiple blocks at once is in splitiblock, when
245 * we split an index block. The new block has a dirty
246 * flag of DirtyIndexSplit already, because it has to get
247 * written to disk before the old block (DirtyIndex).
248 * So when we see the DirtyIndexSplit block, we don't
249 * acquire the rlock (and we don't set dirtying, so putdblock
250 * won't drop the rlock). This kludginess means that
251 * the caller will have to be sure to putdblock on the
252 * new block before putdblock on the old block.
253 */
254 if(dirty == DirtyIndexSplit){
255 /* caller should have another dirty block */
256 assert(!canwlock(&dcache.dirtylock));
257 /* this block should be clean */
258 assert(b->dirtying == 0);
259 assert(b->dirty == 0);
260 }else{
261 rlock(&dcache.dirtylock);
262 assert(b->dirtying == 0);
263 b->dirtying = 1;
266 qlock(&stats.lock);
267 if(b->dirty)
268 stats.absorbedwrites++;
269 stats.dirtydblocks++;
270 qunlock(&stats.lock);
272 /*
273 * In general, shouldn't mark any block as more than one
274 * type, except that split index blocks are a subcase of index
275 * blocks. Only clean blocks ever get marked DirtyIndexSplit,
276 * though, so we don't need the opposite conjunction here.
277 */
278 odirty = b->dirty;
279 if(b->dirty)
280 assert(b->dirty == dirty
281 || (b->dirty==DirtyIndexSplit && dirty==DirtyIndex));
282 else
283 b->dirty = dirty;
285 p = b->part;
286 if(p->writechan == nil){
287 fprint(2, "allocate write proc for part %s\n", p->name);
288 /* XXX hope this doesn't fail! */
289 p->writechan = chancreate(sizeof(DBlock*), dcache.nblocks);
290 vtproc(writeproc, p);
292 qlock(&dcache.lock);
293 if(!odirty){
294 dcache.ndirty++;
295 rwakeupall(&dcache.anydirty);
297 qunlock(&dcache.lock);
300 /*
301 * remove some block from use and update the free list and counters
302 */
303 static DBlock*
304 bumpdblock(void)
306 int flushed;
307 DBlock *b;
308 ulong h;
310 b = dcache.free;
311 if(b != nil){
312 dcache.free = b->next;
313 return b;
316 if(dcache.ndirty >= dcache.maxdirty)
317 _flushdcache();
319 /*
320 * remove blocks until we find one that is unused
321 * referenced blocks are left in the heap even though
322 * they can't be scavenged; this is simple a speed optimization
323 */
324 flushed = 0;
325 for(;;){
326 if(dcache.nheap == 0){
327 _flushdcache();
328 return nil;
330 b = dcache.heap[0];
331 delheap(b);
332 if(!b->ref)
333 break;
336 fprint(2, "bump %s at %llud\n", b->part->name, b->addr);
338 /*
339 * unchain the block
340 */
341 if(b->prev == nil){
342 h = pbhash(b->addr);
343 if(dcache.heads[h] != b)
344 sysfatal("bad hash chains in disk cache");
345 dcache.heads[h] = b->next;
346 }else
347 b->prev->next = b->next;
348 if(b->next != nil)
349 b->next->prev = b->prev;
351 return b;
354 /*
355 * delete an arbitrary block from the heap
356 */
357 static void
358 delheap(DBlock *db)
360 if(db->heap == TWID32)
361 return;
362 fixheap(db->heap, dcache.heap[--dcache.nheap]);
363 db->heap = TWID32;
366 /*
367 * push an element up or down to it's correct new location
368 */
369 static void
370 fixheap(int i, DBlock *b)
372 if(upheap(i, b) == i)
373 downheap(i, b);
376 static int
377 upheap(int i, DBlock *b)
379 DBlock *bb;
380 u32int now;
381 int p;
383 now = dcache.now;
384 for(; i != 0; i = p){
385 p = (i - 1) >> 1;
386 bb = dcache.heap[p];
387 if(b->used2 - now >= bb->used2 - now)
388 break;
389 dcache.heap[i] = bb;
390 bb->heap = i;
393 dcache.heap[i] = b;
394 b->heap = i;
395 return i;
398 static int
399 downheap(int i, DBlock *b)
401 DBlock *bb;
402 u32int now;
403 int k;
405 now = dcache.now;
406 for(; ; i = k){
407 k = (i << 1) + 1;
408 if(k >= dcache.nheap)
409 break;
410 if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now)
411 k++;
412 bb = dcache.heap[k];
413 if(b->used2 - now <= bb->used2 - now)
414 break;
415 dcache.heap[i] = bb;
416 bb->heap = i;
419 dcache.heap[i] = b;
420 b->heap = i;
421 return i;
424 static void
425 findblock(DBlock *bb)
427 DBlock *b, *last;
428 int h;
430 last = nil;
431 h = pbhash(bb->addr);
432 for(b = dcache.heads[h]; b != nil; b = b->next){
433 if(last != b->prev)
434 sysfatal("bad prev link");
435 if(b == bb)
436 return;
437 last = b;
439 sysfatal("block missing from hash table");
442 void
443 checkdcache(void)
445 DBlock *b;
446 u32int size, now;
447 int i, k, refed, nfree;
449 qlock(&dcache.lock);
450 size = dcache.size;
451 now = dcache.now;
452 for(i = 0; i < dcache.nheap; i++){
453 if(dcache.heap[i]->heap != i)
454 sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap);
455 if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now)
456 sysfatal("dc: bad heap ordering");
457 k = (i << 1) + 1;
458 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
459 sysfatal("dc: bad heap ordering");
460 k++;
461 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
462 sysfatal("dc: bad heap ordering");
465 refed = 0;
466 for(i = 0; i < dcache.nblocks; i++){
467 b = &dcache.blocks[i];
468 if(b->data != &dcache.mem[i * size])
469 sysfatal("dc: mis-blocked at %d", i);
470 if(b->ref && b->heap == TWID32)
471 refed++;
472 if(b->addr)
473 findblock(b);
474 if(b->heap != TWID32
475 && dcache.heap[b->heap] != b)
476 sysfatal("dc: spurious heap value");
479 nfree = 0;
480 for(b = dcache.free; b != nil; b = b->next){
481 if(b->addr != 0 || b->heap != TWID32)
482 sysfatal("dc: bad free list");
483 nfree++;
486 if(dcache.nheap + nfree + refed != dcache.nblocks)
487 sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks);
488 qunlock(&dcache.lock);
491 void
492 flushdcache(void)
494 u32int flushround;
496 qlock(&dcache.lock);
497 flushround = dcache.flushround;
498 rwakeupall(&dcache.flush);
499 while(flushround == dcache.flushround)
500 rsleep(&dcache.flushdone);
501 qunlock(&dcache.lock);
504 static void
505 _flushdcache(void)
507 rwakeupall(&dcache.flush);
510 static int
511 parallelwrites(DBlock **b, DBlock **eb, int dirty)
513 DBlock **p;
515 for(p=b; p<eb && (*p)->dirty == dirty; p++)
516 sendp((*p)->part->writechan, *p);
517 for(p=b; p<eb && (*p)->dirty == dirty; p++)
518 recvp(&(*p)->writedonechan);
520 return p-b;
523 /*
524 * Sort first by dirty flag, then by partition, then by address in partition.
525 */
526 static int
527 writeblockcmp(const void *va, const void *vb)
529 DBlock *a, *b;
531 a = *(DBlock**)va;
532 b = *(DBlock**)vb;
534 if(a->dirty != b->dirty)
535 return a->dirty - b->dirty;
536 if(a->part != b->part){
537 if(a->part < b->part)
538 return -1;
539 if(a->part > b->part)
540 return 1;
542 if(a->addr < b->addr)
543 return -1;
544 return 1;
547 static void
548 flushtimerproc(void *v)
550 u32int round;
552 for(;;){
553 qlock(&dcache.lock);
554 while(dcache.ndirty == 0)
555 rsleep(&dcache.anydirty);
556 round = dcache.flushround;
557 qunlock(&dcache.lock);
559 sleep(60*1000);
561 qlock(&dcache.lock);
562 if(round == dcache.flushround){
563 rwakeupall(&dcache.flush);
564 while(round == dcache.flushround)
565 rsleep(&dcache.flushdone);
567 qunlock(&dcache.lock);
571 static void
572 flushproc(void *v)
574 int i, j, n;
575 ulong t0;
576 DBlock *b, **write;
578 USED(v);
579 for(;;){
580 qlock(&dcache.lock);
581 dcache.flushround++;
582 rwakeupall(&dcache.flushdone);
583 rsleep(&dcache.flush);
584 qunlock(&dcache.lock);
586 fprint(2, "flushing dcache: t=0 ms\n");
587 t0 = nsec()/1000;
589 /*
590 * Because we don't record any dependencies at all, we must write out
591 * all blocks currently dirty. Thus we must lock all the blocks that
592 * are currently dirty.
594 * We grab dirtylock to stop the dirtying of new blocks.
595 * Then we wait until all the current blocks finish being dirtied.
596 * Now all the dirty blocks in the system are immutable (clean blocks
597 * might still get recycled), so we can plan our disk writes.
599 * In a better scheme, dirtiers might lock the block for writing in getdblock,
600 * so that flushproc could lock all the blocks here and then unlock them as it
601 * finishes with them.
602 */
604 fprint(2, "flushproc: wlock at t=%lud\n", (ulong)(nsec()/1000) - t0);
605 wlock(&dcache.dirtylock);
607 fprint(2, "flushproc: build list at t=%lud\n", (ulong)(nsec()/1000) - t0);
608 write = dcache.write;
609 n = 0;
610 for(i=0; i<dcache.nblocks; i++){
611 b = &dcache.blocks[i];
612 if(b->dirty)
613 write[n++] = b;
616 qsort(write, n, sizeof(write[0]), writeblockcmp);
618 /* Write each stage of blocks out. */
619 fprint(2, "flushproc: write blocks at t=%lud\n", (ulong)(nsec()/1000) - t0);
620 i = 0;
621 for(j=1; j<DirtyMax; j++){
622 fprint(2, "flushproc: flush dirty %d at t=%lud\n", j, (ulong)(nsec()/1000) - t0);
623 i += parallelwrites(write+i, write+n, j);
625 assert(i == n);
627 fprint(2, "flushproc: update dirty bits at t=%lud\n", (ulong)(nsec()/1000) - t0);
628 qlock(&dcache.lock);
629 for(i=0; i<n; i++){
630 b = write[i];
631 b->dirty = 0;
632 --dcache.ndirty;
633 if(b->ref == 0 && b->heap == TWID32){
634 upheap(dcache.nheap++, b);
635 rwakeupall(&dcache.full);
638 qunlock(&dcache.lock);
639 wunlock(&dcache.dirtylock);
641 fprint(2, "flushproc: done at t=%lud\n", (ulong)(nsec()/1000) - t0);
643 qlock(&stats.lock);
644 stats.dcacheflushes++;
645 stats.dcacheflushwrites += n;
646 qunlock(&stats.lock);
650 static void
651 writeproc(void *v)
653 DBlock *b;
654 Part *p;
656 p = v;
658 for(;;){
659 b = recvp(p->writechan);
660 if(writepart(p, b->addr, b->data, b->size) < 0)
661 fprint(2, "write error: %r\n"); /* XXX details! */
662 sendp(&b->writedonechan, b);