op public repos

Blob

Date:: Sat Mar 13 04:35:13 2004 UTC
Message:: Add binary fraction tree index. The old index code is still supported too. Buildindex and checkindex need to be revisited, though they should be easy to adapt.
Actions:: History | Blame | Raw File
1 /*
2  * The locking here is getting a little out of hand.
3  */
4 
5 #include "stdinc.h"
6 #include "dat.h"
7 #include "fns.h"
8 
9 typedef struct DCache	DCache;
10 
11 enum
12 {
13 	HashLog		= 9,
14 	HashSize	= 1<<HashLog,
15 	HashMask	= HashSize - 1,
16 };
17 
18 struct DCache
19 {
20 	QLock		lock;
21 	RWLock		dirtylock;		/* must be held to inspect or set b->dirty */
22 	u32int		flushround;
23 	Rendez		anydirty;
24 	Rendez		full;
25 	Rendez		flush;
26 	Rendez		flushdone;
27 	DBlock		*free;			/* list of available lumps */
28 	u32int		now;			/* ticks for usage timestamps */
29 	int		size;			/* max. size of any block; allocated to each block */
30 	DBlock		**heads;		/* hash table for finding address */
31 	int		nheap;			/* number of available victims */
32 	DBlock		**heap;			/* heap for locating victims */
33 	int		nblocks;		/* number of blocks allocated */
34 	DBlock		*blocks;		/* array of block descriptors */
35 	DBlock		**write;		/* array of block pointers to be written */
36 	u8int		*mem;			/* memory for all block descriptors */
37 	int		ndirty;			/* number of dirty blocks */
38 	int		maxdirty;		/* max. number of dirty blocks */
39 };
40 
41 static DCache	dcache;
42 
43 static int	downheap(int i, DBlock *b);
44 static int	upheap(int i, DBlock *b);
45 static DBlock	*bumpdblock(void);
46 static void	delheap(DBlock *db);
47 static void	fixheap(int i, DBlock *b);
48 static void	_flushdcache(void);
49 static void	flushproc(void*);
50 static void	flushtimerproc(void*);
51 static void	writeproc(void*);
52 
53 void
54 initdcache(u32int mem)
55 {
56 	DBlock *b, *last;
57 	u32int nblocks, blocksize;
58 	int i;
59 
60 	if(mem < maxblocksize * 2)
61 		sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2);
62 	if(maxblocksize == 0)
63 		sysfatal("no max. block size given for disk cache");
64 	blocksize = maxblocksize;
65 	nblocks = mem / blocksize;
66 	dcache.full.l = &dcache.lock;
67 	dcache.flush.l = &dcache.lock;
68 	dcache.anydirty.l = &dcache.lock;
69 	dcache.flushdone.l = &dcache.lock;
70 	dcache.nblocks = nblocks;
71 	dcache.maxdirty = (nblocks * 3) / 4;
72 	if(1)
73 		fprint(2, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n",
74 			nblocks, blocksize, dcache.maxdirty);
75 	dcache.size = blocksize;
76 	dcache.heads = MKNZ(DBlock*, HashSize);
77 	dcache.heap = MKNZ(DBlock*, nblocks);
78 	dcache.blocks = MKNZ(DBlock, nblocks);
79 	dcache.write = MKNZ(DBlock*, nblocks);
80 	dcache.mem = MKNZ(u8int, nblocks * blocksize);
81 
82 	last = nil;
83 	for(i = 0; i < nblocks; i++){
84 		b = &dcache.blocks[i];
85 		b->data = &dcache.mem[i * blocksize];
86 		b->heap = TWID32;
87 		chaninit(&b->writedonechan, sizeof(void*), 1);
88 		b->next = last;
89 		last = b;
90 	}
91 	dcache.free = last;
92 	dcache.nheap = 0;
93 
94 	vtproc(flushproc, nil);
95 	vtproc(flushtimerproc, nil);
96 }
97 
98 static u32int
99 pbhash(u64int addr)
100 {
101 	u32int h;
102 
103 #define hashit(c)	((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
104 	h = (addr >> 32) ^ addr;
105 	return hashit(h);
106 }
107 
108 DBlock*
109 getdblock(Part *part, u64int addr, int read)
110 {
111 	DBlock *b;
112 	u32int h, size;
113 
114 	size = part->blocksize;
115 	if(size > dcache.size){
116 		seterr(EAdmin, "block size %d too big for cache", size);
117 		return nil;
118 	}
119 	h = pbhash(addr);
120 
121 	/*
122 	 * look for the block in the cache
123 	 */
124 //checkdcache();
125 	qlock(&dcache.lock);
126 again:
127 	for(b = dcache.heads[h]; b != nil; b = b->next){
128 		if(b->part == part && b->addr == addr){
129 			qlock(&stats.lock);
130 			stats.pchit++;
131 			qunlock(&stats.lock);
132 			goto found;
133 		}
134 	}
135 	qlock(&stats.lock);
136 	stats.pcmiss++;
137 	qunlock(&stats.lock);
138 
139 	/*
140 	 * missed: locate the block with the oldest second to last use.
141 	 * remove it from the heap, and fix up the heap.
142 	 */
143 	b = bumpdblock();
144 	if(b == nil){
145 		logerr(EAdmin, "all disk cache blocks in use");
146 		rsleep(&dcache.full);
147 		goto again;
148 	}
149 
150 	/*
151 	 * the new block has no last use, so assume it happens sometime in the middle
152 ZZZ this is not reasonable
153 	 */
154 	b->used = (b->used2 + dcache.now) / 2;
155 
156 	/*
157 	 * rechain the block on the correct hash chain
158 	 */
159 	b->next = dcache.heads[h];
160 	dcache.heads[h] = b;
161 	if(b->next != nil)
162 		b->next->prev = b;
163 	b->prev = nil;
164 
165 	b->addr = addr;
166 	b->part = part;
167 	b->size = 0;
168 
169 found:
170 	b->ref++;
171 	b->used2 = b->used;
172 	b->used = dcache.now++;
173 	if(b->heap != TWID32)
174 		fixheap(b->heap, b);
175 
176 	qunlock(&dcache.lock);
177 //checkdcache();
178 
179 	qlock(&b->lock);
180 	if(b->size != size){
181 		if(b->size < size){
182 			if(!read)
183 				memset(&b->data[b->size], 0, size - b->size);
184 			else{
185 				if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
186 					putdblock(b);
187 					return nil;
188 				}
189 				qlock(&stats.lock);
190 				stats.pcreads++;
191 				stats.pcbreads += size - b->size;
192 				qunlock(&stats.lock);
193 			}
194 		}
195 		b->size = size;
196 	}
197 
198 	return b;
199 }
200 
201 void
202 putdblock(DBlock *b)
203 {
204 	if(b == nil)
205 		return;
206 
207 	if(b->dirtying){
208 		b->dirtying = 0;
209 		runlock(&dcache.dirtylock);
210 	}
211 	qunlock(&b->lock);
212 
213 //checkdcache();
214 	qlock(&dcache.lock);
215 	if(b->dirty)
216 		delheap(b);
217 	else if(--b->ref == 0){
218 		if(b->heap == TWID32)
219 			upheap(dcache.nheap++, b);
220 		rwakeupall(&dcache.full);
221 	}
222 
223 	qunlock(&dcache.lock);
224 //checkdcache();
225 }
226 
227 void
228 dirtydblock(DBlock *b, int dirty)
229 {
230 	int odirty;
231 	Part *p;
232 
233 
234 	assert(b->ref != 0);
235 
236 	/*
237  	 * Because we use an rlock to keep track of how
238 	 * many blocks are being dirtied (and a wlock to
239 	 * stop dirtying), we cannot try to dirty two blocks
240 	 * at the same time in the same thread -- if the 
241 	 * wlock happens between them, we get a deadlock.
242 	 * 
243 	 * The only place in the code where we need to
244 	 * dirty multiple blocks at once is in splitiblock, when
245 	 * we split an index block.  The new block has a dirty
246 	 * flag of DirtyIndexSplit already, because it has to get
247 	 * written to disk before the old block (DirtyIndex).
248 	 * So when we see the DirtyIndexSplit block, we don't
249 	 * acquire the rlock (and we don't set dirtying, so putdblock
250 	 * won't drop the rlock).  This kludginess means that
251 	 * the caller will have to be sure to putdblock on the 
252 	 * new block before putdblock on the old block.
253 	 */
254 	if(dirty == DirtyIndexSplit){
255 		/* caller should have another dirty block */
256 		assert(!canwlock(&dcache.dirtylock));
257 		/* this block should be clean */
258 		assert(b->dirtying == 0);
259 		assert(b->dirty == 0);
260 	}else{
261 		rlock(&dcache.dirtylock);
262 		assert(b->dirtying == 0);
263 		b->dirtying = 1;
264 	}
265 
266 	qlock(&stats.lock);
267 	if(b->dirty)
268 		stats.absorbedwrites++;
269 	stats.dirtydblocks++;
270 	qunlock(&stats.lock);
271 
272 	/*
273 	 * In general, shouldn't mark any block as more than one
274 	 * type, except that split index blocks are a subcase of index
275 	 * blocks.  Only clean blocks ever get marked DirtyIndexSplit,
276 	 * though, so we don't need the opposite conjunction here.
277 	 */
278 	odirty = b->dirty;
279 	if(b->dirty)
280 		assert(b->dirty == dirty
281 			|| (b->dirty==DirtyIndexSplit && dirty==DirtyIndex));
282 	else
283 		b->dirty = dirty;
284 
285 	p = b->part;
286 	if(p->writechan == nil){
287 fprint(2, "allocate write proc for part %s\n", p->name);
288 		/* XXX hope this doesn't fail! */
289 		p->writechan = chancreate(sizeof(DBlock*), dcache.nblocks);
290 		vtproc(writeproc, p);
291 	}
292 	qlock(&dcache.lock);
293 	if(!odirty){
294 		dcache.ndirty++;
295 		rwakeupall(&dcache.anydirty);
296 	}
297 	qunlock(&dcache.lock);
298 }
299 
300 /*
301  * remove some block from use and update the free list and counters
302  */
303 static DBlock*
304 bumpdblock(void)
305 {
306 	int flushed;
307 	DBlock *b;
308 	ulong h;
309 
310 	b = dcache.free;
311 	if(b != nil){
312 		dcache.free = b->next;
313 		return b;
314 	}
315 
316 	if(dcache.ndirty >= dcache.maxdirty)
317 		_flushdcache();
318 
319 	/*
320 	 * remove blocks until we find one that is unused
321 	 * referenced blocks are left in the heap even though
322 	 * they can't be scavenged; this is simple a speed optimization
323 	 */
324 	flushed = 0;
325 	for(;;){
326 		if(dcache.nheap == 0){
327 			_flushdcache();
328 			return nil;
329 		}
330 		b = dcache.heap[0];
331 		delheap(b);
332 		if(!b->ref)
333 			break;
334 	}
335 
336 	fprint(2, "bump %s at %llud\n", b->part->name, b->addr);
337 
338 	/*
339 	 * unchain the block
340 	 */
341 	if(b->prev == nil){
342 		h = pbhash(b->addr);
343 		if(dcache.heads[h] != b)
344 			sysfatal("bad hash chains in disk cache");
345 		dcache.heads[h] = b->next;
346 	}else
347 		b->prev->next = b->next;
348 	if(b->next != nil)
349 		b->next->prev = b->prev;
350 
351 	return b;
352 }
353 
354 /*
355  * delete an arbitrary block from the heap
356  */
357 static void
358 delheap(DBlock *db)
359 {
360 	if(db->heap == TWID32)
361 		return;
362 	fixheap(db->heap, dcache.heap[--dcache.nheap]);
363 	db->heap = TWID32;
364 }
365 
366 /*
367  * push an element up or down to it's correct new location
368  */
369 static void
370 fixheap(int i, DBlock *b)
371 {
372 	if(upheap(i, b) == i)
373 		downheap(i, b);
374 }
375 
376 static int
377 upheap(int i, DBlock *b)
378 {
379 	DBlock *bb;
380 	u32int now;
381 	int p;
382 
383 	now = dcache.now;
384 	for(; i != 0; i = p){
385 		p = (i - 1) >> 1;
386 		bb = dcache.heap[p];
387 		if(b->used2 - now >= bb->used2 - now)
388 			break;
389 		dcache.heap[i] = bb;
390 		bb->heap = i;
391 	}
392 
393 	dcache.heap[i] = b;
394 	b->heap = i;
395 	return i;
396 }
397 
398 static int
399 downheap(int i, DBlock *b)
400 {
401 	DBlock *bb;
402 	u32int now;
403 	int k;
404 
405 	now = dcache.now;
406 	for(; ; i = k){
407 		k = (i << 1) + 1;
408 		if(k >= dcache.nheap)
409 			break;
410 		if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now)
411 			k++;
412 		bb = dcache.heap[k];
413 		if(b->used2 - now <= bb->used2 - now)
414 			break;
415 		dcache.heap[i] = bb;
416 		bb->heap = i;
417 	}
418 
419 	dcache.heap[i] = b;
420 	b->heap = i;
421 	return i;
422 }
423 
424 static void
425 findblock(DBlock *bb)
426 {
427 	DBlock *b, *last;
428 	int h;
429 
430 	last = nil;
431 	h = pbhash(bb->addr);
432 	for(b = dcache.heads[h]; b != nil; b = b->next){
433 		if(last != b->prev)
434 			sysfatal("bad prev link");
435 		if(b == bb)
436 			return;
437 		last = b;
438 	}
439 	sysfatal("block missing from hash table");
440 }
441 
442 void
443 checkdcache(void)
444 {
445 	DBlock *b;
446 	u32int size, now;
447 	int i, k, refed, nfree;
448 
449 	qlock(&dcache.lock);
450 	size = dcache.size;
451 	now = dcache.now;
452 	for(i = 0; i < dcache.nheap; i++){
453 		if(dcache.heap[i]->heap != i)
454 			sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap);
455 		if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now)
456 			sysfatal("dc: bad heap ordering");
457 		k = (i << 1) + 1;
458 		if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
459 			sysfatal("dc: bad heap ordering");
460 		k++;
461 		if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
462 			sysfatal("dc: bad heap ordering");
463 	}
464 
465 	refed = 0;
466 	for(i = 0; i < dcache.nblocks; i++){
467 		b = &dcache.blocks[i];
468 		if(b->data != &dcache.mem[i * size])
469 			sysfatal("dc: mis-blocked at %d", i);
470 		if(b->ref && b->heap == TWID32)
471 			refed++;
472 		if(b->addr)
473 			findblock(b);
474 		if(b->heap != TWID32
475 		&& dcache.heap[b->heap] != b)
476 			sysfatal("dc: spurious heap value");
477 	}
478 
479 	nfree = 0;
480 	for(b = dcache.free; b != nil; b = b->next){
481 		if(b->addr != 0 || b->heap != TWID32)
482 			sysfatal("dc: bad free list");
483 		nfree++;
484 	}
485 
486 	if(dcache.nheap + nfree + refed != dcache.nblocks)
487 		sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks);
488 	qunlock(&dcache.lock);
489 }
490 
491 void
492 flushdcache(void)
493 {
494 	u32int flushround;
495 
496 	qlock(&dcache.lock);
497 	flushround = dcache.flushround;
498 	rwakeupall(&dcache.flush);
499 	while(flushround == dcache.flushround)
500 		rsleep(&dcache.flushdone);
501 	qunlock(&dcache.lock);
502 }
503 
504 static void
505 _flushdcache(void)
506 {
507 	rwakeupall(&dcache.flush);
508 }
509 
510 static int
511 parallelwrites(DBlock **b, DBlock **eb, int dirty)
512 {
513 	DBlock **p;
514 
515 	for(p=b; p<eb && (*p)->dirty == dirty; p++)
516 		sendp((*p)->part->writechan, *p);
517 	for(p=b; p<eb && (*p)->dirty == dirty; p++)
518 		recvp(&(*p)->writedonechan);
519 
520 	return p-b;
521 }
522 
523 /*
524  * Sort first by dirty flag, then by partition, then by address in partition.
525  */
526 static int
527 writeblockcmp(const void *va, const void *vb)
528 {
529 	DBlock *a, *b;
530 
531 	a = *(DBlock**)va;
532 	b = *(DBlock**)vb;
533 
534 	if(a->dirty != b->dirty)
535 		return a->dirty - b->dirty;
536 	if(a->part != b->part){
537 		if(a->part < b->part)
538 			return -1;
539 		if(a->part > b->part)
540 			return 1;
541 	}
542 	if(a->addr < b->addr)
543 		return -1;
544 	return 1;
545 }
546 
547 static void
548 flushtimerproc(void *v)
549 {
550 	u32int round;
551 
552 	for(;;){
553 		qlock(&dcache.lock);
554 		while(dcache.ndirty == 0)
555 			rsleep(&dcache.anydirty);
556 		round = dcache.flushround;
557 		qunlock(&dcache.lock);
558 
559 		sleep(60*1000);
560 
561 		qlock(&dcache.lock);
562 		if(round == dcache.flushround){
563 			rwakeupall(&dcache.flush);
564 			while(round == dcache.flushround)
565 				rsleep(&dcache.flushdone);
566 		}
567 		qunlock(&dcache.lock);
568 	}
569 }
570 
571 static void
572 flushproc(void *v)
573 {
574 	int i, j, n;
575 	ulong t0;
576 	DBlock *b, **write;
577 
578 	USED(v);
579 	for(;;){
580 		qlock(&dcache.lock);
581 		dcache.flushround++;
582 		rwakeupall(&dcache.flushdone);
583 		rsleep(&dcache.flush);
584 		qunlock(&dcache.lock);
585 
586 		fprint(2, "flushing dcache: t=0 ms\n");
587 		t0 = nsec()/1000;
588 
589 		/*
590 		 * Because we don't record any dependencies at all, we must write out
591 		 * all blocks currently dirty.  Thus we must lock all the blocks that 
592 		 * are currently dirty.
593 		 *
594 		 * We grab dirtylock to stop the dirtying of new blocks.
595 		 * Then we wait until all the current blocks finish being dirtied.
596 		 * Now all the dirty blocks in the system are immutable (clean blocks
597 		 * might still get recycled), so we can plan our disk writes.
598 		 * 
599 		 * In a better scheme, dirtiers might lock the block for writing in getdblock,
600 		 * so that flushproc could lock all the blocks here and then unlock them as it
601 		 * finishes with them.
602 		 */ 
603 	
604 		fprint(2, "flushproc: wlock at t=%lud\n", (ulong)(nsec()/1000) - t0);
605 		wlock(&dcache.dirtylock);
606 
607 		fprint(2, "flushproc: build list at t=%lud\n", (ulong)(nsec()/1000) - t0);
608 		write = dcache.write;
609 		n = 0;
610 		for(i=0; i<dcache.nblocks; i++){
611 			b = &dcache.blocks[i];
612 			if(b->dirty)
613 				write[n++] = b;
614 		}
615 
616 		qsort(write, n, sizeof(write[0]), writeblockcmp);
617 
618 		/* Write each stage of blocks out. */
619 		fprint(2, "flushproc: write blocks at t=%lud\n", (ulong)(nsec()/1000) - t0);
620 		i = 0;
621 		for(j=1; j<DirtyMax; j++){
622 			fprint(2, "flushproc: flush dirty %d at t=%lud\n", j, (ulong)(nsec()/1000) - t0);
623 			i += parallelwrites(write+i, write+n, j);
624 		}
625 		assert(i == n);
626 
627 		fprint(2, "flushproc: update dirty bits at t=%lud\n", (ulong)(nsec()/1000) - t0);
628 		qlock(&dcache.lock);
629 		for(i=0; i<n; i++){
630 			b = write[i];
631 			b->dirty = 0;
632 			--dcache.ndirty;
633 			if(b->ref == 0 && b->heap == TWID32){
634 				upheap(dcache.nheap++, b);
635 				rwakeupall(&dcache.full);
636 			}
637 		}
638 		qunlock(&dcache.lock);
639 		wunlock(&dcache.dirtylock);
640 
641 		fprint(2, "flushproc: done at t=%lud\n", (ulong)(nsec()/1000) - t0);
642 
643 		qlock(&stats.lock);
644 		stats.dcacheflushes++;
645 		stats.dcacheflushwrites += n;
646 		qunlock(&stats.lock);
647 	}
648 }
649 
650 static void
651 writeproc(void *v)
652 {
653 	DBlock *b;
654 	Part *p;
655 
656 	p = v;
657 
658 	for(;;){
659 		b = recvp(p->writechan);
660 		if(writepart(p, b->addr, b->data, b->size) < 0)
661 			fprint(2, "write error: %r\n"); /* XXX details! */
662 		sendp(&b->writedonechan, b);
663 	}
664 }