Blob


1 /*
2 * Index, mapping scores to log positions.
3 *
4 * The index is made up of some number of index sections, each of
5 * which is typically stored on a different disk. The blocks in all the
6 * index sections are logically numbered, with each index section
7 * responsible for a range of blocks. Blocks are typically 8kB.
8 *
9 * Index Version 1:
10 *
11 * The N index blocks are treated as a giant hash table. The top 32 bits
12 * of score are used as the key for a lookup. Each index block holds
13 * one hash bucket, which is responsible for ceil(2^32 / N) of the key space.
14 *
15 * The index is sized so that a particular bucket is extraordinarily
16 * unlikely to overflow: assuming compressed data blocks are 4kB
17 * on disk, and assuming each block has a 40 byte index entry,
18 * the index data will be 1% of the total data. Since scores are essentially
19 * random, all buckets should be about the same fullness.
20 * A factor of 5 gives us a wide comfort boundary to account for
21 * random variation. So the index disk space should be 5% of the arena disk space.
22 *
23 * Problems with Index Version 1:
24 *
25 * Because the index size is chosen to handle the worst case load,
26 * the index is very sparse, especially when the Venti server is mostly empty.
27 * This has a few bad properties.
28 *
29 * Loading an index block (which typically requires a random disk seek)
30 * is a very expensive operation, yet it yields only a couple index entries.
31 * We're not making efficient use of the disk arm.
32 *
33 * Writing a block requires first checking to see if the block already
34 * exists on the server, which in turn requires an index lookup. When
35 * writing fresh data, these lookups will fail. The index entry cache
36 * cannot serve these, so they must go to disk, which is expensive.
37 *
38 * Thus both the reading and the writing of blocks are held back by
39 * the expense of accessing the index.
40 *
41 * Index Version 2:
42 *
43 * The index is sized to be exactly 2^M blocks. The index blocks are
44 * logically arranged in a (not exactly balanced) binary tree of depth at
45 * most M. The nodes are named by bit strings describing the path from
46 * the root to the node. The root is . (dot). The left child of the root is .0,
47 * the right child .1. The node you get to by starting at the root and going
48 * left right right left is .0110. At the beginning, there is only the root block.
49 * When a block with name .xxx fills, it splits into .xxx0 and .xxx1.
50 * All the index data is kept in the leaves of the tree.
51 *
52 * Index leaf blocks are laid out on disk by interpreting the bitstring as a
53 * binary fraction and multiplying by 2^M -- .0 is the first block, .1 is
54 * the block halfway into the index, .0110 is at position 6/16, and
55 * .xxx and .xxx0 map to the same block (but only one can be a leaf
56 * node at any given time, so this is okay!). A cheap implementation of
57 * this is to append zeros to the bit string to make it M bits long. That's
58 * the integer index block number.
59 *
60 * To find the leaf block that should hold a score, use the bits of the
61 * score one at a time to walk down the tree to a leaf node. If the tree
62 * has leaf nodes .0, .10, and .11, then score 0110101... ends up in bucket
63 * .0 while score 101110101... ends up in bucket .10. There is no leaf node
64 * .1 because it has split into .10 and .11.
65 *
66 * If we know which disk blocks are in use, we can reconstruct the interior
67 * of the tree: if .xxx1 is in use, then .xxx has been split. We keep an in-use
68 * bitmap of all index disk blocks to aid in reconstructing the interior of the
69 * tree. At one bit per index block, the bitmap is small enough to be kept
70 * in memory even on the largest of Venti servers.
71 *
72 * After the root block splits, the index blocks being used will always be
73 * at least half full (averaged across the entire index). So unlike Version 1,
74 * Index Version 2 is quite dense, alleviating the two problems above.
75 * Index block reads now return many index entries. More importantly,
76 * small servers can keep most of the index in the disk cache, making them
77 * more likely to handle negative lookups without going to disk.
78 *
79 * As the index becomes more full, Index Version 2's performance
80 * degrades gracefully into Index Version 1. V2 is really an optimization
81 * for little servers.
82 *
83 * Write Ordering for Index Version 2:
84 *
85 * Unlike Index Version 1, Version 2 must worry about write ordering
86 * within the index. What happens if the in-use bitmap is out of sync
87 * with the actual leaf nodes? What happens if .xxx splits into .xxx0 and
88 * .xxx1 but only one of the new blocks gets written to disk?
89 *
90 * We arrange that when .xxx splits, the .xxx1 block is written first,
91 * then the .xxx0 block, and finally the in-use bitmap entry for .xxx1.
92 * The split is committed by the writing of .xxx0. This ordering leaves
93 * two possible partial disk writes:
94 *
95 * (a) If .xxx1 is written but .xxx0 and the bitmap are not, then it's as if
96 * the split never happened -- we won't think .xxx1 is in use, and we
97 * won't go looking for it.
98 *
99 * (b) If .xxx1 and .xxx0 are written but the bitmap is not, then the first
100 * time we try to load .xxx, we'll get .xxx0 instead, realize the bitmap is
101 * out of date, and update the bitmap.
103 * Backwards Compatibility
105 * Because there are large Venti servers in use with Index V1, this code
106 * will read either index version, but when asked to create a new index,
107 * will only create V2.
108 */
110 #include "stdinc.h"
111 #include "dat.h"
112 #include "fns.h"
114 static int bucklook(u8int *score, int type, u8int *data, int n);
115 static int writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b);
116 static int okibucket(IBucket *ib, ISect *is);
117 static int initindex1(Index*);
118 static ISect *initisect1(ISect *is);
119 static int splitiblock(Index *ix, DBlock *b, ISect *is, u32int buck, IBucket *ib);
121 #define KEY(k,d) ((d) ? (k)>>(32-(d)) : 0)
123 //static QLock indexlock; //ZZZ
125 static char IndexMagic[] = "venti index configuration";
127 Index*
128 initindex(char *name, ISect **sects, int n)
130 IFile f;
131 Index *ix;
132 ISect *is;
133 u32int last, blocksize, tabsize;
134 int i;
136 if(n <= 0){
137 seterr(EOk, "no index sections to initialize index");
138 return nil;
140 ix = MKZ(Index);
141 if(ix == nil){
142 seterr(EOk, "can't initialize index: out of memory");
143 freeindex(ix);
144 return nil;
147 tabsize = sects[0]->tabsize;
148 if(partifile(&f, sects[0]->part, sects[0]->tabbase, tabsize) < 0)
149 return nil;
150 if(parseindex(&f, ix) < 0){
151 freeifile(&f);
152 freeindex(ix);
153 return nil;
155 freeifile(&f);
156 if(namecmp(ix->name, name) != 0){
157 seterr(ECorrupt, "mismatched index name: found %s expected %s", ix->name, name);
158 return nil;
160 if(ix->nsects != n){
161 seterr(ECorrupt, "mismatched number index sections: found %d expected %d", n, ix->nsects);
162 freeindex(ix);
163 return nil;
165 ix->sects = sects;
166 last = 0;
167 blocksize = ix->blocksize;
168 for(i = 0; i < ix->nsects; i++){
169 is = sects[i];
170 if(namecmp(ix->name, is->index) != 0
171 || is->blocksize != blocksize
172 || is->tabsize != tabsize
173 || namecmp(is->name, ix->smap[i].name) != 0
174 || is->start != ix->smap[i].start
175 || is->stop != ix->smap[i].stop
176 || last != is->start
177 || is->start > is->stop){
178 seterr(ECorrupt, "inconsistent index sections in %s", ix->name);
179 freeindex(ix);
180 return nil;
182 last = is->stop;
184 ix->tabsize = tabsize;
185 ix->buckets = last;
187 if(initindex1(ix) < 0){
188 freeindex(ix);
189 return nil;
192 ix->arenas = MKNZ(Arena*, ix->narenas);
193 if(maparenas(ix->amap, ix->arenas, ix->narenas, ix->name) < 0){
194 freeindex(ix);
195 return nil;
198 return ix;
201 static int
202 initindex1(Index *ix)
204 u32int buckets;
206 switch(ix->version){
207 case IndexVersion1:
208 ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets;
209 buckets = (((u64int)1 << 32) - 1) / ix->div + 1;
210 if(buckets != ix->buckets){
211 seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name);
212 return -1;
214 break;
216 case IndexVersion2:
217 buckets = ix->buckets - ix->bitblocks;
218 if(ix->buckets < ix->bitblocks || (buckets&(buckets-1)))
219 seterr(ECorrupt, "bucket count not a power of two in %s", ix->name);
220 ix->maxdepth = u64log2(buckets);
221 ix->bitkeylog = u64log2(ix->blocksize*8);
222 ix->bitkeymask = (1<<ix->bitkeylog)-1;
223 break;
225 return 0;
228 int
229 wbindex(Index *ix)
231 Fmt f;
232 ZBlock *b;
233 int i;
235 if(ix->nsects == 0){
236 seterr(EOk, "no sections in index %s", ix->name);
237 return -1;
239 b = alloczblock(ix->tabsize, 1);
240 if(b == nil){
241 seterr(EOk, "can't write index configuration: out of memory");
242 return -1;
244 fmtzbinit(&f, b);
245 if(outputindex(&f, ix) < 0){
246 seterr(EOk, "can't make index configuration: table storage too small %d", ix->tabsize);
247 freezblock(b);
248 return -1;
250 for(i = 0; i < ix->nsects; i++){
251 if(writepart(ix->sects[i]->part, ix->sects[i]->tabbase, b->data, ix->tabsize) < 0){
252 seterr(EOk, "can't write index: %r");
253 freezblock(b);
254 return -1;
257 freezblock(b);
259 for(i = 0; i < ix->nsects; i++)
260 if(wbisect(ix->sects[i]) < 0)
261 return -1;
263 return 0;
266 /*
267 * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas
268 * version, blocksize: u32int
269 * name: max. ANameSize string
270 * sections, arenas: AMap
271 */
272 int
273 outputindex(Fmt *f, Index *ix)
275 if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0
276 || (ix->version==IndexVersion2 && fmtprint(f, "%ud\n", ix->bitblocks) < 0)
277 || outputamap(f, ix->smap, ix->nsects) < 0
278 || outputamap(f, ix->amap, ix->narenas) < 0)
279 return -1;
280 return 0;
283 int
284 parseindex(IFile *f, Index *ix)
286 AMapN amn;
287 u32int v;
288 char *s;
290 /*
291 * magic
292 */
293 s = ifileline(f);
294 if(s == nil || strcmp(s, IndexMagic) != 0){
295 seterr(ECorrupt, "bad index magic for %s", f->name);
296 return -1;
299 /*
300 * version
301 */
302 if(ifileu32int(f, &v) < 0){
303 seterr(ECorrupt, "syntax error: bad version number in %s", f->name);
304 return -1;
306 ix->version = v;
307 if(ix->version != IndexVersion1 && ix->version != IndexVersion2){
308 seterr(ECorrupt, "bad version number in %s", f->name);
309 return -1;
312 /*
313 * name
314 */
315 if(ifilename(f, ix->name) < 0){
316 seterr(ECorrupt, "syntax error: bad index name in %s", f->name);
317 return -1;
320 /*
321 * block size
322 */
323 if(ifileu32int(f, &v) < 0){
324 seterr(ECorrupt, "syntax error: bad block size number in %s", f->name);
325 return -1;
327 ix->blocksize = v;
329 if(ix->version == IndexVersion2){
330 /*
331 * free bitmap size
332 */
333 if(ifileu32int(f, &v) < 0){
334 seterr(ECorrupt, "syntax error: bad bitmap size in %s", f->name);
335 return -1;
337 ix->bitblocks = v;
340 if(parseamap(f, &amn) < 0)
341 return -1;
342 ix->nsects = amn.n;
343 ix->smap = amn.map;
345 if(parseamap(f, &amn) < 0)
346 return -1;
347 ix->narenas = amn.n;
348 ix->amap = amn.map;
350 return 0;
353 /*
354 * initialize an entirely new index
355 */
356 Index *
357 newindex(char *name, ISect **sects, int n)
359 Index *ix;
360 AMap *smap;
361 u64int nb;
362 u32int div, ub, xb, fb, start, stop, blocksize, tabsize;
363 int i, j, version;
365 version = IndexVersion2;
367 if(n < 1){
368 seterr(EOk, "creating index with no index sections");
369 return nil;
372 /*
373 * compute the total buckets available in the index,
374 * and the total buckets which are used.
375 */
376 nb = 0;
377 blocksize = sects[0]->blocksize;
378 tabsize = sects[0]->tabsize;
379 for(i = 0; i < n; i++){
380 if(sects[i]->start != 0 || sects[i]->stop != 0
381 || sects[i]->index[0] != '\0'){
382 seterr(EOk, "creating new index using non-empty section %s", sects[i]->name);
383 return nil;
385 if(blocksize != sects[i]->blocksize){
386 seterr(EOk, "mismatched block sizes in index sections");
387 return nil;
389 if(tabsize != sects[i]->tabsize){
390 seterr(EOk, "mismatched config table sizes in index sections");
391 return nil;
393 nb += sects[i]->blocks;
396 /*
397 * check for duplicate names
398 */
399 for(i = 0; i < n; i++){
400 for(j = i + 1; j < n; j++){
401 if(namecmp(sects[i]->name, sects[j]->name) == 0){
402 seterr(EOk, "duplicate section name %s for index %s", sects[i]->name, name);
403 return nil;
408 if(nb >= ((u64int)1 << 32)){
409 seterr(EBug, "index too large");
410 return nil;
413 div = 0;
414 fb = 0;
415 if(version == IndexVersion1){
416 div = (((u64int)1 << 32) + nb - 1) / nb;
417 ub = (((u64int)1 << 32) - 1) / div + 1;
418 if(div < 100){
419 seterr(EBug, "index divisor too coarse");
420 return nil;
422 }else{
423 fb = (nb+blocksize*8-1)/(blocksize*8);
424 for(ub=1; ub<=((nb-fb)>>1); ub<<=1)
426 ub += fb;
428 if(ub > nb){
429 seterr(EBug, "index initialization math wrong");
430 return nil;
432 xb = nb - ub;
434 /*
435 * initialize each of the index sections
436 * and the section map table
437 */
438 smap = MKNZ(AMap, n);
439 if(smap == nil){
440 seterr(EOk, "can't create new index: out of memory");
441 return nil;
443 start = 0;
444 for(i = 0; i < n; i++){
445 stop = start + sects[i]->blocks - xb / n;
446 if(i == n - 1)
447 stop = ub;
448 sects[i]->start = start;
449 sects[i]->stop = stop;
450 namecp(sects[i]->index, name);
452 smap[i].start = start;
453 smap[i].stop = stop;
454 namecp(smap[i].name, sects[i]->name);
455 start = stop;
458 /*
459 * initialize the index itself
460 */
461 ix = MKZ(Index);
462 if(ix == nil){
463 seterr(EOk, "can't create new index: out of memory");
464 free(smap);
465 return nil;
467 ix->version = version;
468 namecp(ix->name, name);
469 ix->sects = sects;
470 ix->smap = smap;
471 ix->nsects = n;
472 ix->blocksize = blocksize;
473 ix->buckets = ub;
474 ix->tabsize = tabsize;
475 ix->div = div;
476 ix->bitblocks = fb;
478 if(initindex1(ix) < 0){
479 free(smap);
480 return nil;
483 return ix;
486 ISect*
487 initisect(Part *part)
489 ISect *is;
490 ZBlock *b;
491 int ok;
493 b = alloczblock(HeadSize, 0);
494 if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){
495 seterr(EAdmin, "can't read index section header: %r");
496 return nil;
499 is = MKZ(ISect);
500 if(is == nil){
501 freezblock(b);
502 return nil;
504 is->part = part;
505 ok = unpackisect(is, b->data);
506 freezblock(b);
507 if(ok < 0){
508 seterr(ECorrupt, "corrupted index section header: %r");
509 freeisect(is);
510 return nil;
513 if(is->version != ISectVersion){
514 seterr(EAdmin, "unknown index section version %d", is->version);
515 freeisect(is);
516 return nil;
519 return initisect1(is);
522 ISect*
523 newisect(Part *part, char *name, u32int blocksize, u32int tabsize)
525 ISect *is;
526 u32int tabbase;
528 is = MKZ(ISect);
529 if(is == nil)
530 return nil;
532 namecp(is->name, name);
533 is->version = ISectVersion;
534 is->part = part;
535 is->blocksize = blocksize;
536 is->start = 0;
537 is->stop = 0;
538 tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1);
539 is->blockbase = (tabbase + tabsize + blocksize - 1) & ~(blocksize - 1);
540 is->blocks = is->part->size / blocksize - is->blockbase / blocksize;
542 is = initisect1(is);
543 if(is == nil)
544 return nil;
546 return is;
549 /*
550 * initialize the computed parameters for an index
551 */
552 static ISect*
553 initisect1(ISect *is)
555 u64int v;
557 is->buckmax = (is->blocksize - IBucketSize) / IEntrySize;
558 is->blocklog = u64log2(is->blocksize);
559 if(is->blocksize != (1 << is->blocklog)){
560 seterr(ECorrupt, "illegal non-power-of-2 bucket size %d\n", is->blocksize);
561 freeisect(is);
562 return nil;
564 partblocksize(is->part, is->blocksize);
565 is->tabbase = (PartBlank + HeadSize + is->blocksize - 1) & ~(is->blocksize - 1);
566 if(is->tabbase >= is->blockbase){
567 seterr(ECorrupt, "index section config table overlaps bucket storage");
568 freeisect(is);
569 return nil;
571 is->tabsize = is->blockbase - is->tabbase;
572 v = is->part->size & ~(u64int)(is->blocksize - 1);
573 if(is->blockbase + (u64int)is->blocks * is->blocksize != v){
574 seterr(ECorrupt, "invalid blocks in index section %s", is->name);
575 //ZZZZZZZZZ
576 // freeisect(is);
577 // return nil;
580 if(is->stop - is->start > is->blocks){
581 seterr(ECorrupt, "index section overflows available space");
582 freeisect(is);
583 return nil;
585 if(is->start > is->stop){
586 seterr(ECorrupt, "invalid index section range");
587 freeisect(is);
588 return nil;
591 return is;
594 int
595 wbisect(ISect *is)
597 ZBlock *b;
599 b = alloczblock(HeadSize, 1);
600 if(b == nil)
601 //ZZZ set error?
602 return -1;
604 if(packisect(is, b->data) < 0){
605 seterr(ECorrupt, "can't make index section header: %r");
606 freezblock(b);
607 return -1;
609 if(writepart(is->part, PartBlank, b->data, HeadSize) < 0){
610 seterr(EAdmin, "can't write index section header: %r");
611 freezblock(b);
612 return -1;
614 freezblock(b);
616 return 0;
619 void
620 freeisect(ISect *is)
622 if(is == nil)
623 return;
624 free(is);
627 void
628 freeindex(Index *ix)
630 int i;
632 if(ix == nil)
633 return;
634 free(ix->amap);
635 free(ix->arenas);
636 if(ix->sects)
637 for(i = 0; i < ix->nsects; i++)
638 freeisect(ix->sects[i]);
639 free(ix->sects);
640 free(ix->smap);
641 free(ix);
644 /*
645 * write a clump to an available arena in the index
646 * and return the address of the clump within the index.
647 ZZZ question: should this distinguish between an arena
648 filling up and real errors writing the clump?
649 */
650 u64int
651 writeiclump(Index *ix, Clump *c, u8int *clbuf)
653 u64int a;
654 int i;
656 for(i = ix->mapalloc; i < ix->narenas; i++){
657 a = writeaclump(ix->arenas[i], c, clbuf);
658 if(a != TWID64)
659 return a + ix->amap[i].start;
662 seterr(EAdmin, "no space left in arenas");
663 return TWID64;
666 /*
667 * convert an arena index to an relative arena address
668 */
669 Arena*
670 amapitoa(Index *ix, u64int a, u64int *aa)
672 int i, r, l, m;
674 l = 1;
675 r = ix->narenas - 1;
676 while(l <= r){
677 m = (r + l) / 2;
678 if(ix->amap[m].start <= a)
679 l = m + 1;
680 else
681 r = m - 1;
683 l--;
685 if(a > ix->amap[l].stop){
686 for(i=0; i<ix->narenas; i++)
687 print("arena %d: %llux - %llux\n", i, ix->amap[i].start, ix->amap[i].stop);
688 print("want arena %d for %llux\n", l, a);
689 seterr(ECrash, "unmapped address passed to amapitoa");
690 return nil;
693 if(ix->arenas[l] == nil){
694 seterr(ECrash, "unmapped arena selected in amapitoa");
695 return nil;
697 *aa = a - ix->amap[l].start;
698 return ix->arenas[l];
701 int
702 iaddrcmp(IAddr *ia1, IAddr *ia2)
704 return ia1->type != ia2->type
705 || ia1->size != ia2->size
706 || ia1->blocks != ia2->blocks
707 || ia1->addr != ia2->addr;
710 /*
711 * lookup the score in the partition
713 * nothing needs to be explicitly locked:
714 * only static parts of ix are used, and
715 * the bucket is locked by the DBlock lock.
716 */
717 int
718 loadientry(Index *ix, u8int *score, int type, IEntry *ie)
720 ISect *is;
721 DBlock *b;
722 IBucket ib;
723 u32int buck;
724 int h, ok;
726 ok = -1;
728 qlock(&stats.lock);
729 stats.indexreads++;
730 qunlock(&stats.lock);
731 b = loadibucket(ix, score, &is, &buck, &ib);
732 if(b == nil)
733 return -1;
735 if(okibucket(&ib, is) < 0)
736 goto out;
738 h = bucklook(score, type, ib.data, ib.n);
739 if(h & 1){
740 h ^= 1;
741 unpackientry(ie, &ib.data[h]);
742 ok = 0;
743 goto out;
746 out:
747 putdblock(b);
748 return ok;
751 /*
752 * insert or update an index entry into the appropriate bucket
753 */
754 int
755 storeientry(Index *ix, IEntry *ie)
757 ISect *is;
758 DBlock *b;
759 IBucket ib;
760 u32int buck;
761 int h, ok;
763 ok = 0;
765 qlock(&stats.lock);
766 stats.indexwreads++;
767 qunlock(&stats.lock);
769 top:
770 b = loadibucket(ix, ie->score, &is, &buck, &ib);
771 if(b == nil)
772 return -1;
774 if(okibucket(&ib, is) < 0)
775 goto out;
777 h = bucklook(ie->score, ie->ia.type, ib.data, ib.n);
778 if(h & 1){
779 h ^= 1;
780 dirtydblock(b, DirtyIndex);
781 packientry(ie, &ib.data[h]);
782 ok = writebucket(is, buck, &ib, b);
783 goto out;
786 if(ib.n < is->buckmax){
787 dirtydblock(b, DirtyIndex);
788 memmove(&ib.data[h + IEntrySize], &ib.data[h], ib.n * IEntrySize - h);
789 ib.n++;
791 packientry(ie, &ib.data[h]);
792 ok = writebucket(is, buck, &ib, b);
793 goto out;
796 /* block is full -- not supposed to happen in V1 */
797 if(ix->version == IndexVersion1){
798 seterr(EAdmin, "index bucket %ud on %s is full\n", buck, is->part->name);
799 ok = -1;
800 goto out;
803 /* in V2, split the block and try again; splitiblock puts b */
804 if(splitiblock(ix, b, is, buck, &ib) < 0)
805 return -1;
806 goto top;
808 out:
809 putdblock(b);
810 return ok;
813 static int
814 writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b)
816 assert(b->dirty == DirtyIndex || b->dirty == DirtyIndexSplit);
818 if(buck >= is->blocks){
819 seterr(EAdmin, "index write out of bounds: %d >= %d\n",
820 buck, is->blocks);
821 return -1;
823 qlock(&stats.lock);
824 stats.indexwrites++;
825 qunlock(&stats.lock);
826 packibucket(ib, b->data);
827 // return writepart(is->part, is->blockbase + ((u64int)buck << is->blocklog), b->data, is->blocksize);
828 return 0;
831 static int
832 okibucket(IBucket *ib, ISect *is)
834 if(ib->n <= is->buckmax)
835 return 0;
837 seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, depth=%lud range=[%lud,%lud)",
838 ib->n, is->buckmax, ib->depth, is->start, is->stop);
839 return -1;
842 /*
843 * look for score within data;
844 * return 1 | byte index of matching index,
845 * or 0 | index of least element > score
846 */
847 static int
848 bucklook(u8int *score, int otype, u8int *data, int n)
850 int i, r, l, m, h, c, cc, type;
852 type = vttodisktype(otype);
853 l = 0;
854 r = n - 1;
855 while(l <= r){
856 m = (r + l) >> 1;
857 h = m * IEntrySize;
858 for(i = 0; i < VtScoreSize; i++){
859 c = score[i];
860 cc = data[h + i];
861 if(c != cc){
862 if(c > cc)
863 l = m + 1;
864 else
865 r = m - 1;
866 goto cont;
869 cc = data[h + IEntryTypeOff];
870 if(type != cc){
871 if(type > cc)
872 l = m + 1;
873 else
874 r = m - 1;
875 goto cont;
877 return h | 1;
878 cont:;
881 return l * IEntrySize;
884 /*
885 * compare two IEntries; consistent with bucklook
886 */
887 int
888 ientrycmp(const void *vie1, const void *vie2)
890 u8int *ie1, *ie2;
891 int i, v1, v2;
893 ie1 = (u8int*)vie1;
894 ie2 = (u8int*)vie2;
895 for(i = 0; i < VtScoreSize; i++){
896 v1 = ie1[i];
897 v2 = ie2[i];
898 if(v1 != v2){
899 if(v1 < v2)
900 return -1;
901 return 0;
904 v1 = ie1[IEntryTypeOff];
905 v2 = ie2[IEntryTypeOff];
906 if(v1 != v2){
907 if(v1 < v2)
908 return -1;
909 return 0;
911 return -1;
914 /*
915 * find the number of the index section holding bucket #buck
916 */
917 static int
918 indexsect0(Index *ix, u32int buck)
920 int r, l, m;
922 l = 1;
923 r = ix->nsects - 1;
924 while(l <= r){
925 m = (r + l) >> 1;
926 if(ix->sects[m]->start <= buck)
927 l = m + 1;
928 else
929 r = m - 1;
931 return l - 1;
934 /*
935 * load the index block at bucket #buck
936 */
937 static DBlock*
938 loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib, int read)
940 ISect *is;
941 DBlock *b;
943 is = ix->sects[indexsect0(ix, buck)];
944 if(buck < is->start || is->stop <= buck){
945 seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
946 return nil;
949 buck -= is->start;
950 if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), read)) == nil)
951 return nil;
953 if(pis)
954 *pis = is;
955 if(pbuck)
956 *pbuck = buck;
957 if(ib)
958 unpackibucket(ib, b->data);
959 return b;
962 /*
963 * find the number of the index section holding score
964 */
965 static int
966 indexsect1(Index *ix, u8int *score)
968 return indexsect0(ix, hashbits(score, 32) / ix->div);
971 /*
972 * load the index block responsible for score.
973 */
974 static DBlock*
975 loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
977 return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib, 1);
980 static u32int
981 keytobuck(Index *ix, u32int key, int d)
983 /* clear all but top d bits; can't depend on boundary case shifts */
984 if(d == 0)
985 key = 0;
986 else if(d != 32)
987 key &= ~((1<<(32-d))-1);
989 /* truncate to maxdepth bits */
990 if(ix->maxdepth != 32)
991 key >>= 32 - ix->maxdepth;
993 return ix->bitblocks + key;
996 /*
997 * to check whether .xxx has split, check whether .xxx1 is in use.
998 * it might be worth caching the block for future lookups, but for now
999 * let's assume the dcache is good enough.
1001 static int
1002 bitmapop(Index *ix, u32int key, int d, int set)
1004 DBlock *b;
1005 int inuse;
1006 u32int key1, buck1;
1008 if(d >= ix->maxdepth)
1009 return 0;
1012 /* construct .xxx1 in bucket number format */
1013 key1 = key | (1<<(32-d-1));
1014 buck1 = keytobuck(ix, key1, d+1);
1016 if(0) fprint(2, "key %d/%0*ub key1 %d/%0*ub buck1 %08ux\n",
1017 d, d, KEY(key, d), d+1, d+1, KEY(key1, d+1), buck1);
1019 /* check whether buck1 is in use */
1021 if((b = loadibucket0(ix, buck1 >> ix->bitkeylog, nil, nil, nil, 1)) == nil){
1022 seterr(ECorrupt, "cannot load in-use bitmap block");
1023 fprint(2, "loadibucket: %r\n");
1024 return -1;
1026 if(0) fprint(2, "buck1 %08ux bitkeymask %08ux bitkeylog %d\n", buck1, ix->bitkeymask, ix->bitkeylog);
1027 buck1 &= ix->bitkeymask;
1028 inuse = ((u32int*)b->data)[buck1>>5] & (1<<(buck1&31));
1029 if(set && !inuse){
1030 dirtydblock(b, DirtyIndexBitmap);
1031 ((u32int*)b->data)[buck1>>5] |= (1<<(buck1&31));
1033 putdblock(b);
1034 return inuse;
1037 static int
1038 issplit(Index *ix, u32int key, int d)
1040 return bitmapop(ix, key, d, 0);
1043 static int
1044 marksplit(Index *ix, u32int key, int d)
1046 return bitmapop(ix, key, d, 1);
1050 * find the number of the index section holding score.
1051 * it's not terrible to be wrong once in a while, so we just
1052 * do what the bitmap tells us and don't worry about the
1053 * bitmap being out of date.
1055 static int
1056 indexsect2(Index *ix, u8int *score)
1058 u32int key;
1059 int d, is;
1061 key = hashbits(score, 32);
1062 for(d=0; d<=ix->maxdepth; d++){
1063 is = issplit(ix, key, d);
1064 if(is == -1)
1065 return 0; /* must return something valid! */
1066 if(!is)
1067 break;
1070 if(d > ix->maxdepth){
1071 seterr(EBug, "index bitmap inconsistent with maxdepth");
1072 return 0; /* must return something valid! */
1075 return indexsect0(ix, keytobuck(ix, key, d));
1079 * load the index block responsible for score.
1080 * (unlike indexsect2, must be correct always.)
1082 static DBlock*
1083 loadibucket2(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
1085 u32int key;
1086 int d, try, is;
1087 DBlock *b;
1089 for(try=0; try<32; try++){
1090 key = hashbits(score, 32);
1091 for(d=0; d<=ix->maxdepth; d++){
1092 is = issplit(ix, key, d);
1093 if(is == -1)
1094 return nil;
1095 if(!is)
1096 break;
1098 if(d > ix->maxdepth){
1099 seterr(EBug, "index bitmap inconsistent with maxdepth");
1100 return nil;
1103 if((b = loadibucket0(ix, keytobuck(ix, key, d), pis, pbuck, ib, 1)) == nil)
1104 return nil;
1106 if(ib->depth == d)
1107 return b;
1109 if(ib->depth < d){
1110 fprint(2, "loaded block %ud for %d/%0*ub got %d/%0*ub\n",
1111 *pbuck, d, d, KEY(key,d), ib->depth, ib->depth, KEY(key, ib->depth));
1112 seterr(EBug, "index block has smaller depth than expected -- cannot happen");
1113 putdblock(b);
1114 return nil;
1118 * ib->depth > d, meaning the bitmap was out of date.
1119 * fix the bitmap and try again. if we actually updated
1120 * the bitmap in splitiblock, this would only happen if
1121 * venti crashed at an inopportune moment. but this way
1122 * the code gets tested more.
1124 if(0) fprint(2, "update bitmap: %d/%0*ub is split\n", d, d, KEY(key,d));
1125 putdblock(b);
1126 if(marksplit(ix, key, d) < 0)
1127 return nil;
1129 seterr(EBug, "loadibucket2 failed to sync bitmap with disk!");
1130 return nil;
1133 static int
1134 splitiblock(Index *ix, DBlock *b, ISect *is, u32int buck, IBucket *ib)
1136 int i, d;
1137 u8int *score;
1138 u32int buck0, buck1, key0, key1, key, dmask;
1139 DBlock *b0, *b1;
1140 IBucket ib0, ib1;
1142 if(ib->depth == ix->maxdepth){
1143 if(0) fprint(2, "depth=%d == maxdepth\n", ib->depth);
1144 seterr(EAdmin, "index bucket %ud on %s is full\n", buck, is->part->name);
1145 putdblock(b);
1146 return -1;
1149 buck = is->start+buck - ix->bitblocks;
1150 d = ib->depth+1;
1151 buck0 = buck;
1152 buck1 = buck0 | (1<<(ix->maxdepth-d));
1153 if(ix->maxdepth == 32){
1154 key0 = buck0;
1155 key1 = buck1;
1156 }else{
1157 key0 = buck0 << (32-ix->maxdepth);
1158 key1 = buck1 << (32-ix->maxdepth);
1160 buck0 += ix->bitblocks;
1161 buck1 += ix->bitblocks;
1162 USED(buck0);
1163 USED(key1);
1165 if(d == 32)
1166 dmask = TWID32;
1167 else
1168 dmask = TWID32 ^ ((1<<(32-d))-1);
1171 * Since we hold the lock for b, the bitmap
1172 * thinks buck1 doesn't exist, and the bit
1173 * for buck1 can't be updated without first
1174 * locking and splitting b, it's safe to try to
1175 * acquire the lock on buck1 without dropping b.
1176 * No one else will go after it too.
1178 * Also, none of the rest of the code ever locks
1179 * more than one block at a time, so it's okay if
1180 * we do.
1182 if((b1 = loadibucket0(ix, buck1, nil, nil, &ib1, 0)) == nil){
1183 putdblock(b);
1184 return -1;
1186 b0 = b;
1187 ib0 = *ib;
1190 * Funny locking going on here -- see dirtydblock.
1191 * We must putdblock(b1) before putdblock(b0).
1193 dirtydblock(b0, DirtyIndex);
1194 dirtydblock(b1, DirtyIndexSplit);
1197 * Split the block contents.
1198 * The block is already sorted, so it's pretty easy:
1199 * the first part stays, the second part goes to b1.
1201 ib0.n = 0;
1202 ib0.depth = d;
1203 ib1.n = 0;
1204 ib1.depth = d;
1205 for(i=0; i<ib->n; i++){
1206 score = ib->data+i*IEntrySize;
1207 key = hashbits(score, 32);
1208 if((key&dmask) != key0)
1209 break;
1211 ib0.n = i;
1212 ib1.n = ib->n - ib0.n;
1213 memmove(ib1.data, ib0.data+ib0.n*IEntrySize, ib1.n*IEntrySize);
1214 if(0) fprint(2, "splitiblock %d in %d/%0*ub => %d in %d/%0*ub + %d in %d/%0*ub\n",
1215 ib->n, d-1, d-1, key0>>(32-d+1),
1216 ib0.n, d, d, key0>>(32-d),
1217 ib1.n, d, d, key1>>(32-d));
1219 packibucket(&ib0, b0->data);
1220 packibucket(&ib1, b1->data);
1222 /* order matters! see comment above. */
1223 putdblock(b1);
1224 putdblock(b0);
1227 * let the recovery code take care of updating the bitmap.
1230 qlock(&stats.lock);
1231 stats.indexsplits++;
1232 qunlock(&stats.lock);
1234 return 0;
1237 int
1238 indexsect(Index *ix, u8int *score)
1240 if(ix->version == IndexVersion1)
1241 return indexsect1(ix, score);
1242 return indexsect2(ix, score);
1245 DBlock*
1246 loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
1248 if(ix->version == IndexVersion1)
1249 return loadibucket1(ix, score, pis, pbuck, ib);
1250 return loadibucket2(ix, score, pis, pbuck, ib);