Blob


1 /*
2 * Index, mapping scores to log positions.
3 *
4 * The index is made up of some number of index sections, each of
5 * which is typically stored on a different disk. The blocks in all the
6 * index sections are logically numbered, with each index section
7 * responsible for a range of blocks. Blocks are typically 8kB.
8 *
9 * The N index blocks are treated as a giant hash table. The top 32 bits
10 * of score are used as the key for a lookup. Each index block holds
11 * one hash bucket, which is responsible for ceil(2^32 / N) of the key space.
12 *
13 * The index is sized so that a particular bucket is extraordinarily
14 * unlikely to overflow: assuming compressed data blocks are 4kB
15 * on disk, and assuming each block has a 40 byte index entry,
16 * the index data will be 1% of the total data. Since scores are essentially
17 * random, all buckets should be about the same fullness.
18 * A factor of 5 gives us a wide comfort boundary to account for
19 * random variation. So the index disk space should be 5% of the arena disk space.
20 */
22 #include "stdinc.h"
23 #include "dat.h"
24 #include "fns.h"
26 //static int bucklook(u8int *score, int type, u8int *data, int n);
27 //static int writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b);
28 //static int okibucket(IBucket *ib, ISect *is);
29 static int initindex1(Index*);
30 static ISect *initisect1(ISect *is);
31 //static int splitiblock(Index *ix, DBlock *b, ISect *is, u32int buck, IBucket *ib);
33 #define KEY(k,d) ((d) ? (k)>>(32-(d)) : 0)
35 //static QLock indexlock; //ZZZ
37 static char IndexMagic[] = "venti index configuration";
39 Index*
40 initindex(char *name, ISect **sects, int n)
41 {
42 IFile f;
43 Index *ix;
44 ISect *is;
45 u32int last, blocksize, tabsize;
46 int i;
48 if(n <= 0){
49 fprint(2, "bad n\n");
50 seterr(EOk, "no index sections to initialize index");
51 return nil;
52 }
53 ix = MKZ(Index);
54 if(ix == nil){
55 fprint(2, "no mem\n");
56 seterr(EOk, "can't initialize index: out of memory");
57 freeindex(ix);
58 return nil;
59 }
61 tabsize = sects[0]->tabsize;
62 if(partifile(&f, sects[0]->part, sects[0]->tabbase, tabsize) < 0)
63 return nil;
64 if(parseindex(&f, ix) < 0){
65 freeifile(&f);
66 freeindex(ix);
67 return nil;
68 }
69 freeifile(&f);
70 if(namecmp(ix->name, name) != 0){
71 seterr(ECorrupt, "mismatched index name: found %s expected %s", ix->name, name);
72 return nil;
73 }
74 if(ix->nsects != n){
75 seterr(ECorrupt, "mismatched number index sections: found %d expected %d", n, ix->nsects);
76 freeindex(ix);
77 return nil;
78 }
79 ix->sects = sects;
80 last = 0;
81 blocksize = ix->blocksize;
82 for(i = 0; i < ix->nsects; i++){
83 is = sects[i];
84 if(namecmp(ix->name, is->index) != 0
85 || is->blocksize != blocksize
86 || is->tabsize != tabsize
87 || namecmp(is->name, ix->smap[i].name) != 0
88 || is->start != ix->smap[i].start
89 || is->stop != ix->smap[i].stop
90 || last != is->start
91 || is->start > is->stop){
92 seterr(ECorrupt, "inconsistent index sections in %s", ix->name);
93 freeindex(ix);
94 return nil;
95 }
96 last = is->stop;
97 }
98 ix->tabsize = tabsize;
99 ix->buckets = last;
101 if(initindex1(ix) < 0){
102 freeindex(ix);
103 return nil;
106 ix->arenas = MKNZ(Arena*, ix->narenas);
107 if(maparenas(ix->amap, ix->arenas, ix->narenas, ix->name) < 0){
108 freeindex(ix);
109 return nil;
112 return ix;
115 static int
116 initindex1(Index *ix)
118 u32int buckets;
120 ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets;
121 buckets = (((u64int)1 << 32) - 1) / ix->div + 1;
122 if(buckets != ix->buckets){
123 seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name);
124 return -1;
127 return 0;
130 int
131 wbindex(Index *ix)
133 Fmt f;
134 ZBlock *b;
135 int i;
137 if(ix->nsects == 0){
138 seterr(EOk, "no sections in index %s", ix->name);
139 return -1;
141 b = alloczblock(ix->tabsize, 1, ix->blocksize);
142 if(b == nil){
143 seterr(EOk, "can't write index configuration: out of memory");
144 return -1;
146 fmtzbinit(&f, b);
147 if(outputindex(&f, ix) < 0){
148 seterr(EOk, "can't make index configuration: table storage too small %d", ix->tabsize);
149 freezblock(b);
150 return -1;
152 for(i = 0; i < ix->nsects; i++){
153 if(writepart(ix->sects[i]->part, ix->sects[i]->tabbase, b->data, ix->tabsize) < 0){
154 seterr(EOk, "can't write index: %r");
155 freezblock(b);
156 return -1;
159 freezblock(b);
161 for(i = 0; i < ix->nsects; i++)
162 if(wbisect(ix->sects[i]) < 0)
163 return -1;
165 return 0;
168 /*
169 * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas
170 * version, blocksize: u32int
171 * name: max. ANameSize string
172 * sections, arenas: AMap
173 */
174 int
175 outputindex(Fmt *f, Index *ix)
177 if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0
178 || outputamap(f, ix->smap, ix->nsects) < 0
179 || outputamap(f, ix->amap, ix->narenas) < 0)
180 return -1;
181 return 0;
184 int
185 parseindex(IFile *f, Index *ix)
187 AMapN amn;
188 u32int v;
189 char *s;
191 /*
192 * magic
193 */
194 s = ifileline(f);
195 if(s == nil || strcmp(s, IndexMagic) != 0){
196 seterr(ECorrupt, "bad index magic for %s", f->name);
197 return -1;
200 /*
201 * version
202 */
203 if(ifileu32int(f, &v) < 0){
204 seterr(ECorrupt, "syntax error: bad version number in %s", f->name);
205 return -1;
207 ix->version = v;
208 if(ix->version != IndexVersion){
209 seterr(ECorrupt, "bad version number in %s", f->name);
210 return -1;
213 /*
214 * name
215 */
216 if(ifilename(f, ix->name) < 0){
217 seterr(ECorrupt, "syntax error: bad index name in %s", f->name);
218 return -1;
221 /*
222 * block size
223 */
224 if(ifileu32int(f, &v) < 0){
225 seterr(ECorrupt, "syntax error: bad block size number in %s", f->name);
226 return -1;
228 ix->blocksize = v;
230 if(parseamap(f, &amn) < 0)
231 return -1;
232 ix->nsects = amn.n;
233 ix->smap = amn.map;
235 if(parseamap(f, &amn) < 0)
236 return -1;
237 ix->narenas = amn.n;
238 ix->amap = amn.map;
240 return 0;
243 /*
244 * initialize an entirely new index
245 */
246 Index *
247 newindex(char *name, ISect **sects, int n)
249 Index *ix;
250 AMap *smap;
251 u64int nb;
252 u32int div, ub, xb, fb, start, stop, blocksize, tabsize;
253 int i, j;
255 if(n < 1){
256 seterr(EOk, "creating index with no index sections");
257 return nil;
260 /*
261 * compute the total buckets available in the index,
262 * and the total buckets which are used.
263 */
264 nb = 0;
265 blocksize = sects[0]->blocksize;
266 tabsize = sects[0]->tabsize;
267 for(i = 0; i < n; i++){
268 if(sects[i]->start != 0 || sects[i]->stop != 0
269 || sects[i]->index[0] != '\0'){
270 seterr(EOk, "creating new index using non-empty section %s", sects[i]->name);
271 return nil;
273 if(blocksize != sects[i]->blocksize){
274 seterr(EOk, "mismatched block sizes in index sections");
275 return nil;
277 if(tabsize != sects[i]->tabsize){
278 seterr(EOk, "mismatched config table sizes in index sections");
279 return nil;
281 nb += sects[i]->blocks;
284 /*
285 * check for duplicate names
286 */
287 for(i = 0; i < n; i++){
288 for(j = i + 1; j < n; j++){
289 if(namecmp(sects[i]->name, sects[j]->name) == 0){
290 seterr(EOk, "duplicate section name %s for index %s", sects[i]->name, name);
291 return nil;
296 if(nb >= ((u64int)1 << 32)){
297 seterr(EBug, "index too large");
298 return nil;
301 fb = 0;
302 div = (((u64int)1 << 32) + nb - 1) / nb;
303 ub = (((u64int)1 << 32) - 1) / div + 1;
304 if(div < 100){
305 seterr(EBug, "index divisor too coarse");
306 return nil;
308 if(ub > nb){
309 seterr(EBug, "index initialization math wrong");
310 return nil;
312 xb = nb - ub;
314 /*
315 * initialize each of the index sections
316 * and the section map table
317 */
318 smap = MKNZ(AMap, n);
319 if(smap == nil){
320 seterr(EOk, "can't create new index: out of memory");
321 return nil;
323 start = 0;
324 for(i = 0; i < n; i++){
325 stop = start + sects[i]->blocks - xb / n;
326 if(i == n - 1)
327 stop = ub;
328 sects[i]->start = start;
329 sects[i]->stop = stop;
330 namecp(sects[i]->index, name);
332 smap[i].start = start;
333 smap[i].stop = stop;
334 namecp(smap[i].name, sects[i]->name);
335 start = stop;
338 /*
339 * initialize the index itself
340 */
341 ix = MKZ(Index);
342 if(ix == nil){
343 seterr(EOk, "can't create new index: out of memory");
344 free(smap);
345 return nil;
347 ix->version = IndexVersion;
348 namecp(ix->name, name);
349 ix->sects = sects;
350 ix->smap = smap;
351 ix->nsects = n;
352 ix->blocksize = blocksize;
353 ix->buckets = ub;
354 ix->tabsize = tabsize;
355 ix->div = div;
356 ix->bitblocks = fb;
358 if(initindex1(ix) < 0){
359 free(smap);
360 return nil;
363 return ix;
366 ISect*
367 initisect(Part *part)
369 ISect *is;
370 ZBlock *b;
371 int ok;
373 b = alloczblock(HeadSize, 0, 0);
374 if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){
375 seterr(EAdmin, "can't read index section header: %r");
376 return nil;
379 is = MKZ(ISect);
380 if(is == nil){
381 freezblock(b);
382 return nil;
384 is->part = part;
385 ok = unpackisect(is, b->data);
386 freezblock(b);
387 if(ok < 0){
388 seterr(ECorrupt, "corrupted index section header: %r");
389 freeisect(is);
390 return nil;
393 if(is->version != ISectVersion1 && is->version != ISectVersion2){
394 seterr(EAdmin, "unknown index section version %d", is->version);
395 freeisect(is);
396 return nil;
399 return initisect1(is);
402 ISect*
403 newisect(Part *part, u32int vers, char *name, u32int blocksize, u32int tabsize)
405 ISect *is;
406 u32int tabbase;
408 is = MKZ(ISect);
409 if(is == nil)
410 return nil;
412 namecp(is->name, name);
413 is->version = vers;
414 is->part = part;
415 is->blocksize = blocksize;
416 is->start = 0;
417 is->stop = 0;
418 tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1);
419 is->blockbase = (tabbase + tabsize + blocksize - 1) & ~(blocksize - 1);
420 is->blocks = is->part->size / blocksize - is->blockbase / blocksize;
421 is->bucketmagic = 0;
422 if(is->version == ISectVersion2){
423 do{
424 is->bucketmagic = fastrand();
425 }while(is->bucketmagic==0);
427 is = initisect1(is);
428 if(is == nil)
429 return nil;
431 return is;
434 /*
435 * initialize the computed parameters for an index
436 */
437 static ISect*
438 initisect1(ISect *is)
440 u64int v;
442 is->buckmax = (is->blocksize - IBucketSize) / IEntrySize;
443 is->blocklog = u64log2(is->blocksize);
444 if(is->blocksize != (1 << is->blocklog)){
445 seterr(ECorrupt, "illegal non-power-of-2 bucket size %d\n", is->blocksize);
446 freeisect(is);
447 return nil;
449 partblocksize(is->part, is->blocksize);
450 is->tabbase = (PartBlank + HeadSize + is->blocksize - 1) & ~(is->blocksize - 1);
451 if(is->tabbase >= is->blockbase){
452 seterr(ECorrupt, "index section config table overlaps bucket storage");
453 freeisect(is);
454 return nil;
456 is->tabsize = is->blockbase - is->tabbase;
457 v = is->part->size & ~(u64int)(is->blocksize - 1);
458 if(is->blockbase + (u64int)is->blocks * is->blocksize != v){
459 seterr(ECorrupt, "invalid blocks in index section %s", is->name);
460 //ZZZZZZZZZ
461 // freeisect(is);
462 // return nil;
465 if(is->stop - is->start > is->blocks){
466 seterr(ECorrupt, "index section overflows available space");
467 freeisect(is);
468 return nil;
470 if(is->start > is->stop){
471 seterr(ECorrupt, "invalid index section range");
472 freeisect(is);
473 return nil;
476 return is;
479 int
480 wbisect(ISect *is)
482 ZBlock *b;
484 b = alloczblock(HeadSize, 1, 0);
485 if(b == nil)
486 //ZZZ set error?
487 return -1;
489 if(packisect(is, b->data) < 0){
490 seterr(ECorrupt, "can't make index section header: %r");
491 freezblock(b);
492 return -1;
494 if(writepart(is->part, PartBlank, b->data, HeadSize) < 0){
495 seterr(EAdmin, "can't write index section header: %r");
496 freezblock(b);
497 return -1;
499 freezblock(b);
501 return 0;
504 void
505 freeisect(ISect *is)
507 if(is == nil)
508 return;
509 free(is);
512 void
513 freeindex(Index *ix)
515 int i;
517 if(ix == nil)
518 return;
519 free(ix->amap);
520 free(ix->arenas);
521 if(ix->sects)
522 for(i = 0; i < ix->nsects; i++)
523 freeisect(ix->sects[i]);
524 free(ix->sects);
525 free(ix->smap);
526 free(ix);
529 /*
530 * write a clump to an available arena in the index
531 * and return the address of the clump within the index.
532 ZZZ question: should this distinguish between an arena
533 filling up and real errors writing the clump?
534 */
535 u64int
536 writeiclump(Index *ix, Clump *c, u8int *clbuf, u64int *pa)
538 u64int a;
539 int i;
541 trace(TraceLump, "writeiclump enter");
542 for(i = ix->mapalloc; i < ix->narenas; i++){
543 a = writeaclump(ix->arenas[i], c, clbuf, ix->amap[i].start, pa);
544 if(a != TWID64){
545 ix->mapalloc = i; /* assuming write is atomic, race is okay */
546 trace(TraceLump, "writeiclump exit");
547 return a;
551 seterr(EAdmin, "no space left in arenas");
552 trace(TraceLump, "writeiclump failed");
553 return TWID64;
556 /*
557 * convert an arena index to an relative arena address
558 */
559 Arena*
560 amapitoa(Index *ix, u64int a, u64int *aa)
562 int i, r, l, m;
564 l = 1;
565 r = ix->narenas - 1;
566 while(l <= r){
567 m = (r + l) / 2;
568 if(ix->amap[m].start <= a)
569 l = m + 1;
570 else
571 r = m - 1;
573 l--;
575 if(a > ix->amap[l].stop){
576 for(i=0; i<ix->narenas; i++)
577 print("arena %d: %llux - %llux\n", i, ix->amap[i].start, ix->amap[i].stop);
578 print("want arena %d for %llux\n", l, a);
579 seterr(ECrash, "unmapped address passed to amapitoa");
580 return nil;
583 if(ix->arenas[l] == nil){
584 seterr(ECrash, "unmapped arena selected in amapitoa");
585 return nil;
587 *aa = a - ix->amap[l].start;
588 return ix->arenas[l];
591 int
592 iaddrcmp(IAddr *ia1, IAddr *ia2)
594 return ia1->type != ia2->type
595 || ia1->size != ia2->size
596 || ia1->blocks != ia2->blocks
597 || ia1->addr != ia2->addr;
600 /*
601 * lookup the score in the partition
603 * nothing needs to be explicitly locked:
604 * only static parts of ix are used, and
605 * the bucket is locked by the DBlock lock.
606 */
607 int
608 loadientry(Index *ix, u8int *score, int type, IEntry *ie)
610 ISect *is;
611 DBlock *b;
612 IBucket ib;
613 u32int buck;
614 int h, ok;
616 ok = -1;
618 trace(TraceLump, "loadientry enter");
620 /*
621 qlock(&stats.lock);
622 stats.indexreads++;
623 qunlock(&stats.lock);
624 */
626 if(!inbloomfilter(mainindex->bloom, score)){
627 trace(TraceLump, "loadientry bloomhit");
628 return -1;
631 trace(TraceLump, "loadientry loadibucket");
632 b = loadibucket(ix, score, &is, &buck, &ib);
633 trace(TraceLump, "loadientry loadedibucket");
634 if(b == nil)
635 return -1;
637 if(okibucket(&ib, is) < 0){
638 trace(TraceLump, "loadientry badbucket");
639 goto out;
642 h = bucklook(score, type, ib.data, ib.n);
643 if(h & 1){
644 h ^= 1;
645 trace(TraceLump, "loadientry found");
646 unpackientry(ie, &ib.data[h]);
647 ok = 0;
648 goto out;
650 trace(TraceLump, "loadientry notfound");
651 addstat(StatBloomFalseMiss, 1);
652 out:
653 putdblock(b);
654 trace(TraceLump, "loadientry exit");
655 return ok;
658 int
659 okibucket(IBucket *ib, ISect *is)
661 if(ib->n <= is->buckmax)
662 return 0;
664 seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, range=[%lud,%lud)",
665 ib->n, is->buckmax, is->start, is->stop);
666 return -1;
669 /*
670 * look for score within data;
671 * return 1 | byte index of matching index,
672 * or 0 | index of least element > score
673 */
674 int
675 bucklook(u8int *score, int otype, u8int *data, int n)
677 int i, r, l, m, h, c, cc, type;
679 type = vttodisktype(otype);
680 l = 0;
681 r = n - 1;
682 while(l <= r){
683 m = (r + l) >> 1;
684 h = m * IEntrySize;
685 for(i = 0; i < VtScoreSize; i++){
686 c = score[i];
687 cc = data[h + i];
688 if(c != cc){
689 if(c > cc)
690 l = m + 1;
691 else
692 r = m - 1;
693 goto cont;
696 cc = data[h + IEntryTypeOff];
697 if(type != cc){
698 if(type > cc)
699 l = m + 1;
700 else
701 r = m - 1;
702 goto cont;
704 return h | 1;
705 cont:;
708 return l * IEntrySize;
711 /*
712 * compare two IEntries; consistent with bucklook
713 */
714 int
715 ientrycmp(const void *vie1, const void *vie2)
717 u8int *ie1, *ie2;
718 int i, v1, v2;
720 ie1 = (u8int*)vie1;
721 ie2 = (u8int*)vie2;
722 for(i = 0; i < VtScoreSize; i++){
723 v1 = ie1[i];
724 v2 = ie2[i];
725 if(v1 != v2){
726 if(v1 < v2)
727 return -1;
728 return 1;
731 v1 = ie1[IEntryTypeOff];
732 v2 = ie2[IEntryTypeOff];
733 if(v1 != v2){
734 if(v1 < v2)
735 return -1;
736 return 1;
738 return 0;
741 /*
742 * find the number of the index section holding bucket #buck
743 */
744 int
745 indexsect0(Index *ix, u32int buck)
747 int r, l, m;
749 l = 1;
750 r = ix->nsects - 1;
751 while(l <= r){
752 m = (r + l) >> 1;
753 if(ix->sects[m]->start <= buck)
754 l = m + 1;
755 else
756 r = m - 1;
758 return l - 1;
761 /*
762 * load the index block at bucket #buck
763 */
764 static DBlock*
765 loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib, int mode)
767 ISect *is;
768 DBlock *b;
770 is = ix->sects[indexsect0(ix, buck)];
771 if(buck < is->start || is->stop <= buck){
772 seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
773 return nil;
776 buck -= is->start;
777 if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), mode)) == nil)
778 return nil;
780 if(pis)
781 *pis = is;
782 if(pbuck)
783 *pbuck = buck;
784 if(ib)
785 unpackibucket(ib, b->data, is->bucketmagic);
786 return b;
789 /*
790 * find the number of the index section holding score
791 */
792 static int
793 indexsect1(Index *ix, u8int *score)
795 return indexsect0(ix, hashbits(score, 32) / ix->div);
798 /*
799 * load the index block responsible for score.
800 */
801 static DBlock*
802 loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
804 return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib, OREAD);
807 int
808 indexsect(Index *ix, u8int *score)
810 return indexsect1(ix, score);
813 DBlock*
814 loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
816 return loadibucket1(ix, score, pis, pbuck, ib);