2 * Index, mapping scores to log positions.
4 * The index is made up of some number of index sections, each of
5 * which is typically stored on a different disk. The blocks in all the
6 * index sections are logically numbered, with each index section
7 * responsible for a range of blocks. Blocks are typically 8kB.
9 * The N index blocks are treated as a giant hash table. The top 32 bits
10 * of score are used as the key for a lookup. Each index block holds
11 * one hash bucket, which is responsible for ceil(2^32 / N) of the key space.
13 * The index is sized so that a particular bucket is extraordinarily
14 * unlikely to overflow: assuming compressed data blocks are 4kB
15 * on disk, and assuming each block has a 40 byte index entry,
16 * the index data will be 1% of the total data. Since scores are essentially
17 * random, all buckets should be about the same fullness.
18 * A factor of 5 gives us a wide comfort boundary to account for
19 * random variation. So the index disk space should be 5% of the arena disk space.
26 //static int bucklook(u8int *score, int type, u8int *data, int n);
27 //static int writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b);
28 //static int okibucket(IBucket *ib, ISect *is);
29 static int initindex1(Index*);
30 static ISect *initisect1(ISect *is);
31 //static int splitiblock(Index *ix, DBlock *b, ISect *is, u32int buck, IBucket *ib);
33 #define KEY(k,d) ((d) ? (k)>>(32-(d)) : 0)
35 //static QLock indexlock; //ZZZ
37 static char IndexMagic[] = "venti index configuration";
40 initindex(char *name, ISect **sects, int n)
45 u32int last, blocksize, tabsize;
50 seterr(EOk, "no index sections to initialize index");
55 fprint(2, "no mem\n");
56 seterr(EOk, "can't initialize index: out of memory");
61 tabsize = sects[0]->tabsize;
62 if(partifile(&f, sects[0]->part, sects[0]->tabbase, tabsize) < 0)
64 if(parseindex(&f, ix) < 0){
70 if(namecmp(ix->name, name) != 0){
71 seterr(ECorrupt, "mismatched index name: found %s expected %s", ix->name, name);
75 seterr(ECorrupt, "mismatched number index sections: found %d expected %d", n, ix->nsects);
81 blocksize = ix->blocksize;
82 for(i = 0; i < ix->nsects; i++){
84 if(namecmp(ix->name, is->index) != 0
85 || is->blocksize != blocksize
86 || is->tabsize != tabsize
87 || namecmp(is->name, ix->smap[i].name) != 0
88 || is->start != ix->smap[i].start
89 || is->stop != ix->smap[i].stop
91 || is->start > is->stop){
92 seterr(ECorrupt, "inconsistent index sections in %s", ix->name);
98 ix->tabsize = tabsize;
101 if(initindex1(ix) < 0){
106 ix->arenas = MKNZ(Arena*, ix->narenas);
107 if(maparenas(ix->amap, ix->arenas, ix->narenas, ix->name) < 0){
116 initindex1(Index *ix)
120 ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets;
121 buckets = (((u64int)1 << 32) - 1) / ix->div + 1;
122 if(buckets != ix->buckets){
123 seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name);
138 seterr(EOk, "no sections in index %s", ix->name);
141 b = alloczblock(ix->tabsize, 1, ix->blocksize);
143 seterr(EOk, "can't write index configuration: out of memory");
147 if(outputindex(&f, ix) < 0){
148 seterr(EOk, "can't make index configuration: table storage too small %d", ix->tabsize);
152 for(i = 0; i < ix->nsects; i++){
153 if(writepart(ix->sects[i]->part, ix->sects[i]->tabbase, b->data, ix->tabsize) < 0){
154 seterr(EOk, "can't write index: %r");
161 for(i = 0; i < ix->nsects; i++)
162 if(wbisect(ix->sects[i]) < 0)
169 * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas
170 * version, blocksize: u32int
171 * name: max. ANameSize string
172 * sections, arenas: AMap
175 outputindex(Fmt *f, Index *ix)
177 if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0
178 || outputamap(f, ix->smap, ix->nsects) < 0
179 || outputamap(f, ix->amap, ix->narenas) < 0)
185 parseindex(IFile *f, Index *ix)
195 if(s == nil || strcmp(s, IndexMagic) != 0){
196 seterr(ECorrupt, "bad index magic for %s", f->name);
203 if(ifileu32int(f, &v) < 0){
204 seterr(ECorrupt, "syntax error: bad version number in %s", f->name);
208 if(ix->version != IndexVersion){
209 seterr(ECorrupt, "bad version number in %s", f->name);
216 if(ifilename(f, ix->name) < 0){
217 seterr(ECorrupt, "syntax error: bad index name in %s", f->name);
224 if(ifileu32int(f, &v) < 0){
225 seterr(ECorrupt, "syntax error: bad block size number in %s", f->name);
230 if(parseamap(f, &amn) < 0)
235 if(parseamap(f, &amn) < 0)
244 * initialize an entirely new index
247 newindex(char *name, ISect **sects, int n)
252 u32int div, ub, xb, fb, start, stop, blocksize, tabsize;
256 seterr(EOk, "creating index with no index sections");
261 * compute the total buckets available in the index,
262 * and the total buckets which are used.
265 blocksize = sects[0]->blocksize;
266 tabsize = sects[0]->tabsize;
267 for(i = 0; i < n; i++){
268 if(sects[i]->start != 0 || sects[i]->stop != 0
269 || sects[i]->index[0] != '\0'){
270 seterr(EOk, "creating new index using non-empty section %s", sects[i]->name);
273 if(blocksize != sects[i]->blocksize){
274 seterr(EOk, "mismatched block sizes in index sections");
277 if(tabsize != sects[i]->tabsize){
278 seterr(EOk, "mismatched config table sizes in index sections");
281 nb += sects[i]->blocks;
285 * check for duplicate names
287 for(i = 0; i < n; i++){
288 for(j = i + 1; j < n; j++){
289 if(namecmp(sects[i]->name, sects[j]->name) == 0){
290 seterr(EOk, "duplicate section name %s for index %s", sects[i]->name, name);
296 if(nb >= ((u64int)1 << 32)){
297 seterr(EBug, "index too large");
302 div = (((u64int)1 << 32) + nb - 1) / nb;
303 ub = (((u64int)1 << 32) - 1) / div + 1;
305 seterr(EBug, "index divisor too coarse");
309 seterr(EBug, "index initialization math wrong");
315 * initialize each of the index sections
316 * and the section map table
318 smap = MKNZ(AMap, n);
320 seterr(EOk, "can't create new index: out of memory");
324 for(i = 0; i < n; i++){
325 stop = start + sects[i]->blocks - xb / n;
328 sects[i]->start = start;
329 sects[i]->stop = stop;
330 namecp(sects[i]->index, name);
332 smap[i].start = start;
334 namecp(smap[i].name, sects[i]->name);
339 * initialize the index itself
343 seterr(EOk, "can't create new index: out of memory");
347 ix->version = IndexVersion;
348 namecp(ix->name, name);
352 ix->blocksize = blocksize;
354 ix->tabsize = tabsize;
358 if(initindex1(ix) < 0){
367 initisect(Part *part)
373 b = alloczblock(HeadSize, 0, 0);
374 if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){
375 seterr(EAdmin, "can't read index section header: %r");
385 ok = unpackisect(is, b->data);
388 seterr(ECorrupt, "corrupted index section header: %r");
393 if(is->version != ISectVersion1 && is->version != ISectVersion2){
394 seterr(EAdmin, "unknown index section version %d", is->version);
399 return initisect1(is);
403 newisect(Part *part, u32int vers, char *name, u32int blocksize, u32int tabsize)
412 namecp(is->name, name);
415 is->blocksize = blocksize;
418 tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1);
419 is->blockbase = (tabbase + tabsize + blocksize - 1) & ~(blocksize - 1);
420 is->blocks = is->part->size / blocksize - is->blockbase / blocksize;
422 if(is->version == ISectVersion2){
424 is->bucketmagic = fastrand();
425 }while(is->bucketmagic==0);
435 * initialize the computed parameters for an index
438 initisect1(ISect *is)
442 is->buckmax = (is->blocksize - IBucketSize) / IEntrySize;
443 is->blocklog = u64log2(is->blocksize);
444 if(is->blocksize != (1 << is->blocklog)){
445 seterr(ECorrupt, "illegal non-power-of-2 bucket size %d\n", is->blocksize);
449 partblocksize(is->part, is->blocksize);
450 is->tabbase = (PartBlank + HeadSize + is->blocksize - 1) & ~(is->blocksize - 1);
451 if(is->tabbase >= is->blockbase){
452 seterr(ECorrupt, "index section config table overlaps bucket storage");
456 is->tabsize = is->blockbase - is->tabbase;
457 v = is->part->size & ~(u64int)(is->blocksize - 1);
458 if(is->blockbase + (u64int)is->blocks * is->blocksize != v){
459 seterr(ECorrupt, "invalid blocks in index section %s", is->name);
465 if(is->stop - is->start > is->blocks){
466 seterr(ECorrupt, "index section overflows available space");
470 if(is->start > is->stop){
471 seterr(ECorrupt, "invalid index section range");
484 b = alloczblock(HeadSize, 1, 0);
489 if(packisect(is, b->data) < 0){
490 seterr(ECorrupt, "can't make index section header: %r");
494 if(writepart(is->part, PartBlank, b->data, HeadSize) < 0){
495 seterr(EAdmin, "can't write index section header: %r");
522 for(i = 0; i < ix->nsects; i++)
523 freeisect(ix->sects[i]);
530 * write a clump to an available arena in the index
531 * and return the address of the clump within the index.
532 ZZZ question: should this distinguish between an arena
533 filling up and real errors writing the clump?
536 writeiclump(Index *ix, Clump *c, u8int *clbuf, u64int *pa)
541 trace(TraceLump, "writeiclump enter");
542 for(i = ix->mapalloc; i < ix->narenas; i++){
543 a = writeaclump(ix->arenas[i], c, clbuf, ix->amap[i].start, pa);
545 ix->mapalloc = i; /* assuming write is atomic, race is okay */
546 trace(TraceLump, "writeiclump exit");
551 seterr(EAdmin, "no space left in arenas");
552 trace(TraceLump, "writeiclump failed");
557 * convert an arena index to an relative arena address
560 amapitoa(Index *ix, u64int a, u64int *aa)
568 if(ix->amap[m].start <= a)
575 if(a > ix->amap[l].stop){
576 for(i=0; i<ix->narenas; i++)
577 print("arena %d: %llux - %llux\n", i, ix->amap[i].start, ix->amap[i].stop);
578 print("want arena %d for %llux\n", l, a);
579 seterr(ECrash, "unmapped address passed to amapitoa");
583 if(ix->arenas[l] == nil){
584 seterr(ECrash, "unmapped arena selected in amapitoa");
587 *aa = a - ix->amap[l].start;
588 return ix->arenas[l];
592 iaddrcmp(IAddr *ia1, IAddr *ia2)
594 return ia1->type != ia2->type
595 || ia1->size != ia2->size
596 || ia1->blocks != ia2->blocks
597 || ia1->addr != ia2->addr;
601 * lookup the score in the partition
603 * nothing needs to be explicitly locked:
604 * only static parts of ix are used, and
605 * the bucket is locked by the DBlock lock.
608 loadientry(Index *ix, u8int *score, int type, IEntry *ie)
618 trace(TraceLump, "loadientry enter");
623 qunlock(&stats.lock);
626 if(!inbloomfilter(mainindex->bloom, score)){
627 trace(TraceLump, "loadientry bloomhit");
631 trace(TraceLump, "loadientry loadibucket");
632 b = loadibucket(ix, score, &is, &buck, &ib);
633 trace(TraceLump, "loadientry loadedibucket");
637 if(okibucket(&ib, is) < 0){
638 trace(TraceLump, "loadientry badbucket");
642 h = bucklook(score, type, ib.data, ib.n);
645 trace(TraceLump, "loadientry found");
646 unpackientry(ie, &ib.data[h]);
650 trace(TraceLump, "loadientry notfound");
651 addstat(StatBloomFalseMiss, 1);
654 trace(TraceLump, "loadientry exit");
659 okibucket(IBucket *ib, ISect *is)
661 if(ib->n <= is->buckmax)
664 seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, range=[%lud,%lud)",
665 ib->n, is->buckmax, is->start, is->stop);
670 * look for score within data;
671 * return 1 | byte index of matching index,
672 * or 0 | index of least element > score
675 bucklook(u8int *score, int otype, u8int *data, int n)
677 int i, r, l, m, h, c, cc, type;
679 type = vttodisktype(otype);
685 for(i = 0; i < VtScoreSize; i++){
696 cc = data[h + IEntryTypeOff];
708 return l * IEntrySize;
712 * compare two IEntries; consistent with bucklook
715 ientrycmp(const void *vie1, const void *vie2)
722 for(i = 0; i < VtScoreSize; i++){
731 v1 = ie1[IEntryTypeOff];
732 v2 = ie2[IEntryTypeOff];
742 * find the number of the index section holding bucket #buck
745 indexsect0(Index *ix, u32int buck)
753 if(ix->sects[m]->start <= buck)
762 * load the index block at bucket #buck
765 loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib, int mode)
770 is = ix->sects[indexsect0(ix, buck)];
771 if(buck < is->start || is->stop <= buck){
772 seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
777 if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), mode)) == nil)
785 unpackibucket(ib, b->data, is->bucketmagic);
790 * find the number of the index section holding score
793 indexsect1(Index *ix, u8int *score)
795 return indexsect0(ix, hashbits(score, 32) / ix->div);
799 * load the index block responsible for score.
802 loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
804 return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib, OREAD);
808 indexsect(Index *ix, u8int *score)
810 return indexsect1(ix, score);
814 loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
816 return loadibucket1(ix, score, pis, pbuck, ib);