Blob


1 /*
2 * Index, mapping scores to log positions.
3 *
4 * The index is made up of some number of index sections, each of
5 * which is typically stored on a different disk. The blocks in all the
6 * index sections are logically numbered, with each index section
7 * responsible for a range of blocks. Blocks are typically 8kB.
8 *
9 * The N index blocks are treated as a giant hash table. The top 32 bits
10 * of score are used as the key for a lookup. Each index block holds
11 * one hash bucket, which is responsible for ceil(2^32 / N) of the key space.
12 *
13 * The index is sized so that a particular bucket is extraordinarily
14 * unlikely to overflow: assuming compressed data blocks are 4kB
15 * on disk, and assuming each block has a 40 byte index entry,
16 * the index data will be 1% of the total data. Since scores are essentially
17 * random, all buckets should be about the same fullness.
18 * A factor of 5 gives us a wide comfort boundary to account for
19 * random variation. So the index disk space should be 5% of the arena disk space.
20 */
22 #include "stdinc.h"
23 #include "dat.h"
24 #include "fns.h"
26 static int initindex1(Index*);
27 static ISect *initisect1(ISect *is);
29 #define KEY(k,d) ((d) ? (k)>>(32-(d)) : 0)
31 static char IndexMagic[] = "venti index configuration";
33 Index*
34 initindex(char *name, ISect **sects, int n)
35 {
36 IFile f;
37 Index *ix;
38 ISect *is;
39 u32int last, blocksize, tabsize;
40 int i;
42 if(n <= 0){
43 fprint(2, "bad n\n");
44 seterr(EOk, "no index sections to initialize index");
45 return nil;
46 }
47 ix = MKZ(Index);
48 if(ix == nil){
49 fprint(2, "no mem\n");
50 seterr(EOk, "can't initialize index: out of memory");
51 freeindex(ix);
52 return nil;
53 }
55 tabsize = sects[0]->tabsize;
56 if(partifile(&f, sects[0]->part, sects[0]->tabbase, tabsize) < 0)
57 return nil;
58 if(parseindex(&f, ix) < 0){
59 freeifile(&f);
60 freeindex(ix);
61 return nil;
62 }
63 freeifile(&f);
64 if(namecmp(ix->name, name) != 0){
65 seterr(ECorrupt, "mismatched index name: found %s expected %s", ix->name, name);
66 return nil;
67 }
68 if(ix->nsects != n){
69 seterr(ECorrupt, "mismatched number index sections: found %d expected %d", n, ix->nsects);
70 freeindex(ix);
71 return nil;
72 }
73 ix->sects = sects;
74 last = 0;
75 blocksize = ix->blocksize;
76 for(i = 0; i < ix->nsects; i++){
77 is = sects[i];
78 if(namecmp(is->index, ix->name) != 0) {
79 seterr(ECorrupt, "%s: index name is %s, not %s",
80 sects[i]->part->name, is->index, ix->name);
81 bad:
82 freeindex(ix);
83 return nil;
84 }
85 if(is->blocksize != blocksize) {
86 seterr(ECorrupt, "%s: blocksize is %d, not %d",
87 sects[i]->part->name, (int)is->blocksize, (int)blocksize);
88 goto bad;
89 }
90 if(is->tabsize != tabsize) {
91 seterr(ECorrupt, "%s: tabsize is %d, not %d",
92 sects[i]->part->name, (int)is->tabsize, (int)tabsize);
93 goto bad;
94 }
95 if(namecmp(is->name, ix->smap[i].name) != 0) {
96 seterr(ECorrupt, "%s: name is %s, not %s",
97 sects[i]->part->name, is->name, ix->smap[i].name);
98 goto bad;
99 }
100 if(is->start != ix->smap[i].start || is->stop != ix->smap[i].stop) {
101 seterr(ECorrupt, "%s: range is %lld,%lld, not %lld,%lld",
102 sects[i]->part->name, is->start, is->stop,
103 ix->smap[i].start, ix->smap[i].stop);
104 goto bad;
106 if(is->start > is->stop) {
107 seterr(ECorrupt, "%s: invalid range %lld,%lld",
108 sects[i]->part->name, is->start, is->stop);
109 goto bad;
111 if(is->start != last || is->start > is->stop) {
112 seterr(ECorrupt, "%s: range %lld-%lld, but last section ended at %lld",
113 sects[i]->part->name, is->start, is->stop, last);
114 goto bad;
116 last = is->stop;
118 ix->tabsize = tabsize;
119 ix->buckets = last;
121 if(initindex1(ix) < 0){
122 freeindex(ix);
123 return nil;
126 ix->arenas = MKNZ(Arena*, ix->narenas);
127 if(maparenas(ix->amap, ix->arenas, ix->narenas, ix->name) < 0){
128 freeindex(ix);
129 return nil;
132 return ix;
135 static int
136 initindex1(Index *ix)
138 u32int buckets;
140 ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets;
141 buckets = (((u64int)1 << 32) - 1) / ix->div + 1;
142 if(buckets != ix->buckets){
143 seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name);
144 return -1;
147 return 0;
150 int
151 wbindex(Index *ix)
153 Fmt f;
154 ZBlock *b;
155 int i;
157 if(ix->nsects == 0){
158 seterr(EOk, "no sections in index %s", ix->name);
159 return -1;
161 b = alloczblock(ix->tabsize, 1, ix->blocksize);
162 if(b == nil){
163 seterr(EOk, "can't write index configuration: out of memory");
164 return -1;
166 fmtzbinit(&f, b);
167 if(outputindex(&f, ix) < 0){
168 seterr(EOk, "can't make index configuration: table storage too small %d", ix->tabsize);
169 freezblock(b);
170 return -1;
172 for(i = 0; i < ix->nsects; i++){
173 if(writepart(ix->sects[i]->part, ix->sects[i]->tabbase, b->data, ix->tabsize) < 0
174 || flushpart(ix->sects[i]->part) < 0){
175 seterr(EOk, "can't write index: %r");
176 freezblock(b);
177 return -1;
180 freezblock(b);
182 for(i = 0; i < ix->nsects; i++)
183 if(wbisect(ix->sects[i]) < 0)
184 return -1;
186 return 0;
189 /*
190 * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas
191 * version, blocksize: u32int
192 * name: max. ANameSize string
193 * sections, arenas: AMap
194 */
195 int
196 outputindex(Fmt *f, Index *ix)
198 if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0
199 || outputamap(f, ix->smap, ix->nsects) < 0
200 || outputamap(f, ix->amap, ix->narenas) < 0)
201 return -1;
202 return 0;
205 int
206 parseindex(IFile *f, Index *ix)
208 AMapN amn;
209 u32int v;
210 char *s;
212 /*
213 * magic
214 */
215 s = ifileline(f);
216 if(s == nil || strcmp(s, IndexMagic) != 0){
217 seterr(ECorrupt, "bad index magic for %s", f->name);
218 return -1;
221 /*
222 * version
223 */
224 if(ifileu32int(f, &v) < 0){
225 seterr(ECorrupt, "syntax error: bad version number in %s", f->name);
226 return -1;
228 ix->version = v;
229 if(ix->version != IndexVersion){
230 seterr(ECorrupt, "bad version number in %s", f->name);
231 return -1;
234 /*
235 * name
236 */
237 if(ifilename(f, ix->name) < 0){
238 seterr(ECorrupt, "syntax error: bad index name in %s", f->name);
239 return -1;
242 /*
243 * block size
244 */
245 if(ifileu32int(f, &v) < 0){
246 seterr(ECorrupt, "syntax error: bad block size number in %s", f->name);
247 return -1;
249 ix->blocksize = v;
251 if(parseamap(f, &amn) < 0)
252 return -1;
253 ix->nsects = amn.n;
254 ix->smap = amn.map;
256 if(parseamap(f, &amn) < 0)
257 return -1;
258 ix->narenas = amn.n;
259 ix->amap = amn.map;
261 return 0;
264 /*
265 * initialize an entirely new index
266 */
267 Index *
268 newindex(char *name, ISect **sects, int n)
270 Index *ix;
271 AMap *smap;
272 u64int nb;
273 u32int div, ub, xb, start, stop, blocksize, tabsize;
274 int i, j;
276 if(n < 1){
277 seterr(EOk, "creating index with no index sections");
278 return nil;
281 /*
282 * compute the total buckets available in the index,
283 * and the total buckets which are used.
284 */
285 nb = 0;
286 blocksize = sects[0]->blocksize;
287 tabsize = sects[0]->tabsize;
288 for(i = 0; i < n; i++){
289 /*
290 * allow index, start, and stop to be set if index is correct
291 * and start and stop are what we would have picked.
292 * this allows calling fmtindex to reformat the index after
293 * replacing a bad index section with a freshly formatted one.
294 * start and stop are checked below.
295 */
296 if(sects[i]->index[0] != '\0' && strcmp(sects[i]->index, name) != 0){
297 seterr(EOk, "creating new index using non-empty section %s", sects[i]->name);
298 return nil;
300 if(blocksize != sects[i]->blocksize){
301 seterr(EOk, "%s has block size %d, but %s has %d",
302 sects[0]->part->name, (int)blocksize,
303 sects[i]->part->name, (int)sects[i]->blocksize);
304 return nil;
306 if(tabsize != sects[i]->tabsize){
307 seterr(EOk, "%s has table size %d, but %s has %d",
308 sects[0]->part->name, (int)tabsize,
309 sects[i]->part->name, (int)sects[i]->tabsize);
310 return nil;
312 nb += sects[i]->blocks;
315 /*
316 * check for duplicate names
317 */
318 for(i = 0; i < n; i++){
319 for(j = i + 1; j < n; j++){
320 if(namecmp(sects[i]->name, sects[j]->name) == 0){
321 seterr(EOk, "%s and %s both have section name %s",
322 sects[i]->part->name,
323 sects[j]->part->name,
324 sects[i]->name);
325 return nil;
330 if(nb >= ((u64int)1 << 32)){
331 seterr(EBug, "index too large");
332 return nil;
335 div = (((u64int)1 << 32) + nb - 1) / nb;
336 ub = (((u64int)1 << 32) - 1) / div + 1;
337 if(div < 100){
338 seterr(EBug, "index divisor too coarse [%lld buckets]", nb);
339 return nil;
341 if(ub > nb){
342 seterr(EBug, "index initialization math wrong");
343 return nil;
345 xb = nb - ub;
347 /*
348 * initialize each of the index sections
349 * and the section map table
350 */
351 smap = MKNZ(AMap, n);
352 if(smap == nil){
353 seterr(EOk, "can't create new index: out of memory");
354 return nil;
356 start = 0;
357 for(i = 0; i < n; i++){
358 stop = start + sects[i]->blocks - xb / n;
359 if(i == n - 1)
360 stop = ub;
362 if(sects[i]->start != 0 || sects[i]->stop != 0)
363 if(sects[i]->start != start || sects[i]->stop != stop){
364 seterr(EOk, "creating new index using non-empty section %s", sects[i]->name);
365 return nil;
368 sects[i]->start = start;
369 sects[i]->stop = stop;
370 namecp(sects[i]->index, name);
372 smap[i].start = start;
373 smap[i].stop = stop;
374 namecp(smap[i].name, sects[i]->name);
375 start = stop;
378 /*
379 * initialize the index itself
380 */
381 ix = MKZ(Index);
382 if(ix == nil){
383 seterr(EOk, "can't create new index: out of memory");
384 free(smap);
385 return nil;
387 ix->version = IndexVersion;
388 namecp(ix->name, name);
389 ix->sects = sects;
390 ix->smap = smap;
391 ix->nsects = n;
392 ix->blocksize = blocksize;
393 ix->buckets = ub;
394 ix->tabsize = tabsize;
395 ix->div = div;
397 if(initindex1(ix) < 0){
398 free(smap);
399 return nil;
402 return ix;
405 ISect*
406 initisect(Part *part)
408 ISect *is;
409 ZBlock *b;
410 int ok;
412 b = alloczblock(HeadSize, 0, 0);
413 if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){
414 seterr(EAdmin, "can't read index section header: %r");
415 return nil;
418 is = MKZ(ISect);
419 if(is == nil){
420 freezblock(b);
421 return nil;
423 is->part = part;
424 ok = unpackisect(is, b->data);
425 freezblock(b);
426 if(ok < 0){
427 seterr(ECorrupt, "corrupted index section header: %r");
428 freeisect(is);
429 return nil;
432 if(is->version != ISectVersion1 && is->version != ISectVersion2){
433 seterr(EAdmin, "unknown index section version %d", is->version);
434 freeisect(is);
435 return nil;
438 return initisect1(is);
441 ISect*
442 newisect(Part *part, u32int vers, char *name, u32int blocksize, u32int tabsize)
444 ISect *is;
445 u32int tabbase;
447 is = MKZ(ISect);
448 if(is == nil)
449 return nil;
451 namecp(is->name, name);
452 is->version = vers;
453 is->part = part;
454 is->blocksize = blocksize;
455 is->start = 0;
456 is->stop = 0;
457 tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1);
458 is->blockbase = (tabbase + tabsize + blocksize - 1) & ~(blocksize - 1);
459 is->blocks = is->part->size / blocksize - is->blockbase / blocksize;
460 is->bucketmagic = 0;
461 if(is->version == ISectVersion2){
462 do{
463 is->bucketmagic = fastrand();
464 }while(is->bucketmagic==0);
466 is = initisect1(is);
467 if(is == nil)
468 return nil;
470 return is;
473 /*
474 * initialize the computed parameters for an index
475 */
476 static ISect*
477 initisect1(ISect *is)
479 u64int v;
481 is->buckmax = (is->blocksize - IBucketSize) / IEntrySize;
482 is->blocklog = u64log2(is->blocksize);
483 if(is->blocksize != (1 << is->blocklog)){
484 seterr(ECorrupt, "illegal non-power-of-2 bucket size %d\n", is->blocksize);
485 freeisect(is);
486 return nil;
488 partblocksize(is->part, is->blocksize);
489 is->tabbase = (PartBlank + HeadSize + is->blocksize - 1) & ~(is->blocksize - 1);
490 if(is->tabbase >= is->blockbase){
491 seterr(ECorrupt, "index section config table overlaps bucket storage");
492 freeisect(is);
493 return nil;
495 is->tabsize = is->blockbase - is->tabbase;
496 v = is->part->size & ~(u64int)(is->blocksize - 1);
497 if(is->blockbase + (u64int)is->blocks * is->blocksize != v){
498 seterr(ECorrupt, "invalid blocks in index section %s", is->name);
499 /* ZZZ what to do?
500 freeisect(is);
501 return nil;
502 */
505 if(is->stop - is->start > is->blocks){
506 seterr(ECorrupt, "index section overflows available space");
507 freeisect(is);
508 return nil;
510 if(is->start > is->stop){
511 seterr(ECorrupt, "invalid index section range");
512 freeisect(is);
513 return nil;
516 return is;
519 int
520 wbisect(ISect *is)
522 ZBlock *b;
524 b = alloczblock(HeadSize, 1, 0);
525 if(b == nil){
526 /* ZZZ set error? */
527 return -1;
530 if(packisect(is, b->data) < 0){
531 seterr(ECorrupt, "can't make index section header: %r");
532 freezblock(b);
533 return -1;
535 if(writepart(is->part, PartBlank, b->data, HeadSize) < 0 || flushpart(is->part) < 0){
536 seterr(EAdmin, "can't write index section header: %r");
537 freezblock(b);
538 return -1;
540 freezblock(b);
542 return 0;
545 void
546 freeisect(ISect *is)
548 if(is == nil)
549 return;
550 free(is);
553 void
554 freeindex(Index *ix)
556 int i;
558 if(ix == nil)
559 return;
560 free(ix->amap);
561 free(ix->arenas);
562 if(ix->sects)
563 for(i = 0; i < ix->nsects; i++)
564 freeisect(ix->sects[i]);
565 free(ix->sects);
566 free(ix->smap);
567 free(ix);
570 /*
571 * write a clump to an available arena in the index
572 * and return the address of the clump within the index.
573 ZZZ question: should this distinguish between an arena
574 filling up and real errors writing the clump?
575 */
576 u64int
577 writeiclump(Index *ix, Clump *c, u8int *clbuf)
579 u64int a;
580 int i;
581 IAddr ia;
582 AState as;
584 trace(TraceLump, "writeiclump enter");
585 qlock(&ix->writing);
586 for(i = ix->mapalloc; i < ix->narenas; i++){
587 a = writeaclump(ix->arenas[i], c, clbuf);
588 if(a != TWID64){
589 ix->mapalloc = i;
590 ia.addr = ix->amap[i].start + a;
591 ia.type = c->info.type;
592 ia.size = c->info.uncsize;
593 ia.blocks = (c->info.size + ClumpSize + (1<<ABlockLog) - 1) >> ABlockLog;
594 as.arena = ix->arenas[i];
595 as.aa = ia.addr;
596 as.stats = as.arena->memstats;
597 insertscore(c->info.score, &ia, IEDirty, &as);
598 qunlock(&ix->writing);
599 trace(TraceLump, "writeiclump exit");
600 return ia.addr;
603 qunlock(&ix->writing);
605 seterr(EAdmin, "no space left in arenas");
606 trace(TraceLump, "writeiclump failed");
607 return TWID64;
610 /*
611 * convert an arena index to an relative arena address
612 */
613 Arena*
614 amapitoa(Index *ix, u64int a, u64int *aa)
616 int i, r, l, m;
618 l = 1;
619 r = ix->narenas - 1;
620 while(l <= r){
621 m = (r + l) / 2;
622 if(ix->amap[m].start <= a)
623 l = m + 1;
624 else
625 r = m - 1;
627 l--;
629 if(a > ix->amap[l].stop){
630 for(i=0; i<ix->narenas; i++)
631 print("arena %d: %llux - %llux\n", i, ix->amap[i].start, ix->amap[i].stop);
632 print("want arena %d for %llux\n", l, a);
633 seterr(ECrash, "unmapped address passed to amapitoa");
634 return nil;
637 if(ix->arenas[l] == nil){
638 seterr(ECrash, "unmapped arena selected in amapitoa");
639 return nil;
641 *aa = a - ix->amap[l].start;
642 return ix->arenas[l];
645 /*
646 * convert an arena index to the bounds of the containing arena group.
647 */
648 Arena*
649 amapitoag(Index *ix, u64int a, u64int *gstart, u64int *glimit, int *g)
651 u64int aa;
652 Arena *arena;
654 arena = amapitoa(ix, a, &aa);
655 if(arena == nil)
656 return nil;
657 if(arenatog(arena, aa, gstart, glimit, g) < 0)
658 return nil;
659 *gstart += a - aa;
660 *glimit += a - aa;
661 return arena;
664 int
665 iaddrcmp(IAddr *ia1, IAddr *ia2)
667 return ia1->type != ia2->type
668 || ia1->size != ia2->size
669 || ia1->blocks != ia2->blocks
670 || ia1->addr != ia2->addr;
673 /*
674 * lookup the score in the partition
676 * nothing needs to be explicitly locked:
677 * only static parts of ix are used, and
678 * the bucket is locked by the DBlock lock.
679 */
680 int
681 loadientry(Index *ix, u8int *score, int type, IEntry *ie)
683 ISect *is;
684 DBlock *b;
685 IBucket ib;
686 u32int buck;
687 int h, ok;
689 ok = -1;
691 trace(TraceLump, "loadientry enter");
693 /*
694 qlock(&stats.lock);
695 stats.indexreads++;
696 qunlock(&stats.lock);
697 */
699 if(!inbloomfilter(mainindex->bloom, score)){
700 trace(TraceLump, "loadientry bloomhit");
701 return -1;
704 trace(TraceLump, "loadientry loadibucket");
705 b = loadibucket(ix, score, &is, &buck, &ib);
706 trace(TraceLump, "loadientry loadedibucket");
707 if(b == nil)
708 return -1;
710 if(okibucket(&ib, is) < 0){
711 trace(TraceLump, "loadientry badbucket");
712 goto out;
715 h = bucklook(score, type, ib.data, ib.n);
716 if(h & 1){
717 h ^= 1;
718 trace(TraceLump, "loadientry found");
719 unpackientry(ie, &ib.data[h]);
720 ok = 0;
721 goto out;
723 trace(TraceLump, "loadientry notfound");
724 addstat(StatBloomFalseMiss, 1);
725 out:
726 putdblock(b);
727 trace(TraceLump, "loadientry exit");
728 return ok;
731 int
732 okibucket(IBucket *ib, ISect *is)
734 if(ib->n <= is->buckmax)
735 return 0;
737 seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, range=[%lud,%lud)",
738 ib->n, is->buckmax, is->start, is->stop);
739 return -1;
742 /*
743 * look for score within data;
744 * return 1 | byte index of matching index,
745 * or 0 | index of least element > score
746 */
747 int
748 bucklook(u8int *score, int otype, u8int *data, int n)
750 int i, r, l, m, h, c, cc, type;
752 if(otype == -1)
753 type = -1;
754 else
755 type = vttodisktype(otype);
756 l = 0;
757 r = n - 1;
758 while(l <= r){
759 m = (r + l) >> 1;
760 h = m * IEntrySize;
761 for(i = 0; i < VtScoreSize; i++){
762 c = score[i];
763 cc = data[h + i];
764 if(c != cc){
765 if(c > cc)
766 l = m + 1;
767 else
768 r = m - 1;
769 goto cont;
772 cc = data[h + IEntryTypeOff];
773 if(type != cc && type != -1){
774 if(type > cc)
775 l = m + 1;
776 else
777 r = m - 1;
778 goto cont;
780 return h | 1;
781 cont:;
784 return l * IEntrySize;
787 /*
788 * compare two IEntries; consistent with bucklook
789 */
790 int
791 ientrycmp(const void *vie1, const void *vie2)
793 u8int *ie1, *ie2;
794 int i, v1, v2;
796 ie1 = (u8int*)vie1;
797 ie2 = (u8int*)vie2;
798 for(i = 0; i < VtScoreSize; i++){
799 v1 = ie1[i];
800 v2 = ie2[i];
801 if(v1 != v2){
802 if(v1 < v2)
803 return -1;
804 return 1;
807 v1 = ie1[IEntryTypeOff];
808 v2 = ie2[IEntryTypeOff];
809 if(v1 != v2){
810 if(v1 < v2)
811 return -1;
812 return 1;
814 return 0;
817 /*
818 * find the number of the index section holding bucket #buck
819 */
820 int
821 indexsect0(Index *ix, u32int buck)
823 int r, l, m;
825 l = 1;
826 r = ix->nsects - 1;
827 while(l <= r){
828 m = (r + l) >> 1;
829 if(ix->sects[m]->start <= buck)
830 l = m + 1;
831 else
832 r = m - 1;
834 return l - 1;
837 /*
838 * load the index block at bucket #buck
839 */
840 static DBlock*
841 loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib, int mode)
843 ISect *is;
844 DBlock *b;
846 is = ix->sects[indexsect0(ix, buck)];
847 if(buck < is->start || is->stop <= buck){
848 seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
849 return nil;
852 buck -= is->start;
853 if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), mode)) == nil)
854 return nil;
856 if(pis)
857 *pis = is;
858 if(pbuck)
859 *pbuck = buck;
860 if(ib)
861 unpackibucket(ib, b->data, is->bucketmagic);
862 return b;
865 /*
866 * find the number of the index section holding score
867 */
868 int
869 indexsect1(Index *ix, u8int *score)
871 return indexsect0(ix, hashbits(score, 32) / ix->div);
874 /*
875 * load the index block responsible for score.
876 */
877 static DBlock*
878 loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
880 return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib, OREAD);
883 int
884 indexsect(Index *ix, u8int *score)
886 return indexsect1(ix, score);
889 DBlock*
890 loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
892 return loadibucket1(ix, score, pis, pbuck, ib);