Blob


1 #include "stdinc.h"
2 #include "dat.h"
3 #include "fns.h"
5 typedef struct ASum ASum;
7 struct ASum
8 {
9 Arena *arena;
10 ASum *next;
11 };
13 static void sealarena(Arena *arena);
14 static int okarena(Arena *arena);
15 static int loadarena(Arena *arena);
16 static CIBlock *getcib(Arena *arena, int clump, int writing, CIBlock *rock);
17 static void putcib(Arena *arena, CIBlock *cib);
18 static void sumproc(void *);
20 static QLock sumlock;
21 static Rendez sumwait;
22 static ASum *sumq;
23 static uchar zero[8192];
25 int arenasumsleeptime;
27 int
28 initarenasum(void)
29 {
30 sumwait.l = &sumlock;
32 if(vtproc(sumproc, nil) < 0){
33 seterr(EOk, "can't start arena checksum slave: %r");
34 return -1;
35 }
36 return 0;
37 }
39 /*
40 * make an Arena, and initialize it based upon the disk header and trailer.
41 */
42 Arena*
43 initarena(Part *part, u64int base, u64int size, u32int blocksize)
44 {
45 Arena *arena;
47 arena = MKZ(Arena);
48 arena->part = part;
49 arena->blocksize = blocksize;
50 arena->clumpmax = arena->blocksize / ClumpInfoSize;
51 arena->base = base + blocksize;
52 arena->size = size - 2 * blocksize;
54 if(loadarena(arena) < 0){
55 seterr(ECorrupt, "arena header or trailer corrupted");
56 freearena(arena);
57 return nil;
58 }
59 if(okarena(arena) < 0){
60 freearena(arena);
61 return nil;
62 }
64 if(arena->diskstats.sealed && scorecmp(zeroscore, arena->score)==0)
65 backsumarena(arena);
67 return arena;
68 }
70 void
71 freearena(Arena *arena)
72 {
73 if(arena == nil)
74 return;
75 free(arena);
76 }
78 Arena*
79 newarena(Part *part, u32int vers, char *name, u64int base, u64int size, u32int blocksize)
80 {
81 int bsize;
82 Arena *arena;
84 if(nameok(name) < 0){
85 seterr(EOk, "illegal arena name", name);
86 return nil;
87 }
88 arena = MKZ(Arena);
89 arena->part = part;
90 arena->version = vers;
91 if(vers == ArenaVersion4)
92 arena->clumpmagic = _ClumpMagic;
93 else{
94 do
95 arena->clumpmagic = fastrand();
96 while(arena->clumpmagic==_ClumpMagic || arena->clumpmagic==0);
97 }
98 arena->blocksize = blocksize;
99 arena->clumpmax = arena->blocksize / ClumpInfoSize;
100 arena->base = base + blocksize;
101 arena->size = size - 2 * blocksize;
103 namecp(arena->name, name);
105 bsize = sizeof zero;
106 if(bsize > arena->blocksize)
107 bsize = arena->blocksize;
109 if(wbarena(arena)<0 || wbarenahead(arena)<0
110 || writepart(arena->part, arena->base, zero, bsize)<0){
111 freearena(arena);
112 return nil;
115 return arena;
118 int
119 readclumpinfo(Arena *arena, int clump, ClumpInfo *ci)
121 CIBlock *cib, r;
123 cib = getcib(arena, clump, 0, &r);
124 if(cib == nil)
125 return -1;
126 unpackclumpinfo(ci, &cib->data->data[cib->offset]);
127 putcib(arena, cib);
128 return 0;
131 int
132 readclumpinfos(Arena *arena, int clump, ClumpInfo *cis, int n)
134 CIBlock *cib, r;
135 int i;
137 for(i = 0; i < n; i++){
138 cib = getcib(arena, clump + i, 0, &r);
139 if(cib == nil)
140 break;
141 unpackclumpinfo(&cis[i], &cib->data->data[cib->offset]);
142 putcib(arena, cib);
144 return i;
147 /*
148 * write directory information for one clump
149 * must be called the arena locked
150 */
151 int
152 writeclumpinfo(Arena *arena, int clump, ClumpInfo *ci)
154 CIBlock *cib, r;
156 cib = getcib(arena, clump, 1, &r);
157 if(cib == nil)
158 return -1;
159 dirtydblock(cib->data, DirtyArenaCib);
160 packclumpinfo(ci, &cib->data->data[cib->offset]);
161 putcib(arena, cib);
162 return 0;
165 u64int
166 arenadirsize(Arena *arena, u32int clumps)
168 return ((clumps / arena->clumpmax) + 1) * arena->blocksize;
171 /*
172 * read a clump of data
173 * n is a hint of the size of the data, not including the header
174 * make sure it won't run off the end, then return the number of bytes actually read
175 */
176 u32int
177 readarena(Arena *arena, u64int aa, u8int *buf, long n)
179 DBlock *b;
180 u64int a;
181 u32int blocksize, off, m;
182 long nn;
184 if(n == 0)
185 return -1;
187 qlock(&arena->lock);
188 a = arena->size - arenadirsize(arena, arena->memstats.clumps);
189 qunlock(&arena->lock);
190 if(aa >= a){
191 seterr(EOk, "reading beyond arena clump storage: clumps=%d aa=%lld a=%lld -1 clumps=%lld\n", arena->memstats.clumps, aa, a, arena->size - arenadirsize(arena, arena->memstats.clumps - 1));
192 return -1;
194 if(aa + n > a)
195 n = a - aa;
197 blocksize = arena->blocksize;
198 a = arena->base + aa;
199 off = a & (blocksize - 1);
200 a -= off;
201 nn = 0;
202 for(;;){
203 b = getdblock(arena->part, a, OREAD);
204 if(b == nil)
205 return -1;
206 m = blocksize - off;
207 if(m > n - nn)
208 m = n - nn;
209 memmove(&buf[nn], &b->data[off], m);
210 putdblock(b);
211 nn += m;
212 if(nn == n)
213 break;
214 off = 0;
215 a += blocksize;
217 return n;
220 /*
221 * write some data to the clump section at a given offset
222 * used to fix up corrupted arenas.
223 */
224 u32int
225 writearena(Arena *arena, u64int aa, u8int *clbuf, u32int n)
227 DBlock *b;
228 u64int a;
229 u32int blocksize, off, m;
230 long nn;
231 int ok;
233 if(n == 0)
234 return -1;
236 qlock(&arena->lock);
237 a = arena->size - arenadirsize(arena, arena->memstats.clumps);
238 if(aa >= a || aa + n > a){
239 qunlock(&arena->lock);
240 seterr(EOk, "writing beyond arena clump storage");
241 return -1;
244 blocksize = arena->blocksize;
245 a = arena->base + aa;
246 off = a & (blocksize - 1);
247 a -= off;
248 nn = 0;
249 for(;;){
250 b = getdblock(arena->part, a, off != 0 || off + n < blocksize ? ORDWR : OWRITE);
251 if(b == nil){
252 qunlock(&arena->lock);
253 return -1;
255 dirtydblock(b, DirtyArena);
256 m = blocksize - off;
257 if(m > n - nn)
258 m = n - nn;
259 memmove(&b->data[off], &clbuf[nn], m);
260 // ok = writepart(arena->part, a, b->data, blocksize);
261 ok = 0;
262 putdblock(b);
263 if(ok < 0){
264 qunlock(&arena->lock);
265 return -1;
267 nn += m;
268 if(nn == n)
269 break;
270 off = 0;
271 a += blocksize;
273 qunlock(&arena->lock);
274 return n;
277 /*
278 * allocate space for the clump and write it,
279 * updating the arena directory
280 ZZZ question: should this distinguish between an arena
281 filling up and real errors writing the clump?
282 */
283 u64int
284 writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64int start, u64int *pa)
286 DBlock *b;
287 u64int a, aa;
288 u32int clump, n, nn, m, off, blocksize;
289 int ok;
290 AState as;
292 n = c->info.size + ClumpSize + U32Size;
293 qlock(&arena->lock);
294 aa = arena->memstats.used;
295 if(arena->memstats.sealed
296 || aa + n + U32Size + arenadirsize(arena, arena->memstats.clumps + 1) > arena->size){
297 if(!arena->memstats.sealed){
298 trace(0, "seal memstats %s", arena->name);
299 arena->memstats.sealed = 1;
300 as.arena = arena;
301 as.aa = start+aa;
302 as.stats = arena->memstats;
303 setdcachestate(&as);
305 qunlock(&arena->lock);
306 return TWID64;
308 if(packclump(c, &clbuf[0], arena->clumpmagic) < 0){
309 qunlock(&arena->lock);
310 return TWID64;
313 /*
314 * write the data out one block at a time
315 */
316 blocksize = arena->blocksize;
317 a = arena->base + aa;
318 off = a & (blocksize - 1);
319 a -= off;
320 nn = 0;
321 for(;;){
322 b = getdblock(arena->part, a, off != 0 ? ORDWR : OWRITE);
323 if(b == nil){
324 qunlock(&arena->lock);
325 return TWID64;
327 dirtydblock(b, DirtyArena);
328 m = blocksize - off;
329 if(m > n - nn)
330 m = n - nn;
331 memmove(&b->data[off], &clbuf[nn], m);
332 // ok = writepart(arena->part, a, b->data, blocksize);
333 ok = 0;
334 putdblock(b);
335 if(ok < 0){
336 qunlock(&arena->lock);
337 return TWID64;
339 nn += m;
340 if(nn == n)
341 break;
342 off = 0;
343 a += blocksize;
346 arena->memstats.used += c->info.size + ClumpSize;
347 arena->memstats.uncsize += c->info.uncsize;
348 if(c->info.size < c->info.uncsize)
349 arena->memstats.cclumps++;
351 clump = arena->memstats.clumps++;
352 if(arena->memstats.clumps == 0)
353 sysfatal("clumps wrapped");
354 arena->wtime = now();
355 if(arena->ctime == 0)
356 arena->ctime = arena->wtime;
358 writeclumpinfo(arena, clump, &c->info);
360 /* set up for call to setdcachestate */
361 as.arena = arena;
362 as.aa = start+arena->memstats.used;
363 as.stats = arena->memstats;
365 /* update this before calling setdcachestate so it cannot be behind dcache.diskstate */
366 *pa = start+aa;
367 setdcachestate(&as);
368 qunlock(&arena->lock);
370 return aa;
373 int
374 atailcmp(ATailStats *a, ATailStats *b)
376 /* good test */
377 if(a->used < b->used)
378 return -1;
379 if(a->used > b->used)
380 return 1;
382 /* suspect tests - why order this way? (no one cares) */
383 if(a->clumps < b->clumps)
384 return -1;
385 if(a->clumps > b->clumps)
386 return 1;
387 if(a->cclumps < b->cclumps)
388 return -1;
389 if(a->cclumps > b->cclumps)
390 return 1;
391 if(a->uncsize < b->uncsize)
392 return -1;
393 if(a->uncsize > b->uncsize)
394 return 1;
395 if(a->sealed < b->sealed)
396 return -1;
397 if(a->sealed > b->sealed)
398 return 1;
400 /* everything matches */
401 return 0;
404 void
405 setatailstate(AState *as)
407 int i, j, osealed;
408 Arena *a;
409 Index *ix;
411 trace(0, "setatailstate %s 0x%llux clumps %d", as->arena->name, as->aa, as->stats.clumps);
413 ix = mainindex;
414 for(i=0; i<ix->narenas; i++)
415 if(ix->arenas[i] == as->arena)
416 break;
417 if(i==ix->narenas || as->aa < ix->amap[i].start || as->aa >= ix->amap[i].stop || as->arena != ix->arenas[i]){
418 fprint(2, "funny settailstate 0x%llux\n", as->aa);
419 return;
422 for(j=i; --j>=0; ){
423 a = ix->arenas[j];
424 if(atailcmp(&a->diskstats, &a->memstats) == 0)
425 break;
427 for(j++; j<=i; j++){
428 a = ix->arenas[j];
429 qlock(&a->lock);
430 osealed = a->diskstats.sealed;
431 if(j == i)
432 a->diskstats = as->stats;
433 else
434 a->diskstats = a->memstats;
435 wbarena(a);
436 if(a->diskstats.sealed != osealed && !a->inqueue)
437 sealarena(a);
438 qunlock(&a->lock);
442 /*
443 * once sealed, an arena never has any data added to it.
444 * it should only be changed to fix errors.
445 * this also syncs the clump directory.
446 */
447 static void
448 sealarena(Arena *arena)
450 arena->inqueue = 1;
451 backsumarena(arena);
454 void
455 backsumarena(Arena *arena)
457 ASum *as;
459 if(sumwait.l == nil)
460 return;
462 as = MK(ASum);
463 if(as == nil)
464 return;
465 qlock(&sumlock);
466 as->arena = arena;
467 as->next = sumq;
468 sumq = as;
469 rwakeup(&sumwait);
470 qunlock(&sumlock);
473 static void
474 sumproc(void *unused)
476 ASum *as;
477 Arena *arena;
479 USED(unused);
481 for(;;){
482 qlock(&sumlock);
483 while(sumq == nil)
484 rsleep(&sumwait);
485 as = sumq;
486 sumq = as->next;
487 qunlock(&sumlock);
488 arena = as->arena;
489 free(as);
491 sumarena(arena);
495 void
496 sumarena(Arena *arena)
498 ZBlock *b;
499 DigestState s;
500 u64int a, e;
501 u32int bs;
502 u8int score[VtScoreSize];
504 bs = MaxIoSize;
505 if(bs < arena->blocksize)
506 bs = arena->blocksize;
508 /*
509 * read & sum all blocks except the last one
510 */
511 memset(&s, 0, sizeof s);
512 b = alloczblock(bs, 0, arena->part->blocksize);
513 e = arena->base + arena->size;
514 for(a = arena->base - arena->blocksize; a + arena->blocksize <= e; a += bs){
515 sleep(arenasumsleeptime);
516 if(a + bs > e)
517 bs = arena->blocksize;
518 if(readpart(arena->part, a, b->data, bs) < 0)
519 goto ReadErr;
520 addstat(StatSumRead, 1);
521 addstat(StatSumReadBytes, bs);
522 sha1(b->data, bs, nil, &s);
525 /*
526 * the last one is special, since it may already have the checksum included
527 */
528 bs = arena->blocksize;
529 if(readpart(arena->part, e, b->data, bs) < 0){
530 ReadErr:
531 logerr(EOk, "sumarena can't sum %s, read at %lld failed: %r", arena->name, a);
532 freezblock(b);
533 return;
535 addstat(StatSumRead, 1);
536 addstat(StatSumReadBytes, bs);
538 sha1(b->data, bs-VtScoreSize, nil, &s);
539 sha1(zeroscore, VtScoreSize, nil, &s);
540 sha1(nil, 0, score, &s);
542 /*
543 * check for no checksum or the same
545 * the writepart is okay because we flushed the dcache in sealarena
546 */
547 if(scorecmp(score, &b->data[bs - VtScoreSize]) != 0){
548 if(scorecmp(zeroscore, &b->data[bs - VtScoreSize]) != 0)
549 logerr(EOk, "overwriting mismatched checksums for arena=%s, found=%V calculated=%V",
550 arena->name, &b->data[bs - VtScoreSize], score);
551 scorecp(&b->data[bs - VtScoreSize], score);
552 if(writepart(arena->part, e, b->data, bs) < 0)
553 logerr(EOk, "sumarena can't write sum for %s: %r", arena->name);
555 freezblock(b);
557 qlock(&arena->lock);
558 scorecp(arena->score, score);
559 qunlock(&arena->lock);
562 /*
563 * write the arena trailer block to the partition
564 */
565 int
566 wbarena(Arena *arena)
568 DBlock *b;
569 int bad;
571 if((b = getdblock(arena->part, arena->base + arena->size, OWRITE)) == nil){
572 logerr(EAdmin, "can't write arena trailer: %r");
573 return -1;
575 dirtydblock(b, DirtyArenaTrailer);
576 bad = okarena(arena)<0 || packarena(arena, b->data)<0;
577 putdblock(b);
578 if(bad)
579 return -1;
580 return 0;
583 int
584 wbarenahead(Arena *arena)
586 ZBlock *b;
587 ArenaHead head;
588 int bad;
590 namecp(head.name, arena->name);
591 head.version = arena->version;
592 head.size = arena->size + 2 * arena->blocksize;
593 head.blocksize = arena->blocksize;
594 head.clumpmagic = arena->clumpmagic;
595 b = alloczblock(arena->blocksize, 1, arena->part->blocksize);
596 if(b == nil){
597 logerr(EAdmin, "can't write arena header: %r");
598 ///ZZZ add error message?
599 return -1;
601 /*
602 * this writepart is okay because it only happens
603 * during initialization.
604 */
605 bad = packarenahead(&head, b->data)<0 ||
606 writepart(arena->part, arena->base - arena->blocksize, b->data, arena->blocksize)<0;
607 freezblock(b);
608 if(bad)
609 return -1;
610 return 0;
613 /*
614 * read the arena header and trailer blocks from disk
615 */
616 static int
617 loadarena(Arena *arena)
619 ArenaHead head;
620 ZBlock *b;
622 b = alloczblock(arena->blocksize, 0, arena->part->blocksize);
623 if(b == nil)
624 return -1;
625 if(readpart(arena->part, arena->base + arena->size, b->data, arena->blocksize) < 0){
626 freezblock(b);
627 return -1;
629 if(unpackarena(arena, b->data) < 0){
630 freezblock(b);
631 return -1;
633 if(arena->version != ArenaVersion4 && arena->version != ArenaVersion5){
634 seterr(EAdmin, "unknown arena version %d", arena->version);
635 freezblock(b);
636 return -1;
638 scorecp(arena->score, &b->data[arena->blocksize - VtScoreSize]);
640 if(readpart(arena->part, arena->base - arena->blocksize, b->data, arena->blocksize) < 0){
641 logerr(EAdmin, "can't read arena header: %r");
642 freezblock(b);
643 return 0;
645 if(unpackarenahead(&head, b->data) < 0)
646 logerr(ECorrupt, "corrupted arena header: %r");
647 else if(namecmp(arena->name, head.name)!=0
648 || arena->clumpmagic != head.clumpmagic
649 || arena->version != head.version
650 || arena->blocksize != head.blocksize
651 || arena->size + 2 * arena->blocksize != head.size){
652 if(namecmp(arena->name, head.name)!=0)
653 logerr(ECorrupt, "arena tail name %s head %s",
654 arena->name, head.name);
655 else if(arena->clumpmagic != head.clumpmagic)
656 logerr(ECorrupt, "arena tail clumpmagic 0x%lux head 0x%lux",
657 (ulong)arena->clumpmagic, (ulong)head.clumpmagic);
658 else if(arena->version != head.version)
659 logerr(ECorrupt, "arena tail version %d head version %d",
660 arena->version, head.version);
661 else if(arena->blocksize != head.blocksize)
662 logerr(ECorrupt, "arena tail block size %d head %d",
663 arena->blocksize, head.blocksize);
664 else if(arena->size+2*arena->blocksize != head.size)
665 logerr(ECorrupt, "arena tail size %lud head %lud",
666 (ulong)arena->size+2*arena->blocksize, head.size);
667 else
668 logerr(ECorrupt, "arena header inconsistent with arena data");
670 freezblock(b);
672 return 0;
675 static int
676 okarena(Arena *arena)
678 u64int dsize;
679 int ok;
681 ok = 0;
682 dsize = arenadirsize(arena, arena->diskstats.clumps);
683 if(arena->diskstats.used + dsize > arena->size){
684 seterr(ECorrupt, "arena used > size");
685 ok = -1;
688 if(arena->diskstats.cclumps > arena->diskstats.clumps)
689 logerr(ECorrupt, "arena has more compressed clumps than total clumps");
691 if(arena->diskstats.uncsize + arena->diskstats.clumps * ClumpSize + arena->blocksize < arena->diskstats.used)
692 logerr(ECorrupt, "arena uncompressed size inconsistent with used space %lld %d %lld", arena->diskstats.uncsize, arena->diskstats.clumps, arena->diskstats.used);
694 if(arena->ctime > arena->wtime)
695 logerr(ECorrupt, "arena creation time after last write time");
697 return ok;
700 static CIBlock*
701 getcib(Arena *arena, int clump, int writing, CIBlock *rock)
703 int mode;
704 CIBlock *cib;
705 u32int block, off;
707 if(clump >= arena->memstats.clumps){
708 seterr(EOk, "clump directory access out of range");
709 return nil;
711 block = clump / arena->clumpmax;
712 off = (clump - block * arena->clumpmax) * ClumpInfoSize;
713 cib = rock;
714 cib->block = block;
715 cib->offset = off;
717 if(writing){
718 if(off == 0 && clump == arena->memstats.clumps-1)
719 mode = OWRITE;
720 else
721 mode = ORDWR;
722 }else
723 mode = OREAD;
725 cib->data = getdblock(arena->part,
726 arena->base + arena->size - (block + 1) * arena->blocksize, mode);
727 if(cib->data == nil)
728 return nil;
729 return cib;
732 static void
733 putcib(Arena *arena, CIBlock *cib)
735 putdblock(cib->data);
736 cib->data = nil;