2 * Check and fix an arena partition.
4 * This is a lot grittier than the rest of Venti because
5 * it can't just give up if a byte here or there is wrong.
7 * The rule here (hopefully followed!) is that block corruption
8 * only ever has a local effect -- there are no blocks that you
9 * can wipe out that will cause large portions of
10 * uncorrupted data blocks to be useless.
18 #define ROUNDUP(x,n) (((x)+(n)-1)&~((n)-1))
20 #pragma varargck type "z" uvlong
21 #pragma varargck type "z" vlong
22 #pragma varargck type "t" uint
43 uchar zero[MaxDiskBlock];
51 void checkarena(vlong, int);
56 fprint(2, "usage: fixarenas [-fv] [-a arenasize] [-b blocksize] file [ranges]\n");
61 * Format number in simplest way that is okay with unittoull.
68 x = va_arg(fmt->args, vlong);
70 return fmtstrcpy(fmt, "0");
72 return fmtprint(fmt, "%lldG", x/G);
74 return fmtprint(fmt, "%lldM", x/M);
76 return fmtprint(fmt, "%lldK", x/K);
77 return fmtprint(fmt, "%lld", x);
81 * Format time like ctime without newline.
89 t = va_arg(fmt->args, uint);
90 strcpy(buf, ctime(t));
92 return fmtstrcpy(fmt, buf);
96 * Coalesce messages about unreadable sectors into larger ranges.
97 * bad(0, 0) flushes the buffer.
100 bad(char *msg, vlong o, int len)
102 static vlong lb0, lb1;
113 if(lb1 != o || (msg && lmsg && strcmp(msg, lmsg) != 0)){
115 print("%s %#llux+%#llux (%,lld+%,lld)\n",
116 lmsg, lb0, lb1-lb0, lb0, lb1-lb0);
124 * Read in the len bytes of data at the offset. If can't for whatever reason,
125 * fill it with garbage but print an error.
128 readdisk(uchar *buf, vlong offset, int len)
132 if(offset >= partend){
133 memset(buf, 0xFB, len);
137 if(offset+len > partend){
138 memset(buf, 0xFB, len);
139 len = partend - offset;
142 if(readpart(part, offset, buf, len) >= 0)
146 * The read failed. Clear the buffer to nonsense, and
147 * then try reading in smaller pieces. If that fails,
148 * read in even smaller pieces. And so on down to sectors.
150 memset(buf, 0xFD, len);
151 for(i=0; i<len; i+=64*K){
155 if(readpart(part, offset+i, buf+i, n) >= 0)
157 for(j=i; j<len && j<i+64*K; j+=4*K){
161 if(readpart(part, offset+j, buf+j, n) >= 0)
163 for(k=j; k<len && k<j+4*K; k+=512){
164 if(readpart(part, offset+k, buf+k, 512) >= 0)
166 bad("disk read failed at", k, 512);
176 * Buffer to support running SHA1 hash of the disk.
178 typedef struct Shabuf Shabuf;
191 sbdebug(Shabuf *sb, char *file)
199 if((fd = create(file, OWRITE, 0666)) < 0)
209 sbupdate(Shabuf *sb, uchar *p, vlong offset, int len)
214 if(sb->rollback && !sb->hist){
217 sb->hist = vtmalloc(sb->nhist*sizeof *sb->hist);
218 memset(sb->hist, 0, sizeof sb->hist[0]);
223 if(sb->offset < offset || sb->offset >= offset+len){
224 if(0) print("sbupdate %p %#llux+%d but offset=%#llux\n",
225 p, offset, len, sb->offset);
228 x = sb->offset - offset;
229 if(0) print("sbupdate %p %#llux+%d skip %d\n",
236 assert(sb->offset == offset);
239 pwrite(sb->fd, p, len, offset - sb->r0);
242 sha1(p, len, nil, &sb->state);
247 /* save state every 4M so we can roll back quickly */
253 sha1(p, n, nil, &sb->state);
262 print("oops! x=%d nhist=%d\n", x, sb->nhist);
264 sb->hist = vtrealloc(sb->hist, sb->nhist*sizeof *sb->hist);
266 sb->hist[x] = sb->state;
272 sbdiskhash(Shabuf *sb, vlong eoffset)
274 static uchar dbuf[4*M];
277 while(sb->offset < eoffset){
279 if(sb->offset+n > eoffset)
280 n = eoffset - sb->offset;
281 readdisk(dbuf, sb->offset, n);
282 sbupdate(sb, dbuf, sb->offset, n);
287 sbrollback(Shabuf *sb, vlong offset)
293 if(!sb->rollback || !sb->r0){
294 print("cannot rollback sha\n");
297 if(offset >= sb->offset)
302 print("cannot rollback sha\n");
305 sb->state = sb->hist[x];
306 sb->offset = sb->r0 + x*4*M;
307 assert(sb->offset <= offset);
311 d.length = sb->offset - sb->r0;
312 dirfwstat(sb->fd, &d);
317 sbscore(Shabuf *sb, uchar *score)
323 sha1(nil, 0, score, &sb->state);
327 * If we're fixing arenas, then editing this memory edits the disk!
328 * It will be written back out as new data is paged in.
335 static void pageout(void);
337 pagein(vlong offset, int len)
340 if(offset >= partend){
341 memset(buf, 0xFB, sizeof buf);
345 if(offset+len > partend){
346 memset(buf, 0xFB, sizeof buf);
347 len = partend - offset;
351 readdisk(buf, offset, len);
352 memmove(sbuf, buf, len);
359 if(buflen==0 || !fix || memcmp(buf, sbuf, buflen) == 0){
363 if(writepart(part, bufoffset, buf, buflen) < 0)
364 print("disk write failed at %#llux+%#ux (%,lld+%,d)\n",
365 bufoffset, buflen, bufoffset, buflen);
370 zerorange(vlong offset, int len)
375 enum { MinBlock = 4*K, MaxBlock = 8*K };
378 if(bufoffset <= offset && offset+len <= bufoffset+buflen){
379 memset(buf+(offset-bufoffset), 0, len);
387 if(i+len < MaxBlock){
388 pagein(offset-i, (len+MinBlock-1)&~(MinBlock-1));
389 memset(buf+i, 0, len);
391 pagein(offset-i, MaxBlock);
392 memset(buf+i, 0, MaxBlock-i);
393 offset += MaxBlock-i;
395 while(len >= MaxBlock){
396 pagein(offset, MaxBlock);
397 memset(buf, 0, MaxBlock);
401 pagein(offset, (len+MinBlock-1)&~(MinBlock-1));
408 * read/write integers
411 p16(uchar *p, u16int u)
413 p[0] = (u>>8) & 0xFF;
421 return (p[0]<<8)|p[1];
425 p32(uchar *p, u32int u)
427 p[0] = (u>>24) & 0xFF;
428 p[1] = (u>>16) & 0xFF;
429 p[2] = (u>>8) & 0xFF;
436 return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3];
441 p64(uchar *p, u64int u)
451 return ((u64int)u32(p)<<32) | u32(p+4);
455 vlongcmp(const void *va, const void *vb)
468 /* D and S are in draw.h */
480 typedef struct Info Info;
528 Info tailinfo4a[] = {
566 Info tailinfo5a[] = {
591 showdiffs(uchar *want, uchar *have, int len, Info *info)
595 while(len > 0 && (n=info->len&N) > 0){
596 if(memcmp(have, want, n) != 0){
599 print("\t%s: correct=%d disk=%d\n",
600 info->name, *want, *have);
603 print("\t%s: correct=%#ux disk=%#ux\n",
604 info->name, u32(want), u32(have));
607 print("\t%s: correct=%,ud disk=%,ud\n",
608 info->name, u32(want), u32(have));
611 print("\t%s: correct=%t\n\t\tdisk=%t\n",
612 info->name, u32(want), u32(have));
615 print("\t%s: correct=%z disk=%z\n",
616 info->name, (uvlong)u32(want), (uvlong)u32(have));
619 print("\t%s: correct=%,lld disk=%,lld\n",
620 info->name, u64(want), u64(have));
623 print("\t%s: correct=%z disk=%z\n",
624 info->name, u64(want), u64(have));
627 print("\t%s: correct=%s disk=%.*s\n",
628 info->name, (char*)want,
629 utfnlen((char*)have, ANameSize-1),
633 print("\t%s: correct=%.*H disk=%.*H\n",
634 info->name, n, want, n, have);
643 if(len > 0 && memcmp(have, want, len) != 0){
644 if(memcmp(want, zero, len) != 0)
645 print("!!\textra want data in showdiffs (bug in fixarenas)\n");
647 print("\tnon-zero data on disk after structure\n");
649 print("want: %.*H\n", len, want);
650 print("have: %.*H\n", len, have);
656 * Does part begin with an arena?
661 return u32(pagein(0, Block)) == ArenaHeadMagic;
664 static int tabsizes[] = { 16*1024, 64*1024, 512*1024, 768*1024, };
666 * Poke around on the disk to guess what the ArenaPart numbers are.
671 int i, j, n, bestn, ndiff, nhead, ntail;
673 u64int diff[100], head[20], tail[20];
674 u64int offset, bestdiff;
676 ap.version = ArenaPartVersion;
678 if(arenasize == 0 || ap.blocksize == 0){
680 * The ArenaPart block at offset PartBlank may be corrupt or just wrong.
681 * Instead, look for the individual arena headers and tails, which there
682 * are many of, and once we've seen enough, infer the spacing.
684 * Of course, nothing in the file format requires that arenas be evenly
685 * spaced, but fmtarenas always does that for us.
689 for(offset=PartBlank; offset<partend; offset+=4*M){
690 p = pagein(offset, 4*M);
691 for(sp=p, ep=p+4*M; p<ep; p+=K){
692 if(u32(p) == ArenaHeadMagic && nhead < nelem(head)){
694 print("arena head at %#llx\n", offset+(p-sp));
695 head[nhead++] = offset+(p-sp);
697 if(u32(p) == ArenaMagic && ntail < nelem(tail)){
698 tail[ntail++] = offset+(p-sp);
700 print("arena tail at %#llx\n", offset+(p-sp));
703 if(nhead == nelem(head) && ntail == nelem(tail))
706 if(nhead < 3 && ntail < 3)
707 sysfatal("too few intact arenas: %d heads, %d tails", nhead, ntail);
710 * Arena size is likely the most common
711 * inter-head or inter-tail spacing.
714 for(i=1; i<nhead; i++)
715 diff[ndiff++] = head[i] - head[i-1];
716 for(i=1; i<ntail; i++)
717 diff[ndiff++] = tail[i] - tail[i-1];
718 qsort(diff, ndiff, sizeof diff[0], vlongcmp);
721 for(i=1, n=1; i<=ndiff; i++, n++){
722 if(i==ndiff || diff[i] != diff[i-1]){
725 bestdiff = diff[i-1];
730 print("arena size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
731 if(arenasize != 0 && arenasize != bestdiff)
732 print("using user-specified size %z instead\n", arenasize);
734 arenasize = bestdiff;
737 * The arena tail for an arena is arenasize-blocksize from the head.
740 for(i=j=0; i<nhead && j<ntail; ){
741 if(tail[j] < head[i]){
745 if(tail[j] < head[i]+arenasize){
746 diff[ndiff++] = head[i]+arenasize - tail[j];
753 sysfatal("too few intact arenas: %d head, tail pairs", ndiff);
754 qsort(diff, ndiff, sizeof diff[0], vlongcmp);
757 for(i=1, n=1; i<=ndiff; i++, n++){
758 if(i==ndiff || diff[i] != diff[i-1]){
761 bestdiff = diff[i-1];
766 print("block size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
767 if(ap.blocksize != 0 && ap.blocksize != bestdiff)
768 print("using user-specified size %z instead\n", (vlong)ap.blocksize);
770 ap.blocksize = bestdiff;
771 if(ap.blocksize == 0 || ap.blocksize&(ap.blocksize-1))
772 sysfatal("block size not a power of two");
773 if(ap.blocksize > MaxDiskBlock)
774 sysfatal("block size too big (max=%d)", MaxDiskBlock);
777 * Use head/tail information to deduce arena base.
780 for(i=0; i<nhead; i++)
781 diff[ndiff++] = head[i]%arenasize;
782 for(i=0; i<ntail; i++)
783 diff[ndiff++] = (tail[i]+ap.blocksize)%arenasize;
784 qsort(diff, ndiff, sizeof diff[0], vlongcmp);
787 for(i=1, n=1; i<=ndiff; i++, n++){
788 if(i==ndiff || diff[i] != diff[i-1]){
791 bestdiff = diff[i-1];
796 ap.arenabase = bestdiff;
799 ap.tabbase = ROUNDUP(PartBlank+HeadSize, ap.blocksize);
801 * XXX pick up table, check arenabase.
802 * XXX pick up table, record base name.
806 * Somewhat standard computation.
807 * Fmtarenas used to use 64k tab, now uses 512k tab.
809 if(ap.arenabase == 0){
810 print("trying standard arena bases...\n");
811 for(i=0; i<nelem(tabsizes); i++){
812 ap.arenabase = ROUNDUP(PartBlank+HeadSize+tabsizes[i], ap.blocksize);
813 p = pagein(ap.arenabase, Block);
814 if(u32(p) == ArenaHeadMagic)
818 p = pagein(ap.arenabase, Block);
819 print("arena base likely %z%s\n", (vlong)ap.arenabase,
820 u32(p)!=ArenaHeadMagic ? " (but no arena head there)" : "");
822 ap.tabsize = ap.arenabase - ap.tabbase;
826 * Check the arena partition blocks and then the arenas listed in range.
829 checkarenas(char *range)
832 int i, lo, hi, narena;
833 uchar dbuf[HeadSize];
838 partend -= partend%ap.blocksize;
840 memset(dbuf, 0, sizeof dbuf);
841 packarenapart(&ap, dbuf);
842 p = pagein(PartBlank, Block);
843 if(memcmp(p, dbuf, HeadSize) != 0){
844 print("on-disk arena part superblock incorrect\n");
845 showdiffs(dbuf, p, HeadSize, partinfo);
847 memmove(p, dbuf, HeadSize);
849 narena = (partend-ap.arenabase + arenasize-1)/arenasize;
851 for(i=0; i<narena; i++)
852 checkarena(ap.arenabase+(vlong)i*arenasize, i);
853 }else if(strcmp(range, "none") == 0){
856 /* parse, e.g., -4,8-9,10- */
857 for(s=range; *s; s=t){
866 lo = strtol(s, &s, 0);
873 hi = strtol(s, &s, 0);
876 print("bad arena range: %s\n", s);
879 for(i=lo; i<=hi; i++)
880 checkarena(ap.arenabase+(vlong)i*arenasize, i);
886 * Is there a clump here at p?
889 isclump(uchar *p, Clump *cl, u32int *pmagic)
893 uchar score[VtScoreSize], *bp;
903 cl->info.type = vtfromdisktype(*p);
904 if(cl->info.type == 0xFF)
907 cl->info.size = u16(p);
909 cl->info.uncsize = u16(p);
910 if(cl->info.size > cl->info.uncsize)
913 scorecp(cl->info.score, p);
917 cl->creator = u32(p);
922 switch(cl->encoding){
924 if(cl->info.size != cl->info.uncsize)
926 scoremem(score, p, cl->info.size);
927 if(scorecmp(score, cl->info.score) != 0)
931 if(cl->info.size >= cl->info.uncsize)
934 n = unwhack(&uw, ubuf, cl->info.uncsize, p, cl->info.size);
935 if(n != cl->info.uncsize)
937 scoremem(score, ubuf, cl->info.uncsize);
938 if(scorecmp(score, cl->info.score) != 0)
946 /* it all worked out in the end */
952 * All ClumpInfos seen in this arena.
953 * Kept in binary tree so we can look up by score.
955 typedef struct Cit Cit;
975 ltreewalk(int *p, uchar *score)
982 i = scorecmp(cibuf[*p].ci.score, score);
986 p = &cibuf[*p].right;
993 addcibuf(ClumpInfo *ci, vlong corrupt)
997 if(ncibuf == mcibuf){
999 cibuf = vtrealloc(cibuf, mcibuf*sizeof cibuf[0]);
1001 cit = &cibuf[ncibuf];
1005 cit->corrupt = corrupt;
1007 *ltreewalk(&ciroot, ci->score) = ncibuf;
1012 addcicorrupt(vlong len)
1014 static ClumpInfo zci;
1016 addcibuf(&zci, len);
1020 haveclump(uchar *score)
1029 i = scorecmp(cibuf[p].ci.score, score);
1040 matchci(ClumpInfo *ci, uchar *p)
1042 if(ci->type != vtfromdisktype(p[0]))
1044 if(ci->size != u16(p+1))
1046 if(ci->uncsize != u16(p+3))
1048 if(scorecmp(ci->score, p+5) != 0)
1054 sealedarena(uchar *p, int blocksize)
1070 print("arena tail says not sealed\n");
1073 if(memcmp(p+n, zero, blocksize-VtScoreSize-n) != 0){
1074 print("arena tail followed by non-zero data\n");
1077 if(memcmp(p+blocksize-VtScoreSize, zero, VtScoreSize) == 0){
1078 print("arena score zero\n");
1085 okayname(char *name, int n)
1089 if(nameok(name) < 0)
1091 sprint(buf, "%d", n);
1094 if(strlen(name) < strlen(buf)
1095 || strcmp(name+strlen(name)-strlen(buf), buf) != 0)
1101 clumpinfocmp(ClumpInfo *a, ClumpInfo *b)
1103 if(a->type != b->type)
1104 return a->type - b->type;
1105 if(a->size != b->size)
1106 return a->size - b->size;
1107 if(a->uncsize != b->uncsize)
1108 return a->uncsize - b->uncsize;
1109 return scorecmp(a->score, b->score);
1113 loadci(vlong offset, Arena *arena, int nci)
1117 ClumpInfo *bci, *ci;
1119 per = arena->blocksize/ClumpInfoSize;
1120 bci = vtmalloc(nci*sizeof bci[0]);
1122 offset += arena->size - arena->blocksize;
1124 for(i=0; i<nci; i+=per){
1126 sp = pagein(offset-4*M, 4*M);
1129 p -= arena->blocksize;
1130 offset -= arena->blocksize;
1131 for(j=0; j<per && i+j<nci; j++)
1132 unpackclumpinfo(ci++, p+j*ClumpInfoSize);
1138 writeci(vlong offset, Arena *arena, ClumpInfo *ci, int nci)
1143 per = arena->blocksize/ClumpInfoSize;
1144 offset += arena->size - arena->blocksize;
1146 for(i=0; i<nci; i+=per){
1148 sp = pagein(offset-4*M, 4*M);
1151 p -= arena->blocksize;
1152 offset -= arena->blocksize;
1153 memset(p, 0, arena->blocksize);
1154 for(j=0; j<per && i+j<nci; j++)
1155 packclumpinfo(ci++, p+j*ClumpInfoSize);
1162 loadarenabasics(vlong offset0, int anum, ArenaHead *head, Arena *arena)
1164 char dname[ANameSize];
1165 static char lastbase[ANameSize];
1171 * Fmtarenas makes all arenas the same size
1172 * except the last, which may be smaller.
1173 * It uses the same block size for arenas as for
1174 * the arena partition blocks.
1176 arena->size = arenasize;
1177 if(offset0+arena->size > partend)
1178 arena->size = partend - offset0;
1179 head->size = arena->size;
1181 arena->blocksize = ap.blocksize;
1182 head->blocksize = arena->blocksize;
1185 * Look for clump magic and name in head/tail blocks.
1186 * All the other info we will reconstruct just in case.
1188 p = pagein(offset0, arena->blocksize);
1189 memset(&ohead, 0, sizeof ohead);
1190 if(unpackarenahead(&ohead, p) >= 0){
1191 head->version = ohead.version;
1192 head->clumpmagic = ohead.clumpmagic;
1193 if(okayname(ohead.name, anum))
1194 strcpy(head->name, ohead.name);
1197 p = pagein(offset0+arena->size-arena->blocksize,
1199 memset(&oarena, 0, sizeof oarena);
1200 if(unpackarena(&oarena, p) >= 0){
1201 arena->version = oarena.version;
1202 arena->clumpmagic = oarena.clumpmagic;
1203 if(okayname(oarena.name, anum))
1204 strcpy(arena->name, oarena.name);
1205 arena->diskstats.clumps = oarena.diskstats.clumps;
1206 print("old arena: sealed=%d\n", oarena.diskstats.sealed);
1207 arena->diskstats.sealed = oarena.diskstats.sealed;
1210 /* Head trumps arena. */
1212 arena->version = head->version;
1213 arena->clumpmagic = head->clumpmagic;
1215 if(arena->version == 0)
1216 arena->version = ArenaVersion5;
1219 snprint(arena->name, ANameSize, "%s", basename);
1221 snprint(arena->name, ANameSize, "%s%d", basename, anum);
1222 }else if(lastbase[0])
1223 snprint(arena->name, ANameSize, "%s%d", lastbase, anum);
1224 else if(head->name[0])
1225 strcpy(arena->name, head->name);
1226 else if(arena->name[0] == 0)
1227 sysfatal("cannot determine base name for arena; use -n");
1228 strcpy(lastbase, arena->name);
1229 sprint(dname, "%d", anum);
1230 lastbase[strlen(lastbase)-strlen(dname)] = 0;
1232 /* Was working in arena, now copy to head. */
1233 head->version = arena->version;
1234 memmove(head->name, arena->name, sizeof head->name);
1235 head->blocksize = arena->blocksize;
1236 head->size = arena->size;
1240 shahead(Shabuf *sb, vlong offset0, ArenaHead *head)
1242 uchar headbuf[MaxDiskBlock];
1244 sb->offset = offset0;
1245 memset(headbuf, 0, sizeof headbuf);
1246 packarenahead(head, headbuf);
1247 sbupdate(sb, headbuf, offset0, head->blocksize);
1251 newclumpmagic(int version)
1255 if(version == ArenaVersion4)
1259 }while(m==0 || m == _ClumpMagic);
1264 * Poke around in the arena to find the clump data
1265 * and compute the relevant statistics.
1268 guessarena(vlong offset0, int anum, ArenaHead *head, Arena *arena,
1269 uchar *oldscore, uchar *score)
1271 uchar dbuf[MaxDiskBlock];
1272 int needtozero, clumps, nb1, nb2, minclumps;
1273 int inbad, n, ncib, printed, sealing, smart;
1276 vlong boffset, eoffset, lastclumpend, leaked;
1277 vlong offset, toffset, totalcorrupt, v;
1279 ClumpInfo *bci, *ci, *eci, *xci;
1280 Cit *bcit, *cit, *ecit;
1281 Shabuf oldsha, newsha;
1284 * We expect to find an arena, with data, between offset
1285 * and offset+arenasize. With any luck, the data starts at
1286 * offset+ap.blocksize. The blocks have variable size and
1287 * aren't padded at all, which doesn't give us any alignment
1288 * constraints. The blocks are compressed or high entropy,
1289 * but the headers are pretty low entropy (except the score):
1291 * type[1] (range 0 thru 9, 13)
1293 * uncsize[2] (<= size)
1295 * so we can look for these. We check the scores as we go,
1296 * so we can't make any wrong turns. If we find ourselves
1297 * in a dead end, scan forward looking for a new start.
1301 memset(head, 0, sizeof *head);
1302 memset(arena, 0, sizeof *arena);
1303 memset(oldscore, 0, VtScoreSize);
1304 memset(score, 0, VtScoreSize);
1305 memset(&oldsha, 0, sizeof oldsha);
1306 memset(&newsha, 0, sizeof newsha);
1307 newsha.rollback = 1;
1310 sbdebug(&oldsha, "old.sha");
1311 sbdebug(&newsha, "new.sha");
1314 loadarenabasics(offset0, anum, head, arena);
1316 /* start the clump hunt */
1321 boffset = offset0 + arena->blocksize;
1323 eoffset = offset0+arena->size - arena->blocksize;
1325 sp = pagein(offset0, 4*M);
1327 if(arena->diskstats.sealed){
1328 oldsha.offset = offset0;
1329 sbupdate(&oldsha, sp, offset0, 4*M);
1332 p = sp + (boffset - offset0);
1333 ncib = arena->blocksize / ClumpInfoSize; /* ci per block in index */
1334 lastclumpend = offset;
1339 while(offset < eoffset){
1341 * Shift buffer if we're running out of room.
1345 * Start the post SHA1 buffer. By now we should know the
1346 * clumpmagic and arena version, so we can create a
1347 * correct head block to get things going.
1349 if(sealing && fix && newsha.offset == 0){
1350 newsha.offset = offset0;
1351 if(arena->clumpmagic == 0){
1352 if(arena->version == 0)
1353 arena->version = ArenaVersion5;
1354 arena->clumpmagic = newclumpmagic(arena->version);
1356 head->clumpmagic = arena->clumpmagic;
1357 shahead(&newsha, offset0, head);
1361 sbdiskhash(&newsha, bufoffset);
1362 sbupdate(&newsha, buf, bufoffset, 4*M-256*K);
1364 pagein(bufoffset+n, 4*M);
1366 if(arena->diskstats.sealed)
1367 sbupdate(&oldsha, buf, bufoffset, 4*M);
1371 * Check for a clump at p, which is at offset in the disk.
1372 * Duplicate clumps happen in corrupted disks
1373 * (the same pattern gets written many times in a row)
1374 * and should never happen during regular use.
1377 if((n = isclump(p, &cl, &magic)) > 0){
1379 * If we were in the middle of some corrupted data,
1380 * flush a warning about it and then add any clump
1381 * info blocks as necessary.
1385 v = offset-lastclumpend;
1387 zerorange(lastclumpend, v);
1388 sbrollback(&newsha, lastclumpend);
1389 print("corrupt clump data - %#llux+%#llux (%,llud bytes)\n",
1390 lastclumpend, v, v);
1394 nb1 = (minclumps+ncib-1)/ncib;
1395 minclumps += (v+ClumpSize+VtMaxLumpSize-1)/(ClumpSize+VtMaxLumpSize);
1396 nb2 = (minclumps+ncib-1)/ncib;
1397 eoffset -= (nb2-nb1)*arena->blocksize;
1400 if(haveclump(cl.info.score))
1401 print("warning: duplicate clump %d %V at %#llux+%#d\n", cl.info.type, cl.info.score, offset, n);
1404 * If clumps use different magic numbers, we don't care.
1405 * We'll just use the first one we find and make the others
1408 if(arena->clumpmagic == 0){
1409 print("clump type %d size %d score %V magic %x\n",
1410 cl.info.type, cl.info.size, cl.info.score, magic);
1411 arena->clumpmagic = magic;
1412 if(magic == _ClumpMagic)
1413 arena->version = ArenaVersion4;
1415 arena->version = ArenaVersion5;
1417 if(magic != arena->clumpmagic)
1418 p32(p, arena->clumpmagic);
1420 arena->ctime = cl.time;
1423 * Record the clump, update arena stats,
1424 * grow clump info blocks if needed.
1427 print("\tclump %d: %d %V at %#llux+%#ux (%d)\n",
1428 clumps, cl.info.type, cl.info.score, offset, n, n);
1429 addcibuf(&cl.info, 0);
1430 if(minclumps%ncib == 0)
1431 eoffset -= arena->blocksize;
1434 if(cl.encoding != ClumpENone)
1435 arena->diskstats.cclumps++;
1436 arena->diskstats.uncsize += cl.info.uncsize;
1437 arena->wtime = cl.time;
1440 * Move to next clump.
1444 lastclumpend = offset;
1447 * Overwrite malformed clump data with zeros later.
1448 * For now, just record whether it needs to be overwritten.
1449 * Bad regions must be of size at least ClumpSize.
1450 * Postponing the overwriting keeps us from writing past
1451 * the end of the arena data (which might be directory data)
1457 if(memcmp(p, zero, ClumpSize) != 0)
1460 offset += ClumpSize;
1473 print("readable clumps: %d; min. directory entries: %d\n",
1475 arena->diskstats.used = lastclumpend - boffset;
1476 leaked = eoffset - lastclumpend;
1478 print("used from %#llux to %#llux = %,lld (%,lld unused)\n",
1479 boffset, lastclumpend, arena->diskstats.used, leaked);
1482 * Finish the SHA1 of the old data.
1484 if(arena->diskstats.sealed){
1485 sbdiskhash(&oldsha, toffset);
1486 readdisk(dbuf, toffset, arena->blocksize);
1487 scorecp(dbuf+arena->blocksize-VtScoreSize, zero);
1488 sbupdate(&oldsha, dbuf, toffset, arena->blocksize);
1489 sbscore(&oldsha, oldscore);
1493 * If we still don't know the clump magic, the arena
1494 * must be empty. It still needs a value, so make
1497 if(arena->version == 0)
1498 arena->version = ArenaVersion5;
1499 if(arena->clumpmagic == 0){
1500 if(arena->version == ArenaVersion4)
1501 arena->clumpmagic = _ClumpMagic;
1504 arena->clumpmagic = fastrand();
1505 while(arena->clumpmagic==_ClumpMagic
1506 ||arena->clumpmagic==0);
1508 head->clumpmagic = arena->clumpmagic;
1512 * Guess at number of clumpinfo blocks to load.
1513 * If we guess high, it's no big deal. If we guess low,
1514 * we'll be forced into rewriting the whole directory.
1515 * Still not such a big deal.
1517 if(clumps == 0 || arena->diskstats.used == totalcorrupt)
1519 if(clumps < arena->diskstats.clumps)
1520 clumps = arena->diskstats.clumps;
1523 clumps += totalcorrupt/
1524 ((arena->diskstats.used - totalcorrupt)/clumps);
1525 clumps += totalcorrupt/2000;
1526 if(clumps < minclumps)
1529 clumps -= clumps%ncib;
1532 * Can't write into the actual data.
1534 v = offset0 + arena->size - arena->blocksize;
1535 v -= (clumps+ncib-1)/ncib * arena->blocksize;
1536 if(v < lastclumpend){
1537 v = offset0 + arena->size - arena->blocksize;
1538 clumps = (v-lastclumpend)/arena->blocksize * ncib;
1541 if(clumps < minclumps)
1542 print("cannot happen?\n");
1545 * Check clumpinfo blocks against directory we created.
1546 * The tricky part is handling the corrupt sections of arena.
1547 * If possible, we remark just the affected directory entries
1548 * rather than slide everything down.
1550 * Allocate clumps+1 blocks and check that we don't need
1551 * the last one at the end.
1553 bci = loadci(offset0, arena, clumps+1);
1556 ecit = cibuf+ncibuf;
1558 smart = 0; /* Somehow the smart code doesn't do corrupt clumps right. */
1562 for(cit=bcit; cit<ecit && ci<eci; cit++){
1567 * If we can, just mark existing entries as corrupt.
1570 for(xci=ci; n>0 && xci<eci; xci++)
1571 n -= ClumpSize+xci->size;
1572 if(n > 0 || xci >= eci)
1575 for(; ci<xci; ci++){
1576 if(verbose && ci->type != VtCorruptType){
1578 print("marking directory %d-%d as corrupt\n",
1579 (int)(ci-bci), (int)(xci-bci));
1582 print("\ttype=%d size=%d uncsize=%d score=%V\n",
1583 ci->type, ci->size, ci->uncsize, ci->score);
1585 ci->type = VtCorruptType;
1589 print("\trewriting clump directory\n");
1591 * Otherwise, blaze a new trail.
1594 while(n > 0 && ci < eci){
1596 sysfatal("bad math in clump corrupt");
1597 if(n <= VtMaxLumpSize+ClumpSize)
1600 m = VtMaxLumpSize+ClumpSize;
1604 ci->type = VtCorruptType;
1605 ci->size = m-ClumpSize;
1606 ci->uncsize = m-ClumpSize;
1607 memset(ci->score, 0, VtScoreSize);
1614 if(clumpinfocmp(&cit->ci, ci) != 0){
1615 if(verbose && (smart || verbose>1)){
1616 print("clumpinfo %d\n", (int)(ci-bci));
1617 print("\twant: %d %d %d %V\n",
1618 cit->ci.type, cit->ci.size,
1619 cit->ci.uncsize, cit->ci.score);
1620 print("\thave: %d %d %d %V\n",
1622 ci->uncsize, ci->score);
1629 if(ci >= eci || cit < ecit){
1630 print("ran out of space editing existing directory; rewriting\n");
1631 print("# eci %ld ci %ld ecit %ld cit %ld\n", eci-bci, ci-bci, ecit-bcit, cit-bcit);
1632 assert(smart); /* can't happen second time thru */
1638 arena->diskstats.clumps = ci-bci;
1639 eoffset = writeci(offset0, arena, bci, ci-bci);
1641 sbrollback(&newsha, v);
1642 print("eoffset=%lld lastclumpend=%lld diff=%lld unseal=%d\n", eoffset, lastclumpend, eoffset-lastclumpend, unseal);
1643 if(lastclumpend > eoffset)
1644 print("arena directory overwrote blocks! cannot happen!\n");
1647 print("arena directory has %d bad or missing entries\n", nbad);
1649 if(eoffset - lastclumpend > 64*1024 && (!arena->diskstats.sealed || unseal)){
1650 if(arena->diskstats.sealed)
1651 print("unsealing arena\n");
1653 memset(oldscore, 0, VtScoreSize);
1657 * Finish the SHA1 of the new data - only meaningful
1658 * if we've been writing to disk (`fix').
1660 arena->diskstats.sealed = sealing;
1661 arena->memstats = arena->diskstats;
1663 uchar tbuf[MaxDiskBlock];
1665 sbdiskhash(&newsha, toffset);
1666 memset(tbuf, 0, sizeof tbuf);
1667 packarena(arena, tbuf);
1668 sbupdate(&newsha, tbuf, toffset, arena->blocksize);
1669 sbscore(&newsha, score);
1674 dumparena(vlong offset, int anum, Arena *arena)
1680 snprint(buf, sizeof buf, "%s.%d", dumpbase, anum);
1681 if((fd = create(buf, OWRITE, 0666)) < 0){
1682 fprint(2, "create %s: %r\n", buf);
1685 e = offset+arena->size;
1686 for(o=offset; o<e; o+=n){
1690 if(pwrite(fd, pagein(o, n), n, o-offset) != n){
1691 fprint(2, "write %s at %#llux: %r\n", buf, o-offset);
1698 checkarena(vlong offset, int anum)
1700 uchar dbuf[MaxDiskBlock];
1701 uchar *p, oldscore[VtScoreSize], score[VtScoreSize];
1702 Arena arena, oarena;
1707 print("# arena %d: offset %#llux\n", anum, offset);
1709 if(offset >= partend){
1710 print("arena offset out of bounds\n");
1714 guessarena(offset, anum, &head, &arena, oldscore, score);
1717 print("#\tversion=%d name=%s blocksize=%d size=%z",
1718 head.version, head.name, head.blocksize, head.size);
1720 print(" clumpmagic=%#.8ux", head.clumpmagic);
1721 print("\n#\tclumps=%d cclumps=%d used=%,lld uncsize=%,lld\n",
1722 arena.diskstats.clumps, arena.diskstats.cclumps,
1723 arena.diskstats.used, arena.diskstats.uncsize);
1724 print("#\tctime=%t\n", arena.ctime);
1725 print("#\twtime=%t\n", arena.wtime);
1726 if(arena.diskstats.sealed)
1727 print("#\tsealed score=%V\n", score);
1731 dumparena(offset, anum, &arena);
1735 memset(dbuf, 0, sizeof dbuf);
1736 packarenahead(&head, dbuf);
1737 p = pagein(offset, arena.blocksize);
1738 if(memcmp(dbuf, p, arena.blocksize) != 0){
1739 print("on-disk arena header incorrect\n");
1740 showdiffs(dbuf, p, arena.blocksize,
1741 arena.version==ArenaVersion4 ? headinfo4 : headinfo5);
1743 memmove(p, dbuf, arena.blocksize);
1745 memset(dbuf, 0, sizeof dbuf);
1746 packarena(&arena, dbuf);
1747 if(arena.diskstats.sealed)
1748 scorecp(dbuf+arena.blocksize-VtScoreSize, score);
1749 p = pagein(offset+arena.size-arena.blocksize, arena.blocksize);
1750 memset(&oarena, 0, sizeof oarena);
1751 unpackarena(&oarena, p);
1752 if(arena.version == ArenaVersion4){
1763 if(oarena.diskstats.sealed){
1765 * some arenas were sealed with the extension
1766 * before we adopted the convention that if it didn't
1767 * add new information it gets dropped.
1769 _packarena(&arena, dbuf, 1);
1772 if(memcmp(dbuf, p, arena.blocksize-VtScoreSize) != 0){
1773 print("on-disk arena tail incorrect\n");
1774 showdiffs(dbuf, p, arena.blocksize-VtScoreSize, fmt);
1776 if(arena.diskstats.sealed){
1777 if(oarena.diskstats.sealed)
1778 if(scorecmp(p+arena.blocksize-VtScoreSize, oldscore) != 0){
1779 print("on-disk arena seal score incorrect\n");
1780 print("\tcorrect=%V\n", oldscore);
1781 print("\t disk=%V\n", p+arena.blocksize-VtScoreSize);
1783 if(fix && scorecmp(p+arena.blocksize-VtScoreSize, score) != 0){
1784 print("%ssealing arena%s: %V\n",
1785 oarena.diskstats.sealed ? "re" : "",
1786 scorecmp(oldscore, score) == 0 ?
1787 "" : " after changes", score);
1790 memmove(p, dbuf, arena.blocksize);
1804 an = vtmallocz(sizeof *an);
1805 for(o=ap.arenabase; o<partend; o+=arenasize){
1806 p = pagein(o, Block);
1807 if(unpackarenahead(&h, p) >= 0){
1808 an->map = vtrealloc(an->map, (an->n+1)*sizeof an->map[0]);
1809 m = &an->map[an->n++];
1812 strcpy(m->name, h.name);
1829 fmtprint(&fmt, "%ud\n", an->n);
1830 for(i=0; i<an->n; i++)
1831 fmtprint(&fmt, "%s\t%lld\t%lld\n",
1832 an->map[i].name, an->map[i].start, an->map[i].stop);
1833 s = fmtstrflush(&fmt);
1835 if(len > ap.tabsize){
1836 print("arena partition map too long: need %z bytes have %z\n",
1837 (vlong)len, (vlong)ap.tabsize);
1841 if(ap.tabsize >= 4*M){ /* can't happen - max arenas is 2000 */
1842 print("arena partition map *way* too long\n");
1846 p = pagein(ap.tabbase, ap.tabsize);
1847 if(memcmp(p, s, len) != 0){
1848 print("arena partition map incorrect; rewriting.\n");
1854 int mainstacksize = 512*1024;
1857 threadmain(int argc, char **argv)
1868 arenasize = unittoull(EARGF(usage()));
1871 ap.blocksize = unittoull(EARGF(usage()));
1879 basename = EARGF(usage());
1885 dumpbase = EARGF(usage());
1891 if(argc != 1 && argc != 2)
1897 fmtinstall('z', zfmt);
1898 fmtinstall('t', tfmt);
1901 part = initpart(file, mode|ODIRECT);
1903 sysfatal("can't open %s: %r", file);
1904 partend = part->size;
1908 threadexitsall(nil);
1910 checkarenas(argc > 1 ? argv[1] : nil);
1912 threadexitsall(nil);