2 * Manage tree of VtFiles stored in the block cache.
4 * The single point of truth for the info about the VtFiles themselves
5 * is the block data. Because of this, there is no explicit locking of
6 * VtFile structures, and indeed there may be more than one VtFile
7 * structure for a given Venti file. They synchronize through the
10 * This is a bit simpler than fossil because there are no epochs
11 * or tags or anything else. Just mutable local blocks and immutable
24 static char EBadEntry[] = "bad VtEntry";
25 static char ENotDir[] = "walk in non-directory";
26 static char ETooBig[] = "file too big";
27 static char EBadAddr[] = "bad address";
28 static char ELabelMismatch[] = "label mismatch";
30 static int sizetodepth(uvlong s, int psize, int dsize);
31 static VtBlock *fileload(VtFile *r, VtEntry *e);
32 static int shrinkdepth(VtFile*, VtBlock*, VtEntry*, int);
33 static int shrinksize(VtFile*, VtEntry*, uvlong);
34 static int growdepth(VtFile*, VtBlock*, VtEntry*, int);
36 #define ISLOCKED(r) ((r)->b != nil)
37 #define DEPTH(t) ((t)&VtTypeDepthMask)
40 vtfilealloc(VtCache *c, VtBlock *b, VtFile *p, u32int offset, int mode)
47 assert(p==nil || ISLOCKED(p));
53 epb = p->dsize / VtEntrySize;
55 if(b->type != VtDirType)
59 * a non-active entry is the only thing that
60 * can legitimately happen here. all the others
63 if(vtentryunpack(&e, b->data, offset % epb) < 0){
64 fprint(2, "vtentryunpack failed\n");
67 if(!(e.flags & VtEntryActive)){
68 if(0)fprint(2, "not active\n");
71 if(e.psize < 256 || e.dsize < 256){
72 fprint(2, "psize %ud dsize %ud\n", e.psize, e.dsize);
76 if(DEPTH(e.type) < sizetodepth(e.size, e.psize, e.dsize)){
77 fprint(2, "depth %ud size %llud psize %ud dsize %ud\n",
78 DEPTH(e.type), e.size, e.psize, e.dsize);
82 size = vtcacheblocksize(c);
83 if(e.dsize > size || e.psize > size){
84 fprint(2, "psize %ud dsize %ud blocksize %ud\n", e.psize, e.dsize, size);
88 r = vtmallocz(sizeof(VtFile));
93 r->dir = (e.type & VtTypeBaseMask) == VtDirType;
98 assert(mode == VtOREAD || p->mode == VtORDWR);
102 memmove(r->score, b->score, VtScoreSize);
114 vtfileroot(VtCache *c, u32int addr, int mode)
119 b = vtcachelocal(c, addr, VtDirType);
123 r = vtfilealloc(c, b, nil, 0, mode);
129 vtfileopenroot(VtCache *c, VtEntry *e)
134 b = vtcacheallocblock(c, VtDirType);
138 vtentrypack(e, b->data, 0);
139 f = vtfilealloc(c, b, nil, 0, VtORDWR);
145 vtfilecreateroot(VtCache *c, int psize, int dsize, int type)
149 memset(&e, 0, sizeof e);
150 e.flags = VtEntryActive;
154 memmove(e.score, vtzeroscore, VtScoreSize);
156 return vtfileopenroot(c, &e);
160 vtfileopen(VtFile *r, u32int offset, int mode)
171 bn = offset/(r->dsize/VtEntrySize);
173 b = vtfileblock(r, bn, mode);
176 r = vtfilealloc(r->c, b, r, offset, mode);
182 vtfilecreate(VtFile *r, int psize, int dsize, int dir)
199 epb = r->dsize/VtEntrySize;
201 size = vtfilegetdirsize(r);
203 * look at a random block to see if we can find an empty entry
205 offset = lnrand(size+1);
206 offset -= offset % epb;
208 /* try the given block and then try the last block */
211 b = vtfileblock(r, bn, VtORDWR);
214 for(i=offset%r->epb; i<epb; i++){
215 if(vtentryunpack(&e, b->data, i) < 0)
217 if((e.flags&VtEntryActive) == 0 && e.gen != ~0)
222 fprint(2, "vtfilecreate: cannot happen\n");
223 werrstr("vtfilecreate: cannot happen");
230 /* found an entry - gen already set */
233 e.flags = VtEntryActive;
234 e.type = dir ? VtDirType : VtDataType;
236 memmove(e.score, vtzeroscore, VtScoreSize);
237 vtentrypack(&e, b->data, i);
241 if(vtfilesetdirsize(r, offset+1) < 0){
247 rr = vtfilealloc(r->c, b, r, offset, VtORDWR);
253 vtfilekill(VtFile *r, int doremove)
263 if(doremove==0 && e.size == 0){
264 /* already truncated */
276 e.flags &= ~VtEntryLocal;
279 memmove(e.score, vtzeroscore, VtScoreSize);
280 vtentrypack(&e, b->data, r->offset % r->epb);
292 vtfileremove(VtFile *r)
294 return vtfilekill(r, 1);
298 vtfiletruncate(VtFile *r)
300 return vtfilekill(r, 0);
304 vtfilegetsize(VtFile *r)
319 shrinksize(VtFile *r, VtEntry *e, uvlong size)
321 int i, depth, type, isdir, ppb;
323 uchar score[VtScoreSize];
326 b = vtcacheglobal(r->c, e->score, e->type);
331 ppb = e->psize/VtScoreSize;
334 for(i=0; i+1<depth; i++)
339 if(b->addr == NilBlock){
340 /* not worth copying the block just so we can zero some of it */
346 * invariant: each pointer in the tree rooted at b accounts for ptrsz bytes
349 /* zero the pointers to unnecessary blocks */
350 i = (size+ptrsz-1)/ptrsz;
352 memmove(b->data+i*VtScoreSize, vtzeroscore, VtScoreSize);
354 /* recurse (go around again) on the partially necessary block */
363 memmove(score, b->data+i*VtScoreSize, VtScoreSize);
365 b = vtcacheglobal(r->c, score, type);
370 if(b->addr == NilBlock){
376 * No one ever truncates BtDir blocks.
378 if(depth==0 && !isdir && e->dsize > size)
379 memset(b->data+size, 0, e->dsize-size);
385 vtfilesetsize(VtFile *r, uvlong size)
393 return vtfiletruncate(r);
395 if(size > VtMaxFileSize || size > ((uvlong)MaxBlock)*r->dsize){
410 depth = sizetodepth(size, e.psize, e.dsize);
411 edepth = DEPTH(e.type);
413 if(shrinkdepth(r, b, &e, depth) < 0){
417 }else if(depth > edepth){
418 if(growdepth(r, b, &e, depth) < 0){
425 shrinksize(r, &e, size);
428 vtentrypack(&e, b->data, r->offset % r->epb);
435 vtfilesetdirsize(VtFile *r, u32int ds)
441 epb = r->dsize/VtEntrySize;
443 size = (uvlong)r->dsize*(ds/epb);
444 size += VtEntrySize*(ds%epb);
445 return vtfilesetsize(r, size);
449 vtfilegetdirsize(VtFile *r)
456 epb = r->dsize/VtEntrySize;
458 size = vtfilegetsize(r);
459 ds = epb*(size/r->dsize);
460 ds += (size%r->dsize)/VtEntrySize;
465 vtfilegetentry(VtFile *r, VtEntry *e)
479 vtfilesetentry(VtFile *r, VtEntry *e)
485 b = fileload(r, &ee);
488 vtentrypack(e, b->data, r->offset % r->epb);
494 blockwalk(VtBlock *p, int index, VtCache *c, int mode, VtEntry *e)
510 score = p->data+index*VtScoreSize;
513 //print("walk from %V/%d ty %d to %V ty %d\n", p->score, index, p->type, score, type);
515 if(mode == VtOWRITE && vtglobaltolocal(score) == NilBlock){
516 b = vtcacheallocblock(c, type);
520 b = vtcacheglobal(c, score, type);
522 if(b == nil || mode == VtOREAD)
525 if(vtglobaltolocal(b->score) != NilBlock)
533 e->flags |= VtEntryLocal;
535 b = vtblockcopy(b/*, e->tag, fs->ehi, fs->elo*/);
540 if(p->type == VtDirType){
541 memmove(e->score, b->score, VtScoreSize);
542 vtentrypack(e, p->data, index);
544 memmove(p->data+index*VtScoreSize, b->score, VtScoreSize);
550 * Change the depth of the VtFile r.
551 * The entry e for r is contained in block p.
554 growdepth(VtFile *r, VtBlock *p, VtEntry *e, int depth)
560 assert(depth <= VtPointerDepth);
562 b = vtcacheglobal(r->c, e->score, e->type);
569 * Keep adding layers until we get to the right depth
570 * or an error occurs.
572 while(DEPTH(e->type) < depth){
573 bb = vtcacheallocblock(r->c, e->type+1);
576 memmove(bb->data, b->score, VtScoreSize);
577 memmove(e->score, bb->score, VtScoreSize);
579 e->flags |= VtEntryLocal;
584 vtentrypack(e, p->data, r->offset % r->epb);
587 if(DEPTH(e->type) == depth)
593 shrinkdepth(VtFile *r, VtBlock *p, VtEntry *e, int depth)
595 VtBlock *b, *nb, *ob, *rb;
599 assert(depth <= VtPointerDepth);
601 rb = vtcacheglobal(r->c, e->score, e->type);
606 * Walk down to the new root block.
607 * We may stop early, but something is better than nothing.
613 for(; DEPTH(e->type) > depth; e->type--){
614 nb = vtcacheglobal(r->c, b->data, e->type-1);
617 if(ob!=nil && ob!=rb)
629 * Right now, e points at the root block rb, b is the new root block,
630 * and ob points at b. To update:
632 * (i) change e to point at b
633 * (ii) zero the pointer ob -> b
634 * (iii) free the root block
636 * p (the block containing e) must be written before
641 memmove(e->score, b->score, VtScoreSize);
642 vtentrypack(e, p->data, r->offset % r->epb);
645 memmove(ob->data, vtzeroscore, VtScoreSize);
649 if(ob!=nil && ob!=rb)
653 if(DEPTH(e->type) == depth)
659 mkindices(VtEntry *e, u32int bn, int *index)
665 memset(index, 0, VtPointerDepth*sizeof(int));
667 np = e->psize/VtScoreSize;
668 for(i=0; bn > 0; i++){
669 if(i >= VtPointerDepth){
670 werrstr("bad address 0x%lux", (ulong)bn);
680 vtfileblock(VtFile *r, u32int bn, int mode)
683 int index[VtPointerDepth+1];
689 assert(bn != NilBlock);
695 i = mkindices(&e, bn, index);
698 if(i > DEPTH(e.type)){
700 werrstr("bad address 0x%lux", (ulong)bn);
704 if(growdepth(r, b, &e, i) < 0)
708 assert(b->type == VtDirType);
710 index[DEPTH(e.type)] = r->offset % r->epb;
712 /* mode for intermediate block */
717 for(i=DEPTH(e.type); i>=0; i--){
718 bb = blockwalk(b, index[i], r->c, i==0 ? mode : m, &e);
731 vtfileblockscore(VtFile *r, u32int bn, uchar score[VtScoreSize])
734 int index[VtPointerDepth+1];
739 assert(bn != NilBlock);
745 i = mkindices(&e, bn, index);
750 if(i > DEPTH(e.type)){
751 memmove(score, vtzeroscore, VtScoreSize);
756 index[DEPTH(e.type)] = r->offset % r->epb;
758 for(i=DEPTH(e.type); i>=1; i--){
759 bb = blockwalk(b, index[i], r->c, VtOREAD, &e);
764 if(memcmp(b->score, vtzeroscore, VtScoreSize) == 0)
768 memmove(score, b->data+index[0]*VtScoreSize, VtScoreSize);
773 fprint(2, "vtfileblockhash: %r\n");
779 vtfileincref(VtFile *r)
787 vtfileclose(VtFile *r)
800 vtfileclose(r->parent);
801 memset(r, ~0, sizeof(*r));
806 * Retrieve the block containing the entry for r.
807 * If a snapshot has happened, we might need
808 * to get a new copy of the block. We avoid this
809 * in the common case by caching the score for
810 * the block and the last epoch in which it was valid.
812 * We use r->mode to tell the difference between active
813 * file system VtFiles (VtORDWR) and VtFiles for the
814 * snapshot file system (VtOREAD).
817 fileloadblock(VtFile *r, int mode)
827 assert(r->mode == VtORDWR);
829 b = vtcacheglobal(r->c, r->score, VtDirType);
834 assert(r->parent != nil);
835 if(vtfilelock(r->parent, VtORDWR) < 0)
837 b = vtfileblock(r->parent, r->offset/r->epb, VtORDWR);
838 vtfileunlock(r->parent);
841 memmove(r->score, b->score, VtScoreSize);
847 werrstr("read/write lock of read-only file");
850 addr = vtglobaltolocal(r->score);
852 return vtcacheglobal(r->c, r->score, VtDirType);
854 b = vtcachelocal(r->c, addr, VtDirType);
859 * If it failed because the epochs don't match, the block has been
860 * archived and reclaimed. Rewalk from the parent and get the
861 * new pointer. This can't happen in the VtORDWR case
862 * above because blocks in the current epoch don't get
863 * reclaimed. The fact that we're VtOREAD means we're
864 * a snapshot. (Or else the file system is read-only, but then
865 * the archiver isn't going around deleting blocks.)
867 rerrstr(e, sizeof e);
868 if(strcmp(e, ELabelMismatch) == 0){
869 if(vtfilelock(r->parent, VtOREAD) < 0)
871 b = vtfileblock(r->parent, r->offset/r->epb, VtOREAD);
872 vtfileunlock(r->parent);
874 fprint(2, "vtfilealloc: lost %V found %V\n",
876 memmove(r->score, b->score, VtScoreSize);
885 vtfilelock(VtFile *r, int mode)
892 b = fileloadblock(r, mode);
896 * The fact that we are holding b serves as the
897 * lock entitling us to write to r->b.
905 * Lock two (usually sibling) VtFiles. This needs special care
906 * because the Entries for both vtFiles might be in the same block.
907 * We also try to lock blocks in left-to-right order within the tree.
910 vtfilelock2(VtFile *r, VtFile *rr, int mode)
915 return vtfilelock(r, mode);
920 if(r->parent==rr->parent && r->offset/r->epb == rr->offset/rr->epb){
921 b = fileloadblock(r, mode);
926 }else if(r->parent==rr->parent || r->offset > rr->offset){
927 bb = fileloadblock(rr, mode);
928 b = fileloadblock(r, mode);
930 b = fileloadblock(r, mode);
931 bb = fileloadblock(rr, mode);
933 if(b == nil || bb == nil){
942 * The fact that we are holding b and bb serves
943 * as the lock entitling us to write to r->b and rr->b.
951 vtfileunlock(VtFile *r)
956 fprint(2, "vtfileunlock: already unlocked\n");
965 fileload(VtFile *r, VtEntry *e)
971 if(vtentryunpack(e, b->data, r->offset % r->epb) < 0)
978 sizetodepth(uvlong s, int psize, int dsize)
983 /* determine pointer depth */
984 np = psize/VtScoreSize;
985 s = (s + dsize - 1)/dsize;
986 for(d = 0; s > 1; d++)
992 vtfileread(VtFile *f, void *data, long count, vlong offset)
1000 vtfilegetentry(f, &e);
1003 if(count < 0 || offset < 0){
1004 werrstr("vtfileread: bad offset or count");
1007 if(offset >= e.size)
1010 if(offset+count > e.size)
1011 count = e.size - offset;
1013 frag = offset % e.dsize;
1014 if(frag+count > e.dsize)
1015 count = e.dsize - frag;
1017 b = vtfileblock(f, offset/e.dsize, VtOREAD);
1021 memmove(data, b->data+frag, count);
1027 filewrite1(VtFile *f, void *data, long count, vlong offset)
1033 vtfilegetentry(f, &e);
1034 if(count < 0 || offset < 0){
1035 werrstr("vtfilewrite: bad offset or count");
1039 frag = offset % e.dsize;
1040 if(frag+count > e.dsize)
1041 count = e.dsize - frag;
1044 if(frag == 0 && count == e.dsize)
1047 b = vtfileblock(f, offset/e.dsize, m);
1051 memmove(b->data+frag, data, count);
1053 if(offset+count > e.size){
1054 vtfilegetentry(f, &e);
1055 e.size = offset+count;
1056 vtfilesetentry(f, &e);
1064 vtfilewrite(VtFile *f, void *data, long count, vlong offset)
1068 assert(ISLOCKED(f));
1073 m = filewrite1(f, (char*)data+tot, count-tot, offset+tot);
1084 flushblock(VtCache *c, VtBlock *bb, uchar score[VtScoreSize], int ppb, int epb,
1092 addr = vtglobaltolocal(score);
1093 if(addr == NilBlock)
1098 if(memcmp(b->score, score, VtScoreSize) != 0)
1101 if((b = vtcachelocal(c, addr, type)) == nil)
1109 for(i=0; i<epb; i++){
1110 if(vtentryunpack(&e, b->data, i) < 0)
1112 if(flushblock(c, nil, e.score, e.psize/VtScoreSize, e.dsize/VtEntrySize,
1118 default: /* VtPointerTypeX */
1119 for(i=0; i<ppb; i++){
1120 if(flushblock(c, nil, b->data+VtScoreSize*i, ppb, epb, type-1) < 0)
1126 if(vtblockwrite(b) < 0)
1128 memmove(score, b->score, VtScoreSize);
1140 vtfileflush(VtFile *f)
1146 assert(ISLOCKED(f));
1147 b = fileload(f, &e);
1148 if(!(e.flags&VtEntryLocal)){
1153 ret = flushblock(f->c, nil, e.score, e.psize/VtScoreSize, e.dsize/VtEntrySize,
1160 vtentrypack(&e, b->data, f->offset % f->epb);
1166 vtfileflushbefore(VtFile *r, u64int offset)
1170 int i, base, depth, ppb, epb, ok;
1171 int index[VtPointerDepth+1], index1[VtPointerDepth+1], j, ret;
1172 VtBlock *bi[VtPointerDepth+2];
1175 assert(ISLOCKED(r));
1179 b = fileload(r, &e);
1184 memset(bi, 0, sizeof bi);
1185 depth = DEPTH(e.type);
1187 i = mkindices(&e, (offset-1)/e.dsize, index);
1192 mkindices(&e, offset/e.dsize, index1);
1193 ppb = e.psize / VtScoreSize;
1194 epb = e.dsize / VtEntrySize;
1196 index[depth] = r->offset % r->epb;
1197 for(i=depth; i>=0; i--){
1198 bb = blockwalk(b, index[i], r->c, VtORDWR, &e);
1206 base = e.type&~VtTypeDepthMask;
1207 for(i=0; i<depth; i++){
1209 /* bottom: data or dir block */
1210 ok = offset%e.dsize == 0;
1212 /* middle: pointer blocks */
1215 * flush everything up to the break
1217 for(j=0; j<index[i-1]; j++)
1218 if(flushblock(r->c, nil, b->data+j*VtScoreSize, ppb, epb, base+i-1) < 0)
1221 * if the rest of the block is already flushed,
1222 * we can flush the whole block.
1226 if(vtglobaltolocal(b->data+j*VtScoreSize) != NilBlock)
1233 score = bi[i+1]->data+index[i]*VtScoreSize;
1234 if(flushblock(r->c, bi[i], score, ppb, epb, base+i) < 0)
1240 /* top: entry. do this always so that the score is up-to-date */
1241 vtentrypack(&e, bi[depth+1]->data, index[depth]);
1242 for(i=0; i<nelem(bi); i++)