7 static int sizeToDepth(uvlong s, int psize, int dsize);
8 static u32int tagGen(void);
9 static Block *sourceLoad(Source *r, Entry *e);
10 static int sourceShrinkDepth(Source*, Block*, Entry*, int);
11 static int sourceShrinkSize(Source*, Entry*, uvlong);
12 static int sourceGrowDepth(Source*, Block*, Entry*, int);
14 #define sourceIsLocked(r) ((r)->b != nil)
17 sourceAlloc(Fs *fs, Block *b, Source *p, u32int offset, int mode, int issnapshot)
25 assert(p==nil || sourceIsLocked(p));
31 epb = p->dsize / VtEntrySize;
33 if(b->l.type != BtDir)
37 * a non-active entry is the only thing that
38 * can legitimately happen here. all the others
41 if(!entryUnpack(&e, b->data, offset % epb)){
42 pname = sourceName(p);
43 consPrint("%s: %s %V: sourceAlloc: entryUnpack failed\n",
44 fs->name, pname, b->score);
47 if(!(e.flags & VtEntryActive)){
48 pname = sourceName(p);
49 if(0) consPrint("%s: %s %V: sourceAlloc: not active\n",
50 fs->name, pname, e.score);
53 if(e.psize < 256 || e.dsize < 256){
54 pname = sourceName(p);
55 consPrint("%s: %s %V: sourceAlloc: psize %ud or dsize %ud < 256\n",
56 fs->name, pname, e.score, e.psize, e.dsize);
60 if(e.depth < sizeToDepth(e.size, e.psize, e.dsize)){
61 pname = sourceName(p);
62 consPrint("%s: %s %V: sourceAlloc: depth %ud size %llud "
63 "psize %ud dsize %ud\n", fs->name, pname,
64 e.score, e.depth, e.size, e.psize, e.dsize);
68 if((e.flags & VtEntryLocal) && e.tag == 0){
69 pname = sourceName(p);
70 consPrint("%s: %s %V: sourceAlloc: flags %#ux tag %#ux\n",
71 fs->name, pname, e.score, e.flags, e.tag);
75 if(e.dsize > fs->blockSize || e.psize > fs->blockSize){
76 pname = sourceName(p);
77 consPrint("%s: %s %V: sourceAlloc: psize %ud or dsize %ud "
78 "> blocksize %ud\n", fs->name, pname, e.score,
79 e.psize, e.dsize, fs->blockSize);
84 if(mode == OReadWrite){
89 }else if(e.snap != 0){
99 r = vtmallocz(sizeof(Source));
102 r->issnapshot = issnapshot;
105 r->dir = (e.flags & _VtEntryDir) != 0;
110 assert(mode == OReadOnly || p->mode == OReadWrite);
115 // consPrint("sourceAlloc: have %V be.%d fse.%d %s\n", b->score,
116 // b->l.epoch, r->fs->ehi, mode == OReadWrite? "rw": "ro");
117 memmove(r->score, b->score, VtScoreSize);
118 r->scoreEpoch = b->l.epoch;
123 // consPrint("%s: sourceAlloc: %p -> %V %d\n", r, r->score, r->offset);
133 sourceRoot(Fs *fs, u32int addr, int mode)
138 b = cacheLocalData(fs->cache, addr, BtDir, RootTag, mode, 0);
142 if(mode == OReadWrite && b->l.epoch != fs->ehi){
143 consPrint("sourceRoot: fs->ehi = %ud, b->l = %L\n",
150 r = sourceAlloc(fs, b, nil, 0, mode, 0);
156 sourceOpen(Source *r, ulong offset, int mode, int issnapshot)
161 assert(sourceIsLocked(r));
162 if(r->mode == OReadWrite)
163 assert(r->epoch == r->b->l.epoch);
169 bn = offset/(r->dsize/VtEntrySize);
171 b = sourceBlock(r, bn, mode);
174 r = sourceAlloc(r->fs, b, r, offset, mode, issnapshot);
180 sourceCreate(Source *r, int dsize, int dir, u32int offset)
188 assert(sourceIsLocked(r));
195 epb = r->dsize/VtEntrySize;
196 psize = (dsize/VtScoreSize)*VtScoreSize;
198 size = sourceGetDirSize(r);
201 * look at a random block to see if we can find an empty entry
203 offset = lnrand(size+1);
204 offset -= offset % epb;
207 /* try the given block and then try the last block */
210 b = sourceBlock(r, bn, OReadWrite);
213 for(i=offset%r->epb; i<epb; i++){
214 entryUnpack(&e, b->data, i);
215 if((e.flags&VtEntryActive) == 0 && e.gen != ~0)
220 fprint(2, "sourceCreate: cannot happen\n");
221 werrstr("sourceCreate: cannot happen");
228 /* found an entry - gen already set */
231 assert(psize && dsize);
232 e.flags = VtEntryActive;
234 e.flags |= _VtEntryDir;
237 memmove(e.score, vtzeroscore, VtScoreSize);
241 entryPack(&e, b->data, i);
246 if(!sourceSetDirSize(r, offset+1)){
252 rr = sourceAlloc(r->fs, b, r, offset, OReadWrite, 0);
258 sourceKill(Source *r, int doremove)
266 assert(sourceIsLocked(r));
267 b = sourceLoad(r, &e);
271 assert(b->l.epoch == r->fs->ehi);
273 if(doremove==0 && e.size == 0){
274 /* already truncated */
279 /* remember info on link we are removing */
280 addr = globalToLocal(e.score);
281 type = entryType(&e);
291 e.flags &= ~VtEntryLocal;
296 memmove(e.score, vtzeroscore, VtScoreSize);
297 entryPack(&e, b->data, r->offset % r->epb);
300 blockRemoveLink(b, addr, type, tag, 1);
312 sourceRemove(Source *r)
314 return sourceKill(r, 1);
318 sourceTruncate(Source *r)
320 return sourceKill(r, 0);
324 sourceGetSize(Source *r)
329 assert(sourceIsLocked(r));
330 b = sourceLoad(r, &e);
339 sourceShrinkSize(Source *r, Entry *e, uvlong size)
344 uchar score[VtScoreSize];
348 b = cacheGlobal(r->fs->cache, e->score, type, e->tag, OReadWrite);
353 ppb = e->psize/VtScoreSize;
354 for(i=0; i+1<e->depth; i++)
357 while(type&BtLevelMask){
358 if(b->addr == NilBlock || b->l.epoch != r->fs->ehi){
359 /* not worth copying the block just so we can zero some of it */
365 * invariant: each pointer in the tree rooted at b accounts for ptrsz bytes
368 /* zero the pointers to unnecessary blocks */
369 i = (size+ptrsz-1)/ptrsz;
371 addr = globalToLocal(b->data+i*VtScoreSize);
372 memmove(b->data+i*VtScoreSize, vtzeroscore, VtScoreSize);
375 blockRemoveLink(b, addr, type-1, e->tag, 1);
378 /* recurse (go around again) on the partially necessary block */
387 memmove(score, b->data+i*VtScoreSize, VtScoreSize);
389 b = cacheGlobal(r->fs->cache, score, type, e->tag, OReadWrite);
394 if(b->addr == NilBlock || b->l.epoch != r->fs->ehi){
400 * No one ever truncates BtDir blocks.
402 if(type == BtData && e->dsize > size){
403 memset(b->data+size, 0, e->dsize-size);
411 sourceSetSize(Source *r, uvlong size)
417 assert(sourceIsLocked(r));
419 return sourceTruncate(r);
421 if(size > VtMaxFileSize || size > ((uvlong)MaxBlock)*r->dsize){
426 b = sourceLoad(r, &e);
436 depth = sizeToDepth(size, e.psize, e.dsize);
439 if(!sourceShrinkDepth(r, b, &e, depth)){
443 }else if(depth > e.depth){
444 if(!sourceGrowDepth(r, b, &e, depth)){
451 sourceShrinkSize(r, &e, size);
454 entryPack(&e, b->data, r->offset % r->epb);
462 sourceSetDirSize(Source *r, ulong ds)
467 assert(sourceIsLocked(r));
468 epb = r->dsize/VtEntrySize;
470 size = (uvlong)r->dsize*(ds/epb);
471 size += VtEntrySize*(ds%epb);
472 return sourceSetSize(r, size);
476 sourceGetDirSize(Source *r)
482 assert(sourceIsLocked(r));
483 epb = r->dsize/VtEntrySize;
485 size = sourceGetSize(r);
486 ds = epb*(size/r->dsize);
487 ds += (size%r->dsize)/VtEntrySize;
492 sourceGetEntry(Source *r, Entry *e)
496 assert(sourceIsLocked(r));
497 b = sourceLoad(r, e);
506 * Must be careful with this. Doesn't record
507 * dependencies, so don't introduce any!
510 sourceSetEntry(Source *r, Entry *e)
515 assert(sourceIsLocked(r));
516 b = sourceLoad(r, &oe);
519 entryPack(e, b->data, r->offset%r->epb);
527 blockWalk(Block *p, int index, int mode, Fs *fs, Entry *e)
533 uchar oscore[VtScoreSize], score[VtScoreSize];
538 if((p->l.type & BtLevelMask) == 0){
539 assert(p->l.type == BtDir);
541 b = cacheGlobal(c, e->score, type, e->tag, mode);
543 type = p->l.type - 1;
544 b = cacheGlobal(c, p->data + index*VtScoreSize, type, e->tag, mode);
548 b->pc = getcallerpc(&p);
550 if(b == nil || mode == OReadOnly)
553 if(p->l.epoch != fs->ehi){
554 fprint(2, "blockWalk: parent not writable\n");
557 if(b->l.epoch == fs->ehi)
566 assert(p->l.type == BtDir);
568 e->flags |= VtEntryLocal;
572 b = blockCopy(b, e->tag, fs->ehi, fs->elo);
576 b->pc = getcallerpc(&p);
577 assert(b->l.epoch == fs->ehi);
580 memmove(score, b->score, VtScoreSize);
581 if(p->l.type == BtDir){
582 memmove(e->score, b->score, VtScoreSize);
583 entryPack(e, p->data, index);
584 blockDependency(p, b, index, nil, &oe);
586 memmove(oscore, p->data+index*VtScoreSize, VtScoreSize);
587 memmove(p->data+index*VtScoreSize, b->score, VtScoreSize);
588 blockDependency(p, b, index, oscore, nil);
593 blockRemoveLink(p, addr, type, e->tag, 0);
599 * Change the depth of the source r.
600 * The entry e for r is contained in block p.
603 sourceGrowDepth(Source *r, Block *p, Entry *e, int depth)
610 assert(sourceIsLocked(r));
611 assert(depth <= VtPointerDepth);
614 b = cacheGlobal(r->fs->cache, e->score, type, e->tag, OReadWrite);
625 * Keep adding layers until we get to the right depth
626 * or an error occurs.
628 while(e->depth < depth){
629 bb = cacheAllocBlock(r->fs->cache, type+1, tag, r->fs->ehi, r->fs->elo);
632 //fprint(2, "alloc %lux grow %V\n", bb->addr, b->score);
633 memmove(bb->data, b->score, VtScoreSize);
634 memmove(e->score, bb->score, VtScoreSize);
638 e->flags |= VtEntryLocal;
639 blockDependency(bb, b, 0, vtzeroscore, nil);
645 entryPack(e, p->data, r->offset % r->epb);
646 blockDependency(p, b, r->offset % r->epb, nil, &oe);
650 return e->depth == depth;
654 sourceShrinkDepth(Source *r, Block *p, Entry *e, int depth)
656 Block *b, *nb, *ob, *rb;
661 assert(sourceIsLocked(r));
662 assert(depth <= VtPointerDepth);
665 rb = cacheGlobal(r->fs->cache, e->score, type, e->tag, OReadWrite);
674 * Walk down to the new root block.
675 * We may stop early, but something is better than nothing.
681 /* BUG: explain type++. i think it is a real bug */
682 for(d=e->depth; d > depth; d--, type++){
683 nb = cacheGlobal(r->fs->cache, b->data, type-1, tag, OReadWrite);
686 if(ob!=nil && ob!=rb)
698 * Right now, e points at the root block rb, b is the new root block,
699 * and ob points at b. To update:
701 * (i) change e to point at b
702 * (ii) zero the pointer ob -> b
703 * (iii) free the root block
705 * p (the block containing e) must be written before
711 /* might have been local and now global; reverse cannot happen */
712 if(globalToLocal(b->score) == NilBlock)
713 e->flags &= ~VtEntryLocal;
714 memmove(e->score, b->score, VtScoreSize);
715 entryPack(e, p->data, r->offset % r->epb);
716 blockDependency(p, b, r->offset % r->epb, nil, &oe);
720 memmove(ob->data, vtzeroscore, VtScoreSize);
721 blockDependency(ob, p, 0, b->score, nil);
725 if(rb->addr != NilBlock)
726 blockRemoveLink(p, rb->addr, rb->l.type, rb->l.tag, 1);
729 if(ob!=nil && ob!=rb)
737 * Normally we return the block at the given number.
738 * If early is set, we stop earlier in the tree. Setting early
739 * to 1 gives us the block that contains the pointer to bn.
742 _sourceBlock(Source *r, ulong bn, int mode, int early, ulong tag)
745 int index[VtPointerDepth+1];
750 assert(sourceIsLocked(r));
751 assert(bn != NilBlock);
753 /* mode for intermediate block */
758 b = sourceLoad(r, &e);
761 if(r->issnapshot && (e.flags & VtEntryNoArchive)){
763 werrstr(ENotArchived);
770 else if(e.tag != tag){
771 fprint(2, "tag mismatch\n");
772 werrstr("tag mismatch");
777 np = e.psize/VtScoreSize;
778 memset(index, 0, sizeof(index));
779 for(i=0; bn > 0; i++){
780 if(i >= VtPointerDepth){
789 if(mode == OReadOnly){
793 if(!sourceGrowDepth(r, b, &e, i))
797 index[e.depth] = r->offset % r->epb;
799 for(i=e.depth; i>=early; i--){
800 bb = blockWalk(b, index[i], m, r->fs, &e);
806 b->pc = getcallerpc(&r);
814 sourceBlock(Source *r, ulong bn, int mode)
818 b = _sourceBlock(r, bn, mode, 0, 0);
820 b->pc = getcallerpc(&r);
825 sourceClose(Source *r)
838 sourceClose(r->parent);
839 memset(r, ~0, sizeof(*r));
844 * Retrieve the block containing the entry for r.
845 * If a snapshot has happened, we might need
846 * to get a new copy of the block. We avoid this
847 * in the common case by caching the score for
848 * the block and the last epoch in which it was valid.
850 * We use r->mode to tell the difference between active
851 * file system sources (OReadWrite) and sources for the
852 * snapshot file system (OReadOnly).
855 sourceLoadBlock(Source *r, int mode)
865 assert(r->mode == OReadWrite);
867 * This needn't be true -- we might bump the low epoch
868 * to reclaim some old blocks, but since this score is
869 * OReadWrite, the blocks must all still be open, so none
870 * are reclaimed. Thus it's okay that the epoch is so low.
872 assert(r->epoch >= r->fs->elo);
874 if(r->epoch == r->fs->ehi){
875 b = cacheGlobal(r->fs->cache, r->score, BtDir, r->tag, OReadWrite);
878 assert(r->epoch == b->l.epoch);
881 assert(r->parent != nil);
882 if(!sourceLock(r->parent, OReadWrite))
884 b = sourceBlock(r->parent, r->offset/r->epb, OReadWrite);
885 sourceUnlock(r->parent);
888 assert(b->l.epoch == r->fs->ehi);
889 // fprint(2, "sourceLoadBlock %p %V => %V\n", r, r->score, b->score);
890 memmove(r->score, b->score, VtScoreSize);
891 r->scoreEpoch = b->l.epoch;
893 r->epoch = r->fs->ehi;
897 addr = globalToLocal(r->score);
899 return cacheGlobal(r->fs->cache, r->score, BtDir, r->tag, mode);
901 b = cacheLocalData(r->fs->cache, addr, BtDir, r->tag, mode, r->scoreEpoch);
906 * If it failed because the epochs don't match, the block has been
907 * archived and reclaimed. Rewalk from the parent and get the
908 * new pointer. This can't happen in the OReadWrite case
909 * above because blocks in the current epoch don't get
910 * reclaimed. The fact that we're OReadOnly means we're
911 * a snapshot. (Or else the file system is read-only, but then
912 * the archiver isn't going around deleting blocks.)
914 rerrstr(e, sizeof e);
915 if(strcmp(e, ELabelMismatch) == 0){
916 if(!sourceLock(r->parent, OReadOnly))
918 b = sourceBlock(r->parent, r->offset/r->epb, OReadOnly);
919 sourceUnlock(r->parent);
921 fprint(2, "sourceAlloc: lost %V found %V\n",
923 memmove(r->score, b->score, VtScoreSize);
924 r->scoreEpoch = b->l.epoch;
933 sourceLock(Source *r, int mode)
940 b = sourceLoadBlock(r, mode);
944 * The fact that we are holding b serves as the
945 * lock entitling us to write to r->b.
949 if(r->mode == OReadWrite)
950 assert(r->epoch == r->b->l.epoch);
955 * Lock two (usually sibling) sources. This needs special care
956 * because the Entries for both sources might be in the same block.
957 * We also try to lock blocks in left-to-right order within the tree.
960 sourceLock2(Source *r, Source *rr, int mode)
965 return sourceLock(r, mode);
970 if(r->parent==rr->parent && r->offset/r->epb == rr->offset/rr->epb){
971 b = sourceLoadBlock(r, mode);
974 if(memcmp(r->score, rr->score, VtScoreSize) != 0){
975 memmove(rr->score, b->score, VtScoreSize);
976 rr->scoreEpoch = b->l.epoch;
978 rr->epoch = rr->fs->ehi;
982 }else if(r->parent==rr->parent || r->offset > rr->offset){
983 bb = sourceLoadBlock(rr, mode);
984 b = sourceLoadBlock(r, mode);
986 b = sourceLoadBlock(r, mode);
987 bb = sourceLoadBlock(rr, mode);
989 if(b == nil || bb == nil){
998 * The fact that we are holding b and bb serves
999 * as the lock entitling us to write to r->b and rr->b.
1007 sourceUnlock(Source *r)
1012 fprint(2, "sourceUnlock: already unlocked\n");
1021 sourceLoad(Source *r, Entry *e)
1025 assert(sourceIsLocked(r));
1027 if(!entryUnpack(e, b->data, r->offset % r->epb))
1029 if(e->gen != r->gen){
1038 sizeToDepth(uvlong s, int psize, int dsize)
1043 /* determine pointer depth */
1044 np = psize/VtScoreSize;
1045 s = (s + dsize - 1)/dsize;
1046 for(d = 0; s > 1; d++)
1047 s = (s + np - 1)/np;
1065 sourceName(Source *s)
1067 return fileName(s->file);